apps-beta / functions.py
felix-weiland's picture
Upload 2 files
e80e1f0
import requests, re
import pandas as pd
import numpy as np
import urllib
from app_store_scraper import AppStore
from google_play_scraper import Sort, reviews_all
from tqdm.notebook import tqdm
def search_apps(term, country, media='software', limit=100):
base_url = 'https://itunes.apple.com/search'
params = {
'term': term,
'country': country,
'media': media,
'limit': limit
}
response = requests.get(base_url, params=params)
response = response.json()['results']
# Create a pandas DataFrame
data = pd.DataFrame(response, columns=['trackName', 'averageUserRating','userRatingCount',
'price', 'currency','primaryGenreName', 'releaseDate',
'currentVersionReleaseDate', 'description',"releaseNotes" ,"screenshotUrls",
'languageCodesISO2A', 'trackId', 'sellerName', "sellerUrl", 'trackViewUrl'])
data["averageUserRating"] = np.round(data["averageUserRating"], 1)
data["market"] = country
data["search_term"] = term
data["releaseDate"] = data["releaseDate"].apply(lambda x: x.split("T")[0])
data["currentVersionReleaseDate"] = data["currentVersionReleaseDate"].apply(lambda x: x.split("T")[0])
data.columns = ['name', 'average_rating', 'rating_count', 'price', 'currency',
'genre', 'release_date', 'latest_version_release', 'description', 'release_notes',
"screenshots", 'languages', 'id', 'seller', 'seller_url', 'link', 'market', 'search_term']
return data
def init_search(term, countries, media='software', limit=100):
term = term.split(", ")
countries = countries.split(", ")
data = pd.DataFrame()
for country in countries:
for t in term:
temp = search_apps(t, country, media=media, limit=limit)
temp['languages'] = temp['languages'].apply(lambda x: ', '.join(x))
temp['screenshots'] = temp['screenshots'].apply(lambda x: ', '.join(x))
data = pd.concat([data, temp], ignore_index=True)
data = data.drop_duplicates(subset=data.columns.difference(['search_term']))
data = data.reset_index(drop=True)
return data
def to_csv(data):
csv = data.to_csv(index=False)
return csv.encode('utf-8')
def extract_name_id(url):
pattern = r'/app/([^/]+)/id([^/?]+)'
match = re.search(pattern, url)
if match:
name = urllib.parse.unquote(match.group(1))
id = match.group(2)
return name, id
else:
return None, None
def get_dev_comments(row):
try:
return row["body"]
except:
return row
def fetch_data(items):
data = pd.DataFrame(columns=["os", "application"])
for client, info in tqdm(items):
#### APPSTORE ####
for app_name, app_id in info["iOS"].items():
reviews = AppStore(country='se', app_name = app_name, app_id = app_id)
# Check to see if reviews have already been fetched. If so, we just want the new ones.
try:
last_date = data[(data["application"] == client) & (data["os"] == "iOS")]["date"].max()
reviews.review(sleep=np.random.randint(5,8), after=last_date)
except:
reviews.review(sleep=np.random.randint(5,8))
# Creating a DataFrame with the reviews. Since the format of the data is a bit strange, we just expand it
# and use that data for a new DataFrame.
if len(reviews.reviews) > 0:
ios = pd.DataFrame(np.array(reviews.reviews),columns=['review'])
ios = ios.pop('review').tolist()
ios = pd.DataFrame(ios)
# Appending the application name to every row.
ios["application"] = client
ios["os"] = "iOS"
ios["app_id"] = app_id
# If there is a developer response, we need to extract that as well.
if "developerResponse" in ios.columns:
ios["developerResponse"] = ios["developerResponse"].apply(get_dev_comments)
else:
ios["developerResponse"] = np.nan
ios = ios[['application','os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']]
# Lastly we add all the data to the main DataFrame.
data = pd.concat([data, ios])
print(f"Collected {len(ios)} items from {client} from AppStore")
#### GOOGLE PLAY ####
if info["Android"] != None:
rename = {
"at" : "date",
"content" : "review",
"replyContent" : "developerResponse",
"score" : "rating"
}
android = reviews_all(info["Android"], sleep_milliseconds=10, lang='sv', sort=Sort.NEWEST)
android = pd.DataFrame(android)
if client in data[data["os"] == "Android"]["application"].unique():
last_date = data[(data["application"] == client) & (data["os"] == "Android")]["date"].max()
android = android[android["at"]>last_date]
android = android.rename(columns=rename)
android["os"] = "Android"
android["application"] = client
android["app_id"] = app_id
android = android[['application', 'os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']]
data = pd.concat([data, android])
print(f"Collected {len(android)} items from {client} from Google Play")
return data