Spaces:
Build error
Build error
File size: 5,793 Bytes
e80e1f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import requests, re
import pandas as pd
import numpy as np
import urllib
from app_store_scraper import AppStore
from google_play_scraper import Sort, reviews_all
from tqdm.notebook import tqdm
def search_apps(term, country, media='software', limit=100):
base_url = 'https://itunes.apple.com/search'
params = {
'term': term,
'country': country,
'media': media,
'limit': limit
}
response = requests.get(base_url, params=params)
response = response.json()['results']
# Create a pandas DataFrame
data = pd.DataFrame(response, columns=['trackName', 'averageUserRating','userRatingCount',
'price', 'currency','primaryGenreName', 'releaseDate',
'currentVersionReleaseDate', 'description',"releaseNotes" ,"screenshotUrls",
'languageCodesISO2A', 'trackId', 'sellerName', "sellerUrl", 'trackViewUrl'])
data["averageUserRating"] = np.round(data["averageUserRating"], 1)
data["market"] = country
data["search_term"] = term
data["releaseDate"] = data["releaseDate"].apply(lambda x: x.split("T")[0])
data["currentVersionReleaseDate"] = data["currentVersionReleaseDate"].apply(lambda x: x.split("T")[0])
data.columns = ['name', 'average_rating', 'rating_count', 'price', 'currency',
'genre', 'release_date', 'latest_version_release', 'description', 'release_notes',
"screenshots", 'languages', 'id', 'seller', 'seller_url', 'link', 'market', 'search_term']
return data
def init_search(term, countries, media='software', limit=100):
term = term.split(", ")
countries = countries.split(", ")
data = pd.DataFrame()
for country in countries:
for t in term:
temp = search_apps(t, country, media=media, limit=limit)
temp['languages'] = temp['languages'].apply(lambda x: ', '.join(x))
temp['screenshots'] = temp['screenshots'].apply(lambda x: ', '.join(x))
data = pd.concat([data, temp], ignore_index=True)
data = data.drop_duplicates(subset=data.columns.difference(['search_term']))
data = data.reset_index(drop=True)
return data
def to_csv(data):
csv = data.to_csv(index=False)
return csv.encode('utf-8')
def extract_name_id(url):
pattern = r'/app/([^/]+)/id([^/?]+)'
match = re.search(pattern, url)
if match:
name = urllib.parse.unquote(match.group(1))
id = match.group(2)
return name, id
else:
return None, None
def get_dev_comments(row):
try:
return row["body"]
except:
return row
def fetch_data(items):
data = pd.DataFrame(columns=["os", "application"])
for client, info in tqdm(items):
#### APPSTORE ####
for app_name, app_id in info["iOS"].items():
reviews = AppStore(country='se', app_name = app_name, app_id = app_id)
# Check to see if reviews have already been fetched. If so, we just want the new ones.
try:
last_date = data[(data["application"] == client) & (data["os"] == "iOS")]["date"].max()
reviews.review(sleep=np.random.randint(5,8), after=last_date)
except:
reviews.review(sleep=np.random.randint(5,8))
# Creating a DataFrame with the reviews. Since the format of the data is a bit strange, we just expand it
# and use that data for a new DataFrame.
if len(reviews.reviews) > 0:
ios = pd.DataFrame(np.array(reviews.reviews),columns=['review'])
ios = ios.pop('review').tolist()
ios = pd.DataFrame(ios)
# Appending the application name to every row.
ios["application"] = client
ios["os"] = "iOS"
ios["app_id"] = app_id
# If there is a developer response, we need to extract that as well.
if "developerResponse" in ios.columns:
ios["developerResponse"] = ios["developerResponse"].apply(get_dev_comments)
else:
ios["developerResponse"] = np.nan
ios = ios[['application','os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']]
# Lastly we add all the data to the main DataFrame.
data = pd.concat([data, ios])
print(f"Collected {len(ios)} items from {client} from AppStore")
#### GOOGLE PLAY ####
if info["Android"] != None:
rename = {
"at" : "date",
"content" : "review",
"replyContent" : "developerResponse",
"score" : "rating"
}
android = reviews_all(info["Android"], sleep_milliseconds=10, lang='sv', sort=Sort.NEWEST)
android = pd.DataFrame(android)
if client in data[data["os"] == "Android"]["application"].unique():
last_date = data[(data["application"] == client) & (data["os"] == "Android")]["date"].max()
android = android[android["at"]>last_date]
android = android.rename(columns=rename)
android["os"] = "Android"
android["application"] = client
android["app_id"] = app_id
android = android[['application', 'os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']]
data = pd.concat([data, android])
print(f"Collected {len(android)} items from {client} from Google Play")
return data |