import requests, re import pandas as pd import numpy as np import urllib from app_store_scraper import AppStore from google_play_scraper import Sort, reviews_all from tqdm.notebook import tqdm def search_apps(term, country, media='software', limit=100): base_url = 'https://itunes.apple.com/search' params = { 'term': term, 'country': country, 'media': media, 'limit': limit } response = requests.get(base_url, params=params) response = response.json()['results'] # Create a pandas DataFrame data = pd.DataFrame(response, columns=['trackName', 'averageUserRating','userRatingCount', 'price', 'currency','primaryGenreName', 'releaseDate', 'currentVersionReleaseDate', 'description',"releaseNotes" ,"screenshotUrls", 'languageCodesISO2A', 'trackId', 'sellerName', "sellerUrl", 'trackViewUrl']) data["averageUserRating"] = np.round(data["averageUserRating"], 1) data["market"] = country data["search_term"] = term data["releaseDate"] = data["releaseDate"].apply(lambda x: x.split("T")[0]) data["currentVersionReleaseDate"] = data["currentVersionReleaseDate"].apply(lambda x: x.split("T")[0]) data.columns = ['name', 'average_rating', 'rating_count', 'price', 'currency', 'genre', 'release_date', 'latest_version_release', 'description', 'release_notes', "screenshots", 'languages', 'id', 'seller', 'seller_url', 'link', 'market', 'search_term'] return data def init_search(term, countries, media='software', limit=100): term = term.split(", ") countries = countries.split(", ") data = pd.DataFrame() for country in countries: for t in term: temp = search_apps(t, country, media=media, limit=limit) temp['languages'] = temp['languages'].apply(lambda x: ', '.join(x)) temp['screenshots'] = temp['screenshots'].apply(lambda x: ', '.join(x)) data = pd.concat([data, temp], ignore_index=True) data = data.drop_duplicates(subset=data.columns.difference(['search_term'])) data = data.reset_index(drop=True) return data def to_csv(data): csv = data.to_csv(index=False) return csv.encode('utf-8') def extract_name_id(url): pattern = r'/app/([^/]+)/id([^/?]+)' match = re.search(pattern, url) if match: name = urllib.parse.unquote(match.group(1)) id = match.group(2) return name, id else: return None, None def get_dev_comments(row): try: return row["body"] except: return row def fetch_data(items): data = pd.DataFrame(columns=["os", "application"]) for client, info in tqdm(items): #### APPSTORE #### for app_name, app_id in info["iOS"].items(): reviews = AppStore(country='se', app_name = app_name, app_id = app_id) # Check to see if reviews have already been fetched. If so, we just want the new ones. try: last_date = data[(data["application"] == client) & (data["os"] == "iOS")]["date"].max() reviews.review(sleep=np.random.randint(5,8), after=last_date) except: reviews.review(sleep=np.random.randint(5,8)) # Creating a DataFrame with the reviews. Since the format of the data is a bit strange, we just expand it # and use that data for a new DataFrame. if len(reviews.reviews) > 0: ios = pd.DataFrame(np.array(reviews.reviews),columns=['review']) ios = ios.pop('review').tolist() ios = pd.DataFrame(ios) # Appending the application name to every row. ios["application"] = client ios["os"] = "iOS" ios["app_id"] = app_id # If there is a developer response, we need to extract that as well. if "developerResponse" in ios.columns: ios["developerResponse"] = ios["developerResponse"].apply(get_dev_comments) else: ios["developerResponse"] = np.nan ios = ios[['application','os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']] # Lastly we add all the data to the main DataFrame. data = pd.concat([data, ios]) print(f"Collected {len(ios)} items from {client} from AppStore") #### GOOGLE PLAY #### if info["Android"] != None: rename = { "at" : "date", "content" : "review", "replyContent" : "developerResponse", "score" : "rating" } android = reviews_all(info["Android"], sleep_milliseconds=10, lang='sv', sort=Sort.NEWEST) android = pd.DataFrame(android) if client in data[data["os"] == "Android"]["application"].unique(): last_date = data[(data["application"] == client) & (data["os"] == "Android")]["date"].max() android = android[android["at"]>last_date] android = android.rename(columns=rename) android["os"] = "Android" android["application"] = client android["app_id"] = app_id android = android[['application', 'os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']] data = pd.concat([data, android]) print(f"Collected {len(android)} items from {client} from Google Play") return data