Spaces:
Build error
Build error
| import requests, re | |
| import pandas as pd | |
| import numpy as np | |
| import urllib | |
| from app_store_scraper import AppStore | |
| from google_play_scraper import Sort, reviews_all | |
| from tqdm.notebook import tqdm | |
| def search_apps(term, country, media='software', limit=100): | |
| base_url = 'https://itunes.apple.com/search' | |
| params = { | |
| 'term': term, | |
| 'country': country, | |
| 'media': media, | |
| 'limit': limit | |
| } | |
| response = requests.get(base_url, params=params) | |
| response = response.json()['results'] | |
| # Create a pandas DataFrame | |
| data = pd.DataFrame(response, columns=['trackName', 'averageUserRating','userRatingCount', | |
| 'price', 'currency','primaryGenreName', 'releaseDate', | |
| 'currentVersionReleaseDate', 'description',"releaseNotes" ,"screenshotUrls", | |
| 'languageCodesISO2A', 'trackId', 'sellerName', "sellerUrl", 'trackViewUrl']) | |
| data["averageUserRating"] = np.round(data["averageUserRating"], 1) | |
| data["market"] = country | |
| data["search_term"] = term | |
| data["releaseDate"] = data["releaseDate"].apply(lambda x: x.split("T")[0]) | |
| data["currentVersionReleaseDate"] = data["currentVersionReleaseDate"].apply(lambda x: x.split("T")[0]) | |
| data.columns = ['name', 'average_rating', 'rating_count', 'price', 'currency', | |
| 'genre', 'release_date', 'latest_version_release', 'description', 'release_notes', | |
| "screenshots", 'languages', 'id', 'seller', 'seller_url', 'link', 'market', 'search_term'] | |
| return data | |
| def init_search(term, countries, media='software', limit=100): | |
| term = term.split(", ") | |
| countries = countries.split(", ") | |
| data = pd.DataFrame() | |
| for country in countries: | |
| for t in term: | |
| temp = search_apps(t, country, media=media, limit=limit) | |
| temp['languages'] = temp['languages'].apply(lambda x: ', '.join(x)) | |
| temp['screenshots'] = temp['screenshots'].apply(lambda x: ', '.join(x)) | |
| data = pd.concat([data, temp], ignore_index=True) | |
| data = data.drop_duplicates(subset=data.columns.difference(['search_term'])) | |
| data = data.reset_index(drop=True) | |
| return data | |
| def to_csv(data): | |
| csv = data.to_csv(index=False) | |
| return csv.encode('utf-8') | |
| def extract_name_id(url): | |
| pattern = r'/app/([^/]+)/id([^/?]+)' | |
| match = re.search(pattern, url) | |
| if match: | |
| name = urllib.parse.unquote(match.group(1)) | |
| id = match.group(2) | |
| return name, id | |
| else: | |
| return None, None | |
| def get_dev_comments(row): | |
| try: | |
| return row["body"] | |
| except: | |
| return row | |
| def fetch_data(items): | |
| data = pd.DataFrame(columns=["os", "application"]) | |
| for client, info in tqdm(items): | |
| #### APPSTORE #### | |
| for app_name, app_id in info["iOS"].items(): | |
| reviews = AppStore(country='se', app_name = app_name, app_id = app_id) | |
| # Check to see if reviews have already been fetched. If so, we just want the new ones. | |
| try: | |
| last_date = data[(data["application"] == client) & (data["os"] == "iOS")]["date"].max() | |
| reviews.review(sleep=np.random.randint(5,8), after=last_date) | |
| except: | |
| reviews.review(sleep=np.random.randint(5,8)) | |
| # Creating a DataFrame with the reviews. Since the format of the data is a bit strange, we just expand it | |
| # and use that data for a new DataFrame. | |
| if len(reviews.reviews) > 0: | |
| ios = pd.DataFrame(np.array(reviews.reviews),columns=['review']) | |
| ios = ios.pop('review').tolist() | |
| ios = pd.DataFrame(ios) | |
| # Appending the application name to every row. | |
| ios["application"] = client | |
| ios["os"] = "iOS" | |
| ios["app_id"] = app_id | |
| # If there is a developer response, we need to extract that as well. | |
| if "developerResponse" in ios.columns: | |
| ios["developerResponse"] = ios["developerResponse"].apply(get_dev_comments) | |
| else: | |
| ios["developerResponse"] = np.nan | |
| ios = ios[['application','os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']] | |
| # Lastly we add all the data to the main DataFrame. | |
| data = pd.concat([data, ios]) | |
| print(f"Collected {len(ios)} items from {client} from AppStore") | |
| #### GOOGLE PLAY #### | |
| if info["Android"] != None: | |
| rename = { | |
| "at" : "date", | |
| "content" : "review", | |
| "replyContent" : "developerResponse", | |
| "score" : "rating" | |
| } | |
| android = reviews_all(info["Android"], sleep_milliseconds=10, lang='sv', sort=Sort.NEWEST) | |
| android = pd.DataFrame(android) | |
| if client in data[data["os"] == "Android"]["application"].unique(): | |
| last_date = data[(data["application"] == client) & (data["os"] == "Android")]["date"].max() | |
| android = android[android["at"]>last_date] | |
| android = android.rename(columns=rename) | |
| android["os"] = "Android" | |
| android["application"] = client | |
| android["app_id"] = app_id | |
| android = android[['application', 'os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']] | |
| data = pd.concat([data, android]) | |
| print(f"Collected {len(android)} items from {client} from Google Play") | |
| return data |