File size: 5,793 Bytes
e80e1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import requests, re
import pandas as pd
import numpy as np
import urllib
from app_store_scraper import AppStore
from google_play_scraper import Sort, reviews_all
from tqdm.notebook import tqdm

def search_apps(term, country, media='software', limit=100):
    
    base_url = 'https://itunes.apple.com/search'
    params = {
        'term': term,
        'country': country,
        'media': media,
        'limit': limit
    }
    response = requests.get(base_url, params=params)
    response = response.json()['results']
    
    # Create a pandas DataFrame
    data = pd.DataFrame(response, columns=['trackName', 'averageUserRating','userRatingCount', 
                                           'price', 'currency','primaryGenreName', 'releaseDate', 
                                           'currentVersionReleaseDate', 'description',"releaseNotes" ,"screenshotUrls",
                                           'languageCodesISO2A', 'trackId', 'sellerName', "sellerUrl", 'trackViewUrl'])
    
    data["averageUserRating"] = np.round(data["averageUserRating"], 1)
    data["market"] = country
    data["search_term"] = term
    data["releaseDate"] = data["releaseDate"].apply(lambda x: x.split("T")[0])
    data["currentVersionReleaseDate"] = data["currentVersionReleaseDate"].apply(lambda x: x.split("T")[0])
    
    data.columns = ['name', 'average_rating', 'rating_count', 'price', 'currency', 
                    'genre', 'release_date', 'latest_version_release', 'description', 'release_notes',
                    "screenshots", 'languages', 'id',  'seller', 'seller_url', 'link', 'market', 'search_term']
    
    return data


def init_search(term, countries, media='software', limit=100):
    term = term.split(", ")
    countries = countries.split(", ")

    data = pd.DataFrame()

    for country in countries:
        for t in term:
            temp = search_apps(t, country, media=media, limit=limit)
            temp['languages'] = temp['languages'].apply(lambda x: ', '.join(x))
            temp['screenshots'] = temp['screenshots'].apply(lambda x: ', '.join(x))
            data = pd.concat([data, temp], ignore_index=True)

    data = data.drop_duplicates(subset=data.columns.difference(['search_term']))
    data = data.reset_index(drop=True)

    return data

def to_csv(data):
    csv = data.to_csv(index=False)
    return csv.encode('utf-8')

def extract_name_id(url):
    pattern = r'/app/([^/]+)/id([^/?]+)'
    match = re.search(pattern, url)
    if match:
        name = urllib.parse.unquote(match.group(1))
        id = match.group(2)
        return name, id
    else:
        return None, None
    
def get_dev_comments(row):
    try:
        return row["body"]
    except:
        return row
    
def fetch_data(items):

    data = pd.DataFrame(columns=["os", "application"])

    for client, info in tqdm(items):

        #### APPSTORE ####
        for app_name, app_id in info["iOS"].items():
            
            reviews = AppStore(country='se', app_name = app_name, app_id = app_id)
            
            # Check to see if reviews have already been fetched. If so, we just want the new ones.
            try:
                last_date = data[(data["application"] == client) & (data["os"] == "iOS")]["date"].max()
                reviews.review(sleep=np.random.randint(5,8), after=last_date)
                
            except:
                reviews.review(sleep=np.random.randint(5,8))

            # Creating a DataFrame with the reviews. Since the format of the data is a bit strange, we just expand it 
            # and use that data for a new DataFrame. 
            if len(reviews.reviews) > 0:
                
                ios = pd.DataFrame(np.array(reviews.reviews),columns=['review'])
                ios = ios.pop('review').tolist()
                ios = pd.DataFrame(ios)

                # Appending the application name to every row. 
                ios["application"] = client
                ios["os"] = "iOS"
                ios["app_id"] = app_id

                # If there is a developer response, we need to extract that as well. 
                if "developerResponse" in ios.columns:
                    ios["developerResponse"] = ios["developerResponse"].apply(get_dev_comments)

                else:
                    ios["developerResponse"] = np.nan
                    
                ios = ios[['application','os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']]

                # Lastly we add all the data to the main DataFrame. 
                data = pd.concat([data, ios])
                print(f"Collected {len(ios)} items from {client} from AppStore")
            
        #### GOOGLE PLAY ####
        if info["Android"] != None:
        
            rename = {
            "at" : "date",
            "content" : "review",
            "replyContent" : "developerResponse",
            "score" : "rating"
            }

            android = reviews_all(info["Android"], sleep_milliseconds=10, lang='sv', sort=Sort.NEWEST)
            android = pd.DataFrame(android)

            if client in data[data["os"] == "Android"]["application"].unique():
                last_date = data[(data["application"] == client) & (data["os"] == "Android")]["date"].max()
                android = android[android["at"]>last_date]

            android = android.rename(columns=rename)
            android["os"] = "Android"
            android["application"] = client
            android["app_id"] = app_id
            android = android[['application', 'os', 'date', 'userName', 'review', 'rating', 'developerResponse', 'app_id']]
            
            data = pd.concat([data, android])
            print(f"Collected {len(android)} items from {client} from Google Play")

    return data