mylesai commited on
Commit
1c76922
·
verified ·
1 Parent(s): aaadc48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -144
app.py CHANGED
@@ -6,111 +6,115 @@ import time
6
 
7
  RAPIDAPI_API_KEY = os.environ['RAPIDAPI_API_KEY']
8
 
 
 
 
9
  def scrape_instagram(user_name):
10
-
11
  url = "https://instagram-scraper-api2.p.rapidapi.com/v1/info"
12
  print(user_name)
13
 
14
- querystring = {"username_or_id_or_url":f"{user_name}"}
15
 
16
  headers = {
17
- "x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
18
- "x-rapidapi-host": "instagram-scraper-api2.p.rapidapi.com"
19
  }
20
 
21
- response = requests.get(url, headers=headers, params=querystring)
22
-
23
- if response.status_code != 200:
24
- print(f"Failed to fetch profile: {response.status_code}")
25
- return {} # Return an empty dictionary if the request fails
26
-
27
- response_json = response.json()
28
- if 'data' not in response_json:
29
- print("No data found in response")
30
- return {} # Return an empty dictionary if there is no data in the response
31
-
32
- response_data = response_json['data']
33
- print(response_data)
34
-
35
- profile_info = {
36
- 'bio': response_data.get('biography', ''),
37
- 'follower_count': response_data.get('follower_count', 0),
38
- 'following_count': response_data.get('following_count', 0),
39
- 'bio_links': [item['url'] for item in response_data.get('bio_links', [])],
40
- 'full_name': response_data.get('full_name', ''),
41
- 'username': response_data.get('username', ''),
42
- 'num_posts': response_data.get('media_count', 0),
43
- 'profile_id': response_data.get('profile_pic_id', ''),
44
- 'email': response_data.get('biography_email', ''),
45
- 'badge': response_data.get('account_badges', []),
46
- 'category': response_data.get('category', ''),
47
- 'phone_number': response_data.get('contact_phone_number', ''),
48
- 'city_name': response_data.get('location_data', {}).get('city_name', ''),
49
- 'country': '',
50
- 'date_joined': ''
51
- }
52
 
53
- return profile_info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- def get_insta_info(df):
56
- # Add new columns to the DataFrame
57
- df['Bio'] = ''
58
- df['Follower Count'] = 0
59
- df['Following Count'] = 0
60
- df['Bio Links'] = ''
61
- df['Full Name'] = ''
62
- df['Username'] = ''
63
- df['Num Posts'] = 0
64
- df['Profile ID'] = ''
65
- df['Email'] = ''
66
- df['Badge'] = ''
67
- df['Category'] = ''
68
- df['Phone Number'] = ''
69
- df['City Name'] = ''
70
- df['Country'] = ''
71
- df['Date Joined'] = ''
72
 
 
73
  def get_insta_info(df, progress=gr.Progress()):
74
  # Add new columns to the DataFrame
75
- df['Bio'] = ''
76
- df['Follower Count'] = 0
77
- df['Following Count'] = 0
78
- df['Bio Links'] = ''
79
- df['Full Name'] = ''
80
- df['Username'] = ''
81
- df['Num Posts'] = 0
82
- df['Profile ID'] = ''
83
- df['Email'] = ''
84
- df['Badge'] = ''
85
- df['Category'] = ''
86
- df['Phone Number'] = ''
87
- df['City Name'] = ''
88
- df['Country'] = ''
89
- df['Date Joined'] = ''
90
 
91
  links = df['Links'].values
92
  print(links)
93
 
94
  for i in progress.tqdm(range(len(links)), desc='Scraping...'):
95
- time.sleep(1)
96
- profile_info = scrape_instagram(links[i])
97
-
98
- if profile_info: # Only populate if profile_info is not empty
99
- df.at[i, 'Bio'] = profile_info['bio']
100
- df.at[i, 'Follower Count'] = profile_info['follower_count']
101
- df.at[i, 'Following Count'] = profile_info['following_count']
102
- df.at[i, 'Bio Links'] = ', '.join(profile_info['bio_links'])
103
- df.at[i, 'Full Name'] = profile_info['full_name']
104
- df.at[i, 'Username'] = profile_info['username']
105
- df.at[i, 'Num Posts'] = profile_info['num_posts']
106
- df.at[i, 'Profile ID'] = profile_info['profile_id']
107
- df.at[i, 'Email'] = profile_info['email']
108
- df.at[i, 'Badge'] = ', '.join(profile_info['badge'])
109
- df.at[i, 'Category'] = profile_info['category']
110
- df.at[i, 'Phone Number'] = profile_info['phone_number']
111
- df.at[i, 'City Name'] = profile_info['city_name']
112
- df.at[i, 'Country'] = profile_info['country']
113
- df.at[i, 'Date Joined'] = profile_info['date_joined']
 
 
 
 
 
 
 
 
 
114
 
115
  return df
116
 
@@ -127,88 +131,118 @@ def scrape_linkedins(links):
127
  "x-rapidapi-user": "usama"
128
  }
129
 
130
-
131
  # Initialize an empty list to store the dictionaries
132
  profile_info_list = []
133
-
134
- response = requests.post(url, json=payload, headers=headers)
135
- responses = response.json()['data']
136
- for response_item in responses:
137
- response_data = response_item.get('data', {})
138
-
139
- # Use get() method with default empty strings for missing fields
140
- profile_info = {
141
- 'link': response_item.get('entry', ''),
142
- 'full_name': response_data.get('fullName', ''),
143
- 'headline': response_data.get('headline', ''),
144
- 'connections': response_data.get('followers', ''), # or 'connections' based on availability
145
- 'country': response_data.get('addressCountryOnly', ''),
146
- 'address': response_data.get('addressWithoutCountry', ''),
147
- 'about': response_data.get('about', ''),
148
- 'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at "
149
- f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"),
150
- 'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])],
151
- 'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at "
152
- f"{response_data.get('educations', [{}])[0].get('title', '')}"),
153
- 'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])]
154
- }
155
 
156
- # Append the dictionary to the list
157
- profile_info_list.append(profile_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  return profile_info_list
160
 
161
  # Function to populate DataFrame with LinkedIn information
162
  def get_LI_info(df, progress=gr.Progress()):
163
- links = df['Links'].tolist()
164
- profile_info_list = scrape_linkedins(links)
 
 
 
 
165
 
166
  # Create a dictionary for quick lookup based on the link
167
  profile_info_dict = {info['link']: info for info in profile_info_list if info}
168
 
169
  # Add new columns to the DataFrame
170
- df['Full Name'] = ''
171
- df['Headline'] = ''
172
- df['Connections'] = ''
173
- df['Country'] = ''
174
- df['Address'] = ''
175
- df['About'] = ''
176
- df['Current Role'] = ''
177
- df['All Roles'] = ''
178
- df['Most Recent Education'] = ''
179
- df['All Education'] = ''
180
 
181
  # Populate the DataFrame by matching the Link values
182
  for index, row in progress.tqdm(df.iterrows(), desc='Scraping...'):
183
- link = row['Links']
184
- if link in profile_info_dict:
185
- profile_info = profile_info_dict[link]
186
- df.at[index, 'Full Name'] = profile_info['full_name']
187
- df.at[index, 'Headline'] = profile_info['headline']
188
- df.at[index, 'Connections'] = profile_info['connections']
189
- df.at[index, 'Country'] = profile_info['country']
190
- df.at[index, 'Address'] = profile_info['address']
191
- df.at[index, 'About'] = profile_info['about']
192
- df.at[index, 'Current Role'] = profile_info['current_role']
193
- df.at[index, 'All Roles'] = profile_info['all_roles']
194
- df.at[index, 'Most Recent Education'] = profile_info['education']
195
- df.at[index, 'All Education'] = profile_info['all_education']
196
-
 
 
 
 
197
 
198
  return df
199
 
200
 
201
- def get_scrape_data(csv_file, social_media, password):
202
  if password != os.environ['DASHBOARD_PASSWORD']:
203
  raise gr.Error('Incorrect Password')
204
- df = pd.read_csv(csv_file.name)
 
 
 
 
 
 
 
 
 
 
 
 
205
  if social_media == 'LinkedIn':
206
- output_df = get_LI_info(df)
207
  elif social_media == 'Instagram':
208
- output_df = get_insta_info(df)
 
209
  print(output_df.head(2))
210
  file_name = f'./{social_media}_output.csv'
211
- output_df.to_csv(f'./{social_media}_output.csv')
212
  completion_status = "Done"
213
  return completion_status, gr.DownloadButton(label='Download Scraped Data', value=file_name, visible=True), output_df
214
 
@@ -221,7 +255,7 @@ with gr.Blocks() as block:
221
  """)
222
  with gr.Column(visible=True):
223
  password = gr.Textbox(label='Enter Password')
224
- csv_file = gr.File(label='Input CSV File (must be CSV File)')
225
  social_media = gr.Radio(choices=['LinkedIn', 'Instagram'], label='Which Social Media?', info = 'Which Social Media do you want to scrape from?')
226
  con_gen_btn = gr.Button('Scrape')
227
  status = gr.Textbox(label='Completion Status')
 
6
 
7
  RAPIDAPI_API_KEY = os.environ['RAPIDAPI_API_KEY']
8
 
9
+ import requests
10
+
11
+ # Function to scrape Instagram profile
12
  def scrape_instagram(user_name):
 
13
  url = "https://instagram-scraper-api2.p.rapidapi.com/v1/info"
14
  print(user_name)
15
 
16
+ querystring = {"username_or_id_or_url": f"{user_name}"}
17
 
18
  headers = {
19
+ "x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
20
+ "x-rapidapi-host": "instagram-scraper-api2.p.rapidapi.com"
21
  }
22
 
23
+ try:
24
+ response = requests.get(url, headers=headers, params=querystring)
25
+ response.raise_for_status() # Raise HTTPError for bad responses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ response_json = response.json()
28
+ if 'data' not in response_json:
29
+ print("No data found in response")
30
+ return {} # Return an empty dictionary if there is no data in the response
31
+
32
+ response_data = response_json['data']
33
+ print(response_data)
34
+
35
+ profile_info = {
36
+ 'bio': response_data.get('biography', ''),
37
+ 'follower_count': response_data.get('follower_count', 0),
38
+ 'following_count': response_data.get('following_count', 0),
39
+ 'bio_links': [item['url'] for item in response_data.get('bio_links', [])],
40
+ 'full_name': response_data.get('full_name', ''),
41
+ 'username': response_data.get('username', ''),
42
+ 'num_posts': response_data.get('media_count', 0),
43
+ 'profile_id': response_data.get('profile_pic_id', ''),
44
+ 'email': response_data.get('biography_email', ''),
45
+ 'badge': response_data.get('account_badges', []),
46
+ 'category': response_data.get('category', ''),
47
+ 'phone_number': response_data.get('contact_phone_number', ''),
48
+ 'city_name': response_data.get('location_data', {}).get('city_name', ''),
49
+ 'country': '',
50
+ 'date_joined': ''
51
+ }
52
+
53
+ return profile_info
54
+
55
+ except requests.exceptions.RequestException as e:
56
+ print(f"Request error: {e}")
57
+ except requests.exceptions.HTTPError as e:
58
+ print(f"HTTP error: {e}")
59
+ except requests.exceptions.ConnectionError as e:
60
+ print(f"Connection error: {e}")
61
+ except requests.exceptions.Timeout as e:
62
+ print(f"Timeout error: {e}")
63
+ except ValueError as e:
64
+ print(f"JSON decode error: {e}")
65
+ except KeyError as e:
66
+ print(f"Key error: {e}")
67
+ except Exception as e:
68
+ print(f"An unexpected error occurred: {e}")
69
+
70
+ return {} # Return an empty dictionary if an error occurs
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ # Function to populate DataFrame with Instagram information
74
  def get_insta_info(df, progress=gr.Progress()):
75
  # Add new columns to the DataFrame
76
+ new_columns = [
77
+ 'Bio', 'Follower Count', 'Following Count', 'Bio Links', 'Full Name',
78
+ 'Username', 'Num Posts', 'Profile ID', 'Email', 'Badge', 'Category',
79
+ 'Phone Number', 'City Name', 'Country', 'Date Joined'
80
+ ]
81
+
82
+ for column in new_columns:
83
+ if column not in df.columns:
84
+ df[column] = ''
 
 
 
 
 
 
85
 
86
  links = df['Links'].values
87
  print(links)
88
 
89
  for i in progress.tqdm(range(len(links)), desc='Scraping...'):
90
+ try:
91
+ time.sleep(1) # Simulate delay for scraping
92
+ profile_info = scrape_instagram(links[i])
93
+
94
+ if profile_info: # Only populate if profile_info is not empty
95
+ df.at[i, 'Bio'] = profile_info['bio']
96
+ df.at[i, 'Follower Count'] = profile_info['follower_count']
97
+ df.at[i, 'Following Count'] = profile_info['following_count']
98
+ df.at[i, 'Bio Links'] = ', '.join(profile_info['bio_links'])
99
+ df.at[i, 'Full Name'] = profile_info['full_name']
100
+ df.at[i, 'Username'] = profile_info['username']
101
+ df.at[i, 'Num Posts'] = profile_info['num_posts']
102
+ df.at[i, 'Profile ID'] = profile_info['profile_id']
103
+ df.at[i, 'Email'] = profile_info['email']
104
+ df.at[i, 'Badge'] = ', '.join(str(badge) for badge in profile_info['badge'])
105
+ df.at[i, 'Category'] = profile_info['category']
106
+ df.at[i, 'Phone Number'] = profile_info['phone_number']
107
+ df.at[i, 'City Name'] = profile_info['city_name']
108
+ df.at[i, 'Country'] = profile_info['country']
109
+ df.at[i, 'Date Joined'] = profile_info['date_joined']
110
+ except requests.exceptions.RequestException as e:
111
+ print(f"Request error for link {links[i]}: {e}")
112
+ except ValueError as e:
113
+ print(f"JSON decode error for link {links[i]}: {e}")
114
+ except KeyError as e:
115
+ print(f"Key error for link {links[i]}: {e}")
116
+ except Exception as e:
117
+ print(f"An unexpected error occurred for link {links[i]}: {e}")
118
 
119
  return df
120
 
 
131
  "x-rapidapi-user": "usama"
132
  }
133
 
 
134
  # Initialize an empty list to store the dictionaries
135
  profile_info_list = []
136
+
137
+ try:
138
+ response = requests.post(url, json=payload, headers=headers)
139
+ response.raise_for_status() # Raise HTTPError for bad responses
140
+ data = response.json()
141
+
142
+ if 'data' not in data:
143
+ raise ValueError("Missing 'data' in response")
144
+
145
+ responses = data['data']
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ for response_item in responses:
148
+ response_data = response_item.get('data', {})
149
+
150
+ # Use get() method with default empty strings for missing fields
151
+ profile_info = {
152
+ 'link': response_item.get('entry', ''),
153
+ 'full_name': response_data.get('fullName', ''),
154
+ 'headline': response_data.get('headline', ''),
155
+ 'connections': response_data.get('followers', ''), # or 'connections' based on availability
156
+ 'country': response_data.get('addressCountryOnly', ''),
157
+ 'address': response_data.get('addressWithoutCountry', ''),
158
+ 'about': response_data.get('about', ''),
159
+ 'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at "
160
+ f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"),
161
+ 'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])],
162
+ 'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at "
163
+ f"{response_data.get('educations', [{}])[0].get('title', '')}"),
164
+ 'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])]
165
+ }
166
+
167
+ # Append the dictionary to the list
168
+ profile_info_list.append(profile_info)
169
+
170
+ except requests.exceptions.RequestException as e:
171
+ print(f"Request error: {e}")
172
+ except ValueError as e:
173
+ print(f"Value error: {e}")
174
+ except KeyError as e:
175
+ print(f"Key error: {e}")
176
+ except Exception as e:
177
+ print(f"An unexpected error occurred: {e}")
178
 
179
  return profile_info_list
180
 
181
  # Function to populate DataFrame with LinkedIn information
182
  def get_LI_info(df, progress=gr.Progress()):
183
+ try:
184
+ links = df['Links'].tolist()
185
+ profile_info_list = scrape_linkedins(links)
186
+ except Exception as e:
187
+ print(f"Error scraping LinkedIn profiles: {e}")
188
+ return df
189
 
190
  # Create a dictionary for quick lookup based on the link
191
  profile_info_dict = {info['link']: info for info in profile_info_list if info}
192
 
193
  # Add new columns to the DataFrame
194
+ for column in ['Full Name', 'Headline', 'Connections', 'Country', 'Address', 'About', 'Current Role', 'All Roles', 'Most Recent Education', 'All Education']:
195
+ if column not in df.columns:
196
+ df[column] = ''
 
 
 
 
 
 
 
197
 
198
  # Populate the DataFrame by matching the Link values
199
  for index, row in progress.tqdm(df.iterrows(), desc='Scraping...'):
200
+ try:
201
+ link = row['Links']
202
+ if link in profile_info_dict:
203
+ profile_info = profile_info_dict[link]
204
+ df.at[index, 'Full Name'] = profile_info.get('full_name', '')
205
+ df.at[index, 'Headline'] = profile_info.get('headline', '')
206
+ df.at[index, 'Connections'] = profile_info.get('connections', '')
207
+ df.at[index, 'Country'] = profile_info.get('country', '')
208
+ df.at[index, 'Address'] = profile_info.get('address', '')
209
+ df.at[index, 'About'] = profile_info.get('about', '')
210
+ df.at[index, 'Current Role'] = profile_info.get('current_role', '')
211
+ df.at[index, 'All Roles'] = profile_info.get('all_roles', '')
212
+ df.at[index, 'Most Recent Education'] = profile_info.get('education', '')
213
+ df.at[index, 'All Education'] = profile_info.get('all_education', '')
214
+ else:
215
+ print(f"Profile information for link {link} not found.")
216
+ except Exception as e:
217
+ print(f"Error processing row {index} with link {link}: {e}")
218
 
219
  return df
220
 
221
 
222
+ def get_scrape_data(csv_files, social_media, password):
223
  if password != os.environ['DASHBOARD_PASSWORD']:
224
  raise gr.Error('Incorrect Password')
225
+
226
+ # Initialize an empty list to store DataFrames
227
+ dataframes = []
228
+
229
+ # Read each CSV file and append the DataFrame to the list
230
+ for csv_file in csv_files:
231
+ df = pd.read_csv(csv_file.name)
232
+ dataframes.append(df)
233
+
234
+ # Concatenate all DataFrames into a single DataFrame
235
+ combined_df = pd.concat(dataframes, ignore_index=True)
236
+
237
+ # Process the combined DataFrame based on the social media platform
238
  if social_media == 'LinkedIn':
239
+ output_df = get_LI_info(combined_df)
240
  elif social_media == 'Instagram':
241
+ output_df = get_insta_info(combined_df)
242
+
243
  print(output_df.head(2))
244
  file_name = f'./{social_media}_output.csv'
245
+ output_df.to_csv(file_name)
246
  completion_status = "Done"
247
  return completion_status, gr.DownloadButton(label='Download Scraped Data', value=file_name, visible=True), output_df
248
 
 
255
  """)
256
  with gr.Column(visible=True):
257
  password = gr.Textbox(label='Enter Password')
258
+ csv_file = gr.File(label='Input CSV File (must be CSV File)', file_count='multiple')
259
  social_media = gr.Radio(choices=['LinkedIn', 'Instagram'], label='Which Social Media?', info = 'Which Social Media do you want to scrape from?')
260
  con_gen_btn = gr.Button('Scrape')
261
  status = gr.Textbox(label='Completion Status')