File size: 11,946 Bytes
9c068aa 600b485 f1d926e aaadc48 d03716d f1d926e 9c068aa 1c76922 9c068aa 06d36bc 9c068aa 1c76922 9c068aa 1c76922 9c068aa 1c76922 9c068aa 1c76922 9c068aa 1c76922 aaadc48 cf4432a 1c76922 cf4432a aaadc48 1c76922 9c068aa f62fe01 1c76922 f62fe01 1c76922 f62fe01 1c76922 f62fe01 1c76922 f62fe01 1c76922 9c068aa f62fe01 9c068aa aaadc48 1c76922 20b2f7f 9c068aa 1c76922 9c068aa 07f4438 aaadc48 1c76922 9c068aa 1c76922 5ff22e9 1c76922 70bf999 1c76922 9c068aa 1c76922 e0f90c5 cf4432a 1c76922 9c068aa e7171a3 9c068aa deaf595 03578c3 9c068aa e688a0c 1c76922 9c068aa 46513fc 9c068aa 46513fc 9c068aa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 | import requests
import pandas as pd
import gradio as gr
import os
import time
import math
RAPIDAPI_API_KEY = os.environ['RAPIDAPI_API_KEY']
import requests
# Function to scrape Instagram profile
def scrape_instagram(user_name):
url = "https://instagram-scraper-api2.p.rapidapi.com/v1/info"
print(user_name)
querystring = {"username_or_id_or_url": f"{user_name}"}
headers = {
"x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
"x-rapidapi-host": "instagram-scraper-api2.p.rapidapi.com"
}
try:
response = requests.get(url, headers=headers, params=querystring)
response.raise_for_status() # Raise HTTPError for bad responses
response_json = response.json()
if 'data' not in response_json:
print("No data found in response")
return {} # Return an empty dictionary if there is no data in the response
response_data = response_json['data']
print(response_data)
profile_info = {
'bio': response_data.get('biography', ''),
'follower_count': response_data.get('follower_count', 0),
'following_count': response_data.get('following_count', 0),
'bio_links': [item['url'] for item in response_data.get('bio_links', [])],
'full_name': response_data.get('full_name', ''),
'username': response_data.get('username', ''),
'num_posts': response_data.get('media_count', 0),
'profile_id': response_data.get('profile_pic_id', ''),
'email': response_data.get('biography_email', ''),
'badge': response_data.get('account_badges', []),
'category': response_data.get('category', ''),
'phone_number': response_data.get('contact_phone_number', ''),
'city_name': response_data.get('location_data', {}).get('city_name', ''),
'country': '',
'date_joined': ''
}
return profile_info
except requests.exceptions.RequestException as e:
print(f"Request error: {e}")
except requests.exceptions.HTTPError as e:
print(f"HTTP error: {e}")
except requests.exceptions.ConnectionError as e:
print(f"Connection error: {e}")
except requests.exceptions.Timeout as e:
print(f"Timeout error: {e}")
except ValueError as e:
print(f"JSON decode error: {e}")
except KeyError as e:
print(f"Key error: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
return {} # Return an empty dictionary if an error occurs
# Function to populate DataFrame with Instagram information
def get_insta_info(df, progress=gr.Progress()):
# Add new columns to the DataFrame
new_columns = [
'Bio', 'Follower Count', 'Following Count', 'Bio Links', 'Full Name',
'Username', 'Num Posts', 'Profile ID', 'Email', 'Badge', 'Category',
'Phone Number', 'City Name', 'Country', 'Date Joined'
]
for column in new_columns:
if column not in df.columns:
df[column] = ''
links = df['Links'].values
print(links)
for i in progress.tqdm(range(len(links)), desc='Scraping...'):
try:
time.sleep(1) # Simulate delay for scraping
profile_info = scrape_instagram(links[i])
if profile_info: # Only populate if profile_info is not empty
df.at[i, 'Bio'] = profile_info['bio']
df.at[i, 'Follower Count'] = profile_info['follower_count']
df.at[i, 'Following Count'] = profile_info['following_count']
df.at[i, 'Bio Links'] = ', '.join(profile_info['bio_links'])
df.at[i, 'Full Name'] = profile_info['full_name']
df.at[i, 'Username'] = profile_info['username']
df.at[i, 'Num Posts'] = profile_info['num_posts']
df.at[i, 'Profile ID'] = profile_info['profile_id']
df.at[i, 'Email'] = profile_info['email']
df.at[i, 'Badge'] = ', '.join(str(badge) for badge in profile_info['badge'])
df.at[i, 'Category'] = profile_info['category']
df.at[i, 'Phone Number'] = profile_info['phone_number']
df.at[i, 'City Name'] = profile_info['city_name']
df.at[i, 'Country'] = profile_info['country']
df.at[i, 'Date Joined'] = profile_info['date_joined']
except requests.exceptions.RequestException as e:
print(f"Request error for link {links[i]}: {e}")
except ValueError as e:
print(f"JSON decode error for link {links[i]}: {e}")
except KeyError as e:
print(f"Key error for link {links[i]}: {e}")
except Exception as e:
print(f"An unexpected error occurred for link {links[i]}: {e}")
return df
# Function to scrape LinkedIn profiles
def scrape_linkedins(links):
url = "https://linkedin-bulk-data-scraper.p.rapidapi.com/profiles"
headers = {
"x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
"x-rapidapi-host": "linkedin-bulk-data-scraper.p.rapidapi.com",
"Content-Type": "application/json",
"x-rapidapi-user": "usama"
}
# Initialize an empty list to store the dictionaries
profile_info_list = []
chunk_size = 100
# Calculate the number of chunks needed
num_chunks = math.ceil(len(links) / chunk_size)
try:
for i in range(num_chunks):
chunk = links[i * chunk_size:(i + 1) * chunk_size]
payload = {"links": chunk}
response = requests.post(url, json=payload, headers=headers)
response.raise_for_status() # Raise HTTPError for bad responses
data = response.json()
if 'data' not in data:
raise ValueError("Missing 'data' in response")
responses = data['data']
for response_item in responses:
response_data = response_item.get('data', {})
# Use get() method with default empty strings for missing fields
profile_info = {
'link': response_item.get('entry', ''),
'full_name': response_data.get('fullName', ''),
'headline': response_data.get('headline', ''),
'connections': response_data.get('followers', ''), # or 'connections' based on availability
'country': response_data.get('addressCountryOnly', ''),
'address': response_data.get('addressWithoutCountry', ''),
'about': response_data.get('about', ''),
'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at "
f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"),
'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])],
'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at "
f"{response_data.get('educations', [{}])[0].get('title', '')}"),
'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])]
}
# Append the dictionary to the list
profile_info_list.append(profile_info)
except requests.exceptions.RequestException as e:
print(f"Request error: {e}")
except ValueError as e:
print(f"Value error: {e}")
except KeyError as e:
print(f"Key error: {e}")
except Exception as e:
print(f"An unexpected error occurred: {e}")
return profile_info_list
# Function to populate DataFrame with LinkedIn information
def get_LI_info(df, progress=gr.Progress()):
try:
links = df['Links'].tolist()
profile_info_list = scrape_linkedins(links)
except Exception as e:
print(f"Error scraping LinkedIn profiles: {e}")
return df
# Create a dictionary for quick lookup based on the link
profile_info_dict = {info['link']: info for info in profile_info_list if info}
# Add new columns to the DataFrame
for column in ['Full Name', 'Headline', 'Connections', 'Country', 'Address', 'About', 'Current Role', 'All Roles', 'Most Recent Education', 'All Education']:
if column not in df.columns:
df[column] = ''
# Populate the DataFrame by matching the Link values
for index, row in progress.tqdm(df.iterrows(), desc='Scraping...'):
try:
link = row['Links']
if link in profile_info_dict:
profile_info = profile_info_dict[link]
df.at[index, 'Full Name'] = profile_info.get('full_name', '')
df.at[index, 'Headline'] = profile_info.get('headline', '')
df.at[index, 'Connections'] = profile_info.get('connections', '')
df.at[index, 'Country'] = profile_info.get('country', '')
df.at[index, 'Address'] = profile_info.get('address', '')
df.at[index, 'About'] = profile_info.get('about', '')
df.at[index, 'Current Role'] = profile_info.get('current_role', '')
df.at[index, 'All Roles'] = profile_info.get('all_roles', '')
df.at[index, 'Most Recent Education'] = profile_info.get('education', '')
df.at[index, 'All Education'] = profile_info.get('all_education', '')
else:
print(f"Profile information for link {link} not found.")
except Exception as e:
print(f"Error processing row {index} with link {link}: {e}")
return df
def get_scrape_data(csv_files, social_media, password):
if password != os.environ['DASHBOARD_PASSWORD']:
raise gr.Error('Incorrect Password')
# Initialize an empty list to store DataFrames
dataframes = []
# Read each CSV file and append the DataFrame to the list
for csv_file in csv_files:
df = pd.read_csv(csv_file.name)
dataframes.append(df)
# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)
# Process the combined DataFrame based on the social media platform
if social_media == 'LinkedIn':
output_df = get_LI_info(combined_df)
elif social_media == 'Instagram':
output_df = get_insta_info(combined_df)
print(output_df.head(2))
file_name = f'./{social_media}_output.csv'
output_df.to_csv(file_name)
completion_status = "Done"
return completion_status, gr.DownloadButton(label='Download Scraped Data', value=file_name, visible=True), output_df
with gr.Blocks() as block:
gr.Markdown("""
# Social Media Scraper Dashboard
This dashboard is scrapes data from Linkedin and Instagram \n
Link to Data Analysis Platform: https://ai-data-analyst.streamlit.app/
""")
with gr.Column(visible=True):
password = gr.Textbox(label='Enter Password')
csv_file = gr.File(label='Input CSV File (must be CSV File)', file_count='multiple')
social_media = gr.Radio(choices=['LinkedIn', 'Instagram'], label='Which Social Media?', info = 'Which Social Media do you want to scrape from?')
con_gen_btn = gr.Button('Scrape')
status = gr.Textbox(label='Completion Status')
data = gr.DataFrame(label='Scraped Data')
download_btn = gr.DownloadButton(label='Download Content', visible=False)
con_gen_btn.click(get_scrape_data, inputs=[csv_file, social_media, password], outputs=[status, download_btn, data])
block.queue(default_concurrency_limit=5)
block.launch()
|