|
|
import requests |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
import os |
|
|
import time |
|
|
import math |
|
|
|
|
|
RAPIDAPI_API_KEY = os.environ['RAPIDAPI_API_KEY'] |
|
|
|
|
|
import requests |
|
|
|
|
|
|
|
|
def scrape_instagram(user_name): |
|
|
url = "https://instagram-scraper-api2.p.rapidapi.com/v1/info" |
|
|
print(user_name) |
|
|
|
|
|
querystring = {"username_or_id_or_url": f"{user_name}"} |
|
|
|
|
|
headers = { |
|
|
"x-rapidapi-key": f"{RAPIDAPI_API_KEY}", |
|
|
"x-rapidapi-host": "instagram-scraper-api2.p.rapidapi.com" |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.get(url, headers=headers, params=querystring) |
|
|
response.raise_for_status() |
|
|
|
|
|
response_json = response.json() |
|
|
if 'data' not in response_json: |
|
|
print("No data found in response") |
|
|
return {} |
|
|
|
|
|
response_data = response_json['data'] |
|
|
print(response_data) |
|
|
|
|
|
profile_info = { |
|
|
'bio': response_data.get('biography', ''), |
|
|
'follower_count': response_data.get('follower_count', 0), |
|
|
'following_count': response_data.get('following_count', 0), |
|
|
'bio_links': [item['url'] for item in response_data.get('bio_links', [])], |
|
|
'full_name': response_data.get('full_name', ''), |
|
|
'username': response_data.get('username', ''), |
|
|
'num_posts': response_data.get('media_count', 0), |
|
|
'profile_id': response_data.get('profile_pic_id', ''), |
|
|
'email': response_data.get('biography_email', ''), |
|
|
'badge': response_data.get('account_badges', []), |
|
|
'category': response_data.get('category', ''), |
|
|
'phone_number': response_data.get('contact_phone_number', ''), |
|
|
'city_name': response_data.get('location_data', {}).get('city_name', ''), |
|
|
'country': '', |
|
|
'date_joined': '' |
|
|
} |
|
|
|
|
|
return profile_info |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Request error: {e}") |
|
|
except requests.exceptions.HTTPError as e: |
|
|
print(f"HTTP error: {e}") |
|
|
except requests.exceptions.ConnectionError as e: |
|
|
print(f"Connection error: {e}") |
|
|
except requests.exceptions.Timeout as e: |
|
|
print(f"Timeout error: {e}") |
|
|
except ValueError as e: |
|
|
print(f"JSON decode error: {e}") |
|
|
except KeyError as e: |
|
|
print(f"Key error: {e}") |
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred: {e}") |
|
|
|
|
|
return {} |
|
|
|
|
|
|
|
|
|
|
|
def get_insta_info(df, progress=gr.Progress()): |
|
|
|
|
|
new_columns = [ |
|
|
'Bio', 'Follower Count', 'Following Count', 'Bio Links', 'Full Name', |
|
|
'Username', 'Num Posts', 'Profile ID', 'Email', 'Badge', 'Category', |
|
|
'Phone Number', 'City Name', 'Country', 'Date Joined' |
|
|
] |
|
|
|
|
|
for column in new_columns: |
|
|
if column not in df.columns: |
|
|
df[column] = '' |
|
|
|
|
|
links = df['Links'].values |
|
|
print(links) |
|
|
|
|
|
for i in progress.tqdm(range(len(links)), desc='Scraping...'): |
|
|
try: |
|
|
time.sleep(1) |
|
|
profile_info = scrape_instagram(links[i]) |
|
|
|
|
|
if profile_info: |
|
|
df.at[i, 'Bio'] = profile_info['bio'] |
|
|
df.at[i, 'Follower Count'] = profile_info['follower_count'] |
|
|
df.at[i, 'Following Count'] = profile_info['following_count'] |
|
|
df.at[i, 'Bio Links'] = ', '.join(profile_info['bio_links']) |
|
|
df.at[i, 'Full Name'] = profile_info['full_name'] |
|
|
df.at[i, 'Username'] = profile_info['username'] |
|
|
df.at[i, 'Num Posts'] = profile_info['num_posts'] |
|
|
df.at[i, 'Profile ID'] = profile_info['profile_id'] |
|
|
df.at[i, 'Email'] = profile_info['email'] |
|
|
df.at[i, 'Badge'] = ', '.join(str(badge) for badge in profile_info['badge']) |
|
|
df.at[i, 'Category'] = profile_info['category'] |
|
|
df.at[i, 'Phone Number'] = profile_info['phone_number'] |
|
|
df.at[i, 'City Name'] = profile_info['city_name'] |
|
|
df.at[i, 'Country'] = profile_info['country'] |
|
|
df.at[i, 'Date Joined'] = profile_info['date_joined'] |
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Request error for link {links[i]}: {e}") |
|
|
except ValueError as e: |
|
|
print(f"JSON decode error for link {links[i]}: {e}") |
|
|
except KeyError as e: |
|
|
print(f"Key error for link {links[i]}: {e}") |
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred for link {links[i]}: {e}") |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def scrape_linkedins(links): |
|
|
url = "https://linkedin-bulk-data-scraper.p.rapidapi.com/profiles" |
|
|
|
|
|
headers = { |
|
|
"x-rapidapi-key": f"{RAPIDAPI_API_KEY}", |
|
|
"x-rapidapi-host": "linkedin-bulk-data-scraper.p.rapidapi.com", |
|
|
"Content-Type": "application/json", |
|
|
"x-rapidapi-user": "usama" |
|
|
} |
|
|
|
|
|
|
|
|
profile_info_list = [] |
|
|
chunk_size = 100 |
|
|
|
|
|
|
|
|
num_chunks = math.ceil(len(links) / chunk_size) |
|
|
|
|
|
try: |
|
|
for i in range(num_chunks): |
|
|
chunk = links[i * chunk_size:(i + 1) * chunk_size] |
|
|
payload = {"links": chunk} |
|
|
|
|
|
response = requests.post(url, json=payload, headers=headers) |
|
|
response.raise_for_status() |
|
|
data = response.json() |
|
|
|
|
|
if 'data' not in data: |
|
|
raise ValueError("Missing 'data' in response") |
|
|
|
|
|
responses = data['data'] |
|
|
|
|
|
for response_item in responses: |
|
|
response_data = response_item.get('data', {}) |
|
|
|
|
|
|
|
|
profile_info = { |
|
|
'link': response_item.get('entry', ''), |
|
|
'full_name': response_data.get('fullName', ''), |
|
|
'headline': response_data.get('headline', ''), |
|
|
'connections': response_data.get('followers', ''), |
|
|
'country': response_data.get('addressCountryOnly', ''), |
|
|
'address': response_data.get('addressWithoutCountry', ''), |
|
|
'about': response_data.get('about', ''), |
|
|
'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at " |
|
|
f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"), |
|
|
'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])], |
|
|
'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at " |
|
|
f"{response_data.get('educations', [{}])[0].get('title', '')}"), |
|
|
'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])] |
|
|
} |
|
|
|
|
|
|
|
|
profile_info_list.append(profile_info) |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
|
print(f"Request error: {e}") |
|
|
except ValueError as e: |
|
|
print(f"Value error: {e}") |
|
|
except KeyError as e: |
|
|
print(f"Key error: {e}") |
|
|
except Exception as e: |
|
|
print(f"An unexpected error occurred: {e}") |
|
|
|
|
|
return profile_info_list |
|
|
|
|
|
|
|
|
|
|
|
def get_LI_info(df, progress=gr.Progress()): |
|
|
try: |
|
|
links = df['Links'].tolist() |
|
|
profile_info_list = scrape_linkedins(links) |
|
|
except Exception as e: |
|
|
print(f"Error scraping LinkedIn profiles: {e}") |
|
|
return df |
|
|
|
|
|
|
|
|
profile_info_dict = {info['link']: info for info in profile_info_list if info} |
|
|
|
|
|
|
|
|
for column in ['Full Name', 'Headline', 'Connections', 'Country', 'Address', 'About', 'Current Role', 'All Roles', 'Most Recent Education', 'All Education']: |
|
|
if column not in df.columns: |
|
|
df[column] = '' |
|
|
|
|
|
|
|
|
for index, row in progress.tqdm(df.iterrows(), desc='Scraping...'): |
|
|
try: |
|
|
link = row['Links'] |
|
|
if link in profile_info_dict: |
|
|
profile_info = profile_info_dict[link] |
|
|
df.at[index, 'Full Name'] = profile_info.get('full_name', '') |
|
|
df.at[index, 'Headline'] = profile_info.get('headline', '') |
|
|
df.at[index, 'Connections'] = profile_info.get('connections', '') |
|
|
df.at[index, 'Country'] = profile_info.get('country', '') |
|
|
df.at[index, 'Address'] = profile_info.get('address', '') |
|
|
df.at[index, 'About'] = profile_info.get('about', '') |
|
|
df.at[index, 'Current Role'] = profile_info.get('current_role', '') |
|
|
df.at[index, 'All Roles'] = profile_info.get('all_roles', '') |
|
|
df.at[index, 'Most Recent Education'] = profile_info.get('education', '') |
|
|
df.at[index, 'All Education'] = profile_info.get('all_education', '') |
|
|
else: |
|
|
print(f"Profile information for link {link} not found.") |
|
|
except Exception as e: |
|
|
print(f"Error processing row {index} with link {link}: {e}") |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def get_scrape_data(csv_files, social_media, password): |
|
|
if password != os.environ['DASHBOARD_PASSWORD']: |
|
|
raise gr.Error('Incorrect Password') |
|
|
|
|
|
|
|
|
dataframes = [] |
|
|
|
|
|
|
|
|
for csv_file in csv_files: |
|
|
df = pd.read_csv(csv_file.name) |
|
|
dataframes.append(df) |
|
|
|
|
|
|
|
|
combined_df = pd.concat(dataframes, ignore_index=True) |
|
|
|
|
|
|
|
|
if social_media == 'LinkedIn': |
|
|
output_df = get_LI_info(combined_df) |
|
|
elif social_media == 'Instagram': |
|
|
output_df = get_insta_info(combined_df) |
|
|
|
|
|
print(output_df.head(2)) |
|
|
file_name = f'./{social_media}_output.csv' |
|
|
output_df.to_csv(file_name) |
|
|
completion_status = "Done" |
|
|
return completion_status, gr.DownloadButton(label='Download Scraped Data', value=file_name, visible=True), output_df |
|
|
|
|
|
|
|
|
with gr.Blocks() as block: |
|
|
gr.Markdown(""" |
|
|
# Social Media Scraper Dashboard |
|
|
This dashboard is scrapes data from Linkedin and Instagram \n |
|
|
Link to Data Analysis Platform: https://ai-data-analyst.streamlit.app/ |
|
|
""") |
|
|
with gr.Column(visible=True): |
|
|
password = gr.Textbox(label='Enter Password') |
|
|
csv_file = gr.File(label='Input CSV File (must be CSV File)', file_count='multiple') |
|
|
social_media = gr.Radio(choices=['LinkedIn', 'Instagram'], label='Which Social Media?', info = 'Which Social Media do you want to scrape from?') |
|
|
con_gen_btn = gr.Button('Scrape') |
|
|
status = gr.Textbox(label='Completion Status') |
|
|
data = gr.DataFrame(label='Scraped Data') |
|
|
download_btn = gr.DownloadButton(label='Download Content', visible=False) |
|
|
|
|
|
con_gen_btn.click(get_scrape_data, inputs=[csv_file, social_media, password], outputs=[status, download_btn, data]) |
|
|
|
|
|
|
|
|
block.queue(default_concurrency_limit=5) |
|
|
block.launch() |
|
|
|