Spaces:

mylesai
/

scraper

Paused

App Files Files Community

mylesai commited on Jun 23, 2024

Commit

1c76922

verified ·

1 Parent(s): aaadc48

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -144

app.py CHANGED Viewed

@@ -6,111 +6,115 @@ import time
 RAPIDAPI_API_KEY = os.environ['RAPIDAPI_API_KEY']
 def scrape_instagram(user_name):
     url = "https://instagram-scraper-api2.p.rapidapi.com/v1/info"
     print(user_name)
-    querystring = {"username_or_id_or_url":f"{user_name}"}
     headers = {
-    	"x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
-    	"x-rapidapi-host": "instagram-scraper-api2.p.rapidapi.com"
     }
-    response = requests.get(url, headers=headers, params=querystring)
-    if response.status_code != 200:
-        print(f"Failed to fetch profile: {response.status_code}")
-        return {}  # Return an empty dictionary if the request fails
-    response_json = response.json()
-    if 'data' not in response_json:
-        print("No data found in response")
-        return {}  # Return an empty dictionary if there is no data in the response
-    response_data = response_json['data']
-    print(response_data)
-    profile_info = {
-        'bio': response_data.get('biography', ''),
-        'follower_count': response_data.get('follower_count', 0),
-        'following_count': response_data.get('following_count', 0),
-        'bio_links': [item['url'] for item in response_data.get('bio_links', [])],
-        'full_name': response_data.get('full_name', ''),
-        'username': response_data.get('username', ''),
-        'num_posts': response_data.get('media_count', 0),
-        'profile_id': response_data.get('profile_pic_id', ''),
-        'email': response_data.get('biography_email', ''),
-        'badge': response_data.get('account_badges', []),
-        'category': response_data.get('category', ''),
-        'phone_number': response_data.get('contact_phone_number', ''),
-        'city_name': response_data.get('location_data', {}).get('city_name', ''),
-        'country': '',
-        'date_joined': ''
-    }
-    return profile_info
-def get_insta_info(df):
-    # Add new columns to the DataFrame
-    df['Bio'] = ''
-    df['Follower Count'] = 0
-    df['Following Count'] = 0
-    df['Bio Links'] = ''
-    df['Full Name'] = ''
-    df['Username'] = ''
-    df['Num Posts'] = 0
-    df['Profile ID'] = ''
-    df['Email'] = ''
-    df['Badge'] = ''
-    df['Category'] = ''
-    df['Phone Number'] = ''
-    df['City Name'] = ''
-    df['Country'] = ''
-    df['Date Joined'] = ''
 def get_insta_info(df, progress=gr.Progress()):
     # Add new columns to the DataFrame
-    df['Bio'] = ''
-    df['Follower Count'] = 0
-    df['Following Count'] = 0
-    df['Bio Links'] = ''
-    df['Full Name'] = ''
-    df['Username'] = ''
-    df['Num Posts'] = 0
-    df['Profile ID'] = ''
-    df['Email'] = ''
-    df['Badge'] = ''
-    df['Category'] = ''
-    df['Phone Number'] = ''
-    df['City Name'] = ''
-    df['Country'] = ''
-    df['Date Joined'] = ''
     links = df['Links'].values
     print(links)
     for i in progress.tqdm(range(len(links)), desc='Scraping...'):
-        time.sleep(1)
-        profile_info = scrape_instagram(links[i])
-        if profile_info:  # Only populate if profile_info is not empty
-            df.at[i, 'Bio'] = profile_info['bio']
-            df.at[i, 'Follower Count'] = profile_info['follower_count']
-            df.at[i, 'Following Count'] = profile_info['following_count']
-            df.at[i, 'Bio Links'] = ', '.join(profile_info['bio_links'])
-            df.at[i, 'Full Name'] = profile_info['full_name']
-            df.at[i, 'Username'] = profile_info['username']
-            df.at[i, 'Num Posts'] = profile_info['num_posts']
-            df.at[i, 'Profile ID'] = profile_info['profile_id']
-            df.at[i, 'Email'] = profile_info['email']
-            df.at[i, 'Badge'] = ', '.join(profile_info['badge'])
-            df.at[i, 'Category'] = profile_info['category']
-            df.at[i, 'Phone Number'] = profile_info['phone_number']
-            df.at[i, 'City Name'] = profile_info['city_name']
-            df.at[i, 'Country'] = profile_info['country']
-            df.at[i, 'Date Joined'] = profile_info['date_joined']
     return df
@@ -127,88 +131,118 @@ def scrape_linkedins(links):
         "x-rapidapi-user": "usama"
     }
     # Initialize an empty list to store the dictionaries
     profile_info_list = []
-    response = requests.post(url, json=payload, headers=headers)
-    responses = response.json()['data']
-    for response_item in responses:
-        response_data = response_item.get('data', {})
-        # Use get() method with default empty strings for missing fields
-        profile_info = {
-            'link': response_item.get('entry', ''),
-            'full_name': response_data.get('fullName', ''),
-            'headline': response_data.get('headline', ''),
-            'connections': response_data.get('followers', ''),  # or 'connections' based on availability
-            'country': response_data.get('addressCountryOnly', ''),
-            'address': response_data.get('addressWithoutCountry', ''),
-            'about': response_data.get('about', ''),
-            'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at "
-                             f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"),
-            'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])],
-            'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at "
-                          f"{response_data.get('educations', [{}])[0].get('title', '')}"),
-            'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])]
-        }
-        # Append the dictionary to the list
-        profile_info_list.append(profile_info)
     return profile_info_list
 # Function to populate DataFrame with LinkedIn information
 def get_LI_info(df, progress=gr.Progress()):
-    links = df['Links'].tolist()
-    profile_info_list = scrape_linkedins(links)
     # Create a dictionary for quick lookup based on the link
     profile_info_dict = {info['link']: info for info in profile_info_list if info}
     # Add new columns to the DataFrame
-    df['Full Name'] = ''
-    df['Headline'] = ''
-    df['Connections'] = ''
-    df['Country'] = ''
-    df['Address'] = ''
-    df['About'] = ''
-    df['Current Role'] = ''
-    df['All Roles'] = ''
-    df['Most Recent Education'] = ''
-    df['All Education'] = ''
     # Populate the DataFrame by matching the Link values
     for index, row in progress.tqdm(df.iterrows(), desc='Scraping...'):
-        link = row['Links']
-        if link in profile_info_dict:
-            profile_info = profile_info_dict[link]
-            df.at[index, 'Full Name'] = profile_info['full_name']
-            df.at[index, 'Headline'] = profile_info['headline']
-            df.at[index, 'Connections'] = profile_info['connections']
-            df.at[index, 'Country'] = profile_info['country']
-            df.at[index, 'Address'] = profile_info['address']
-            df.at[index, 'About'] = profile_info['about']
-            df.at[index, 'Current Role'] = profile_info['current_role']
-            df.at[index, 'All Roles'] = profile_info['all_roles']
-            df.at[index, 'Most Recent Education'] = profile_info['education']
-            df.at[index, 'All Education'] = profile_info['all_education']
     return df
-def get_scrape_data(csv_file, social_media, password):
     if password != os.environ['DASHBOARD_PASSWORD']:
         raise gr.Error('Incorrect Password')
-    df = pd.read_csv(csv_file.name)
     if social_media == 'LinkedIn':
-        output_df = get_LI_info(df)
     elif social_media == 'Instagram':
-        output_df = get_insta_info(df)
     print(output_df.head(2))
     file_name = f'./{social_media}_output.csv'
-    output_df.to_csv(f'./{social_media}_output.csv')
     completion_status = "Done"
     return completion_status, gr.DownloadButton(label='Download Scraped Data', value=file_name, visible=True), output_df
@@ -221,7 +255,7 @@ with gr.Blocks() as block:
     """)
     with gr.Column(visible=True):
         password = gr.Textbox(label='Enter Password')
-        csv_file = gr.File(label='Input CSV File (must be CSV File)')
         social_media = gr.Radio(choices=['LinkedIn', 'Instagram'], label='Which Social Media?', info = 'Which Social Media do you want to scrape from?')
         con_gen_btn = gr.Button('Scrape')
         status = gr.Textbox(label='Completion Status')

 RAPIDAPI_API_KEY = os.environ['RAPIDAPI_API_KEY']
+import requests
+# Function to scrape Instagram profile
 def scrape_instagram(user_name):
     url = "https://instagram-scraper-api2.p.rapidapi.com/v1/info"
     print(user_name)
+    querystring = {"username_or_id_or_url": f"{user_name}"}
     headers = {
+        "x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
+        "x-rapidapi-host": "instagram-scraper-api2.p.rapidapi.com"
     }
+    try:
+        response = requests.get(url, headers=headers, params=querystring)
+        response.raise_for_status()  # Raise HTTPError for bad responses
+        response_json = response.json()
+        if 'data' not in response_json:
+            print("No data found in response")
+            return {}  # Return an empty dictionary if there is no data in the response
+        response_data = response_json['data']
+        print(response_data)
+        profile_info = {
+            'bio': response_data.get('biography', ''),
+            'follower_count': response_data.get('follower_count', 0),
+            'following_count': response_data.get('following_count', 0),
+            'bio_links': [item['url'] for item in response_data.get('bio_links', [])],
+            'full_name': response_data.get('full_name', ''),
+            'username': response_data.get('username', ''),
+            'num_posts': response_data.get('media_count', 0),
+            'profile_id': response_data.get('profile_pic_id', ''),
+            'email': response_data.get('biography_email', ''),
+            'badge': response_data.get('account_badges', []),
+            'category': response_data.get('category', ''),
+            'phone_number': response_data.get('contact_phone_number', ''),
+            'city_name': response_data.get('location_data', {}).get('city_name', ''),
+            'country': '',
+            'date_joined': ''
+        }
+        return profile_info
+    except requests.exceptions.RequestException as e:
+        print(f"Request error: {e}")
+    except requests.exceptions.HTTPError as e:
+        print(f"HTTP error: {e}")
+    except requests.exceptions.ConnectionError as e:
+        print(f"Connection error: {e}")
+    except requests.exceptions.Timeout as e:
+        print(f"Timeout error: {e}")
+    except ValueError as e:
+        print(f"JSON decode error: {e}")
+    except KeyError as e:
+        print(f"Key error: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+    return {}  # Return an empty dictionary if an error occurs
+# Function to populate DataFrame with Instagram information
 def get_insta_info(df, progress=gr.Progress()):
     # Add new columns to the DataFrame
+    new_columns = [
+        'Bio', 'Follower Count', 'Following Count', 'Bio Links', 'Full Name',
+        'Username', 'Num Posts', 'Profile ID', 'Email', 'Badge', 'Category',
+        'Phone Number', 'City Name', 'Country', 'Date Joined'
+    ]
+    for column in new_columns:
+        if column not in df.columns:
+            df[column] = ''
     links = df['Links'].values
     print(links)
     for i in progress.tqdm(range(len(links)), desc='Scraping...'):
+        try:
+            time.sleep(1)  # Simulate delay for scraping
+            profile_info = scrape_instagram(links[i])
+            if profile_info:  # Only populate if profile_info is not empty
+                df.at[i, 'Bio'] = profile_info['bio']
+                df.at[i, 'Follower Count'] = profile_info['follower_count']
+                df.at[i, 'Following Count'] = profile_info['following_count']
+                df.at[i, 'Bio Links'] = ', '.join(profile_info['bio_links'])
+                df.at[i, 'Full Name'] = profile_info['full_name']
+                df.at[i, 'Username'] = profile_info['username']
+                df.at[i, 'Num Posts'] = profile_info['num_posts']
+                df.at[i, 'Profile ID'] = profile_info['profile_id']
+                df.at[i, 'Email'] = profile_info['email']
+                df.at[i, 'Badge'] = ', '.join(str(badge) for badge in profile_info['badge'])
+                df.at[i, 'Category'] = profile_info['category']
+                df.at[i, 'Phone Number'] = profile_info['phone_number']
+                df.at[i, 'City Name'] = profile_info['city_name']
+                df.at[i, 'Country'] = profile_info['country']
+                df.at[i, 'Date Joined'] = profile_info['date_joined']
+        except requests.exceptions.RequestException as e:
+            print(f"Request error for link {links[i]}: {e}")
+        except ValueError as e:
+            print(f"JSON decode error for link {links[i]}: {e}")
+        except KeyError as e:
+            print(f"Key error for link {links[i]}: {e}")
+        except Exception as e:
+            print(f"An unexpected error occurred for link {links[i]}: {e}")
     return df
         "x-rapidapi-user": "usama"
     }
     # Initialize an empty list to store the dictionaries
     profile_info_list = []
+    try:
+        response = requests.post(url, json=payload, headers=headers)
+        response.raise_for_status()  # Raise HTTPError for bad responses
+        data = response.json()
+        if 'data' not in data:
+            raise ValueError("Missing 'data' in response")
+        responses = data['data']
+        for response_item in responses:
+            response_data = response_item.get('data', {})
+            # Use get() method with default empty strings for missing fields
+            profile_info = {
+                'link': response_item.get('entry', ''),
+                'full_name': response_data.get('fullName', ''),
+                'headline': response_data.get('headline', ''),
+                'connections': response_data.get('followers', ''),  # or 'connections' based on availability
+                'country': response_data.get('addressCountryOnly', ''),
+                'address': response_data.get('addressWithoutCountry', ''),
+                'about': response_data.get('about', ''),
+                'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at "
+                                 f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"),
+                'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])],
+                'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at "
+                              f"{response_data.get('educations', [{}])[0].get('title', '')}"),
+                'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])]
+            }
+            # Append the dictionary to the list
+            profile_info_list.append(profile_info)
+    except requests.exceptions.RequestException as e:
+        print(f"Request error: {e}")
+    except ValueError as e:
+        print(f"Value error: {e}")
+    except KeyError as e:
+        print(f"Key error: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
     return profile_info_list
 # Function to populate DataFrame with LinkedIn information
 def get_LI_info(df, progress=gr.Progress()):
+    try:
+        links = df['Links'].tolist()
+        profile_info_list = scrape_linkedins(links)
+    except Exception as e:
+        print(f"Error scraping LinkedIn profiles: {e}")
+        return df
     # Create a dictionary for quick lookup based on the link
     profile_info_dict = {info['link']: info for info in profile_info_list if info}
     # Add new columns to the DataFrame
+    for column in ['Full Name', 'Headline', 'Connections', 'Country', 'Address', 'About', 'Current Role', 'All Roles', 'Most Recent Education', 'All Education']:
+        if column not in df.columns:
+            df[column] = ''
     # Populate the DataFrame by matching the Link values
     for index, row in progress.tqdm(df.iterrows(), desc='Scraping...'):
+        try:
+            link = row['Links']
+            if link in profile_info_dict:
+                profile_info = profile_info_dict[link]
+                df.at[index, 'Full Name'] = profile_info.get('full_name', '')
+                df.at[index, 'Headline'] = profile_info.get('headline', '')
+                df.at[index, 'Connections'] = profile_info.get('connections', '')
+                df.at[index, 'Country'] = profile_info.get('country', '')
+                df.at[index, 'Address'] = profile_info.get('address', '')
+                df.at[index, 'About'] = profile_info.get('about', '')
+                df.at[index, 'Current Role'] = profile_info.get('current_role', '')
+                df.at[index, 'All Roles'] = profile_info.get('all_roles', '')
+                df.at[index, 'Most Recent Education'] = profile_info.get('education', '')
+                df.at[index, 'All Education'] = profile_info.get('all_education', '')
+            else:
+                print(f"Profile information for link {link} not found.")
+        except Exception as e:
+            print(f"Error processing row {index} with link {link}: {e}")
     return df
+def get_scrape_data(csv_files, social_media, password):
     if password != os.environ['DASHBOARD_PASSWORD']:
         raise gr.Error('Incorrect Password')
+    # Initialize an empty list to store DataFrames
+    dataframes = []
+    # Read each CSV file and append the DataFrame to the list
+    for csv_file in csv_files:
+        df = pd.read_csv(csv_file.name)
+        dataframes.append(df)
+    # Concatenate all DataFrames into a single DataFrame
+    combined_df = pd.concat(dataframes, ignore_index=True)
+    # Process the combined DataFrame based on the social media platform
     if social_media == 'LinkedIn':
+        output_df = get_LI_info(combined_df)
     elif social_media == 'Instagram':
+        output_df = get_insta_info(combined_df)
     print(output_df.head(2))
     file_name = f'./{social_media}_output.csv'
+    output_df.to_csv(file_name)
     completion_status = "Done"
     return completion_status, gr.DownloadButton(label='Download Scraped Data', value=file_name, visible=True), output_df
     """)
     with gr.Column(visible=True):
         password = gr.Textbox(label='Enter Password')
+        csv_file = gr.File(label='Input CSV File (must be CSV File)', file_count='multiple')
         social_media = gr.Radio(choices=['LinkedIn', 'Instagram'], label='Which Social Media?', info = 'Which Social Media do you want to scrape from?')
         con_gen_btn = gr.Button('Scrape')
         status = gr.Textbox(label='Completion Status')