Spaces:

mylesai
/

scraper

Paused

App Files Files Community

mylesai commited on Jun 23, 2024

Commit

f62fe01

verified ·

1 Parent(s): 1c76922

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -30

app.py CHANGED Viewed

@@ -122,8 +122,6 @@ def get_insta_info(df, progress=gr.Progress()):
 def scrape_linkedins(links):
     url = "https://linkedin-bulk-data-scraper.p.rapidapi.com/profiles"
-    payload = {"links": links}
     headers = {
         "x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
         "x-rapidapi-host": "linkedin-bulk-data-scraper.p.rapidapi.com",
@@ -133,39 +131,47 @@ def scrape_linkedins(links):
     # Initialize an empty list to store the dictionaries
     profile_info_list = []
     try:
-        response = requests.post(url, json=payload, headers=headers)
-        response.raise_for_status()  # Raise HTTPError for bad responses
-        data = response.json()
-        if 'data' not in data:
-            raise ValueError("Missing 'data' in response")
-        responses = data['data']
-        for response_item in responses:
-            response_data = response_item.get('data', {})
-            # Use get() method with default empty strings for missing fields
-            profile_info = {
-                'link': response_item.get('entry', ''),
-                'full_name': response_data.get('fullName', ''),
-                'headline': response_data.get('headline', ''),
-                'connections': response_data.get('followers', ''),  # or 'connections' based on availability
-                'country': response_data.get('addressCountryOnly', ''),
-                'address': response_data.get('addressWithoutCountry', ''),
-                'about': response_data.get('about', ''),
-                'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at "
-                                 f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"),
-                'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])],
-                'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at "
-                              f"{response_data.get('educations', [{}])[0].get('title', '')}"),
-                'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])]
-            }
-            # Append the dictionary to the list
-            profile_info_list.append(profile_info)
     except requests.exceptions.RequestException as e:
         print(f"Request error: {e}")
@@ -178,6 +184,7 @@ def scrape_linkedins(links):
     return profile_info_list
 # Function to populate DataFrame with LinkedIn information
 def get_LI_info(df, progress=gr.Progress()):
     try:

 def scrape_linkedins(links):
     url = "https://linkedin-bulk-data-scraper.p.rapidapi.com/profiles"
     headers = {
         "x-rapidapi-key": f"{RAPIDAPI_API_KEY}",
         "x-rapidapi-host": "linkedin-bulk-data-scraper.p.rapidapi.com",
     # Initialize an empty list to store the dictionaries
     profile_info_list = []
+    chunk_size = 100
+    # Calculate the number of chunks needed
+    num_chunks = math.ceil(len(links) / chunk_size)
     try:
+        for i in range(num_chunks):
+            chunk = links[i * chunk_size:(i + 1) * chunk_size]
+            payload = {"links": chunk}
+            response = requests.post(url, json=payload, headers=headers)
+            response.raise_for_status()  # Raise HTTPError for bad responses
+            data = response.json()
+            if 'data' not in data:
+                raise ValueError("Missing 'data' in response")
+            responses = data['data']
+            for response_item in responses:
+                response_data = response_item.get('data', {})
+                # Use get() method with default empty strings for missing fields
+                profile_info = {
+                    'link': response_item.get('entry', ''),
+                    'full_name': response_data.get('fullName', ''),
+                    'headline': response_data.get('headline', ''),
+                    'connections': response_data.get('followers', ''),  # or 'connections' based on availability
+                    'country': response_data.get('addressCountryOnly', ''),
+                    'address': response_data.get('addressWithoutCountry', ''),
+                    'about': response_data.get('about', ''),
+                    'current_role': (f"{response_data.get('experiences', [{}])[0].get('title', '')} at "
+                                     f"{response_data.get('experiences', [{}])[0].get('subtitle', '')}"),
+                    'all_roles': [f"{item.get('title', '')} at {item.get('subtitle', '')}" for item in response_data.get('experiences', [{}])],
+                    'education': (f"{response_data.get('educations', [{}])[0].get('subtitle', '')} at "
+                                  f"{response_data.get('educations', [{}])[0].get('title', '')}"),
+                    'all_education': [f"{item.get('subtitle', '')} at {item.get('title', '')}" for item in response_data.get('educations', [{}])]
+                }
+                # Append the dictionary to the list
+                profile_info_list.append(profile_info)
     except requests.exceptions.RequestException as e:
         print(f"Request error: {e}")
     return profile_info_list
 # Function to populate DataFrame with LinkedIn information
 def get_LI_info(df, progress=gr.Progress()):
     try: