Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 15, 2024

Commit

718f3ed

verified ·

1 Parent(s): ffe8e3b

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +21 -44

scrape_3gpp.py CHANGED Viewed

@@ -76,50 +76,27 @@ def scrape(url, folder_name, status_list, sorted_files, progress=gr.Progress()):
     df = pd.DataFrame()  # Initialize df to ensure it's always defined
     excel_file = "guide_status.xlsx"
-    # Try to process the Excel file if provided and valid
-    if excel_file and os.path.exists(excel_file):
-        try:
-            df = pd.read_excel(excel_file)
-            print(f"Initial DataFrame size: {len(df)}")
-            if 'TDoc Status' in df.columns and status_list:
-                df = df[df['TDoc Status'].isin(status_list)]
-                print(f"Filtered DataFrame size: {len(df)}")
-            if not df.empty:
-                if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
-                    status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
-                elif 'URL' in df.columns and not df['URL'].isnull().all():
-                    status_filenames = df['URL'].tolist()
-                print(f"Filenames from Excel: {status_filenames}")
-        except Exception as e:
-            print(f"Error reading Excel file: {e}")
-    # If no valid Excel file is given or no status_filenames are found, download zip files directly from the URL
-    if not excel_file or not status_filenames:
-        print("Downloading zip files directly from the URL...")
-        response = requests.get(url)
-        soup = BeautifulSoup(response.content, 'html.parser')
-        # Select all zip files
-        zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip') ]
-        sorted_files_tab = []
-        # Check if the user selected some filters
-        if len(sorted_files) != 0:
-            for link in zip_links:
-                for file in sorted_files:
-                    if file in link:
-                        sorted_files_tab.append(link)
-        if len(sorted_files_tab) != 0:
-            zip_links = sorted_files_tab
-        # Construct absolute URLs for zip files
-        status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
-        print(f"Filenames from URL: {status_filenames}")
     download_directory = folder_name
     if not os.path.exists(download_directory):

     df = pd.DataFrame()  # Initialize df to ensure it's always defined
     excel_file = "guide_status.xlsx"
+    print("Downloading zip files directly from the URL...")
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, 'html.parser')
+    # Select all zip files
+    zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip') ]
+    sorted_files_tab = []
+    # Check if the user selected some filters
+    if len(sorted_files) != 0:
+        for link in zip_links:
+            for file in sorted_files:
+                if file in link:
+                    sorted_files_tab.append(link)
+    if len(sorted_files_tab) != 0:
+        zip_links = sorted_files_tab
+    # Construct absolute URLs for zip files
+    status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
+    print(f"Filenames from URL: {status_filenames}")
     download_directory = folder_name
     if not os.path.exists(download_directory):