Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

App Files Files Community

MaksG commited on Mar 26, 2024

Commit

31f53ef

verified ·

1 Parent(s): 1f21438

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +36 -106

scrape_3gpp.py CHANGED Viewed

@@ -65,135 +65,65 @@ def extract_statuses(url):
 def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     filenames = []
     status_filenames = []
-    # Check if the excel_file argument is provided and if the file exists.
-    excel_file_path = "guide_status.xlsx"  # Hardcoded path to the Excel file
-    if os.path.exists(excel_file_path):
         try:
-            df = pd.read_excel(excel_file_path)
             print(f"Initial DataFrame size: {len(df)}")
-            if 'TDoc Status' in df.columns:
-                # Check if status_list is empty; if so, consider all rows valid
-                if not status_list:
-                    print("No status list provided, considering all statuses.")
-                    # No need to filter df based on status_list, as we're considering all statuses
-                else:
-                    # Proceed with filtering if status_list is not empty
-                    df = df[df['TDoc Status'].isin(status_list)]
-                    print(f"Filtered DataFrame size: {len(df)}")
-                if df.empty:
-                    print("No files match the specified 'TDoc Status'.")
                 else:
-                    if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
-                        status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
-                    elif 'URL' in df.columns and not df['URL'].isnull().all():
-                        status_filenames = df['URL'].tolist()
-                    else:
-                        print("No valid 'File' or 'URL' entries found for the filtered statuses.")
-                    print(f"Filenames: {status_filenames}")
             else:
-                print("'TDoc Status' column not found in the Excel file.")
-        except Exception as e:
-            print(f"Error reading Excel file: {e}")
-    if excel_file and os.path.exists(excel_file):
-        try:
-            df = pd.read_excel(excel_file)
-            # If 'Actions' in df.columns and filter based on it, and construct URLs from 'TDoc' or 'URL' columns
-            if 'Actions' in df.columns:
-                df = df[df['Actions'] == 'x']
-            elif 'File' in df.columns:
-                filenames = [f"{url}{row['File']}.zip" for index, row in df.iterrows()]
-            elif 'URL' in df.columns:
-                filenames = df['URL'].tolist()
         except Exception as e:
             print(f"Error reading Excel file: {e}")
-            # Optionally, handle the error or return a message if needed
-    # If no Excel file is provided or found, or if it lacks 'TDoc'/'URL', the function can still continue with predefined URLs or other logic
     download_directory = folder_name
     if not os.path.exists(download_directory):
         os.makedirs(download_directory)
     pourcentss = 0.05
-    print(f'filenames: {status_filenames}')
-    if not filenames and not status_filenames:
-        print("No Excel file provided, or no valid URLs found in the file.")
-        # You can either return here or continue with other predefined logic
-        response = requests.get(url)
-        # Analyser le contenu HTML de la page
-        soup = BeautifulSoup(response.content, "html.parser")
-        # Trouver tous les balises <a> avec des attributs href (liens)
-        links = soup.find_all("a", href=True)
-        # Filtrer les liens se terminant par ".zip"
-        zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
-        # Télécharger chaque fichier zip
-        for zip_link in zip_links:
-            progress(pourcentss,desc='Downloading')
-            pourcentss+=0.4/len(df)
-            # Construire l'URL absolue du fichier zip
-            absolute_url = urljoin(url, zip_link)
-            # Extraire le nom de fichier de l'URL
-            filename = os.path.basename(absolute_url)
-            # Chemin où le fichier sera enregistré
-            save_path = os.path.join(download_directory, filename)
-            # Envoyer une requête GET pour télécharger le fichier
-            with requests.get(absolute_url, stream=True) as r:
                 r.raise_for_status()
                 with open(save_path, 'wb') as f:
                     for chunk in r.iter_content(chunk_size=8192):
                         f.write(chunk)
-    elif not filenames:
-    # Proceed with downloading files using the filenames list
-      for file_url in status_filenames:
-          filename = os.path.basename(file_url)
-          save_path = os.path.join(download_directory, filename)
-          progress(pourcentss,desc='Downloading')
-          pourcentss+=0.4/len(df)
-          try:
-              with requests.get(file_url, stream=True) as r:
-                  r.raise_for_status()
-                  with open(save_path, 'wb') as f:
-                      for chunk in r.iter_content(chunk_size=8192):
-                          f.write(chunk)
-          except requests.exceptions.HTTPError as e:
-              print(f"skipped file: {file_url}: {e}")
-    else:
-    # Proceed with downloading files using the filenames list
-      for file_url in filenames:
-          filename = os.path.basename(file_url)
-          save_path = os.path.join(download_directory, filename)
-          progress(pourcentss,desc='Downloading')
-          pourcentss+=0.4/len(df)
-          try:
-              with requests.get(file_url, stream=True) as r:
-                  r.raise_for_status()
-                  with open(save_path, 'wb') as f:
-                      for chunk in r.iter_content(chunk_size=8192):
-                          f.write(chunk)
-          except requests.exceptions.HTTPError as e:
-              print(f"HTTP error occurred: {file_url}: {e}")
-              return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
-    return True, len(df)

 def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     filenames = []
     status_filenames = []
+    df = pd.DataFrame()  # Initialize df to ensure it's always defined
+    if os.path.exists(excel_file):
         try:
+            df = pd.read_excel(excel_file)
             print(f"Initial DataFrame size: {len(df)}")
+            if 'TDoc Status' in df.columns and status_list:
+                df = df[df['TDoc Status'].isin(status_list)]
+                print(f"Filtered DataFrame size: {len(df)}")
+            else:
+                # If status_list is empty, consider all statuses
+                print("No filtering applied based on TDoc Status")
+            if not df.empty:
+                if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
+                    status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
+                elif 'URL' in df.columns and not df['URL'].isnull().all():
+                    status_filenames = df['URL'].tolist()
                 else:
+                    print("No valid 'TDoc' or 'URL' entries found.")
+                print(f"Filenames: {status_filenames}")
             else:
+                print("DataFrame is empty after filtering.")
         except Exception as e:
             print(f"Error reading Excel file: {e}")
     download_directory = folder_name
     if not os.path.exists(download_directory):
         os.makedirs(download_directory)
     pourcentss = 0.05
+    if not status_filenames:
+        print("No Excel file provided, or no valid URLs found in the file.")
+        return False, 0
+    # Proceed with downloading files using the filenames list
+    for file_url in status_filenames:
+        filename = os.path.basename(file_url)
+        save_path = os.path.join(download_directory, filename)
+        progress(pourcentss, desc='Downloading')
+        # Adjust progress calculation based on actual number of files
+        pourcentss += 0.4 / len(status_filenames)
+        try:
+            with requests.get(file_url, stream=True) as r:
                 r.raise_for_status()
                 with open(save_path, 'wb') as f:
                     for chunk in r.iter_content(chunk_size=8192):
                         f.write(chunk)
+        except requests.exceptions.HTTPError as e:
+            print(f"HTTP error occurred: {file_url}: {e}")
+            # Decide how you want to handle HTTP errors (e.g., skip this file, stop the process, etc.)
+    # Ensure correct return value, especially if the function should indicate success/failure and the number of processed files
+    return True, len(status_filenames)