Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

App Files Files Community

heymenn commited on Apr 12, 2024

Commit

103847d

verified ·

1 Parent(s): 7a48811

Add the sorting of files (#2)

Browse files

- Add the sorting of files (621a4e205f9990bac2cf0a0f044a2c82f5d99f4f)

Files changed (1) hide show

scrape_3gpp.py +54 -4

scrape_3gpp.py CHANGED Viewed

@@ -69,17 +69,23 @@ from bs4 import BeautifulSoup
 import pandas as pd
 import gradio as gr
-def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     filenames = []
     status_filenames = []
     df = pd.DataFrame()  # Initialize df to ensure it's always defined
     # Try to process the Excel file if provided and valid
     if excel_file and os.path.exists(excel_file):
         try:
             df = pd.read_excel(excel_file)
             print(f"Initial DataFrame size: {len(df)}")
             if 'TDoc Status' in df.columns and status_list:
                 df = df[df['TDoc Status'].isin(status_list)]
                 print(f"Filtered DataFrame size: {len(df)}")
@@ -99,7 +105,20 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
         print("Downloading zip files directly from the URL...")
         response = requests.get(url)
         soup = BeautifulSoup(response.content, 'html.parser')
-        zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip')]
         # Construct absolute URLs for zip files
         status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
@@ -111,12 +130,19 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     pourcentss = 0.05
     # Proceed with downloading files
     for file_url in status_filenames:
         filename = os.path.basename(file_url)
         save_path = os.path.join(download_directory, filename)
         progress(pourcentss, desc='Downloading')
         pourcentss += 0.4 / max(len(status_filenames), 1)  # Ensure non-zero division
         try:
             with requests.get(file_url, stream=True) as r:
                 r.raise_for_status()
@@ -243,8 +269,32 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
     temp_excel = nom_status
     progress(0.0,desc='Downloading')
-    result, count = scrape(url, excel_file, folder_name, status_list)
     if result:
         print("Success")
     else:

 import pandas as pd
 import gradio as gr
+def scrape(url, excel_file, folder_name, status_list, sorted_files, progress=gr.Progress()):
+    print("ENTERING SCRAPE FUNCTION")
     filenames = []
     status_filenames = []
     df = pd.DataFrame()  # Initialize df to ensure it's always defined
     # Try to process the Excel file if provided and valid
+    print(f"WE ARE TESTING IF OS.PATH.EXISTS WITH THIS FILE : {excel_file}")
     if excel_file and os.path.exists(excel_file):
         try:
             df = pd.read_excel(excel_file)
             print(f"Initial DataFrame size: {len(df)}")
+            print(f"WE ARE TRYING TO LOOK AT status_list : {status_list}")
+            print(f"WE ARE TRYING TO LOOK AT df.columns : {df.columns.tolist()}")
             if 'TDoc Status' in df.columns and status_list:
                 df = df[df['TDoc Status'].isin(status_list)]
                 print(f"Filtered DataFrame size: {len(df)}")
         print("Downloading zip files directly from the URL...")
         response = requests.get(url)
         soup = BeautifulSoup(response.content, 'html.parser')
+        # Select all zip files
+        zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip') ]
+        # If the user selected
+        if len(sorted_files) != 0:
+            sorted_files_tab = []
+            for link in zip_links:
+                for file in sorted_files:
+                    if file in link:
+                        sorted_files_tab.append(link)
+        if len(sorted_files_tab) != 0:
+            zip_links = sorted_files_tab
         # Construct absolute URLs for zip files
         status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
     pourcentss = 0.05
     # Proceed with downloading files
     for file_url in status_filenames:
         filename = os.path.basename(file_url)
         save_path = os.path.join(download_directory, filename)
         progress(pourcentss, desc='Downloading')
         pourcentss += 0.4 / max(len(status_filenames), 1)  # Ensure non-zero division
         try:
             with requests.get(file_url, stream=True) as r:
                 r.raise_for_status()
     temp_excel = nom_status
     progress(0.0,desc='Downloading')
+    #Sorting files, downloading only files which have the status selected by the user
+    sorted_files = []
+    try:
+        guide_file = 'guide.xlsx'
+        if os.path.exists(guide_file):
+            dfStatus = pd.read_excel(guide_file)
+            # Look if the user selected some filter status
+            if len(dfStatus['TDoc Status'].unique().tolist()) != len (status_list):
+                keys_statuses_filename = dfStatus['TDoc'].tolist()
+                values_unique_statuses = dfStatus['TDoc Status'].tolist()
+                doc_statuses = dict(zip(keys_statuses_filename, values_unique_statuses))
+                for key in doc_statuses.keys():
+                    if doc_statuses[key] in status_list:
+                        sorted_files.append(key)
+            print(sorted_files)
+    except Exception as e:
+        print(f"Not able to retrieve informations from 'guide.xlsx' ")
+    result, count = scrape(url, excel_file, folder_name, status_list, sorted_files)
     if result:
         print("Success")
     else: