Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

App Files Files Community

MaksG commited on Mar 4, 2024

Commit

4f012ae

verified ·

1 Parent(s): 7854dd4

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +21 -8

scrape_3gpp.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 import zipfile
 import textract
-def scrape(url, excel_file, folder_name):
     filenames = []
     # Check if the excel_file argument is provided and if the file exists.
     if excel_file and os.path.exists(excel_file):
@@ -45,9 +45,15 @@ def scrape(url, excel_file, folder_name):
         # Filtrer les liens se terminant par ".zip"
         zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
         # Télécharger chaque fichier zip
         for zip_link in zip_links:
             # Construire l'URL absolue du fichier zip
             absolute_url = urljoin(url, zip_link)
@@ -85,6 +91,7 @@ def scrape(url, excel_file, folder_name):
     return True, "Téléchargement terminé !"
 def extractZip(folder_name):
     # Répertoire où les fichiers zip sont déjà téléchargés
     download_directory = folder_name
@@ -147,25 +154,27 @@ def excel3gpp(url):
         print(f'Excel file downloaded and saved as: {filepath}')
 def replace_line_breaks(text):
     return text.replace("\n", "/n")
 def remod_text(text):
     return text.replace("/n", "\n")
-def extractionPrincipale(url, excel_file=None):
     folder_name = url.split("/")[-2]
     result, message = scrape(url, excel_file, folder_name)
     if result:
         print("Success:", message)
     else:
         return(None, message)
     extractZip(folder_name)
     excel3gpp(url)
     extract_directory = folder_name +" extraction"
     categories = {
@@ -180,7 +189,8 @@ def extractionPrincipale(url, excel_file=None):
         "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
         "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
     }
     data = []
     errors_count = 0
     pre_title_section = None
@@ -188,6 +198,9 @@ def extractionPrincipale(url, excel_file=None):
         folder_path = os.path.join(extract_directory, folder)
         if os.path.isdir(folder_path):
             for file in os.listdir(folder_path):
                 if file == "__MACOSX":
                     continue
                 file_path = os.path.join(folder_path, file)
@@ -340,7 +353,7 @@ def extractionPrincipale(url, excel_file=None):
-    new_df_columns = ["URL", "File", "Type", "title", "Source", "Status", "Content"]  # Create a DataFrame with the updated data
     new_df = pd.DataFrame(data, columns=new_df_columns)
     try:
         old_df = pd.read_excel(excel_file)

 import zipfile
 import textract
+def scrape(url, excel_file, folder_name,progress=gr.Progress()):
     filenames = []
     # Check if the excel_file argument is provided and if the file exists.
     if excel_file and os.path.exists(excel_file):
         # Filtrer les liens se terminant par ".zip"
         zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
+        download_num = 0
+        pourcentss = 0.1
         # Télécharger chaque fichier zip
         for zip_link in zip_links:
+            if download_num%10 == 0:
+              pourcentss = pourcentss + download_num/500
+              progress(pourcentss,desc='Telechargement')
+              download_num = 0
+            download_num+=1
             # Construire l'URL absolue du fichier zip
             absolute_url = urljoin(url, zip_link)
     return True, "Téléchargement terminé !"
 def extractZip(folder_name):
     # Répertoire où les fichiers zip sont déjà téléchargés
     download_directory = folder_name
         print(f'Excel file downloaded and saved as: {filepath}')
 def replace_line_breaks(text):
     return text.replace("\n", "/n")
 def remod_text(text):
     return text.replace("/n", "\n")
+def extractionPrincipale(url, excel_file=None,progress=gr.Progress()):
     folder_name = url.split("/")[-2]
+    progress(0.1,desc='Telechargement')
     result, message = scrape(url, excel_file, folder_name)
     if result:
         print("Success:", message)
     else:
         return(None, message)
+    progress(0.4,desc='Extraction')
     extractZip(folder_name)
+    progress(0.5,desc='Extraction 2')
     excel3gpp(url)
+    progress(0.6,desc='Mise en forme Excel')
     extract_directory = folder_name +" extraction"
     categories = {
         "ppt": ["URL", "File", "Type", "Title", "Source", "Content"],
         "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
     }
+    nouv=0
+    num=0.6
     data = []
     errors_count = 0
     pre_title_section = None
         folder_path = os.path.join(extract_directory, folder)
         if os.path.isdir(folder_path):
             for file in os.listdir(folder_path):
+                num=num + nouv/400
+                progress(num,desc='Mise en forme Excel')
+                nouv+=1
                 if file == "__MACOSX":
                     continue
                 file_path = os.path.join(folder_path, file)
+    new_df_columns = ["URL", "File", "Type", "Title", "Source", "Status", "Content"]  # Create a DataFrame with the updated data
     new_df = pd.DataFrame(data, columns=new_df_columns)
     try:
         old_df = pd.read_excel(excel_file)