Spaces:

Almaatla
/

Standard_Intelligence_Dev

Sleeping

App Files Files Community

MaksG commited on Mar 11, 2024

Commit

396839f

verified ·

1 Parent(s): 0226df2

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +39 -44

scrape_3gpp.py CHANGED Viewed

@@ -9,31 +9,6 @@ import textract
 import gradio as gr
-def count_links(url):
-    # Define common file extensions for downloadable content
-    file_extensions = ('.zip')
-    try:
-        # Send a HTTP request to the URL
-        response = requests.get(url)
-        response.raise_for_status()  # Raise an exception for HTTP errors
-        # Parse the HTML content of the page
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Find all <a> tags in the HTML
-        links = soup.find_all('a')
-        # Count the number of links that point to downloadable files
-        count = sum(1 for link in links if any(link.get('href', '').endswith(ext) for ext in file_extensions))
-        return count
-    except requests.RequestException as e:
-        print(f"Error fetching the page: {e}")
-        return None
 def browse_folder(url):
     if url.lower().endswith(('docs', 'docs/')):
       return gr.update(choices=[])
@@ -87,11 +62,11 @@ def extract_statuses(url):
                 return []
-def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress()):
     filenames = []
     status_filenames = []
     # Check if the excel_file argument is provided and if the file exists.
-    excel_file_path = 'guide_status.xlsx'  # Hardcoded path to the Excel file
     if os.path.exists(excel_file_path):
         try:
@@ -141,7 +116,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
     download_directory = folder_name
     if not os.path.exists(download_directory):
         os.makedirs(download_directory)
     pourcentss = 0.05
     print(f'filenames: {status_filenames}')
     if not filenames and not status_filenames:
@@ -157,11 +132,11 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
         # Filtrer les liens se terminant par ".zip"
         zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
         # Télécharger chaque fichier zip
         for zip_link in zip_links:
             progress(pourcentss,desc='Downloading')
-            pourcentss+=0.4/count
             # Construire l'URL absolue du fichier zip
             absolute_url = urljoin(url, zip_link)
@@ -184,7 +159,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
           filename = os.path.basename(file_url)
           save_path = os.path.join(download_directory, filename)
           progress(pourcentss,desc='Downloading')
-          pourcentss+=0.4/count
           try:
               with requests.get(file_url, stream=True) as r:
                   r.raise_for_status()
@@ -210,14 +185,19 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
               print(f"HTTP error occurred: {file_url}: {e}")
               return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
-    return True, "Téléchargement terminé !"
-def extractZip(folder_name):
     # Répertoire où les fichiers zip sont déjà téléchargés
-    download_directory = folder_name
-    extract_directory = folder_name + " extraction"  # Répertoire où le contenu des fichiers zip sera extrait
     # Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
     for zip_file in os.listdir(download_directory):
@@ -233,6 +213,7 @@ def extractZip(folder_name):
                     os.makedirs(extract_dir)
                 # Extraire le contenu du fichier zip
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall(extract_dir)
@@ -242,6 +223,7 @@ def extractZip(folder_name):
     print("Toutes les extractions sont terminées !")
 def excel3gpp(url):
     response = requests.get(url)
     response.raise_for_status()  # This will raise an exception if there's an error
@@ -263,12 +245,16 @@ def excel3gpp(url):
         excel_response.raise_for_status()
         # Define the path where you want to save the file
-        filename = excel_url.split('/')[-1]
-        filepath = os.path.join('path_to_save_directory', filename)  # Replace 'path_to_save_directory' with your desired path
         # Write the content of the Excel file to a local file
         # Write the content of the Excel file to a local file named 'guide.xlsx'
-        filepath = 'guide.xlsx'  # Directly specify the filename
         with open(filepath, 'wb') as f:
             f.write(excel_response.content)
@@ -300,24 +286,32 @@ def update_excel(data, excel_file, url):
         print(f"Error updating Excel file: {e}")
 def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
-    folder_name = 'nom provisoire'
-    temp_excel = url.split("/")[-2] + "_status.xlsx"
     progress(0.0,desc='Downloading')
-    count = count_links(url)
-    result, message = scrape(url, excel_file, folder_name, status_list)
     if result:
         print("Success:", message)
     else:
         return(None, message)
     progress(0.4,desc='Extraction')
-    extractZip(folder_name)
     progress(0.5,desc='Extraction 2')
     excel3gpp(url)
     progress(0.6,desc='Creating Excel File')
-    extract_directory = folder_name +" extraction"
     categories = {
         "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
         "CR": ["URL", "File", "Type", "Title", "Source", "Content"],
@@ -518,3 +512,4 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
     file_name = temp_excel
     # Save the updated DataFrame to Excel
     return file_name, "Téléchargement réussi"

 import gradio as gr
 def browse_folder(url):
     if url.lower().endswith(('docs', 'docs/')):
       return gr.update(choices=[])
                 return []
+def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     filenames = []
     status_filenames = []
     # Check if the excel_file argument is provided and if the file exists.
+    excel_file_path = "guide_status.xlsx"  # Hardcoded path to the Excel file
     if os.path.exists(excel_file_path):
         try:
     download_directory = folder_name
     if not os.path.exists(download_directory):
         os.makedirs(download_directory)
     pourcentss = 0.05
     print(f'filenames: {status_filenames}')
     if not filenames and not status_filenames:
         # Filtrer les liens se terminant par ".zip"
         zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
         # Télécharger chaque fichier zip
         for zip_link in zip_links:
             progress(pourcentss,desc='Downloading')
+            pourcentss+=0.4/len(df)
             # Construire l'URL absolue du fichier zip
             absolute_url = urljoin(url, zip_link)
           filename = os.path.basename(file_url)
           save_path = os.path.join(download_directory, filename)
           progress(pourcentss,desc='Downloading')
+          pourcentss+=0.4/len(df)
           try:
               with requests.get(file_url, stream=True) as r:
                   r.raise_for_status()
               print(f"HTTP error occurred: {file_url}: {e}")
               return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
+    return True, "Téléchargement terminé !", len(df)
+def extractZip(url):
     # Répertoire où les fichiers zip sont déjà téléchargés
+    nom_extract = url.split("/")[-3] + "_extraction"
+    if os.path.exists(nom_extract):
+        shutil.rmtree(nom_extract)
+    extract_directory = nom_extract
+    download_directory = url.split("/")[-3] + "_downloads"
+      # Répertoire où le contenu des fichiers zip sera extrait
     # Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
     for zip_file in os.listdir(download_directory):
                     os.makedirs(extract_dir)
                 # Extraire le contenu du fichier zip
                 with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                     zip_ref.extractall(extract_dir)
     print("Toutes les extractions sont terminées !")
 def excel3gpp(url):
     response = requests.get(url)
     response.raise_for_status()  # This will raise an exception if there's an error
         excel_response.raise_for_status()
         # Define the path where you want to save the file
+        # Replace 'path_to_save_directory' with your desired path
         # Write the content of the Excel file to a local file
         # Write the content of the Excel file to a local file named 'guide.xlsx'
+        nom_guide = 'guide.xlsx'  # Directly specify the filename
+        if os.path.exists(nom_guide):
+            os.remove(nom_guide)
+        filepath = nom_guide
         with open(filepath, 'wb') as f:
             f.write(excel_response.content)
         print(f"Error updating Excel file: {e}")
 def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
+    nom_download = url.split("/")[-3] + "_downloads"
+    if os.path.exists(nom_download):
+        shutil.rmtree(nom_download)
+    folder_name = nom_download
+    nom_status = url.split("/")[-3] + "_status.xlsx"
+    if os.path.exists(nom_status):
+        os.remove(nom_status)
+    temp_excel = nom_status
     progress(0.0,desc='Downloading')
+    result, message, count = scrape(url, excel_file, folder_name, status_list)
     if result:
         print("Success:", message)
     else:
         return(None, message)
     progress(0.4,desc='Extraction')
+    extractZip(url)
     progress(0.5,desc='Extraction 2')
     excel3gpp(url)
     progress(0.6,desc='Creating Excel File')
+    extract_directory = url.split("/")[-3] + "_extraction"
     categories = {
         "Other": ["URL", "File", "Type", "Title", "Source", "Content"],
         "CR": ["URL", "File", "Type", "Title", "Source", "Content"],
     file_name = temp_excel
     # Save the updated DataFrame to Excel
     return file_name, "Téléchargement réussi"