Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +7 -7
scrape_3gpp.py
CHANGED
|
@@ -67,7 +67,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 67 |
status_filenames = []
|
| 68 |
df = pd.DataFrame() # Initialize df to ensure it's always defined
|
| 69 |
|
| 70 |
-
if
|
|
|
|
| 71 |
try:
|
| 72 |
df = pd.read_excel(excel_file)
|
| 73 |
print(f"Initial DataFrame size: {len(df)}")
|
|
@@ -76,7 +77,6 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 76 |
df = df[df['TDoc Status'].isin(status_list)]
|
| 77 |
print(f"Filtered DataFrame size: {len(df)}")
|
| 78 |
else:
|
| 79 |
-
# If status_list is empty, consider all statuses
|
| 80 |
print("No filtering applied based on TDoc Status")
|
| 81 |
|
| 82 |
if not df.empty:
|
|
@@ -86,13 +86,15 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 86 |
status_filenames = df['URL'].tolist()
|
| 87 |
else:
|
| 88 |
print("No valid 'TDoc' or 'URL' entries found.")
|
| 89 |
-
|
| 90 |
print(f"Filenames: {status_filenames}")
|
| 91 |
else:
|
| 92 |
print("DataFrame is empty after filtering.")
|
| 93 |
|
| 94 |
except Exception as e:
|
| 95 |
print(f"Error reading Excel file: {e}")
|
|
|
|
|
|
|
| 96 |
|
| 97 |
download_directory = folder_name
|
| 98 |
if not os.path.exists(download_directory):
|
|
@@ -109,8 +111,7 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 109 |
filename = os.path.basename(file_url)
|
| 110 |
save_path = os.path.join(download_directory, filename)
|
| 111 |
progress(pourcentss, desc='Downloading')
|
| 112 |
-
|
| 113 |
-
pourcentss += 0.4 / len(status_filenames)
|
| 114 |
try:
|
| 115 |
with requests.get(file_url, stream=True) as r:
|
| 116 |
r.raise_for_status()
|
|
@@ -119,14 +120,13 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 119 |
f.write(chunk)
|
| 120 |
except requests.exceptions.HTTPError as e:
|
| 121 |
print(f"HTTP error occurred: {file_url}: {e}")
|
| 122 |
-
# Decide how you want to handle HTTP errors (e.g., skip this file, stop the process, etc.)
|
| 123 |
|
| 124 |
-
# Ensure correct return value, especially if the function should indicate success/failure and the number of processed files
|
| 125 |
return True, len(status_filenames)
|
| 126 |
|
| 127 |
|
| 128 |
|
| 129 |
|
|
|
|
| 130 |
def extractZip(url):
|
| 131 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
| 132 |
nom_extract = url.split("/")[-3] + "_extraction"
|
|
|
|
| 67 |
status_filenames = []
|
| 68 |
df = pd.DataFrame() # Initialize df to ensure it's always defined
|
| 69 |
|
| 70 |
+
# Only proceed if excel_file is not None and it exists
|
| 71 |
+
if excel_file and os.path.exists(excel_file):
|
| 72 |
try:
|
| 73 |
df = pd.read_excel(excel_file)
|
| 74 |
print(f"Initial DataFrame size: {len(df)}")
|
|
|
|
| 77 |
df = df[df['TDoc Status'].isin(status_list)]
|
| 78 |
print(f"Filtered DataFrame size: {len(df)}")
|
| 79 |
else:
|
|
|
|
| 80 |
print("No filtering applied based on TDoc Status")
|
| 81 |
|
| 82 |
if not df.empty:
|
|
|
|
| 86 |
status_filenames = df['URL'].tolist()
|
| 87 |
else:
|
| 88 |
print("No valid 'TDoc' or 'URL' entries found.")
|
| 89 |
+
|
| 90 |
print(f"Filenames: {status_filenames}")
|
| 91 |
else:
|
| 92 |
print("DataFrame is empty after filtering.")
|
| 93 |
|
| 94 |
except Exception as e:
|
| 95 |
print(f"Error reading Excel file: {e}")
|
| 96 |
+
else:
|
| 97 |
+
print("No valid excel_file path provided.")
|
| 98 |
|
| 99 |
download_directory = folder_name
|
| 100 |
if not os.path.exists(download_directory):
|
|
|
|
| 111 |
filename = os.path.basename(file_url)
|
| 112 |
save_path = os.path.join(download_directory, filename)
|
| 113 |
progress(pourcentss, desc='Downloading')
|
| 114 |
+
pourcentss += 0.4 / len(status_filenames) if status_filenames else 1 # Adjust to prevent division by zero
|
|
|
|
| 115 |
try:
|
| 116 |
with requests.get(file_url, stream=True) as r:
|
| 117 |
r.raise_for_status()
|
|
|
|
| 120 |
f.write(chunk)
|
| 121 |
except requests.exceptions.HTTPError as e:
|
| 122 |
print(f"HTTP error occurred: {file_url}: {e}")
|
|
|
|
| 123 |
|
|
|
|
| 124 |
return True, len(status_filenames)
|
| 125 |
|
| 126 |
|
| 127 |
|
| 128 |
|
| 129 |
+
|
| 130 |
def extractZip(url):
|
| 131 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
| 132 |
nom_extract = url.split("/")[-3] + "_extraction"
|