Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +36 -106
scrape_3gpp.py
CHANGED
|
@@ -65,135 +65,65 @@ def extract_statuses(url):
|
|
| 65 |
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
| 66 |
filenames = []
|
| 67 |
status_filenames = []
|
| 68 |
-
|
| 69 |
-
excel_file_path = "guide_status.xlsx" # Hardcoded path to the Excel file
|
| 70 |
|
| 71 |
-
if os.path.exists(
|
| 72 |
try:
|
| 73 |
-
df = pd.read_excel(
|
| 74 |
print(f"Initial DataFrame size: {len(df)}")
|
| 75 |
|
| 76 |
-
if 'TDoc Status' in df.columns:
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
else:
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
status_filenames = df['URL'].tolist()
|
| 94 |
-
else:
|
| 95 |
-
print("No valid 'File' or 'URL' entries found for the filtered statuses.")
|
| 96 |
-
|
| 97 |
-
print(f"Filenames: {status_filenames}")
|
| 98 |
else:
|
| 99 |
-
print("
|
| 100 |
-
|
| 101 |
-
except Exception as e:
|
| 102 |
-
print(f"Error reading Excel file: {e}")
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
if excel_file and os.path.exists(excel_file):
|
| 107 |
-
try:
|
| 108 |
-
df = pd.read_excel(excel_file)
|
| 109 |
|
| 110 |
-
# If 'Actions' in df.columns and filter based on it, and construct URLs from 'TDoc' or 'URL' columns
|
| 111 |
-
if 'Actions' in df.columns:
|
| 112 |
-
df = df[df['Actions'] == 'x']
|
| 113 |
-
|
| 114 |
-
elif 'File' in df.columns:
|
| 115 |
-
filenames = [f"{url}{row['File']}.zip" for index, row in df.iterrows()]
|
| 116 |
-
elif 'URL' in df.columns:
|
| 117 |
-
filenames = df['URL'].tolist()
|
| 118 |
except Exception as e:
|
| 119 |
print(f"Error reading Excel file: {e}")
|
| 120 |
-
# Optionally, handle the error or return a message if needed
|
| 121 |
|
| 122 |
-
# If no Excel file is provided or found, or if it lacks 'TDoc'/'URL', the function can still continue with predefined URLs or other logic
|
| 123 |
download_directory = folder_name
|
| 124 |
if not os.path.exists(download_directory):
|
| 125 |
os.makedirs(download_directory)
|
| 126 |
|
| 127 |
pourcentss = 0.05
|
| 128 |
-
print(f'filenames: {status_filenames}')
|
| 129 |
-
if not filenames and not status_filenames:
|
| 130 |
-
print("No Excel file provided, or no valid URLs found in the file.")
|
| 131 |
-
# You can either return here or continue with other predefined logic
|
| 132 |
-
response = requests.get(url)
|
| 133 |
-
|
| 134 |
-
# Analyser le contenu HTML de la page
|
| 135 |
-
soup = BeautifulSoup(response.content, "html.parser")
|
| 136 |
-
|
| 137 |
-
# Trouver tous les balises <a> avec des attributs href (liens)
|
| 138 |
-
links = soup.find_all("a", href=True)
|
| 139 |
-
|
| 140 |
-
# Filtrer les liens se terminant par ".zip"
|
| 141 |
-
zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
|
| 142 |
-
|
| 143 |
-
# Télécharger chaque fichier zip
|
| 144 |
-
for zip_link in zip_links:
|
| 145 |
-
progress(pourcentss,desc='Downloading')
|
| 146 |
-
pourcentss+=0.4/len(df)
|
| 147 |
-
# Construire l'URL absolue du fichier zip
|
| 148 |
-
absolute_url = urljoin(url, zip_link)
|
| 149 |
-
|
| 150 |
-
# Extraire le nom de fichier de l'URL
|
| 151 |
-
filename = os.path.basename(absolute_url)
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
|
|
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
r.raise_for_status()
|
| 159 |
with open(save_path, 'wb') as f:
|
| 160 |
for chunk in r.iter_content(chunk_size=8192):
|
| 161 |
f.write(chunk)
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
for file_url in status_filenames:
|
| 166 |
-
filename = os.path.basename(file_url)
|
| 167 |
-
save_path = os.path.join(download_directory, filename)
|
| 168 |
-
progress(pourcentss,desc='Downloading')
|
| 169 |
-
pourcentss+=0.4/len(df)
|
| 170 |
-
try:
|
| 171 |
-
with requests.get(file_url, stream=True) as r:
|
| 172 |
-
r.raise_for_status()
|
| 173 |
-
with open(save_path, 'wb') as f:
|
| 174 |
-
for chunk in r.iter_content(chunk_size=8192):
|
| 175 |
-
f.write(chunk)
|
| 176 |
-
except requests.exceptions.HTTPError as e:
|
| 177 |
-
print(f"skipped file: {file_url}: {e}")
|
| 178 |
|
| 179 |
-
else:
|
| 180 |
-
# Proceed with downloading files using the filenames list
|
| 181 |
-
for file_url in filenames:
|
| 182 |
-
filename = os.path.basename(file_url)
|
| 183 |
-
save_path = os.path.join(download_directory, filename)
|
| 184 |
-
progress(pourcentss,desc='Downloading')
|
| 185 |
-
pourcentss+=0.4/len(df)
|
| 186 |
-
try:
|
| 187 |
-
with requests.get(file_url, stream=True) as r:
|
| 188 |
-
r.raise_for_status()
|
| 189 |
-
with open(save_path, 'wb') as f:
|
| 190 |
-
for chunk in r.iter_content(chunk_size=8192):
|
| 191 |
-
f.write(chunk)
|
| 192 |
-
except requests.exceptions.HTTPError as e:
|
| 193 |
-
print(f"HTTP error occurred: {file_url}: {e}")
|
| 194 |
-
return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
|
| 195 |
-
|
| 196 |
-
return True, len(df)
|
| 197 |
|
| 198 |
|
| 199 |
|
|
|
|
| 65 |
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
| 66 |
filenames = []
|
| 67 |
status_filenames = []
|
| 68 |
+
df = pd.DataFrame() # Initialize df to ensure it's always defined
|
|
|
|
| 69 |
|
| 70 |
+
if os.path.exists(excel_file):
|
| 71 |
try:
|
| 72 |
+
df = pd.read_excel(excel_file)
|
| 73 |
print(f"Initial DataFrame size: {len(df)}")
|
| 74 |
|
| 75 |
+
if 'TDoc Status' in df.columns and status_list:
|
| 76 |
+
df = df[df['TDoc Status'].isin(status_list)]
|
| 77 |
+
print(f"Filtered DataFrame size: {len(df)}")
|
| 78 |
+
else:
|
| 79 |
+
# If status_list is empty, consider all statuses
|
| 80 |
+
print("No filtering applied based on TDoc Status")
|
| 81 |
+
|
| 82 |
+
if not df.empty:
|
| 83 |
+
if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
|
| 84 |
+
status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
|
| 85 |
+
elif 'URL' in df.columns and not df['URL'].isnull().all():
|
| 86 |
+
status_filenames = df['URL'].tolist()
|
|
|
|
| 87 |
else:
|
| 88 |
+
print("No valid 'TDoc' or 'URL' entries found.")
|
| 89 |
+
|
| 90 |
+
print(f"Filenames: {status_filenames}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
else:
|
| 92 |
+
print("DataFrame is empty after filtering.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
except Exception as e:
|
| 95 |
print(f"Error reading Excel file: {e}")
|
|
|
|
| 96 |
|
|
|
|
| 97 |
download_directory = folder_name
|
| 98 |
if not os.path.exists(download_directory):
|
| 99 |
os.makedirs(download_directory)
|
| 100 |
|
| 101 |
pourcentss = 0.05
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
if not status_filenames:
|
| 104 |
+
print("No Excel file provided, or no valid URLs found in the file.")
|
| 105 |
+
return False, 0
|
| 106 |
|
| 107 |
+
# Proceed with downloading files using the filenames list
|
| 108 |
+
for file_url in status_filenames:
|
| 109 |
+
filename = os.path.basename(file_url)
|
| 110 |
+
save_path = os.path.join(download_directory, filename)
|
| 111 |
+
progress(pourcentss, desc='Downloading')
|
| 112 |
+
# Adjust progress calculation based on actual number of files
|
| 113 |
+
pourcentss += 0.4 / len(status_filenames)
|
| 114 |
+
try:
|
| 115 |
+
with requests.get(file_url, stream=True) as r:
|
| 116 |
r.raise_for_status()
|
| 117 |
with open(save_path, 'wb') as f:
|
| 118 |
for chunk in r.iter_content(chunk_size=8192):
|
| 119 |
f.write(chunk)
|
| 120 |
+
except requests.exceptions.HTTPError as e:
|
| 121 |
+
print(f"HTTP error occurred: {file_url}: {e}")
|
| 122 |
+
# Decide how you want to handle HTTP errors (e.g., skip this file, stop the process, etc.)
|
| 123 |
|
| 124 |
+
# Ensure correct return value, especially if the function should indicate success/failure and the number of processed files
|
| 125 |
+
return True, len(status_filenames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
|