Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +39 -44
scrape_3gpp.py
CHANGED
|
@@ -9,31 +9,6 @@ import textract
|
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
|
| 12 |
-
def count_links(url):
|
| 13 |
-
# Define common file extensions for downloadable content
|
| 14 |
-
file_extensions = ('.zip')
|
| 15 |
-
|
| 16 |
-
try:
|
| 17 |
-
# Send a HTTP request to the URL
|
| 18 |
-
response = requests.get(url)
|
| 19 |
-
response.raise_for_status() # Raise an exception for HTTP errors
|
| 20 |
-
|
| 21 |
-
# Parse the HTML content of the page
|
| 22 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
| 23 |
-
|
| 24 |
-
# Find all <a> tags in the HTML
|
| 25 |
-
links = soup.find_all('a')
|
| 26 |
-
|
| 27 |
-
# Count the number of links that point to downloadable files
|
| 28 |
-
count = sum(1 for link in links if any(link.get('href', '').endswith(ext) for ext in file_extensions))
|
| 29 |
-
|
| 30 |
-
return count
|
| 31 |
-
except requests.RequestException as e:
|
| 32 |
-
print(f"Error fetching the page: {e}")
|
| 33 |
-
return None
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
def browse_folder(url):
|
| 38 |
if url.lower().endswith(('docs', 'docs/')):
|
| 39 |
return gr.update(choices=[])
|
|
@@ -87,11 +62,11 @@ def extract_statuses(url):
|
|
| 87 |
return []
|
| 88 |
|
| 89 |
|
| 90 |
-
def scrape(url, excel_file, folder_name, status_list,
|
| 91 |
filenames = []
|
| 92 |
status_filenames = []
|
| 93 |
# Check if the excel_file argument is provided and if the file exists.
|
| 94 |
-
excel_file_path =
|
| 95 |
|
| 96 |
if os.path.exists(excel_file_path):
|
| 97 |
try:
|
|
@@ -141,7 +116,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
| 141 |
download_directory = folder_name
|
| 142 |
if not os.path.exists(download_directory):
|
| 143 |
os.makedirs(download_directory)
|
| 144 |
-
|
| 145 |
pourcentss = 0.05
|
| 146 |
print(f'filenames: {status_filenames}')
|
| 147 |
if not filenames and not status_filenames:
|
|
@@ -157,11 +132,11 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
| 157 |
|
| 158 |
# Filtrer les liens se terminant par ".zip"
|
| 159 |
zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
|
| 160 |
-
|
| 161 |
# Télécharger chaque fichier zip
|
| 162 |
for zip_link in zip_links:
|
| 163 |
progress(pourcentss,desc='Downloading')
|
| 164 |
-
pourcentss+=0.4/
|
| 165 |
# Construire l'URL absolue du fichier zip
|
| 166 |
absolute_url = urljoin(url, zip_link)
|
| 167 |
|
|
@@ -184,7 +159,7 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
| 184 |
filename = os.path.basename(file_url)
|
| 185 |
save_path = os.path.join(download_directory, filename)
|
| 186 |
progress(pourcentss,desc='Downloading')
|
| 187 |
-
pourcentss+=0.4/
|
| 188 |
try:
|
| 189 |
with requests.get(file_url, stream=True) as r:
|
| 190 |
r.raise_for_status()
|
|
@@ -210,14 +185,19 @@ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress
|
|
| 210 |
print(f"HTTP error occurred: {file_url}: {e}")
|
| 211 |
return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
|
| 212 |
|
| 213 |
-
return True, "Téléchargement terminé !"
|
| 214 |
|
| 215 |
|
| 216 |
|
| 217 |
-
def extractZip(
|
| 218 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
# Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
|
| 223 |
for zip_file in os.listdir(download_directory):
|
|
@@ -233,6 +213,7 @@ def extractZip(folder_name):
|
|
| 233 |
os.makedirs(extract_dir)
|
| 234 |
|
| 235 |
# Extraire le contenu du fichier zip
|
|
|
|
| 236 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 237 |
zip_ref.extractall(extract_dir)
|
| 238 |
|
|
@@ -242,6 +223,7 @@ def extractZip(folder_name):
|
|
| 242 |
|
| 243 |
print("Toutes les extractions sont terminées !")
|
| 244 |
|
|
|
|
| 245 |
def excel3gpp(url):
|
| 246 |
response = requests.get(url)
|
| 247 |
response.raise_for_status() # This will raise an exception if there's an error
|
|
@@ -263,12 +245,16 @@ def excel3gpp(url):
|
|
| 263 |
excel_response.raise_for_status()
|
| 264 |
|
| 265 |
# Define the path where you want to save the file
|
| 266 |
-
|
| 267 |
-
filepath = os.path.join('path_to_save_directory', filename) # Replace 'path_to_save_directory' with your desired path
|
| 268 |
|
| 269 |
# Write the content of the Excel file to a local file
|
| 270 |
# Write the content of the Excel file to a local file named 'guide.xlsx'
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
with open(filepath, 'wb') as f:
|
| 274 |
f.write(excel_response.content)
|
|
@@ -300,24 +286,32 @@ def update_excel(data, excel_file, url):
|
|
| 300 |
print(f"Error updating Excel file: {e}")
|
| 301 |
|
| 302 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
progress(0.0,desc='Downloading')
|
| 306 |
-
|
| 307 |
-
result, message = scrape(url, excel_file, folder_name, status_list)
|
| 308 |
if result:
|
| 309 |
print("Success:", message)
|
| 310 |
else:
|
| 311 |
return(None, message)
|
| 312 |
|
| 313 |
progress(0.4,desc='Extraction')
|
| 314 |
-
extractZip(
|
| 315 |
progress(0.5,desc='Extraction 2')
|
| 316 |
excel3gpp(url)
|
| 317 |
progress(0.6,desc='Creating Excel File')
|
| 318 |
|
| 319 |
|
| 320 |
-
extract_directory =
|
| 321 |
categories = {
|
| 322 |
"Other": ["URL", "File", "Type", "Title", "Source", "Content"],
|
| 323 |
"CR": ["URL", "File", "Type", "Title", "Source", "Content"],
|
|
@@ -518,3 +512,4 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 518 |
file_name = temp_excel
|
| 519 |
# Save the updated DataFrame to Excel
|
| 520 |
return file_name, "Téléchargement réussi"
|
|
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def browse_folder(url):
|
| 13 |
if url.lower().endswith(('docs', 'docs/')):
|
| 14 |
return gr.update(choices=[])
|
|
|
|
| 62 |
return []
|
| 63 |
|
| 64 |
|
| 65 |
+
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
| 66 |
filenames = []
|
| 67 |
status_filenames = []
|
| 68 |
# Check if the excel_file argument is provided and if the file exists.
|
| 69 |
+
excel_file_path = "guide_status.xlsx" # Hardcoded path to the Excel file
|
| 70 |
|
| 71 |
if os.path.exists(excel_file_path):
|
| 72 |
try:
|
|
|
|
| 116 |
download_directory = folder_name
|
| 117 |
if not os.path.exists(download_directory):
|
| 118 |
os.makedirs(download_directory)
|
| 119 |
+
|
| 120 |
pourcentss = 0.05
|
| 121 |
print(f'filenames: {status_filenames}')
|
| 122 |
if not filenames and not status_filenames:
|
|
|
|
| 132 |
|
| 133 |
# Filtrer les liens se terminant par ".zip"
|
| 134 |
zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
|
| 135 |
+
|
| 136 |
# Télécharger chaque fichier zip
|
| 137 |
for zip_link in zip_links:
|
| 138 |
progress(pourcentss,desc='Downloading')
|
| 139 |
+
pourcentss+=0.4/len(df)
|
| 140 |
# Construire l'URL absolue du fichier zip
|
| 141 |
absolute_url = urljoin(url, zip_link)
|
| 142 |
|
|
|
|
| 159 |
filename = os.path.basename(file_url)
|
| 160 |
save_path = os.path.join(download_directory, filename)
|
| 161 |
progress(pourcentss,desc='Downloading')
|
| 162 |
+
pourcentss+=0.4/len(df)
|
| 163 |
try:
|
| 164 |
with requests.get(file_url, stream=True) as r:
|
| 165 |
r.raise_for_status()
|
|
|
|
| 185 |
print(f"HTTP error occurred: {file_url}: {e}")
|
| 186 |
return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
|
| 187 |
|
| 188 |
+
return True, "Téléchargement terminé !", len(df)
|
| 189 |
|
| 190 |
|
| 191 |
|
| 192 |
+
def extractZip(url):
|
| 193 |
# Répertoire où les fichiers zip sont déjà téléchargés
|
| 194 |
+
nom_extract = url.split("/")[-3] + "_extraction"
|
| 195 |
+
if os.path.exists(nom_extract):
|
| 196 |
+
shutil.rmtree(nom_extract)
|
| 197 |
+
extract_directory = nom_extract
|
| 198 |
+
|
| 199 |
+
download_directory = url.split("/")[-3] + "_downloads"
|
| 200 |
+
# Répertoire où le contenu des fichiers zip sera extrait
|
| 201 |
|
| 202 |
# Extraire le contenu de tous les fichiers zip dans le répertoire de téléchargement
|
| 203 |
for zip_file in os.listdir(download_directory):
|
|
|
|
| 213 |
os.makedirs(extract_dir)
|
| 214 |
|
| 215 |
# Extraire le contenu du fichier zip
|
| 216 |
+
|
| 217 |
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
| 218 |
zip_ref.extractall(extract_dir)
|
| 219 |
|
|
|
|
| 223 |
|
| 224 |
print("Toutes les extractions sont terminées !")
|
| 225 |
|
| 226 |
+
|
| 227 |
def excel3gpp(url):
|
| 228 |
response = requests.get(url)
|
| 229 |
response.raise_for_status() # This will raise an exception if there's an error
|
|
|
|
| 245 |
excel_response.raise_for_status()
|
| 246 |
|
| 247 |
# Define the path where you want to save the file
|
| 248 |
+
# Replace 'path_to_save_directory' with your desired path
|
|
|
|
| 249 |
|
| 250 |
# Write the content of the Excel file to a local file
|
| 251 |
# Write the content of the Excel file to a local file named 'guide.xlsx'
|
| 252 |
+
|
| 253 |
+
nom_guide = 'guide.xlsx' # Directly specify the filename
|
| 254 |
+
if os.path.exists(nom_guide):
|
| 255 |
+
os.remove(nom_guide)
|
| 256 |
+
filepath = nom_guide
|
| 257 |
+
|
| 258 |
|
| 259 |
with open(filepath, 'wb') as f:
|
| 260 |
f.write(excel_response.content)
|
|
|
|
| 286 |
print(f"Error updating Excel file: {e}")
|
| 287 |
|
| 288 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
| 289 |
+
nom_download = url.split("/")[-3] + "_downloads"
|
| 290 |
+
if os.path.exists(nom_download):
|
| 291 |
+
shutil.rmtree(nom_download)
|
| 292 |
+
folder_name = nom_download
|
| 293 |
+
|
| 294 |
+
nom_status = url.split("/")[-3] + "_status.xlsx"
|
| 295 |
+
if os.path.exists(nom_status):
|
| 296 |
+
os.remove(nom_status)
|
| 297 |
+
temp_excel = nom_status
|
| 298 |
+
|
| 299 |
progress(0.0,desc='Downloading')
|
| 300 |
+
|
| 301 |
+
result, message, count = scrape(url, excel_file, folder_name, status_list)
|
| 302 |
if result:
|
| 303 |
print("Success:", message)
|
| 304 |
else:
|
| 305 |
return(None, message)
|
| 306 |
|
| 307 |
progress(0.4,desc='Extraction')
|
| 308 |
+
extractZip(url)
|
| 309 |
progress(0.5,desc='Extraction 2')
|
| 310 |
excel3gpp(url)
|
| 311 |
progress(0.6,desc='Creating Excel File')
|
| 312 |
|
| 313 |
|
| 314 |
+
extract_directory = url.split("/")[-3] + "_extraction"
|
| 315 |
categories = {
|
| 316 |
"Other": ["URL", "File", "Type", "Title", "Source", "Content"],
|
| 317 |
"CR": ["URL", "File", "Type", "Title", "Source", "Content"],
|
|
|
|
| 512 |
file_name = temp_excel
|
| 513 |
# Save the updated DataFrame to Excel
|
| 514 |
return file_name, "Téléchargement réussi"
|
| 515 |
+
|