Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +24 -25
scrape_3gpp.py
CHANGED
|
@@ -426,33 +426,32 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 426 |
status = ""
|
| 427 |
data.append([url+ "/" + folder + '.zip', folder , category, title, source,status, contenu])
|
| 428 |
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
else:
|
| 438 |
-
print(f"
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
if guide_df is not None:
|
| 442 |
-
tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
|
| 443 |
-
# Use tdoc_source_map as needed
|
| 444 |
-
else:
|
| 445 |
-
print("Error: guide_df is not initialized. Exiting function.")
|
| 446 |
-
return
|
| 447 |
|
| 448 |
-
|
| 449 |
-
tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
|
| 450 |
-
# Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
|
| 451 |
-
for item in data:
|
| 452 |
-
nom_du_fichier = item[1] # Assuming 'Nom du fichier' is the first item in your data list
|
| 453 |
-
if nom_du_fichier in tdoc_source_map:
|
| 454 |
-
item[4] = tdoc_source_map[nom_du_fichier] # Update the 'Source' field, assuming it's the fourth item
|
| 455 |
-
item[5] = tdoc_status_map[nom_du_fichier]
|
| 456 |
|
| 457 |
|
| 458 |
processed_count += 1
|
|
|
|
| 426 |
status = ""
|
| 427 |
data.append([url+ "/" + folder + '.zip', folder , category, title, source,status, contenu])
|
| 428 |
|
| 429 |
+
guide_file = 'guide.xlsx'
|
| 430 |
+
if os.path.exists(guide_file):
|
| 431 |
+
# If guide.xlsx exists, proceed with operations that require it
|
| 432 |
+
try:
|
| 433 |
+
guide_df = pd.read_excel(guide_file, usecols=['Source', 'TDoc', 'TDoc Status'])
|
| 434 |
+
# Continue with the operations that require guide.xlsx
|
| 435 |
+
# For example, reading the file, processing the data, etc.
|
| 436 |
+
tdoc_source_map = {row['TDoc']: row['Source'] for index, row in guide_df.iterrows()}
|
| 437 |
+
tdoc_status_map = {row['TDoc']: row['TDoc Status'] for index, row in guide_df.iterrows()}
|
| 438 |
+
# Update the 'Source' in your data based on matching 'Nom du fichier' with 'TDoc'
|
| 439 |
+
for item in data:
|
| 440 |
+
nom_du_fichier = item[1] # Assuming 'Nom du fichier' is the first item in your data list
|
| 441 |
+
if nom_du_fichier in tdoc_source_map:
|
| 442 |
+
item[4] = tdoc_source_map[nom_du_fichier] # Update the 'Source' field, assuming it's the fourth item
|
| 443 |
+
item[5] = tdoc_status_map[nom_du_fichier]
|
| 444 |
+
# Your code that depends on guide.xlsx goes here
|
| 445 |
+
|
| 446 |
+
except Exception as e:
|
| 447 |
+
print(f"An error occurred while processing {guide_file}: {e}")
|
| 448 |
+
# Handle any errors that arise during processing
|
| 449 |
else:
|
| 450 |
+
print(f"File {guide_file} not found. Skipping operations that require this file.")
|
| 451 |
+
# Since guide.xlsx is not found, skip the related operations
|
| 452 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
|
| 454 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
|
| 456 |
|
| 457 |
processed_count += 1
|