Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +7 -35
scrape_3gpp.py
CHANGED
|
@@ -228,7 +228,7 @@ def update_excel(data, excel_file, url):
|
|
| 228 |
temp_df = pd.DataFrame(data, columns=new_df_columns)
|
| 229 |
|
| 230 |
try:
|
| 231 |
-
#
|
| 232 |
if os.path.exists(excel_file):
|
| 233 |
old_df = pd.read_excel(excel_file)
|
| 234 |
df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
|
|
@@ -242,7 +242,7 @@ def update_excel(data, excel_file, url):
|
|
| 242 |
|
| 243 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
| 244 |
folder_name = 'nom provisoire'
|
| 245 |
-
temp_excel = '
|
| 246 |
progress(0.0,desc='Telechargement')
|
| 247 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
| 248 |
if result:
|
|
@@ -279,7 +279,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 279 |
pre_title_section = None
|
| 280 |
|
| 281 |
try:
|
| 282 |
-
df = pd.read_excel(
|
| 283 |
except Exception as e:
|
| 284 |
print(f"Initializing a new DataFrame because: {e}")
|
| 285 |
df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
|
|
@@ -450,39 +450,11 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 450 |
print(f"Updated after processing {processed_count} files.")
|
| 451 |
data = [] # Clear the data list after updating
|
| 452 |
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
try:
|
| 458 |
-
old_df = pd.read_excel(excel_file)
|
| 459 |
-
|
| 460 |
-
# Check if 'Actions' column exists in the old DataFrame
|
| 461 |
-
if 'Actions' in old_df.columns:
|
| 462 |
-
# Assuming you want to update 'Content' in old_df for matching 'TDoc' values in 'File'
|
| 463 |
-
for index, new_row in new_df.iterrows():
|
| 464 |
-
# Find matching rows in old_df where 'TDoc' matches 'File' from new_df
|
| 465 |
-
match_indices = old_df[old_df['TDoc'] == new_row['File']].index
|
| 466 |
-
# Update 'Content' in old_df for matching rows
|
| 467 |
-
for i in match_indices:
|
| 468 |
-
old_df.at[i, 'Content'] = new_row['Content']
|
| 469 |
-
old_df.at[i, 'URL'] = new_row['URL']
|
| 470 |
-
|
| 471 |
-
df = old_df
|
| 472 |
-
###placer la colonne content en 4eme position
|
| 473 |
-
# current_columns = df.columns.tolist()
|
| 474 |
-
# current_columns.remove('URL')
|
| 475 |
-
# # Insert 'Content' at the desired position
|
| 476 |
-
# new_columns_order = current_columns[:1] + ['URL'] + current_columns[3:]
|
| 477 |
-
# df = df[new_columns_order]
|
| 478 |
-
else:
|
| 479 |
-
# If 'Actions' column doesn't exist, simply concatenate the DataFrames
|
| 480 |
-
df = pd.concat([old_df, new_df], axis=0, ignore_index=True)
|
| 481 |
-
except Exception as e:
|
| 482 |
-
print("The provided excel file seems invalid:", e)
|
| 483 |
-
df = new_df
|
| 484 |
|
| 485 |
file_name = url.split("/")[-2] + ".xlsx"
|
| 486 |
# Save the updated DataFrame to Excel
|
| 487 |
-
df.to_excel(file_name, index=False)
|
| 488 |
return file_name, "Téléchargement réussi"
|
|
|
|
| 228 |
temp_df = pd.DataFrame(data, columns=new_df_columns)
|
| 229 |
|
| 230 |
try:
|
| 231 |
+
# Check if the Excel file already exists and append data to it
|
| 232 |
if os.path.exists(excel_file):
|
| 233 |
old_df = pd.read_excel(excel_file)
|
| 234 |
df = pd.concat([old_df, temp_df], axis=0, ignore_index=True)
|
|
|
|
| 242 |
|
| 243 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
| 244 |
folder_name = 'nom provisoire'
|
| 245 |
+
temp_excel = 'temporaire.xlsx'
|
| 246 |
progress(0.0,desc='Telechargement')
|
| 247 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
| 248 |
if result:
|
|
|
|
| 279 |
pre_title_section = None
|
| 280 |
|
| 281 |
try:
|
| 282 |
+
df = pd.read_excel(temp_excel)
|
| 283 |
except Exception as e:
|
| 284 |
print(f"Initializing a new DataFrame because: {e}")
|
| 285 |
df = pd.DataFrame(columns=["URL", "File", "Type", "Title", "Source", "Status", "Content"])
|
|
|
|
| 450 |
print(f"Updated after processing {processed_count} files.")
|
| 451 |
data = [] # Clear the data list after updating
|
| 452 |
|
| 453 |
+
if data:
|
| 454 |
+
# This final call ensures that any remaining data is processed and saved.
|
| 455 |
+
update_excel(data, temp_excel, url)
|
| 456 |
+
print(f"Final update after processing all files.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
file_name = url.split("/")[-2] + ".xlsx"
|
| 459 |
# Save the updated DataFrame to Excel
|
|
|
|
| 460 |
return file_name, "Téléchargement réussi"
|