Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +38 -18
scrape_3gpp.py
CHANGED
|
@@ -9,6 +9,31 @@ import textract
|
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def browse_folder(url):
|
| 13 |
if url.lower().endswith(('docs', 'docs/')):
|
| 14 |
return gr.update(choices=[])
|
|
@@ -62,7 +87,7 @@ def extract_statuses(url):
|
|
| 62 |
return []
|
| 63 |
|
| 64 |
|
| 65 |
-
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
| 66 |
filenames = []
|
| 67 |
status_filenames = []
|
| 68 |
# Check if the excel_file argument is provided and if the file exists.
|
|
@@ -116,8 +141,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 116 |
download_directory = folder_name
|
| 117 |
if not os.path.exists(download_directory):
|
| 118 |
os.makedirs(download_directory)
|
| 119 |
-
|
| 120 |
-
pourcentss = 0.
|
| 121 |
print(f'filenames: {status_filenames}')
|
| 122 |
if not filenames and not status_filenames:
|
| 123 |
print("No Excel file provided, or no valid URLs found in the file.")
|
|
@@ -135,11 +160,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 135 |
|
| 136 |
# Télécharger chaque fichier zip
|
| 137 |
for zip_link in zip_links:
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
progress(pourcentss,desc='Telechargement')
|
| 141 |
-
download_num = 0
|
| 142 |
-
download_num+=1
|
| 143 |
# Construire l'URL absolue du fichier zip
|
| 144 |
absolute_url = urljoin(url, zip_link)
|
| 145 |
|
|
@@ -161,11 +183,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
| 161 |
for file_url in status_filenames:
|
| 162 |
filename = os.path.basename(file_url)
|
| 163 |
save_path = os.path.join(download_directory, filename)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
progress(pourcentss,desc='Telechargement')
|
| 167 |
-
download_num = 0
|
| 168 |
-
download_num+=1
|
| 169 |
try:
|
| 170 |
with requests.get(file_url, stream=True) as r:
|
| 171 |
r.raise_for_status()
|
|
@@ -283,7 +302,8 @@ def update_excel(data, excel_file, url):
|
|
| 283 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
| 284 |
folder_name = 'nom provisoire'
|
| 285 |
temp_excel = url.split("/")[-2] + "_status.xlsx"
|
| 286 |
-
progress(0.0,desc='
|
|
|
|
| 287 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
| 288 |
if result:
|
| 289 |
print("Success:", message)
|
|
@@ -294,7 +314,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 294 |
extractZip(folder_name)
|
| 295 |
progress(0.5,desc='Extraction 2')
|
| 296 |
excel3gpp(url)
|
| 297 |
-
progress(0.6,desc='
|
| 298 |
|
| 299 |
|
| 300 |
extract_directory = folder_name +" extraction"
|
|
@@ -311,7 +331,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 311 |
"pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
|
| 312 |
}
|
| 313 |
|
| 314 |
-
|
| 315 |
data = []
|
| 316 |
errors_count = 0
|
| 317 |
processed_count = 0 # Counter for processed files
|
|
@@ -328,8 +348,8 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
| 328 |
folder_path = os.path.join(extract_directory, folder)
|
| 329 |
if os.path.isdir(folder_path):
|
| 330 |
for file in os.listdir(folder_path):
|
| 331 |
-
|
| 332 |
-
|
| 333 |
|
| 334 |
|
| 335 |
if file == "__MACOSX":
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
|
| 11 |
|
| 12 |
+
def count_links(url):
|
| 13 |
+
# Define common file extensions for downloadable content
|
| 14 |
+
file_extensions = ('.zip')
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
# Send a HTTP request to the URL
|
| 18 |
+
response = requests.get(url)
|
| 19 |
+
response.raise_for_status() # Raise an exception for HTTP errors
|
| 20 |
+
|
| 21 |
+
# Parse the HTML content of the page
|
| 22 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 23 |
+
|
| 24 |
+
# Find all <a> tags in the HTML
|
| 25 |
+
links = soup.find_all('a')
|
| 26 |
+
|
| 27 |
+
# Count the number of links that point to downloadable files
|
| 28 |
+
count = sum(1 for link in links if any(link.get('href', '').endswith(ext) for ext in file_extensions))
|
| 29 |
+
|
| 30 |
+
return count
|
| 31 |
+
except requests.RequestException as e:
|
| 32 |
+
print(f"Error fetching the page: {e}")
|
| 33 |
+
return None
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def browse_folder(url):
|
| 38 |
if url.lower().endswith(('docs', 'docs/')):
|
| 39 |
return gr.update(choices=[])
|
|
|
|
| 87 |
return []
|
| 88 |
|
| 89 |
|
| 90 |
+
def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress()):
|
| 91 |
filenames = []
|
| 92 |
status_filenames = []
|
| 93 |
# Check if the excel_file argument is provided and if the file exists.
|
|
|
|
| 141 |
download_directory = folder_name
|
| 142 |
if not os.path.exists(download_directory):
|
| 143 |
os.makedirs(download_directory)
|
| 144 |
+
|
| 145 |
+
pourcentss = 0.05
|
| 146 |
print(f'filenames: {status_filenames}')
|
| 147 |
if not filenames and not status_filenames:
|
| 148 |
print("No Excel file provided, or no valid URLs found in the file.")
|
|
|
|
| 160 |
|
| 161 |
# Télécharger chaque fichier zip
|
| 162 |
for zip_link in zip_links:
|
| 163 |
+
progress(pourcentss,desc='Downloading')
|
| 164 |
+
pourcentss+=0.4/count
|
|
|
|
|
|
|
|
|
|
| 165 |
# Construire l'URL absolue du fichier zip
|
| 166 |
absolute_url = urljoin(url, zip_link)
|
| 167 |
|
|
|
|
| 183 |
for file_url in status_filenames:
|
| 184 |
filename = os.path.basename(file_url)
|
| 185 |
save_path = os.path.join(download_directory, filename)
|
| 186 |
+
progress(pourcentss,desc='Downloading')
|
| 187 |
+
pourcentss+=0.4/count
|
|
|
|
|
|
|
|
|
|
| 188 |
try:
|
| 189 |
with requests.get(file_url, stream=True) as r:
|
| 190 |
r.raise_for_status()
|
|
|
|
| 302 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
| 303 |
folder_name = 'nom provisoire'
|
| 304 |
temp_excel = url.split("/")[-2] + "_status.xlsx"
|
| 305 |
+
progress(0.0,desc='Downloading')
|
| 306 |
+
count = count_links(url)
|
| 307 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
| 308 |
if result:
|
| 309 |
print("Success:", message)
|
|
|
|
| 314 |
extractZip(folder_name)
|
| 315 |
progress(0.5,desc='Extraction 2')
|
| 316 |
excel3gpp(url)
|
| 317 |
+
progress(0.6,desc='Creating Excel File')
|
| 318 |
|
| 319 |
|
| 320 |
extract_directory = folder_name +" extraction"
|
|
|
|
| 331 |
"pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
|
| 332 |
}
|
| 333 |
|
| 334 |
+
pourcents2=0.6
|
| 335 |
data = []
|
| 336 |
errors_count = 0
|
| 337 |
processed_count = 0 # Counter for processed files
|
|
|
|
| 348 |
folder_path = os.path.join(extract_directory, folder)
|
| 349 |
if os.path.isdir(folder_path):
|
| 350 |
for file in os.listdir(folder_path):
|
| 351 |
+
progress(pourcents2,desc='Creating Excel File')
|
| 352 |
+
pourcents2+=0.4/count
|
| 353 |
|
| 354 |
|
| 355 |
if file == "__MACOSX":
|