Spaces:
Sleeping
Sleeping
File size: 3,798 Bytes
3107242 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import os
import time
import requests
import csv
from bs4 import BeautifulSoup
# Vérifie si la page contient des sous-catégories
def has_subcategories(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.find('table') is None
# Enregistre les informations dans un fichier CSV
def save_to_csv(data, csv_file):
file_exists = os.path.exists(csv_file)
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write headers only if the file is new or empty
if not file_exists or os.stat(csv_file).st_size == 0:
writer.writerow(["Id", "Catégorie", "Nom du document", "Lien", "Langue"])
writer.writerow(data)
# Récupère le dernier ID à partir du fichier CSV
def get_last_id(csv_file):
if not os.path.exists(csv_file):
return 1 # Commencer à 1 si le fichier n'existe pas
with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
last_row = list(reader)[-1] # Lire la dernière ligne
return int(last_row[0]) + 1 # Retourner l'ID suivant
# Scrape les informations des PDFs présents dans un tableau HTML
def scrape_pdfs(url, category, id_counter, language, max_docs=None, processed_count=0):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table')
if not table:
return id_counter, processed_count
for row in table.find_all('tr'):
if max_docs is not None and processed_count >= max_docs:
return id_counter, processed_count
first_td = row.find('td')
if first_td:
link = first_td.find('a')
if link and 'href' in link.attrs:
pdf_link = link['href']
title = link.get_text().strip()
pdf_data = [id_counter, category, title, f'https://adala.justice.gov.ma{pdf_link}', language]
save_to_csv(pdf_data, 'dataset/docs_metadata.csv')
print(f"-> Ajouté : {pdf_data}")
id_counter += 1
processed_count += 1
time.sleep(1)
return id_counter, processed_count
# Fonction principale qui explore récursivement les sous-catégories et scrape les PDF
def scrape_documents(url, base_folder, id_counter, language, max_docs=None, processed_count=0):
if max_docs is not None and processed_count >= max_docs:
return id_counter, processed_count
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
h2 = soup.find('h2')
category_name = h2.get_text().strip() if h2 else "Documents"
if has_subcategories(url):
main = soup.find('main')
if main:
sections = main.find_all('section')
if sections:
last_section = sections[-1]
ul = last_section.find('ul')
if ul:
for a in ul.find_all('a'):
if max_docs is not None and processed_count >= max_docs:
break
if 'href' in a.attrs:
subcategory_name = a.get_text().strip().replace("Parcourir", "").strip()
sub_link = a['href']
full_url = f'https://adala.justice.gov.ma{sub_link}'
id_counter, processed_count = scrape_documents(
full_url, subcategory_name, id_counter, language, max_docs, processed_count)
else:
id_counter, processed_count = scrape_pdfs(
url, category_name, id_counter, language, max_docs, processed_count)
return id_counter, processed_count
|