import os import time import requests import csv from bs4 import BeautifulSoup # Vérifie si la page contient des sous-catégories def has_subcategories(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') return soup.find('table') is None # Enregistre les informations dans un fichier CSV def save_to_csv(data, csv_file): file_exists = os.path.exists(csv_file) with open(csv_file, mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) # Write headers only if the file is new or empty if not file_exists or os.stat(csv_file).st_size == 0: writer.writerow(["Id", "Catégorie", "Nom du document", "Lien", "Langue"]) writer.writerow(data) # Récupère le dernier ID à partir du fichier CSV def get_last_id(csv_file): if not os.path.exists(csv_file): return 1 # Commencer à 1 si le fichier n'existe pas with open(csv_file, mode='r', newline='', encoding='utf-8') as file: reader = csv.reader(file) last_row = list(reader)[-1] # Lire la dernière ligne return int(last_row[0]) + 1 # Retourner l'ID suivant # Scrape les informations des PDFs présents dans un tableau HTML def scrape_pdfs(url, category, id_counter, language, max_docs=None, processed_count=0): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') table = soup.find('table') if not table: return id_counter, processed_count for row in table.find_all('tr'): if max_docs is not None and processed_count >= max_docs: return id_counter, processed_count first_td = row.find('td') if first_td: link = first_td.find('a') if link and 'href' in link.attrs: pdf_link = link['href'] title = link.get_text().strip() pdf_data = [id_counter, category, title, f'https://adala.justice.gov.ma{pdf_link}', language] save_to_csv(pdf_data, 'dataset/docs_metadata.csv') print(f"-> Ajouté : {pdf_data}") id_counter += 1 processed_count += 1 time.sleep(1) return id_counter, processed_count # Fonction principale qui explore récursivement les sous-catégories et scrape les PDF def scrape_documents(url, base_folder, id_counter, language, max_docs=None, processed_count=0): if max_docs is not None and processed_count >= max_docs: return id_counter, processed_count response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') h2 = soup.find('h2') category_name = h2.get_text().strip() if h2 else "Documents" if has_subcategories(url): main = soup.find('main') if main: sections = main.find_all('section') if sections: last_section = sections[-1] ul = last_section.find('ul') if ul: for a in ul.find_all('a'): if max_docs is not None and processed_count >= max_docs: break if 'href' in a.attrs: subcategory_name = a.get_text().strip().replace("Parcourir", "").strip() sub_link = a['href'] full_url = f'https://adala.justice.gov.ma{sub_link}' id_counter, processed_count = scrape_documents( full_url, subcategory_name, id_counter, language, max_docs, processed_count) else: id_counter, processed_count = scrape_pdfs( url, category_name, id_counter, language, max_docs, processed_count) return id_counter, processed_count