Spaces:
Sleeping
Sleeping
| #this script is to run the scraper for documents | |
| from src.scraping.scrape_docs import scrape_documents, get_last_id | |
| from src.configs.config import METADATA_FILE, MAX_DOCS | |
| import os | |
| # Liste des jobs de scraping par défaut | |
| DEFAULT_SCRAPE_JOBS = [ | |
| { | |
| "url": "https://adala.justice.gov.ma/fr/resources/338", | |
| "language": "fr" | |
| }, | |
| { | |
| "url": "https://adala.justice.gov.ma/fr/resources/1", | |
| "language": "ar" | |
| } | |
| ] | |
| def get_user_input(): | |
| while True: | |
| choice = input("Voulez-vous utiliser les URLs par défaut ? (oui/non): ").lower() | |
| if choice in ['oui', 'non']: | |
| return choice | |
| print("Veuillez répondre par 'oui' ou 'non'") | |
| def get_custom_url(): | |
| url = input("Veuillez entrer l'URL à scraper: ") | |
| while True: | |
| lang = input("Veuillez choisir la langue (fr/ar): ").lower() | |
| if lang in ['fr', 'ar']: | |
| return {"url": url, "language": lang} | |
| print("La langue doit être 'fr' ou 'ar'") | |
| def get_max_docs(): | |
| try: | |
| max_input = input("Combien de documents voulez-vous scraper ? (laisser vide pour tout): ").strip() | |
| return int(max_input) if max_input else None | |
| except ValueError: | |
| print("Entrée invalide. Tous les documents seront scrapés.") | |
| return None | |
| if __name__ == "__main__": | |
| starting_id = get_last_id(METADATA_FILE) | |
| '''user_choice = get_user_input() | |
| if user_choice == 'oui': | |
| scrape_jobs = DEFAULT_SCRAPE_JOBS | |
| else: | |
| scrape_jobs = [get_custom_url()]''' | |
| scrape_jobs = DEFAULT_SCRAPE_JOBS | |
| max_docs = MAX_DOCS | |
| processed_total = 0 | |
| for job in scrape_jobs: | |
| lang = job["language"] | |
| url = job["url"] | |
| target_folder = os.path.join("dataset", lang) | |
| print(f"\nDémarrage du scraping pour {lang.upper()} — {url}") | |
| starting_id, processed_total = scrape_documents( | |
| url, target_folder, starting_id, lang, | |
| max_docs=max_docs, | |
| processed_count=processed_total | |
| ) | |
| # Stop early if max reached across multiple jobs | |
| if max_docs is not None and processed_total >= max_docs: | |
| break | |
| print(f"\n✅ Scraping terminé — {processed_total} document(s) scrapé(s).") | |