RAG_APP / src /scripts /0_docs_metadata.py
sxid003's picture
Upload 83 files
3107242 verified
#this script is to run the scraper for documents
from src.scraping.scrape_docs import scrape_documents, get_last_id
from src.configs.config import METADATA_FILE, MAX_DOCS
import os
# Liste des jobs de scraping par défaut
DEFAULT_SCRAPE_JOBS = [
{
"url": "https://adala.justice.gov.ma/fr/resources/338",
"language": "fr"
},
{
"url": "https://adala.justice.gov.ma/fr/resources/1",
"language": "ar"
}
]
def get_user_input():
while True:
choice = input("Voulez-vous utiliser les URLs par défaut ? (oui/non): ").lower()
if choice in ['oui', 'non']:
return choice
print("Veuillez répondre par 'oui' ou 'non'")
def get_custom_url():
url = input("Veuillez entrer l'URL à scraper: ")
while True:
lang = input("Veuillez choisir la langue (fr/ar): ").lower()
if lang in ['fr', 'ar']:
return {"url": url, "language": lang}
print("La langue doit être 'fr' ou 'ar'")
def get_max_docs():
try:
max_input = input("Combien de documents voulez-vous scraper ? (laisser vide pour tout): ").strip()
return int(max_input) if max_input else None
except ValueError:
print("Entrée invalide. Tous les documents seront scrapés.")
return None
if __name__ == "__main__":
starting_id = get_last_id(METADATA_FILE)
'''user_choice = get_user_input()
if user_choice == 'oui':
scrape_jobs = DEFAULT_SCRAPE_JOBS
else:
scrape_jobs = [get_custom_url()]'''
scrape_jobs = DEFAULT_SCRAPE_JOBS
max_docs = MAX_DOCS
processed_total = 0
for job in scrape_jobs:
lang = job["language"]
url = job["url"]
target_folder = os.path.join("dataset", lang)
print(f"\nDémarrage du scraping pour {lang.upper()}{url}")
starting_id, processed_total = scrape_documents(
url, target_folder, starting_id, lang,
max_docs=max_docs,
processed_count=processed_total
)
# Stop early if max reached across multiple jobs
if max_docs is not None and processed_total >= max_docs:
break
print(f"\n✅ Scraping terminé — {processed_total} document(s) scrapé(s).")