|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import threading |
|
|
import json |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
import uuid |
|
|
import os |
|
|
|
|
|
from script_brvm import ( |
|
|
initialize_model_pipeline, |
|
|
download_and_extract_pdfs, |
|
|
process_single_pdf, |
|
|
upload_results_to_hf_single |
|
|
) |
|
|
|
|
|
|
|
|
HF_DATASET_PDFS_REPO_ID = "lamekemal/brvm-reports-pdfs" |
|
|
ZIP_FILENAME_IN_DATASET = "brvm_reports.zip" |
|
|
LOCAL_PDF_FOLDER = Path("brvm_reports_extracted") |
|
|
LOCAL_CACHE_DIR = Path("./hf_cache") |
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
HF_DATASET_JSON_REPO_ID = "lamekemal/brvm-reports-json" |
|
|
LOCAL_JSON_OUTPUT_BASE_FOLDER = Path("brvm_json_outputs") |
|
|
|
|
|
extractor_pipeline = None |
|
|
processed_files = [] |
|
|
|
|
|
def load_model(): |
|
|
global extractor_pipeline |
|
|
extractor_pipeline = initialize_model_pipeline() |
|
|
|
|
|
def start_background_processing(status_box): |
|
|
def background_task(): |
|
|
pdf_files = download_and_extract_pdfs( |
|
|
HF_DATASET_PDFS_REPO_ID, |
|
|
ZIP_FILENAME_IN_DATASET, |
|
|
LOCAL_PDF_FOLDER, |
|
|
LOCAL_CACHE_DIR |
|
|
) |
|
|
|
|
|
run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_" + uuid.uuid4().hex[:8] |
|
|
local_output_dir = LOCAL_JSON_OUTPUT_BASE_FOLDER / run_id |
|
|
local_output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
for pdf_path in pdf_files: |
|
|
result = process_single_pdf(pdf_path, extractor_pipeline) |
|
|
|
|
|
output_json_path = local_output_dir / f"{pdf_path.stem}.json" |
|
|
with open(output_json_path, "w", encoding="utf-8") as f: |
|
|
json.dump(result, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
upload_results_to_hf_single(result, HF_DATASET_JSON_REPO_ID, HF_TOKEN) |
|
|
|
|
|
processed_files.append(pdf_path.name) |
|
|
status_box.update(value="\n".join(processed_files)) |
|
|
|
|
|
thread = threading.Thread(target=background_task) |
|
|
thread.start() |
|
|
|
|
|
def launch_processing(status_box): |
|
|
start_background_processing(status_box) |
|
|
return "✅ Traitement lancé." |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# 📊 Extraction BRVM automatisée") |
|
|
gr.Markdown("Le modèle est chargé au démarrage. Cliquez sur le bouton pour lancer le traitement des bulletins.") |
|
|
|
|
|
status_box = gr.Textbox(label="Fichiers traités", lines=20) |
|
|
launch_button = gr.Button("🚀 Lancer le traitement") |
|
|
launch_button.click(launch_processing, inputs=[status_box], outputs=[status_box]) |
|
|
|
|
|
load_model() |
|
|
demo.launch() |
|
|
|