File size: 2,438 Bytes
ad83998 e8e2397 ad83998 1c25b7b 2e1ee31 f32ba89 ad83998 1c25b7b ad83998 b0efaca ad83998 e8e2397 f32ba89 e8e2397 ad83998 2e1ee31 ad83998 e8e2397 ad83998 e8e2397 ad83998 e8e2397 2e1ee31 ad83998 e8e2397 ad83998 e8e2397 ad83998 e8e2397 ad83998 e8e2397 ad83998 e8e2397 ad83998 e8e2397 ad83998 e8e2397 ad83998 e8e2397 ad83998 1c25b7b ad83998 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | # app.py
import gradio as gr
import threading
import json
from pathlib import Path
from datetime import datetime
import uuid
import os
from script_brvm import (
initialize_model_pipeline,
download_and_extract_pdfs,
process_single_pdf,
upload_results_to_hf_single
)
# ---------- CONFIGURATION ----------
HF_DATASET_PDFS_REPO_ID = "lamekemal/brvm-reports-pdfs"
ZIP_FILENAME_IN_DATASET = "brvm_reports.zip"
LOCAL_PDF_FOLDER = Path("brvm_reports_extracted")
LOCAL_CACHE_DIR = Path("./hf_cache")
HF_TOKEN = os.getenv("HF_TOKEN")
HF_DATASET_JSON_REPO_ID = "lamekemal/brvm-reports-json"
LOCAL_JSON_OUTPUT_BASE_FOLDER = Path("brvm_json_outputs")
extractor_pipeline = None
processed_files = []
def load_model():
global extractor_pipeline
extractor_pipeline = initialize_model_pipeline()
def start_background_processing(status_box):
def background_task():
pdf_files = download_and_extract_pdfs(
HF_DATASET_PDFS_REPO_ID,
ZIP_FILENAME_IN_DATASET,
LOCAL_PDF_FOLDER,
LOCAL_CACHE_DIR
)
run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_" + uuid.uuid4().hex[:8]
local_output_dir = LOCAL_JSON_OUTPUT_BASE_FOLDER / run_id
local_output_dir.mkdir(parents=True, exist_ok=True)
for pdf_path in pdf_files:
result = process_single_pdf(pdf_path, extractor_pipeline)
output_json_path = local_output_dir / f"{pdf_path.stem}.json"
with open(output_json_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
upload_results_to_hf_single(result, HF_DATASET_JSON_REPO_ID, HF_TOKEN)
processed_files.append(pdf_path.name)
status_box.update(value="\n".join(processed_files))
thread = threading.Thread(target=background_task)
thread.start()
def launch_processing(status_box):
start_background_processing(status_box)
return "✅ Traitement lancé."
with gr.Blocks() as demo:
gr.Markdown("# 📊 Extraction BRVM automatisée")
gr.Markdown("Le modèle est chargé au démarrage. Cliquez sur le bouton pour lancer le traitement des bulletins.")
status_box = gr.Textbox(label="Fichiers traités", lines=20)
launch_button = gr.Button("🚀 Lancer le traitement")
launch_button.click(launch_processing, inputs=[status_box], outputs=[status_box])
load_model()
demo.launch()
|