File size: 2,438 Bytes
ad83998
e8e2397
ad83998
 
1c25b7b
 
2e1ee31
f32ba89
ad83998
1c25b7b
ad83998
 
 
 
 
 
b0efaca
ad83998
e8e2397
f32ba89
e8e2397
 
ad83998
 
 
2e1ee31
ad83998
 
e8e2397
ad83998
 
 
e8e2397
ad83998
 
 
 
 
 
 
e8e2397
2e1ee31
ad83998
 
 
e8e2397
ad83998
 
e8e2397
ad83998
 
 
e8e2397
ad83998
e8e2397
ad83998
 
e8e2397
ad83998
 
e8e2397
ad83998
 
 
e8e2397
ad83998
 
 
e8e2397
ad83998
 
 
1c25b7b
ad83998
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# app.py

import gradio as gr
import threading
import json
from pathlib import Path
from datetime import datetime
import uuid
import os

from script_brvm import (
    initialize_model_pipeline,
    download_and_extract_pdfs,
    process_single_pdf,
    upload_results_to_hf_single
)

# ---------- CONFIGURATION ----------
HF_DATASET_PDFS_REPO_ID = "lamekemal/brvm-reports-pdfs"
ZIP_FILENAME_IN_DATASET = "brvm_reports.zip"
LOCAL_PDF_FOLDER = Path("brvm_reports_extracted")
LOCAL_CACHE_DIR = Path("./hf_cache")
HF_TOKEN = os.getenv("HF_TOKEN")
HF_DATASET_JSON_REPO_ID = "lamekemal/brvm-reports-json"
LOCAL_JSON_OUTPUT_BASE_FOLDER = Path("brvm_json_outputs")

extractor_pipeline = None
processed_files = []

def load_model():
    global extractor_pipeline
    extractor_pipeline = initialize_model_pipeline()

def start_background_processing(status_box):
    def background_task():
        pdf_files = download_and_extract_pdfs(
            HF_DATASET_PDFS_REPO_ID,
            ZIP_FILENAME_IN_DATASET,
            LOCAL_PDF_FOLDER,
            LOCAL_CACHE_DIR
        )

        run_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + "_" + uuid.uuid4().hex[:8]
        local_output_dir = LOCAL_JSON_OUTPUT_BASE_FOLDER / run_id
        local_output_dir.mkdir(parents=True, exist_ok=True)

        for pdf_path in pdf_files:
            result = process_single_pdf(pdf_path, extractor_pipeline)

            output_json_path = local_output_dir / f"{pdf_path.stem}.json"
            with open(output_json_path, "w", encoding="utf-8") as f:
                json.dump(result, f, ensure_ascii=False, indent=2)

            upload_results_to_hf_single(result, HF_DATASET_JSON_REPO_ID, HF_TOKEN)

            processed_files.append(pdf_path.name)
            status_box.update(value="\n".join(processed_files))

    thread = threading.Thread(target=background_task)
    thread.start()

def launch_processing(status_box):
    start_background_processing(status_box)
    return "✅ Traitement lancé."

with gr.Blocks() as demo:
    gr.Markdown("# 📊 Extraction BRVM automatisée")
    gr.Markdown("Le modèle est chargé au démarrage. Cliquez sur le bouton pour lancer le traitement des bulletins.")
    
    status_box = gr.Textbox(label="Fichiers traités", lines=20)
    launch_button = gr.Button("🚀 Lancer le traitement")
    launch_button.click(launch_processing, inputs=[status_box], outputs=[status_box])

load_model()
demo.launch()