File size: 5,223 Bytes
ad83998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# script_brvm.py

import fitz  # PyMuPDF
import json
from pathlib import Path
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os
from huggingface_hub import hf_hub_download, HfApi
from datetime import datetime
import zipfile

# ---------- CONFIGURATION ----------
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
PROMPT_TEMPLATE = """
[INST]Tu es un expert en analyse de données financières de la BRVM. Extrais les informations du texte suivant et retourne-les sous la forme d'un objet JSON unique et valide. Ta réponse doit commencer par `{{` et se terminer par `}}`. N'inclus aucun texte, explication ou formatage en dehors de l'objet JSON.

**Texte du bulletin à analyser :**
{texte_pdf}[/INST]
"""

MAX_NEW_TOKENS = 8192

# ---------- MODÈLE ----------
def initialize_model_pipeline():
    try:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            quantization_config=bnb_config,
            device_map="auto",
            torch_dtype=torch.bfloat16,
            trust_remote_code=True
        )

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        extractor_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
        )
        return extractor_pipeline
    except Exception as e:
        print(f"Erreur chargement modèle : {e}")
        return None

# ---------- PDF ----------
def download_and_extract_pdfs(repo_id, zip_filename, target_folder, cache_folder):
    try:
        local_zip_path = hf_hub_download(
            repo_id=repo_id,
            filename=zip_filename,
            repo_type="dataset",
            cache_dir=cache_folder
        )

        target_folder.mkdir(parents=True, exist_ok=True)
        extracted_files = []

        with zipfile.ZipFile(local_zip_path, 'r') as z:
            for member in z.infolist():
                if member.is_dir() or member.filename.startswith('/') or '..' in member.filename:
                    continue
                if member.filename.lower().endswith('.pdf'):
                    target_path = target_folder / Path(member.filename).name
                    with z.open(member) as source, open(target_path, "wb") as target:
                        target.write(source.read())
                    extracted_files.append(target_path)

        return extracted_files
    except Exception as e:
        print(f"Erreur extraction PDF : {e}")
        return []

def extract_text_from_pdf(pdf_path):
    try:
        with fitz.open(pdf_path) as doc:
            return "\n".join(page.get_text() for page in doc)
    except Exception as e:
        print(f"Erreur lecture PDF : {e}")
        return ""

# ---------- TRAITEMENT ----------
def parse_json_from_model_output(raw_output):
    try:
        generated_text = raw_output.split("[/INST]")[-1].strip()
        start_index = generated_text.find('{')
        end_index = generated_text.rfind('}')
        if start_index != -1 and end_index != -1 and end_index > start_index:
            json_str = generated_text[start_index : end_index + 1]
            return json.loads(json_str)
        else:
            raise ValueError("Accolades JSON non trouvées.")
    except Exception as e:
        return {"error": "ParsingFailed", "details": str(e), "raw_output": raw_output}

def process_single_pdf(pdf_path, pipeline):
    text = extract_text_from_pdf(pdf_path)
    if not text.strip():
        return {"error": "PDF vide", "source_file": pdf_path.name}

    prompt = PROMPT_TEMPLATE.format(texte_pdf=text[:30000])
    try:
        response = pipeline(
            prompt,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=0.2,
            do_sample=False,
            return_full_text=False,
            pad_token_id=pipeline.tokenizer.eos_token_id
        )
        raw_output = response[0]['generated_text']
        data = parse_json_from_model_output(f"[INST]{prompt}[/INST]{raw_output}")
        data['source_file'] = pdf_path.name
        return data
    except Exception as e:
        return {"error": "PipelineError", "details": str(e), "source_file": pdf_path.name}

# ---------- UPLOAD ----------
def upload_results_to_hf_single(result, repo_id, hf_token):
    if not hf_token:
        print("HF_TOKEN manquant.")
        return
    try:
        api = HfApi(token=hf_token)
        temp_path = Path("temp_result.json")
        with open(temp_path, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)

        api.upload_file(
            path_or_fileobj=str(temp_path),
            repo_id=repo_id,
            repo_type="dataset",
            path_in_repo=f"{result['source_file']}.json",
            commit_message=f"Upload {result['source_file']}"
        )
        temp_path.unlink()
    except Exception as e:
        print(f"Erreur upload : {e}")