brvm / script_brvm.py
lamekemal's picture
helioc
ad83998
# script_brvm.py
import fitz # PyMuPDF
import json
from pathlib import Path
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os
from huggingface_hub import hf_hub_download, HfApi
from datetime import datetime
import zipfile
# ---------- CONFIGURATION ----------
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
PROMPT_TEMPLATE = """
[INST]Tu es un expert en analyse de données financières de la BRVM. Extrais les informations du texte suivant et retourne-les sous la forme d'un objet JSON unique et valide. Ta réponse doit commencer par `{{` et se terminer par `}}`. N'inclus aucun texte, explication ou formatage en dehors de l'objet JSON.
**Texte du bulletin à analyser :**
{texte_pdf}[/INST]
"""
MAX_NEW_TOKENS = 8192
# ---------- MODÈLE ----------
def initialize_model_pipeline():
try:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
extractor_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
return extractor_pipeline
except Exception as e:
print(f"Erreur chargement modèle : {e}")
return None
# ---------- PDF ----------
def download_and_extract_pdfs(repo_id, zip_filename, target_folder, cache_folder):
try:
local_zip_path = hf_hub_download(
repo_id=repo_id,
filename=zip_filename,
repo_type="dataset",
cache_dir=cache_folder
)
target_folder.mkdir(parents=True, exist_ok=True)
extracted_files = []
with zipfile.ZipFile(local_zip_path, 'r') as z:
for member in z.infolist():
if member.is_dir() or member.filename.startswith('/') or '..' in member.filename:
continue
if member.filename.lower().endswith('.pdf'):
target_path = target_folder / Path(member.filename).name
with z.open(member) as source, open(target_path, "wb") as target:
target.write(source.read())
extracted_files.append(target_path)
return extracted_files
except Exception as e:
print(f"Erreur extraction PDF : {e}")
return []
def extract_text_from_pdf(pdf_path):
try:
with fitz.open(pdf_path) as doc:
return "\n".join(page.get_text() for page in doc)
except Exception as e:
print(f"Erreur lecture PDF : {e}")
return ""
# ---------- TRAITEMENT ----------
def parse_json_from_model_output(raw_output):
try:
generated_text = raw_output.split("[/INST]")[-1].strip()
start_index = generated_text.find('{')
end_index = generated_text.rfind('}')
if start_index != -1 and end_index != -1 and end_index > start_index:
json_str = generated_text[start_index : end_index + 1]
return json.loads(json_str)
else:
raise ValueError("Accolades JSON non trouvées.")
except Exception as e:
return {"error": "ParsingFailed", "details": str(e), "raw_output": raw_output}
def process_single_pdf(pdf_path, pipeline):
text = extract_text_from_pdf(pdf_path)
if not text.strip():
return {"error": "PDF vide", "source_file": pdf_path.name}
prompt = PROMPT_TEMPLATE.format(texte_pdf=text[:30000])
try:
response = pipeline(
prompt,
max_new_tokens=MAX_NEW_TOKENS,
temperature=0.2,
do_sample=False,
return_full_text=False,
pad_token_id=pipeline.tokenizer.eos_token_id
)
raw_output = response[0]['generated_text']
data = parse_json_from_model_output(f"[INST]{prompt}[/INST]{raw_output}")
data['source_file'] = pdf_path.name
return data
except Exception as e:
return {"error": "PipelineError", "details": str(e), "source_file": pdf_path.name}
# ---------- UPLOAD ----------
def upload_results_to_hf_single(result, repo_id, hf_token):
if not hf_token:
print("HF_TOKEN manquant.")
return
try:
api = HfApi(token=hf_token)
temp_path = Path("temp_result.json")
with open(temp_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
api.upload_file(
path_or_fileobj=str(temp_path),
repo_id=repo_id,
repo_type="dataset",
path_in_repo=f"{result['source_file']}.json",
commit_message=f"Upload {result['source_file']}"
)
temp_path.unlink()
except Exception as e:
print(f"Erreur upload : {e}")