File size: 5,223 Bytes
ad83998 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# script_brvm.py
import fitz # PyMuPDF
import json
from pathlib import Path
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import os
from huggingface_hub import hf_hub_download, HfApi
from datetime import datetime
import zipfile
# ---------- CONFIGURATION ----------
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
PROMPT_TEMPLATE = """
[INST]Tu es un expert en analyse de données financières de la BRVM. Extrais les informations du texte suivant et retourne-les sous la forme d'un objet JSON unique et valide. Ta réponse doit commencer par `{{` et se terminer par `}}`. N'inclus aucun texte, explication ou formatage en dehors de l'objet JSON.
**Texte du bulletin à analyser :**
{texte_pdf}[/INST]
"""
MAX_NEW_TOKENS = 8192
# ---------- MODÈLE ----------
def initialize_model_pipeline():
try:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
extractor_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
return extractor_pipeline
except Exception as e:
print(f"Erreur chargement modèle : {e}")
return None
# ---------- PDF ----------
def download_and_extract_pdfs(repo_id, zip_filename, target_folder, cache_folder):
try:
local_zip_path = hf_hub_download(
repo_id=repo_id,
filename=zip_filename,
repo_type="dataset",
cache_dir=cache_folder
)
target_folder.mkdir(parents=True, exist_ok=True)
extracted_files = []
with zipfile.ZipFile(local_zip_path, 'r') as z:
for member in z.infolist():
if member.is_dir() or member.filename.startswith('/') or '..' in member.filename:
continue
if member.filename.lower().endswith('.pdf'):
target_path = target_folder / Path(member.filename).name
with z.open(member) as source, open(target_path, "wb") as target:
target.write(source.read())
extracted_files.append(target_path)
return extracted_files
except Exception as e:
print(f"Erreur extraction PDF : {e}")
return []
def extract_text_from_pdf(pdf_path):
try:
with fitz.open(pdf_path) as doc:
return "\n".join(page.get_text() for page in doc)
except Exception as e:
print(f"Erreur lecture PDF : {e}")
return ""
# ---------- TRAITEMENT ----------
def parse_json_from_model_output(raw_output):
try:
generated_text = raw_output.split("[/INST]")[-1].strip()
start_index = generated_text.find('{')
end_index = generated_text.rfind('}')
if start_index != -1 and end_index != -1 and end_index > start_index:
json_str = generated_text[start_index : end_index + 1]
return json.loads(json_str)
else:
raise ValueError("Accolades JSON non trouvées.")
except Exception as e:
return {"error": "ParsingFailed", "details": str(e), "raw_output": raw_output}
def process_single_pdf(pdf_path, pipeline):
text = extract_text_from_pdf(pdf_path)
if not text.strip():
return {"error": "PDF vide", "source_file": pdf_path.name}
prompt = PROMPT_TEMPLATE.format(texte_pdf=text[:30000])
try:
response = pipeline(
prompt,
max_new_tokens=MAX_NEW_TOKENS,
temperature=0.2,
do_sample=False,
return_full_text=False,
pad_token_id=pipeline.tokenizer.eos_token_id
)
raw_output = response[0]['generated_text']
data = parse_json_from_model_output(f"[INST]{prompt}[/INST]{raw_output}")
data['source_file'] = pdf_path.name
return data
except Exception as e:
return {"error": "PipelineError", "details": str(e), "source_file": pdf_path.name}
# ---------- UPLOAD ----------
def upload_results_to_hf_single(result, repo_id, hf_token):
if not hf_token:
print("HF_TOKEN manquant.")
return
try:
api = HfApi(token=hf_token)
temp_path = Path("temp_result.json")
with open(temp_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
api.upload_file(
path_or_fileobj=str(temp_path),
repo_id=repo_id,
repo_type="dataset",
path_in_repo=f"{result['source_file']}.json",
commit_message=f"Upload {result['source_file']}"
)
temp_path.unlink()
except Exception as e:
print(f"Erreur upload : {e}")
|