Spaces:

ZedLow
/

Constrained-Financial-RAG

Sleeping

App Files Files Community

ZedLow commited on Feb 5

Commit

f2863bc

verified ·

1 Parent(s): 97ef6d2

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -218

app.py CHANGED Viewed

@@ -1,240 +1,184 @@
-import os
-import sys
-import json
-import time
-import logging
-import torch
 import spaces
 import gradio as gr
 import torch.nn.functional as F
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForSequenceClassification
 from qwen_vl_utils import process_vision_info
-# --- LOGGING ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# --- CONFIGURATION (MODE STABLE) ---
-CONFIG = {
-    # On reste sur le 1.5B : C'est le SEUL qui ne fait pas crasher ZeroGPU au démarrage
-    "embedding_model": "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-    "rerank_model": "BAAI/bge-reranker-v2-m3",
-    "vision_model": "Qwen/Qwen2-VL-2B-Instruct",
-    "data_path": "data/dataset.json",
-    "allowed_image_dir": "data",
-    "max_embed_len": 2048,
-    "max_rerank_len": 512
-}
-# --- PATCH ---
-def apply_patches():
-    import transformers
-    if not hasattr(transformers.PreTrainedModel, "all_tied_weights_keys"):
-        setattr(transformers.PreTrainedModel, "all_tied_weights_keys", {})
-# --- ENGINE CLASS ---
-class FinancialAnalystEngine:
-    def __init__(self):
-        logger.info("🏗️ Initializing Engine...")
-        apply_patches()
-        self.dataset = []
-        self.doc_embeddings = None
-        self.load_data()
-        logger.info("🔹 Loading Models (CPU Mode for Stability)...")
-        # 1. Chargement CPU (Vital pour ne pas avoir l'erreur "No CUDA")
-        self.embed_tokenizer = AutoTokenizer.from_pretrained(CONFIG["embedding_model"], trust_remote_code=False)
-        self.embed_model = AutoModel.from_pretrained(CONFIG["embedding_model"], trust_remote_code=False, torch_dtype=torch.float16).eval()
-        self.rerank_tokenizer = AutoTokenizer.from_pretrained(CONFIG["rerank_model"])
-        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(CONFIG["rerank_model"], torch_dtype=torch.float16).eval()
-        self.vision_processor = AutoProcessor.from_pretrained(CONFIG["vision_model"])
-        # Pas de flash_attention_2 ici, c'est ça qui causait ton autre crash
-        self.vision_model = Qwen2VLForConditionalGeneration.from_pretrained(CONFIG["vision_model"], torch_dtype=torch.float16).eval()
-        # 2. Indexation immédiate
-        self.index_documents()
-        logger.info("🚀 Engine Ready.")
-    def load_data(self):
-        try:
-            with open(CONFIG["data_path"], "r", encoding="utf-8") as f:
-                self.dataset = json.load(f)
-            logger.info(f"📂 Dataset loaded: {len(self.dataset)} documents.")
-        except Exception as e:
-            logger.error(f"❌ Failed to load dataset: {e}")
-            self.dataset = []
-    def validate_image_path(self, path):
-        clean_path = os.path.abspath(path)
-        allowed_path = os.path.abspath(CONFIG["allowed_image_dir"])
-        if not clean_path.startswith(allowed_path):
-            return None
-        return clean_path
-    def last_token_pool(self, last_hidden_states, attention_mask):
-        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
-        if left_padding:
-            return last_hidden_states[:, -1]
-        else:
-            sequence_lengths = attention_mask.sum(dim=1) - 1
-            batch_size = last_hidden_states.shape[0]
-            # Sécurité pour éviter les erreurs de device
-            sequence_lengths = sequence_lengths.to(last_hidden_states.device)
-            return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
-    def index_documents(self):
-        if not self.dataset: return
-        logger.info("⚙️ Indexing documents...")
-        texts = [d.get('text', '') for d in self.dataset]
-        embeddings = []
-        batch_size = 4
-        with torch.no_grad():
-            for i in range(0, len(texts), batch_size):
-                batch = texts[i : i + batch_size]
-                inputs = self.embed_tokenizer(
-                    batch, max_length=CONFIG["max_embed_len"], padding=True, truncation=True, return_tensors="pt"
-                )
-                outputs = self.embed_model(**inputs)
-                emb = self.last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])
-                emb = F.normalize(emb, p=2, dim=1)
-                embeddings.append(emb)
-        if embeddings:
-            self.doc_embeddings = torch.cat(embeddings, dim=0)
-            logger.info(f"✅ Indexing complete. Shape: {self.doc_embeddings.shape}")
-    def pipeline(self, query):
-        start_time = time.time()
-        # ZeroGPU active le GPU ici. On vérifie s'il est là.
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Transfert des modèles vers le GPU (Just-in-Time)
-        self.embed_model.to(device)
-        self.rerank_model.to(device)
-        self.vision_model.to(device)
-        if self.doc_embeddings is not None:
-            self.doc_embeddings = self.doc_embeddings.to(device)
-        if not self.dataset or self.doc_embeddings is None:
-            return [], "System not initialized.", ""
-        # === 1. RETRIEVAL ===
-        query_prompt = f"Instruct: Given a user query, retrieve relevant passages that answer the query.\nQuery: {query}"
-        with torch.no_grad():
-            q_inputs = self.embed_tokenizer([query_prompt], max_length=CONFIG["max_embed_len"], truncation=True, return_tensors="pt").to(device)
-            q_out = self.embed_model(**q_inputs)
-            q_emb = self.last_token_pool(q_out.last_hidden_state, q_inputs['attention_mask'])
-            q_emb = F.normalize(q_emb, p=2, dim=1)
-            scores = (q_emb @ self.doc_embeddings.T).squeeze(0)
-            top_k_indices = torch.topk(scores, k=min(10, len(scores))).indices.tolist()
-        # === 2. RERANKING ===
-        pairs = [[query, self.dataset[idx]['text']] for idx in top_k_indices]
-        with torch.no_grad():
-            r_inputs = self.rerank_tokenizer(pairs, padding=True, truncation=True, max_length=CONFIG["max_rerank_len"], return_tensors="pt").to(device)
-            r_scores = self.rerank_model(**r_inputs, return_dict=True).logits.view(-1).float()
-            top_3_indices_local = torch.topk(r_scores, k=min(3, len(r_scores))).indices.tolist()
-        # === 3. CONTEXT & IMAGES (LE FIX ANTI-HALLUCINATION) ===
-        images_content = []
-        gallery_data = []
-        sources_md = "### 📚 Verified Sources\n\n"
-        for rank, idx_local in enumerate(top_3_indices_local):
-            global_idx = top_k_indices[idx_local]
-            doc = self.dataset[global_idx]
-            score = r_scores[idx_local].item()
-            valid_path = self.validate_image_path(doc['image_path'])
-            if not valid_path: continue
-            try:
-                img = Image.open(valid_path)
-                # --- LE FIX EST ICI ---
-                # On écrit en GROS le nom du document pour l'IA
-                doc_name = doc.get('doc_name', 'Unknown Document')
-                doc_section = doc.get('section', 'Unknown Section')
-                context_header = (
-                    f"\n--- DOCUMENT {rank+1} METADATA ---\n"
-                    f"FILE NAME: {doc_name}\n" # Ex: Microsoft_2023_Report
-                    f"SECTION: {doc_section}\n"
-                    f"RELEVANCE: {score:.2f}\n"
-                    "---------------------------\n"
-                )
-                images_content.append({"type": "text", "text": context_header})
-                images_content.append({"type": "image", "image": img})
-                gallery_data.append((img, f"{doc_name}"))
-                sources_md += f"**{rank+1}. {doc_name}** - *{doc_section}* (Score: {score:.2f})\n"
-            except Exception as e:
-                logger.error(f"Image load error: {e}")
-                continue
-        # === 4. GENERATION ===
-        # Prompt Strict pour forcer la lecture du header
-        system_prompt = (
-            "You are a strict financial data extraction engine. "
-            "Analyze the provided images to answer the user query.\n"
-            "CRITICAL RULES:\n"
-            "1. Read the 'DOCUMENT METADATA' provided before each image.\n"
-            "2. If the user asks about 'Microsoft', ONLY use images labeled as Microsoft/MSFT.\n"
-            "3. If the user asks about 'Apple', ONLY use images labeled as Apple/AAPL.\n"
-            "4. Do not mix data between companies.\n"
-            "Output format:\n- **Answer**: [Direct Answer]\n- **Evidence**: [Quote]\n- **Context**: [Year/Company]"
-        )
-        messages = [
-            {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
-            {"role": "user", "content": images_content + [{"type": "text", "text": f"Query: {query}"}]}
-        ]
-        text_input = self.vision_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = self.vision_processor(text=[text_input], images=process_vision_info(messages)[0], padding=True, return_tensors="pt").to(device)
-        generated_ids = self.vision_model.generate(**inputs, max_new_tokens=512, temperature=0.1)
-        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        response = self.vision_processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        logger.info(f"⏱️ Total Latency: {time.time() - start_time:.2f}s")
-        return gallery_data, sources_md, response
-# --- INSTANTIATION ---
-engine = FinancialAnalystEngine()
-# --- UI ---
-@spaces.GPU(duration=60)
-def run_query(query):
-    return engine.pipeline(query)
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# ⚡ AI Financial Analyst (Safe Mode)")
     with gr.Row():
-        inp = gr.Textbox(label="Question", placeholder="Ex: What is the Operating Income for Microsoft?", scale=4)
-        btn = gr.Button("Analyze", variant="primary", scale=1)
     with gr.Row():
-        with gr.Column(scale=2):
-            out_gallery = gr.Gallery(label="Documents", columns=3, height=400)
-        with gr.Column(scale=1):
-            out_meta = gr.Markdown(label="Sources")
-            out_resp = gr.Markdown(label="Answer")
-    btn.click(run_query, inp, [out_gallery, out_meta, out_resp])
 if __name__ == "__main__":
     demo.launch()

 import spaces
+import torch
 import gradio as gr
+import json
 import torch.nn.functional as F
 from PIL import Image
 from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForSequenceClassification
 from qwen_vl_utils import process_vision_info
+# --- CONFIGURATION ---
+print(f"🚀 Démarrage RAG Finance (Mode Multi-View : 3 Images)...")
+# --- 1. DONNÉES ---
+try:
+    with open("data/dataset.json", "r", encoding="utf-8") as f:
+        dataset = json.load(f)
+except:
+    dataset = []
+    print("⚠️ Index vide.")
+# --- 2. MODÈLES ---
+# A. EMBEDDING : GTE-Qwen2-7B (Le modèle LOURD qui causait les crashs mémoire)
+EMBED_MODEL_ID = "Alibaba-NLP/gte-Qwen2-7B-instruct"
+print(f"🔹 Chargement Embedder : {EMBED_MODEL_ID}")
+embed_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID, trust_remote_code=False)
+embed_model = AutoModel.from_pretrained(
+    EMBED_MODEL_ID,
+    trust_remote_code=False,
+    torch_dtype=torch.bfloat16,
+    # C'est cette ligne qui fait planter si pas de GPU détecté immédiatement
+    attn_implementation="flash_attention_2",
+    device_map="auto"
+)
+# B. RERANKER
+RERANK_MODEL_ID = "BAAI/bge-reranker-v2-m3"
+print(f"⚖️ Chargement Reranker : {RERANK_MODEL_ID}")
+rerank_tokenizer = AutoTokenizer.from_pretrained(RERANK_MODEL_ID)
+rerank_model = AutoModelForSequenceClassification.from_pretrained(
+    RERANK_MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+# C. VISION
+GEN_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
+print(f"👁️ Chargement Vision : {GEN_MODEL_ID}")
+gen_model = Qwen2VLForConditionalGeneration.from_pretrained(
+    GEN_MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto"
+)
+gen_processor = AutoProcessor.from_pretrained(GEN_MODEL_ID)
+# --- 3. FONCTIONS ---
+def last_token_pool(last_hidden_states, attention_mask):
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+# --- 4. PIPELINE ---
+@spaces.GPU
+def retrieve_and_answer(query):
+    print(f"⚡ Question : {query}")
+    if not dataset: return None, "Base vide", "Pas de document"
+    # 1. RETRIEVAL (Recalculé à chaque fois -> Lent)
+    valid_docs = []
+    for i, doc in enumerate(dataset):
+        text = doc.get('text', '').strip()
+        if text:
+            valid_docs.append({'text': text, 'original_index': i})
+    query_text = f"Instruct: Given a user query, retrieve relevant passages that answer the query.\nQuery: {query}"
+    with torch.no_grad():
+        q_inputs = embed_tokenizer([query_text], max_length=8192, padding=True, truncation=True, return_tensors='pt').to(embed_model.device)
+        q_outputs = embed_model(**q_inputs)
+        q_emb = last_token_pool(q_outputs.last_hidden_state, q_inputs['attention_mask'])
+        q_emb = F.normalize(q_emb, p=2, dim=1)
+        d_embeddings_list = []
+        doc_texts = [d['text'] for d in valid_docs]
+        for i in range(0, len(doc_texts), 1):
+            d_inputs = embed_tokenizer(doc_texts[i:i+1], max_length=8192, padding=True, truncation=True, return_tensors='pt').to(embed_model.device)
+            d_outputs = embed_model(**d_inputs)
+            batch_emb = last_token_pool(d_outputs.last_hidden_state, d_inputs['attention_mask'])
+            batch_emb = F.normalize(batch_emb, p=2, dim=1)
+            d_embeddings_list.append(batch_emb)
+        d_emb_final = torch.cat(d_embeddings_list, dim=0)
+        scores = (q_emb @ d_emb_final.T).squeeze(0)
+        top_k_indices = torch.topk(scores, k=min(10, len(scores))).indices.tolist()
+    # 2. RERANKING
+    pairs = []
+    for idx in top_k_indices:
+        pairs.append([query, valid_docs[idx]['text']])
+    with torch.no_grad():
+        r_inputs = rerank_tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=8192).to(rerank_model.device)
+        r_scores = rerank_model(**r_inputs, return_dict=True).logits.view(-1).float()
+        top_3_indices_local = torch.topk(r_scores, k=min(3, len(r_scores))).indices.tolist()
+    # 3. PREPARATION IMAGES (C'est ICI que l'hallucination se crée)
+    images_content = []
+    gallery_preview = []
+    meta_info = ""
+    for rank, idx_local in enumerate(top_3_indices_local):
+        idx_in_top_k = idx_local
+        idx_in_valid = top_k_indices[idx_in_top_k]
+        final_doc_idx = valid_docs[idx_in_valid]['original_index']
+        doc = dataset[final_doc_idx]
+        image_path = doc['image_path']
+        score = r_scores[idx_local].item()
+        try:
+            img = Image.open(image_path)
+            # PROBLÈME ICI : On ne dit pas au modèle "Ceci est Microsoft" ou "Ceci est Apple"
+            # Il voit juste "Image 1", "Image 2"...
+            images_content.append({"type": "text", "text": f"Image {rank+1} (Pertinence: {score:.2f}):\n"})
+            images_content.append({"type": "image", "image": img})
+            gallery_preview.append((img, f"Page {rank+1} - Score {score:.2f}"))
+            meta_info += f"- **Image {rank+1}:** {doc['doc_name']} (Score: {score:.2f})\n"
+        except:
+            continue
+    # 4. GENERATION
+    system_prompt = (
+        "You are an expert financial analyst examining 3 pages of a report. "
+        "Your goal is to answer the user question using ONLY the provided images."
+    )
+    user_content = images_content + [{"type": "text", "text": f"\nUser Question: {query}"}]
+    messages = [
+        {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
+        {"role": "user", "content": user_content}
+    ]
+    text_input = gen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = gen_processor(
+        text=[text_input],
+        images=image_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to(gen_model.device)
+    generated_ids = gen_model.generate(**inputs, max_new_tokens=768)
+    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+    response = gen_processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    return gallery_preview, meta_info, response
+# --- 5. UI ---
+with gr.Blocks(title="RAG Finance") as demo:
+    gr.Markdown("# 🚀 RAG Finance (Version Originale Instable)")
     with gr.Row():
+        query_input = gr.Textbox(label="Question")
+        submit_btn = gr.Button("Analyser", variant="primary")
     with gr.Row():
+        output_gallery = gr.Gallery(label="Pages")
+        output_meta = gr.Markdown(label="Sources")
+        output_text = gr.Markdown(label="Réponse")
+    submit_btn.click(retrieve_and_answer, inputs=query_input, outputs=[output_gallery, output_meta, output_text])
 if __name__ == "__main__":
     demo.launch()