Spaces:

ignaciaginting
/

financial_agent

Running

App Files Files Community

ignaciaginting commited on Apr 11, 2025

Commit

4c8d2d0

verified ·

1 Parent(s): 6387027

check the colpali first

Browse files

Files changed (1) hide show

app.py +58 -67

app.py CHANGED Viewed

@@ -1,95 +1,86 @@
 import streamlit as st
-import fitz  # PyMuPDF
 import torch
 from PIL import Image
-import io
-from sentence_transformers import SentenceTransformer, util
 from transformers.utils.import_utils import is_flash_attn_2_available
 from colpali_engine.models import ColQwen2, ColQwen2Processor
 # -----------------------------
-# Load models
 # -----------------------------
 @st.cache_resource
-def load_models():
-    text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-    colpali_model = ColQwen2.from_pretrained(
-        "vidore/colqwen2-v1.0",
         torch_dtype=torch.bfloat16,
         device_map="cuda:0" if torch.cuda.is_available() else "cpu",
-        attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None
     ).eval()
-    colpali_processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v1.0")
-    return text_model, colpali_model, colpali_processor
-text_model, colpali_model, colpali_processor = load_models()
-# -----------------------------
-# UI Elements
-# -----------------------------
-st.title("📄 Chat with Your Financial Report (PDF + Table + Image)")
-pdf_file = st.file_uploader("Upload your PDF", type="pdf")
-use_colpali = st.checkbox("Enable ColPali (for image tables)", value=True)
 # -----------------------------
-# Process PDF
 # -----------------------------
-if pdf_file:
-    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-    text_chunks = []
     images = []
     for page in doc:
-        blocks = page.get_text("blocks")
-        for block in blocks:
-            if block[4].strip():
-                text_chunks.append(block[4].strip())
-        # Extract images if ColPali is enabled
-        if use_colpali:
-            for img_index, img in enumerate(page.get_images(full=True)):
-                xref = img[0]
-                base_image = doc.extract_image(xref)
-                image_bytes = base_image["image"]
-                image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-                images.append(image)
-    # Embed all text chunks
-    text_embeddings = text_model.encode(text_chunks, convert_to_tensor=True)
-    if use_colpali and images:
-        image_inputs = colpali_processor.process_images(images).to(colpali_model.device)
-        with torch.no_grad():
-            image_embeddings = colpali_model(**image_inputs)
-    else:
-        image_embeddings = None
-    # -----------------------------
-    # Chat Interface
-    # -----------------------------
-    user_query = st.text_input("Ask a question about your PDF:")
-    if user_query:
-        st.write("🔍 Searching for answers...")
-        # Text-based search
-        query_embedding = text_model.encode(user_query, convert_to_tensor=True)
-        top_text_hits = util.semantic_search(query_embedding, text_embeddings, top_k=3)[0]
-        st.markdown("### 📝 Top Text Answers")
-        for hit in top_text_hits:
-            score = hit['score']
-            chunk = text_chunks[hit['corpus_id']]
-            st.markdown(f"**Score:** {score:.4f}\n\n{chunk}")
-        # Image-based search (ColPali)
-        if use_colpali and image_embeddings is not None:
-            query_vec = colpali_processor.process_queries([user_query]).to(colpali_model.device)
             with torch.no_grad():
-                query_embedding_img = colpali_model(**query_vec)
-            scores = colpali_processor.score_multi_vector(query_embedding_img, image_embeddings)
-            top_k = torch.topk(scores, k=min(3, len(images)))
-            st.markdown("### 🖼️ Top Image/Table Matches")
-            for idx, score in zip(top_k.indices, top_k.values):
-                st.image(images[idx], caption=f"Similarity Score: {score.item():.4f}", use_column_width=True)

 import streamlit as st
 import torch
 from PIL import Image
+import fitz  # PyMuPDF
 from transformers.utils.import_utils import is_flash_attn_2_available
 from colpali_engine.models import ColQwen2, ColQwen2Processor
 # -----------------------------
+# Load ColPali Model
 # -----------------------------
 @st.cache_resource
+def load_colpali():
+    model_name = "vidore/colqwen2-v1.0"
+    model = ColQwen2.from_pretrained(
+        model_name,
         torch_dtype=torch.bfloat16,
         device_map="cuda:0" if torch.cuda.is_available() else "cpu",
+        attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
     ).eval()
+    processor = ColQwen2Processor.from_pretrained(model_name)
+    return model, processor
+colpali_model, colpali_processor = load_colpali()
+st.title("🔍 Visual PDF Search with ColPali")
+pdf_file = st.file_uploader("Upload a PDF", type="pdf")
 # -----------------------------
+# Convert PDF to image
 # -----------------------------
+def render_pdf_page_as_image(doc, zoom=2.0):
     images = []
     for page in doc:
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    return images
+# -----------------------------
+# Chunk image into pieces
+# -----------------------------
+def chunk_image(image, rows=2, cols=2):
+    width, height = image.size
+    chunk_width = width // cols
+    chunk_height = height // rows
+    chunks = []
+    for r in range(rows):
+        for c in range(cols):
+            left = c * chunk_width
+            top = r * chunk_height
+            right = left + chunk_width
+            bottom = top + chunk_height
+            chunk = image.crop((left, top, right, bottom)).resize((512, 512))
+            chunks.append(chunk)
+    return chunks
+if pdf_file:
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    images = render_pdf_page_as_image(doc)
+    if not images:
+        st.warning("Failed to read content from the PDF.")
+    else:
+        all_chunks = []
+        for image in images:
+            all_chunks.extend(chunk_image(image, rows=2, cols=2))
+        user_query = st.text_input("What are you looking for in the document?")
+        if user_query:
+            batch_images = colpali_processor.process_images(all_chunks).to(colpali_model.device)
+            batch_queries = colpali_processor.process_queries([user_query]).to(colpali_model.device)
             with torch.no_grad():
+                image_embeddings = colpali_model(**batch_images)
+                query_embeddings = colpali_model(**batch_queries)
+            scores = colpali_processor.score_multi_vector(query_embeddings, image_embeddings)
+            best_idx = torch.argmax(scores).item()
+            best_image = all_chunks[best_idx]
+            best_score = scores[0, best_idx].item()
+            st.markdown("### 🔍 Most Relevant Image Chunk")
+            st.image(best_image, caption=f"Score: {best_score:.4f}", use_column_width=True)