Spaces:

arasuezofis
/

documentbasedresponse

Sleeping

App Files Files Community

arasuezofis commited on Oct 10, 2025

Commit

1e56cd8

verified ·

1 Parent(s): 55b1735

Update app.py

Browse files

Files changed (1) hide show

app.py +222 -102

app.py CHANGED Viewed

@@ -1,6 +1,13 @@
 import io
-import time
-from typing import List, Tuple, Optional
 import gradio as gr
 import torch
@@ -12,95 +19,168 @@ from transformers import (
     TextIteratorStreamer,
 )
-MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"  # 250M instruct variant
-# If you ever need to swap models (e.g., 256M/500M), just change the ID.
-# Load once at startup
-device = "cuda" if torch.cuda.is_available() else "cpu"
-dtype = torch.float16 if device == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=dtype)
-model.to(device)
-model.eval()
 SYSTEM_PROMPT = (
-    "You are an invoice assistant. Answer strictly based on the uploaded document. "
-    "If asked for fields (invoice number, date, totals, etc.), extract them from the image."
 )
-def pdf_to_images(pdf_bytes: bytes, max_pages: int = 5, dpi: int = 216) -> List[Image.Image]:
-    """
-    Render first N pages of a PDF to PIL images (RGB).
-    """
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-    images = []
     for i, page in enumerate(doc):
         if i >= max_pages:
             break
-        # Render page
-        pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
         img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         images.append(img)
     return images
-def ensure_images(file: Optional[gr.File]) -> List[Image.Image]:
     """
-    Accepts a PDF/PNG/JPEG and returns a list of PIL images.
-    - PDF => multiple images (page picker will handle selection)
-    - PNG/JPG => single image
     """
-    if file is None:
         return []
-    mime = file.mime_type or ""
-    data = file.read()
-    if "pdf" in mime or (file.name and file.name.lower().endswith(".pdf")):
-        return pdf_to_images(data, max_pages=8)
-    # Image path
-    img = Image.open(io.BytesIO(data)).convert("RGB")
-    return [img]
-def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
-    Stream a reply grounded on chosen image(s) + chat history.
-    We only keep a compact history to stay lean on memory.
     """
-    # Build multimodal messages per transformers' chat template
-    # Format: [{"role":"system","content":...}, {"role":"user","content":[text, image, ...]}, ...]
-    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # Keep only last 4 exchanges to avoid context bloat
-    trimmed = chat_history[-4:] if chat_history else []
     for u, a in trimmed:
         messages.append({"role": "user", "content": u})
         messages.append({"role": "assistant", "content": a})
-    # Add the current turn with images
-    multimodal_content = []
-    if images:
-        # SmolVLM supports multiple images; push them before the text question
-        for im in images:
-            multimodal_content.append(im)
     if user_text.strip():
-        multimodal_content.append(user_text.strip())
-    messages.append({"role": "user", "content": multimodal_content})
-    # Tokenize with chat template
-    inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_tensors="pt"
-    ).to(device)
-    # Vision inputs: processor handles images separately
-    vision_inputs = processor(images=images, return_tensors="pt").to(device)
-    # Merge text & vision inputs
-    model_inputs = {**inputs, **vision_inputs}
     streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
@@ -111,86 +191,126 @@ def generate_reply(images: List[Image.Image], user_text: str, chat_history: List
         temperature=0.0,
     )
-    # Non-blocking generation
     import threading
-    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
-    thread.start()
     partial = ""
     for token in streamer:
         partial += token
         yield partial
-def start_chat(file, page_index):
-    # Convert to images and preselect a page
-    imgs = ensure_images(file)
     if not imgs:
-        return gr.update(choices=[], value=None), None, "No file loaded yet."
     choices = [f"Page {i+1}" for i in range(len(imgs))]
-    value = choices[min(page_index, len(imgs)-1)] if page_index is not None else choices[0]
-    return gr.update(choices=choices, value=value), imgs, "Document ready. Select a page and ask questions."
 def page_picker_changed(pages_dropdown, images_state):
     if not images_state:
-        return None
-    idx = max(0, int(pages_dropdown.split()[-1]) - 1)
-    return images_state[idx]
-with gr.Blocks(title="Invoice Chat (SmolVLM-250M)") as demo:
-    gr.Markdown("# Invoice Chat • SmolVLM-Instruct-250M\nAsk questions grounded on your uploaded invoice.")
     with gr.Row():
         with gr.Column(scale=1):
-            file = gr.File(label="Upload invoice (PDF/PNG/JPEG)")
-            pages = gr.Dropdown(label="Select page (for PDFs)", choices=[], value=None)
-            load_btn = gr.Button("Prepare Document")
         with gr.Column(scale=2):
             image_view = gr.Image(label="Current page/image", interactive=False)
-    chatbot = gr.Chatbot(height=380)
-    user_box = gr.Textbox(label="Your question", placeholder="e.g., What is the invoice number and total?")
-    ask_btn = gr.Button("Ask")
-    # Hidden states
     images_state = gr.State([])
     selected_img_state = gr.State(None)
-    # Wire events
     load_btn.click(
         start_chat,
         inputs=[file, gr.State(0)],
-        outputs=[pages, images_state, gr.Textbox(visible=False)]
     )
-    pages.change(page_picker_changed, inputs=[pages, images_state], outputs=[image_view])
-    def chat(user_text, history, images_state, image_view):
-        if not user_text.strip():
-            return gr.update(), history
-        # Choose the selected image; if none, fall back to first
-        sel_img = None
-        if image_view is not None and isinstance(image_view, dict) and image_view.get("image"):
-            # gr.Image returns a dict in some contexts; handle robustly
-            sel_img = Image.open(image_view["image"]).convert("RGB")
-        elif images_state:
-            sel_img = images_state[0]
-        if sel_img is None:
-            history = history + [(user_text, "Please upload a document first.")]
-            return gr.update(value=history), history
-        stream = generate_reply([sel_img], user_text, history)
-        acc = ""
-        for chunk in stream:
-            acc = chunk
-            yield history + [(user_text, acc)], history + [(user_text, acc)]
     ask_btn.click(
         chat,
-        inputs=[user_box, chatbot, images_state, image_view],
         outputs=[chatbot, chatbot]
     )
     user_box.submit(
         chat,
-        inputs=[user_box, chatbot, images_state, image_view],
         outputs=[chatbot, chatbot]
     )

+# app.py
+# ------------------------------------------------------------
+# Invoice Chat • SmolVLM-Instruct-250M
+# Operationalized for Hugging Face Spaces (Gradio SDK)
+# ------------------------------------------------------------
 import io
+import os
+import re
+from typing import List, Tuple, Optional, Union
 import gradio as gr
 import torch
     TextIteratorStreamer,
 )
+# -----------------------------
+# Model bootstrap (lean & mean)
+# -----------------------------
+MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=DTYPE)
+model.to(DEVICE).eval()
 SYSTEM_PROMPT = (
+    "You are an invoice assistant. Respond ONLY using details visible in the uploaded document. "
+    "If a field (invoice number, date, totals, tax, vendor, etc.) is not clearly visible, say so."
 )
+# -----------------------------
+# Utilities
+# -----------------------------
+def pdf_to_images_from_bytes(pdf_bytes: bytes, max_pages: int = 8, dpi: int = 216) -> List[Image.Image]:
+    """Render first N pages of a PDF (in-memory) as PIL RGB images."""
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    images: List[Image.Image] = []
+    for i, page in enumerate(doc):
+        if i >= max_pages:
+            break
+        pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    return images
+def pdf_to_images_from_path(path: str, max_pages: int = 8, dpi: int = 216) -> List[Image.Image]:
+    """Render first N pages of a PDF (file path) as PIL RGB images."""
+    doc = fitz.open(path)
+    images: List[Image.Image] = []
     for i, page in enumerate(doc):
         if i >= max_pages:
             break
+        pix = page.get_pixmap(matrix=fitz.Matrix(dpi / 72, dpi / 72))
         img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
         images.append(img)
     return images
+def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> List[Image.Image]:
     """
+    Accept PDF/PNG/JPEG via Gradio File. Handles multiple shapes of input:
+      - str path (tempfile path)
+      - dict with 'name' or 'path' (some Gradio versions)
+      - bytes / BytesIO
+    Returns a list of PIL images. PDFs => multi-image; PNG/JPEG => single image.
     """
+    if not file_val:
         return []
+    # Normalize to path/bytes
+    path: Optional[str] = None
+    raw_bytes: Optional[bytes] = None
+    if isinstance(file_val, str) and os.path.exists(file_val):
+        path = file_val
+    elif isinstance(file_val, dict):
+        # Gradio sometimes passes a dict with keys like {'name': '/tmp/..', 'orig_name': 'x.pdf', 'size': ...}
+        maybe_path = file_val.get("name") or file_val.get("path")
+        if isinstance(maybe_path, str) and os.path.exists(maybe_path):
+            path = maybe_path
+        else:
+            # if dict contains 'data' or similar
+            data = file_val.get("data")
+            if isinstance(data, (bytes, bytearray)):
+                raw_bytes = bytes(data)
+    elif isinstance(file_val, (bytes, bytearray)):
+        raw_bytes = bytes(file_val)
+    elif isinstance(file_val, io.BytesIO):
+        raw_bytes = file_val.getvalue()
+    # Branch by PDF vs Image
+    def is_pdf_from_name(name: str) -> bool:
+        return name.lower().endswith(".pdf")
+    if path:
+        if is_pdf_from_name(path):
+            return pdf_to_images_from_path(path)
+        # Image path
+        with open(path, "rb") as f:
+            img = Image.open(io.BytesIO(f.read())).convert("RGB")
+        return [img]
+    if raw_bytes:
+        # Try sniffing PDF header
+        if raw_bytes[:5] == b"%PDF-":
+            return pdf_to_images_from_bytes(raw_bytes)
+        # Else treat as image bytes
+        img = Image.open(io.BytesIO(raw_bytes)).convert("RGB")
+        return [img]
+    # Fallback: nothing usable
+    return []
+def parse_page_selection(value, num_pages: int) -> int:
     """
+    Accept 'Page 3', '3', 3, 'pg-2', etc. Return safe 0-based index clamped to [0, num_pages-1].
+    Defaults to 0 if unusable.
     """
+    if num_pages <= 0:
+        return 0
+    if value is None:
+        return 0
+    if isinstance(value, int):
+        idx = value - 1
+    else:
+        s = str(value).strip()
+        m = re.search(r"(\d+)", s)
+        idx = int(m.group(1)) - 1 if m else 0
+    return max(0, min(num_pages - 1, idx))
+def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[Image.Image]):
+    """
+    Construct chat-format messages compatible with processor.apply_chat_template.
+    We trim the history to avoid runaway context growth.
+    """
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    trimmed = history[-4:] if history else []
     for u, a in trimmed:
         messages.append({"role": "user", "content": u})
         messages.append({"role": "assistant", "content": a})
+    multimodal = []
+    for im in images:
+        multimodal.append(im)
     if user_text.strip():
+        multimodal.append(user_text.strip())
+    messages.append({"role": "user", "content": multimodal})
+    return messages
+def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
+    """
+    Stream a model reply grounded on provided images + user question + compact chat history.
+    """
+    messages = build_messages(chat_history, user_text, images)
+    # Text context
+    text_inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_tensors="pt"
+    ).to(DEVICE)
+    # Vision tensors
+    vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
+    # Merge dicts
+    model_inputs = {**text_inputs, **vision_inputs}
     streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         temperature=0.0,
     )
     import threading
+    t = threading.Thread(target=model.generate, kwargs=gen_kwargs)
+    t.start()
     partial = ""
     for token in streamer:
         partial += token
         yield partial
+# -----------------------------
+# Gradio UI Orchestration
+# -----------------------------
+def start_chat(file_val, page_index):
+    imgs = ensure_images(file_val)
     if not imgs:
+        # Reset the dropdown & return empty
+        return (
+            gr.update(choices=[], value=None),
+            [],
+            None,
+            "No file loaded. Please upload a PDF/PNG/JPEG.",
+        )
     choices = [f"Page {i+1}" for i in range(len(imgs))]
+    safe_idx = 0 if page_index is None else max(0, min(len(imgs) - 1, int(page_index)))
+    default_value = choices[safe_idx]
+    return (
+        gr.update(choices=choices, value=default_value),
+        imgs,
+        imgs[safe_idx],
+        "Document ready. Select a page and ask questions.",
+    )
 def page_picker_changed(pages_dropdown, images_state):
     if not images_state:
+        return None, gr.update()
+    idx = parse_page_selection(pages_dropdown, len(images_state))
+    selected = images_state[idx]
+    return selected, selected  # for preview and selected state
+def chat(user_text, history, images_state, selected_img):
+    if not user_text or not user_text.strip():
+        # No update; just echo current state
+        return gr.update(), history
+    # Choose selected image; fallback to first page if needed
+    sel_img = selected_img if selected_img is not None else (images_state[0] if images_state else None)
+    if sel_img is None:
+        history = history + [(user_text, "Please upload a document first.")]
+        return gr.update(value=history), history
+    stream = generate_reply([sel_img], user_text, history)
+    acc = ""
+    for chunk in stream:
+        acc = chunk
+        yield history + [(user_text, acc)], history + [(user_text, acc)]
+# -----------------------------
+# App definition
+# -----------------------------
+with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
+    gr.Markdown(
+        "## Invoice Chat • SmolVLM-Instruct-250M\n"
+        "Upload a PDF/PNG/JPEG, pick a page, and interrogate the document. "
+        "This is a CPU-friendly, low-latency experience designed for rapid insight capture."
+    )
     with gr.Row():
         with gr.Column(scale=1):
+            file = gr.File(label="Upload invoice (PDF / PNG / JPEG)")
+            pages = gr.Dropdown(
+                label="Select page (for PDFs)",
+                choices=[],
+                value=None,
+                allow_custom_value=True,  # set False to hard-lock to dropdown values
+                info="Type a page number (e.g., 2) or choose from the list."
+            )
+            load_btn = gr.Button("Prepare Document", variant="primary")
         with gr.Column(scale=2):
             image_view = gr.Image(label="Current page/image", interactive=False)
+    chatbot = gr.Chatbot(height=400)
+    user_box = gr.Textbox(
+        label="Your question",
+        placeholder="e.g., What is the invoice number and total with tax?",
+    )
+    ask_btn = gr.Button("Ask", variant="primary")
+    # Hidden session state
     images_state = gr.State([])
     selected_img_state = gr.State(None)
+    # Wire up events
     load_btn.click(
         start_chat,
         inputs=[file, gr.State(0)],
+        outputs=[pages, images_state, image_view, gr.Textbox(visible=False)]
+    )
+    # When the page dropdown changes, update both preview and the selected image state
+    pages.change(
+        page_picker_changed,
+        inputs=[pages, images_state],
+        outputs=[image_view, selected_img_state]
     )
+    # Ask flows (streaming)
     ask_btn.click(
         chat,
+        inputs=[user_box, chatbot, images_state, selected_img_state],
         outputs=[chatbot, chatbot]
     )
     user_box.submit(
         chat,
+        inputs=[user_box, chatbot, images_state, selected_img_state],
         outputs=[chatbot, chatbot]
     )