Spaces:

arasuezofis
/

documentbasedresponse

Sleeping

App Files Files Community

arasuezofis commited on Oct 10, 2025

Commit

638e61c

verified ·

1 Parent(s): ad1f26c

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -0

app.py CHANGED Viewed

	@@ -0,0 +1,198 @@

+import io
+import time
+from typing import List, Tuple, Optional
+import gradio as gr
+import torch
+from PIL import Image
+import fitz  # PyMuPDF
+from transformers import (
+    AutoProcessor,
+    AutoModelForVision2Seq,
+    TextIteratorStreamer,
+)
+MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"  # 250M instruct variant
+# If you ever need to swap models (e.g., 256M/500M), just change the ID.
+# Load once at startup
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if device == "cuda" else torch.float32
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=dtype)
+model.to(device)
+model.eval()
+SYSTEM_PROMPT = (
+    "You are an invoice assistant. Answer strictly based on the uploaded document. "
+    "If asked for fields (invoice number, date, totals, etc.), extract them from the image."
+)
+def pdf_to_images(pdf_bytes: bytes, max_pages: int = 5, dpi: int = 216) -> List[Image.Image]:
+    """
+    Render first N pages of a PDF to PIL images (RGB).
+    """
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    images = []
+    for i, page in enumerate(doc):
+        if i >= max_pages:
+            break
+        # Render page
+        pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        images.append(img)
+    return images
+def ensure_images(file: Optional[gr.File]) -> List[Image.Image]:
+    """
+    Accepts a PDF/PNG/JPEG and returns a list of PIL images.
+    - PDF => multiple images (page picker will handle selection)
+    - PNG/JPG => single image
+    """
+    if file is None:
+        return []
+    mime = file.mime_type or ""
+    data = file.read()
+    if "pdf" in mime or (file.name and file.name.lower().endswith(".pdf")):
+        return pdf_to_images(data, max_pages=8)
+    # Image path
+    img = Image.open(io.BytesIO(data)).convert("RGB")
+    return [img]
+def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
+    """
+    Stream a reply grounded on chosen image(s) + chat history.
+    We only keep a compact history to stay lean on memory.
+    """
+    # Build multimodal messages per transformers' chat template
+    # Format: [{"role":"system","content":...}, {"role":"user","content":[text, image, ...]}, ...]
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    # Keep only last 4 exchanges to avoid context bloat
+    trimmed = chat_history[-4:] if chat_history else []
+    for u, a in trimmed:
+        messages.append({"role": "user", "content": u})
+        messages.append({"role": "assistant", "content": a})
+    # Add the current turn with images
+    multimodal_content = []
+    if images:
+        # SmolVLM supports multiple images; push them before the text question
+        for im in images:
+            multimodal_content.append(im)
+    if user_text.strip():
+        multimodal_content.append(user_text.strip())
+    messages.append({"role": "user", "content": multimodal_content})
+    # Tokenize with chat template
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_tensors="pt"
+    ).to(device)
+    # Vision inputs: processor handles images separately
+    vision_inputs = processor(images=images, return_tensors="pt").to(device)
+    # Merge text & vision inputs
+    model_inputs = {**inputs, **vision_inputs}
+    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = dict(
+        **model_inputs,
+        streamer=streamer,
+        max_new_tokens=512,
+        do_sample=False,
+        temperature=0.0,
+    )
+    # Non-blocking generation
+    import threading
+    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    partial = ""
+    for token in streamer:
+        partial += token
+        yield partial
+def start_chat(file, page_index):
+    # Convert to images and preselect a page
+    imgs = ensure_images(file)
+    if not imgs:
+        return gr.update(choices=[], value=None), None, "No file loaded yet."
+    choices = [f"Page {i+1}" for i in range(len(imgs))]
+    value = choices[min(page_index, len(imgs)-1)] if page_index is not None else choices[0]
+    return gr.update(choices=choices, value=value), imgs, "Document ready. Select a page and ask questions."
+def page_picker_changed(pages_dropdown, images_state):
+    if not images_state:
+        return None
+    idx = max(0, int(pages_dropdown.split()[-1]) - 1)
+    return images_state[idx]
+with gr.Blocks(title="Invoice Chat (SmolVLM-250M)") as demo:
+    gr.Markdown("# Invoice Chat • SmolVLM-Instruct-250M\nAsk questions grounded on your uploaded invoice.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            file = gr.File(label="Upload invoice (PDF/PNG/JPEG)")
+            pages = gr.Dropdown(label="Select page (for PDFs)", choices=[], value=None)
+            load_btn = gr.Button("Prepare Document")
+        with gr.Column(scale=2):
+            image_view = gr.Image(label="Current page/image", interactive=False)
+    chatbot = gr.Chatbot(height=380)
+    user_box = gr.Textbox(label="Your question", placeholder="e.g., What is the invoice number and total?")
+    ask_btn = gr.Button("Ask")
+    # Hidden states
+    images_state = gr.State([])
+    selected_img_state = gr.State(None)
+    # Wire events
+    load_btn.click(
+        start_chat,
+        inputs=[file, gr.State(0)],
+        outputs=[pages, images_state, gr.Textbox(visible=False)]
+    )
+    pages.change(page_picker_changed, inputs=[pages, images_state], outputs=[image_view])
+    def chat(user_text, history, images_state, image_view):
+        if not user_text.strip():
+            return gr.update(), history
+        # Choose the selected image; if none, fall back to first
+        sel_img = None
+        if image_view is not None and isinstance(image_view, dict) and image_view.get("image"):
+            # gr.Image returns a dict in some contexts; handle robustly
+            sel_img = Image.open(image_view["image"]).convert("RGB")
+        elif images_state:
+            sel_img = images_state[0]
+        if sel_img is None:
+            history = history + [(user_text, "Please upload a document first.")]
+            return gr.update(value=history), history
+        stream = generate_reply([sel_img], user_text, history)
+        acc = ""
+        for chunk in stream:
+            acc = chunk
+            yield history + [(user_text, acc)], history + [(user_text, acc)]
+    ask_btn.click(
+        chat,
+        inputs=[user_box, chatbot, images_state, image_view],
+        outputs=[chatbot, chatbot]
+    )
+    user_box.submit(
+        chat,
+        inputs=[user_box, chatbot, images_state, image_view],
+        outputs=[chatbot, chatbot]
+    )
+if __name__ == "__main__":
+    demo.launch()