Spaces:

arasuezofis
/

documentbasedresponse

Sleeping

App Files Files Community

arasuezofis commited on Oct 13, 2025

Commit

3149ed3

verified ·

1 Parent(s): 62c1db6

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -63

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 # app.py
 # ------------------------------------------------------------
-# Invoice Chat • SmolVLM-Instruct-250M
-# Gradio Space with robust page picker + safe streaming chat
 # ------------------------------------------------------------
 import io
 import os
 import re
-from typing import List, Tuple, Optional, Union
 import gradio as gr
 import torch
@@ -16,7 +15,7 @@ import fitz  # PyMuPDF
 from transformers import (
     AutoProcessor,
     AutoTokenizer,
-    AutoModelForImageTextToText,  # <= new, replaces AutoModelForVision2Seq
     TextIteratorStreamer,
 )
@@ -24,17 +23,12 @@ from transformers import (
 # Model bootstrap
 # -----------------------------
 MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
-# Tokenizer has the chat template
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
-# Processor handles vision tensors
 processor = AutoProcessor.from_pretrained(MODEL_ID)
-# New class to avoid deprecation warnings
-model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, dtype=DTYPE)
-model.to(DEVICE).eval()
 SYSTEM_PROMPT = (
     "You are an invoice assistant. Respond ONLY using details visible in the uploaded document. "
@@ -68,15 +62,10 @@ def pdf_to_images_from_path(path: str, max_pages: int = 8, dpi: int = 216) -> Li
 def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> List[Image.Image]:
     """
-    Accept PDF/PNG/JPEG via Gradio File. Handles:
-      - str path (tempfile path)
-      - dict with 'name'/'path' or 'data'
-      - bytes / BytesIO
-    Returns a list of PIL images. PDFs => multi-image; PNG/JPEG => single image.
     """
     if not file_val:
         return []
     path: Optional[str] = None
     raw_bytes: Optional[bytes] = None
@@ -95,11 +84,8 @@ def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> Li
     elif isinstance(file_val, io.BytesIO):
         raw_bytes = file_val.getvalue()
-    def is_pdf_name(name: str) -> bool:
-        return name.lower().endswith(".pdf")
     if path:
-        if is_pdf_name(path):
             return pdf_to_images_from_path(path)
         with open(path, "rb") as f:
             img = Image.open(io.BytesIO(f.read())).convert("RGB")
@@ -115,34 +101,28 @@ def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> Li
 def parse_page_selection(value, num_pages: int) -> int:
     """
-    Accept 'Page 3', '3', 3, 'pg-2', etc. Return safe 0-based index clamped to [0, num_pages-1].
     """
-    if num_pages <= 0:
-        return 0
-    if value is None:
         return 0
     if isinstance(value, int):
         idx = value - 1
     else:
-        s = str(value).strip()
-        m = re.search(r"(\d+)", s)
         idx = int(m.group(1)) - 1 if m else 0
     return max(0, min(num_pages - 1, idx))
 def build_messages(history_msgs: list, user_text: str, images: List[Image.Image]):
     """
-    Compose the full prompt for the model:
       - system prompt
-      - trimmed history (already in {'role','content'} format)
-      - current user turn with images + text
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # Keep last 8 messages to stay lean
-    trimmed = history_msgs[-8:] if history_msgs else []
     messages.extend(trimmed)
-    # Current user turn: images first, then text
     multimodal = []
     for im in images:
         multimodal.append(im)
@@ -155,46 +135,40 @@ def build_messages(history_msgs: list, user_text: str, images: List[Image.Image]
 # -----------------------------
 # Core generation (streaming)
 # -----------------------------
-def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
-    - Build prompt as TEXT (chat template) -> tokenize to dict (input_ids, attention_mask)
-    - Vision tensors via processor (pixel_values)
-    - Pass ONLY allowed kwargs to model.generate (avoid rows/cols etc.)
     """
-    messages = build_messages(chat_history, user_text, images)
-    # 1) Build prompt text
     prompt_text = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
-        tokenize=False,   # IMPORTANT: return a string
     )
-    # 2) Tokenize to get a dict
     text_inputs = tokenizer(prompt_text, return_tensors="pt").to(DEVICE)
-    # 3) Vision tensors
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
-    # 4) Allow-list only the keys generate() expects
     model_inputs = {
         "input_ids": text_inputs["input_ids"],
-        # attention_mask may or may not exist depending on tokenizer; include if present
         **({"attention_mask": text_inputs["attention_mask"]} if "attention_mask" in text_inputs else {}),
-        # vision inputs
         **({"pixel_values": vision_inputs["pixel_values"]} if "pixel_values" in vision_inputs else {}),
     }
-    # 5) Streamer uses the same tokenizer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
         streamer=streamer,
         max_new_tokens=512,
-        do_sample=False,
-        # NOTE: some I2T models ignore temperature/top_p; avoid passing unsupported flags
     )
     import threading
@@ -241,19 +215,16 @@ def chat(user_text, history_msgs, images_state, selected_img):
     sel_img = selected_img if selected_img is not None else (images_state[0] if images_state else None)
     if sel_img is None:
-        # push a system-style nudge
         history_msgs = history_msgs + [
             {"role": "user", "content": user_text},
-            {"role": "assistant", "content": "Please upload a document first."}
         ]
         return gr.update(value=history_msgs), history_msgs
-    # Stream the assistant reply
     stream = generate_reply([sel_img], user_text, history_msgs)
     acc = ""
     for chunk in stream:
         acc = chunk
-        # do incremental streaming by replacing the last assistant message
         yield (
             history_msgs + [
                 {"role": "user", "content": user_text},
@@ -262,11 +233,9 @@ def chat(user_text, history_msgs, images_state, selected_img):
             history_msgs + [
                 {"role": "user", "content": user_text},
                 {"role": "assistant", "content": acc},
-            ]
         )
 # -----------------------------
 # App definition
 # -----------------------------
@@ -284,14 +253,14 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
                 choices=[],
                 value=None,
                 allow_custom_value=True,
-                info="Type a page number (e.g., 2) or choose from the list."
             )
             load_btn = gr.Button("Prepare Document", variant="primary")
         with gr.Column(scale=2):
             image_view = gr.Image(label="Current page/image", interactive=False)
-    # Lock Chatbot type to silence deprecation warning
-    chatbot = gr.Chatbot(height=400, type="tuples")
     user_box = gr.Textbox(
         label="Your question",
         placeholder="e.g., What is the invoice number and total with tax?",
@@ -306,22 +275,22 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
     load_btn.click(
         start_chat,
         inputs=[file, gr.State(0)],
-        outputs=[pages, images_state, image_view, gr.Textbox(visible=False)]
     )
     pages.change(
         page_picker_changed,
         inputs=[pages, images_state],
-        outputs=[image_view, selected_img_state]
     )
     ask_btn.click(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],
-        outputs=[chatbot, chatbot]
     )
     user_box.submit(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],
-        outputs=[chatbot, chatbot]
     )
 if __name__ == "__main__":

 # app.py
 # ------------------------------------------------------------
+# Invoice Chat • SmolVLM-Instruct-250M (messages-mode, streaming)
 # ------------------------------------------------------------
 import io
 import os
 import re
+from typing import List, Optional, Union
 import gradio as gr
 import torch
 from transformers import (
     AutoProcessor,
     AutoTokenizer,
+    AutoModelForImageTextToText,  # modern replacement for AutoModelForVision2Seq
     TextIteratorStreamer,
 )
 # Model bootstrap
 # -----------------------------
 MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, dtype=DTYPE).to(DEVICE).eval()
 SYSTEM_PROMPT = (
     "You are an invoice assistant. Respond ONLY using details visible in the uploaded document. "
 def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> List[Image.Image]:
     """
+    Accept PDF/PNG/JPEG (path/dict/bytes/BytesIO) and return a list of PIL images.
     """
     if not file_val:
         return []
     path: Optional[str] = None
     raw_bytes: Optional[bytes] = None
     elif isinstance(file_val, io.BytesIO):
         raw_bytes = file_val.getvalue()
     if path:
+        if path.lower().endswith(".pdf"):
             return pdf_to_images_from_path(path)
         with open(path, "rb") as f:
             img = Image.open(io.BytesIO(f.read())).convert("RGB")
 def parse_page_selection(value, num_pages: int) -> int:
     """
+    Accept 'Page 3', '3', 3, 'pg-2', etc. Return safe 0-based index.
     """
+    if num_pages <= 0 or value is None:
         return 0
     if isinstance(value, int):
         idx = value - 1
     else:
+        m = re.search(r"(\d+)", str(value).strip())
         idx = int(m.group(1)) - 1 if m else 0
     return max(0, min(num_pages - 1, idx))
 def build_messages(history_msgs: list, user_text: str, images: List[Image.Image]):
     """
+    Compose the model prompt using OpenAI-style messages:
       - system prompt
+      - trimmed prior messages
+      - current user turn (images + text)
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+    trimmed = history_msgs[-8:] if history_msgs else []  # keep the window tight
     messages.extend(trimmed)
     multimodal = []
     for im in images:
         multimodal.append(im)
 # -----------------------------
 # Core generation (streaming)
 # -----------------------------
+def generate_reply(images: List[Image.Image], user_text: str, history_msgs: list):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
+    - Build prompt text (chat template) -> tokenize (dict)
+    - Vision tensors via processor (dict)
+    - Allow-list kwargs to model.generate
     """
+    messages = build_messages(history_msgs, user_text, images)
+    # 1) Build prompt as TEXT (not tokens)
     prompt_text = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
+        tokenize=False,
     )
+    # 2) Tokenize → mapping with input_ids/attention_mask
     text_inputs = tokenizer(prompt_text, return_tensors="pt").to(DEVICE)
+    # 3) Vision tensors (pixel_values)
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
     model_inputs = {
         "input_ids": text_inputs["input_ids"],
         **({"attention_mask": text_inputs["attention_mask"]} if "attention_mask" in text_inputs else {}),
         **({"pixel_values": vision_inputs["pixel_values"]} if "pixel_values" in vision_inputs else {}),
     }
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
         streamer=streamer,
         max_new_tokens=512,
+        do_sample=False,  # keep deterministic for enterprise-grade UX
     )
     import threading
     sel_img = selected_img if selected_img is not None else (images_state[0] if images_state else None)
     if sel_img is None:
         history_msgs = history_msgs + [
             {"role": "user", "content": user_text},
+            {"role": "assistant", "content": "Please upload a document first."},
         ]
         return gr.update(value=history_msgs), history_msgs
     stream = generate_reply([sel_img], user_text, history_msgs)
     acc = ""
     for chunk in stream:
         acc = chunk
         yield (
             history_msgs + [
                 {"role": "user", "content": user_text},
             history_msgs + [
                 {"role": "user", "content": user_text},
                 {"role": "assistant", "content": acc},
+            ],
         )
 # -----------------------------
 # App definition
 # -----------------------------
                 choices=[],
                 value=None,
                 allow_custom_value=True,
+                info="Type a page number (e.g., 2) or choose from the list.",
             )
             load_btn = gr.Button("Prepare Document", variant="primary")
         with gr.Column(scale=2):
             image_view = gr.Image(label="Current page/image", interactive=False)
+    # ✅ messages mode (no more tuples warnings)
+    chatbot = gr.Chatbot(height=400, type="messages")
     user_box = gr.Textbox(
         label="Your question",
         placeholder="e.g., What is the invoice number and total with tax?",
     load_btn.click(
         start_chat,
         inputs=[file, gr.State(0)],
+        outputs=[pages, images_state, image_view, gr.Textbox(visible=False)],
     )
     pages.change(
         page_picker_changed,
         inputs=[pages, images_state],
+        outputs=[image_view, selected_img_state],
     )
     ask_btn.click(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],
+        outputs=[chatbot, chatbot],
     )
     user_box.submit(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],
+        outputs=[chatbot, chatbot],
     )
 if __name__ == "__main__":