Spaces:

arasuezofis
/

documentbasedresponse

Sleeping

App Files Files Community

arasuezofis commited on Oct 13, 2025

Commit

9037c59

verified ·

1 Parent(s): 3a1ba6d

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -49

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # app.py
 # ------------------------------------------------------------
 # Invoice Chat • SmolVLM-Instruct-250M
-# Gradio Space with resilient page picker + streaming chat
 # ------------------------------------------------------------
 import io
@@ -16,7 +16,7 @@ import fitz  # PyMuPDF
 from transformers import (
     AutoProcessor,
     AutoTokenizer,
-    AutoModelForVision2Seq,
     TextIteratorStreamer,
 )
@@ -26,13 +26,14 @@ from transformers import (
 MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# float16 only if CUDA is available; on CPU use float32
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
-# Load tokenizer (has the chat template), processor (images), and model
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=DTYPE)
 model.to(DEVICE).eval()
 SYSTEM_PROMPT = (
@@ -44,7 +45,6 @@ SYSTEM_PROMPT = (
 # Utilities
 # -----------------------------
 def pdf_to_images_from_bytes(pdf_bytes: bytes, max_pages: int = 8, dpi: int = 216) -> List[Image.Image]:
-    """Render first N pages of a PDF (in-memory) as PIL RGB images."""
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     images: List[Image.Image] = []
     for i, page in enumerate(doc):
@@ -55,9 +55,7 @@ def pdf_to_images_from_bytes(pdf_bytes: bytes, max_pages: int = 8, dpi: int = 21
         images.append(img)
     return images
 def pdf_to_images_from_path(path: str, max_pages: int = 8, dpi: int = 216) -> List[Image.Image]:
-    """Render first N pages of a PDF (file path) as PIL RGB images."""
     doc = fitz.open(path)
     images: List[Image.Image] = []
     for i, page in enumerate(doc):
@@ -68,12 +66,11 @@ def pdf_to_images_from_path(path: str, max_pages: int = 8, dpi: int = 216) -> Li
         images.append(img)
     return images
 def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> List[Image.Image]:
     """
-    Accept PDF/PNG/JPEG via Gradio File. Handles multiple shapes of input:
       - str path (tempfile path)
-      - dict with 'name' or 'path'
       - bytes / BytesIO
     Returns a list of PIL images. PDFs => multi-image; PNG/JPEG => single image.
     """
@@ -98,7 +95,6 @@ def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> Li
     elif isinstance(file_val, io.BytesIO):
         raw_bytes = file_val.getvalue()
-    # PDF vs Image
     def is_pdf_name(name: str) -> bool:
         return name.lower().endswith(".pdf")
@@ -117,35 +113,28 @@ def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> Li
     return []
 def parse_page_selection(value, num_pages: int) -> int:
     """
     Accept 'Page 3', '3', 3, 'pg-2', etc. Return safe 0-based index clamped to [0, num_pages-1].
-    Defaults to 0 if unusable.
     """
     if num_pages <= 0:
         return 0
     if value is None:
         return 0
     if isinstance(value, int):
         idx = value - 1
     else:
         s = str(value).strip()
         m = re.search(r"(\d+)", s)
         idx = int(m.group(1)) - 1 if m else 0
     return max(0, min(num_pages - 1, idx))
 def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[Image.Image]):
     """
-    Construct chat-format messages compatible with tokenizer.apply_chat_template.
-    We trim the history to avoid runaway context growth.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     trimmed = history[-4:] if history else []
     for u, a in trimmed:
         messages.append({"role": "user", "content": u})
         messages.append({"role": "assistant", "content": a})
@@ -155,45 +144,52 @@ def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[
         multimodal.append(im)
     if user_text.strip():
         multimodal.append(user_text.strip())
     messages.append({"role": "user", "content": multimodal})
     return messages
 def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
-    Key fix: build text with chat template (string), then tokenize to get a dict.
     """
     messages = build_messages(chat_history, user_text, images)
-    # 1) Get the chat prompt as TEXT (not tokens)
     prompt_text = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
-        tokenize=False,               # <-- IMPORTANT: return string
     )
-    # 2) Tokenize to get a dict (input_ids, attention_mask)
-    text_inputs = tokenizer(
-        prompt_text,
-        return_tensors="pt"
-    ).to(DEVICE)
-    # 3) Vision tensors (dict with pixel_values)
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
-    # 4) Merge dicts safely
-    model_inputs = {**text_inputs, **vision_inputs}
-    # 5) Stream with the same tokenizer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
         streamer=streamer,
         max_new_tokens=512,
         do_sample=False,
-        temperature=0.0,
     )
     import threading
@@ -205,8 +201,6 @@ def generate_reply(images: List[Image.Image], user_text: str, chat_history: List
         partial += token
         yield partial
 # -----------------------------
 # Gradio UI Orchestration
 # -----------------------------
@@ -219,11 +213,9 @@ def start_chat(file_val, page_index):
             None,
             "No file loaded. Please upload a PDF/PNG/JPEG.",
         )
     choices = [f"Page {i+1}" for i in range(len(imgs))]
     safe_idx = 0 if page_index is None else max(0, min(len(imgs) - 1, int(page_index)))
     default_value = choices[safe_idx]
     return (
         gr.update(choices=choices, value=default_value),
         imgs,
@@ -231,19 +223,16 @@ def start_chat(file_val, page_index):
         "Document ready. Select a page and ask questions.",
     )
 def page_picker_changed(pages_dropdown, images_state):
     if not images_state:
         return None, gr.update()
     idx = parse_page_selection(pages_dropdown, len(images_state))
     selected = images_state[idx]
-    return selected, selected  # for preview and selected state
 def chat(user_text, history, images_state, selected_img):
     if not user_text or not user_text.strip():
         return gr.update(), history
     sel_img = selected_img if selected_img is not None else (images_state[0] if images_state else None)
     if sel_img is None:
         history = history + [(user_text, "Please upload a document first.")]
@@ -255,7 +244,6 @@ def chat(user_text, history, images_state, selected_img):
         acc = chunk
         yield history + [(user_text, acc)], history + [(user_text, acc)]
 # -----------------------------
 # App definition
 # -----------------------------
@@ -265,7 +253,6 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
         "Upload a PDF/PNG/JPEG, pick a page, and interrogate the document. "
         "Optimized for CPU-friendly, low-latency insights."
     )
     with gr.Row():
         with gr.Column(scale=1):
             file = gr.File(label="Upload invoice (PDF / PNG / JPEG)")
@@ -273,14 +260,15 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
                 label="Select page (for PDFs)",
                 choices=[],
                 value=None,
-                allow_custom_value=True,  # set False to lock to dropdown values
                 info="Type a page number (e.g., 2) or choose from the list."
             )
             load_btn = gr.Button("Prepare Document", variant="primary")
         with gr.Column(scale=2):
             image_view = gr.Image(label="Current page/image", interactive=False)
-    chatbot = gr.Chatbot(height=400)
     user_box = gr.Textbox(
         label="Your question",
         placeholder="e.g., What is the invoice number and total with tax?",
@@ -297,13 +285,11 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
         inputs=[file, gr.State(0)],
         outputs=[pages, images_state, image_view, gr.Textbox(visible=False)]
     )
     pages.change(
         page_picker_changed,
         inputs=[pages, images_state],
         outputs=[image_view, selected_img_state]
     )
     ask_btn.click(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],

 # app.py
 # ------------------------------------------------------------
 # Invoice Chat • SmolVLM-Instruct-250M
+# Gradio Space with robust page picker + safe streaming chat
 # ------------------------------------------------------------
 import io
 from transformers import (
     AutoProcessor,
     AutoTokenizer,
+    AutoModelForImageTextToText,  # <= new, replaces AutoModelForVision2Seq
     TextIteratorStreamer,
 )
 MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+# Tokenizer has the chat template
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+# Processor handles vision tensors
 processor = AutoProcessor.from_pretrained(MODEL_ID)
+# New class to avoid deprecation warnings
+model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, dtype=DTYPE)
 model.to(DEVICE).eval()
 SYSTEM_PROMPT = (
 # Utilities
 # -----------------------------
 def pdf_to_images_from_bytes(pdf_bytes: bytes, max_pages: int = 8, dpi: int = 216) -> List[Image.Image]:
     doc = fitz.open(stream=pdf_bytes, filetype="pdf")
     images: List[Image.Image] = []
     for i, page in enumerate(doc):
         images.append(img)
     return images
 def pdf_to_images_from_path(path: str, max_pages: int = 8, dpi: int = 216) -> List[Image.Image]:
     doc = fitz.open(path)
     images: List[Image.Image] = []
     for i, page in enumerate(doc):
         images.append(img)
     return images
 def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> List[Image.Image]:
     """
+    Accept PDF/PNG/JPEG via Gradio File. Handles:
       - str path (tempfile path)
+      - dict with 'name'/'path' or 'data'
       - bytes / BytesIO
     Returns a list of PIL images. PDFs => multi-image; PNG/JPEG => single image.
     """
     elif isinstance(file_val, io.BytesIO):
         raw_bytes = file_val.getvalue()
     def is_pdf_name(name: str) -> bool:
         return name.lower().endswith(".pdf")
     return []
 def parse_page_selection(value, num_pages: int) -> int:
     """
     Accept 'Page 3', '3', 3, 'pg-2', etc. Return safe 0-based index clamped to [0, num_pages-1].
     """
     if num_pages <= 0:
         return 0
     if value is None:
         return 0
     if isinstance(value, int):
         idx = value - 1
     else:
         s = str(value).strip()
         m = re.search(r"(\d+)", s)
         idx = int(m.group(1)) - 1 if m else 0
     return max(0, min(num_pages - 1, idx))
 def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[Image.Image]):
     """
+    Construct chat-format messages for tokenizer.apply_chat_template.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     trimmed = history[-4:] if history else []
     for u, a in trimmed:
         messages.append({"role": "user", "content": u})
         messages.append({"role": "assistant", "content": a})
         multimodal.append(im)
     if user_text.strip():
         multimodal.append(user_text.strip())
     messages.append({"role": "user", "content": multimodal})
     return messages
+# -----------------------------
+# Core generation (streaming)
+# -----------------------------
 def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
+    - Build prompt as TEXT (chat template) -> tokenize to dict (input_ids, attention_mask)
+    - Vision tensors via processor (pixel_values)
+    - Pass ONLY allowed kwargs to model.generate (avoid rows/cols etc.)
     """
     messages = build_messages(chat_history, user_text, images)
+    # 1) Build prompt text
     prompt_text = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
+        tokenize=False,   # IMPORTANT: return a string
     )
+    # 2) Tokenize to get a dict
+    text_inputs = tokenizer(prompt_text, return_tensors="pt").to(DEVICE)
+    # 3) Vision tensors
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
+    # 4) Allow-list only the keys generate() expects
+    model_inputs = {
+        "input_ids": text_inputs["input_ids"],
+        # attention_mask may or may not exist depending on tokenizer; include if present
+        **({"attention_mask": text_inputs["attention_mask"]} if "attention_mask" in text_inputs else {}),
+        # vision inputs
+        **({"pixel_values": vision_inputs["pixel_values"]} if "pixel_values" in vision_inputs else {}),
+    }
+    # 5) Streamer uses the same tokenizer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
         streamer=streamer,
         max_new_tokens=512,
         do_sample=False,
+        # NOTE: some I2T models ignore temperature/top_p; avoid passing unsupported flags
     )
     import threading
         partial += token
         yield partial
 # -----------------------------
 # Gradio UI Orchestration
 # -----------------------------
             None,
             "No file loaded. Please upload a PDF/PNG/JPEG.",
         )
     choices = [f"Page {i+1}" for i in range(len(imgs))]
     safe_idx = 0 if page_index is None else max(0, min(len(imgs) - 1, int(page_index)))
     default_value = choices[safe_idx]
     return (
         gr.update(choices=choices, value=default_value),
         imgs,
         "Document ready. Select a page and ask questions.",
     )
 def page_picker_changed(pages_dropdown, images_state):
     if not images_state:
         return None, gr.update()
     idx = parse_page_selection(pages_dropdown, len(images_state))
     selected = images_state[idx]
+    return selected, selected  # preview + selected state
 def chat(user_text, history, images_state, selected_img):
     if not user_text or not user_text.strip():
         return gr.update(), history
     sel_img = selected_img if selected_img is not None else (images_state[0] if images_state else None)
     if sel_img is None:
         history = history + [(user_text, "Please upload a document first.")]
         acc = chunk
         yield history + [(user_text, acc)], history + [(user_text, acc)]
 # -----------------------------
 # App definition
 # -----------------------------
         "Upload a PDF/PNG/JPEG, pick a page, and interrogate the document. "
         "Optimized for CPU-friendly, low-latency insights."
     )
     with gr.Row():
         with gr.Column(scale=1):
             file = gr.File(label="Upload invoice (PDF / PNG / JPEG)")
                 label="Select page (for PDFs)",
                 choices=[],
                 value=None,
+                allow_custom_value=True,
                 info="Type a page number (e.g., 2) or choose from the list."
             )
             load_btn = gr.Button("Prepare Document", variant="primary")
         with gr.Column(scale=2):
             image_view = gr.Image(label="Current page/image", interactive=False)
+    # Lock Chatbot type to silence deprecation warning
+    chatbot = gr.Chatbot(height=400, type="tuples")
     user_box = gr.Textbox(
         label="Your question",
         placeholder="e.g., What is the invoice number and total with tax?",
         inputs=[file, gr.State(0)],
         outputs=[pages, images_state, image_view, gr.Textbox(visible=False)]
     )
     pages.change(
         page_picker_changed,
         inputs=[pages, images_state],
         outputs=[image_view, selected_img_state]
     )
     ask_btn.click(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],