Spaces:

arasuezofis
/

documentbasedresponse

Sleeping

App Files Files Community

arasuezofis commited on Oct 10, 2025

Commit

1fcca49

verified ·

1 Parent(s): 1e56cd8

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -28

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # app.py
 # ------------------------------------------------------------
 # Invoice Chat • SmolVLM-Instruct-250M
-# Operationalized for Hugging Face Spaces (Gradio SDK)
 # ------------------------------------------------------------
 import io
@@ -15,17 +15,22 @@ from PIL import Image
 import fitz  # PyMuPDF
 from transformers import (
     AutoProcessor,
     AutoModelForVision2Seq,
     TextIteratorStreamer,
 )
 # -----------------------------
-# Model bootstrap (lean & mean)
 # -----------------------------
 MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=DTYPE)
 model.to(DEVICE).eval()
@@ -68,26 +73,23 @@ def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> Li
     """
     Accept PDF/PNG/JPEG via Gradio File. Handles multiple shapes of input:
       - str path (tempfile path)
-      - dict with 'name' or 'path' (some Gradio versions)
       - bytes / BytesIO
     Returns a list of PIL images. PDFs => multi-image; PNG/JPEG => single image.
     """
     if not file_val:
         return []
-    # Normalize to path/bytes
     path: Optional[str] = None
     raw_bytes: Optional[bytes] = None
     if isinstance(file_val, str) and os.path.exists(file_val):
         path = file_val
     elif isinstance(file_val, dict):
-        # Gradio sometimes passes a dict with keys like {'name': '/tmp/..', 'orig_name': 'x.pdf', 'size': ...}
         maybe_path = file_val.get("name") or file_val.get("path")
         if isinstance(maybe_path, str) and os.path.exists(maybe_path):
             path = maybe_path
         else:
-            # if dict contains 'data' or similar
             data = file_val.get("data")
             if isinstance(data, (bytes, bytearray)):
                 raw_bytes = bytes(data)
@@ -96,27 +98,23 @@ def ensure_images(file_val: Optional[Union[str, dict, bytes, io.BytesIO]]) -> Li
     elif isinstance(file_val, io.BytesIO):
         raw_bytes = file_val.getvalue()
-    # Branch by PDF vs Image
-    def is_pdf_from_name(name: str) -> bool:
         return name.lower().endswith(".pdf")
     if path:
-        if is_pdf_from_name(path):
             return pdf_to_images_from_path(path)
-        # Image path
         with open(path, "rb") as f:
             img = Image.open(io.BytesIO(f.read())).convert("RGB")
         return [img]
     if raw_bytes:
-        # Try sniffing PDF header
         if raw_bytes[:5] == b"%PDF-":
             return pdf_to_images_from_bytes(raw_bytes)
-        # Else treat as image bytes
         img = Image.open(io.BytesIO(raw_bytes)).convert("RGB")
         return [img]
-    # Fallback: nothing usable
     return []
@@ -142,7 +140,7 @@ def parse_page_selection(value, num_pages: int) -> int:
 def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[Image.Image]):
     """
-    Construct chat-format messages compatible with processor.apply_chat_template.
     We trim the history to avoid runaway context growth.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
@@ -165,24 +163,25 @@ def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[
 def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
     """
     messages = build_messages(chat_history, user_text, images)
-    # Text context
-    text_inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_tensors="pt"
     ).to(DEVICE)
-    # Vision tensors
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
-    # Merge dicts
     model_inputs = {**text_inputs, **vision_inputs}
-    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
         streamer=streamer,
@@ -207,7 +206,6 @@ def generate_reply(images: List[Image.Image], user_text: str, chat_history: List
 def start_chat(file_val, page_index):
     imgs = ensure_images(file_val)
     if not imgs:
-        # Reset the dropdown & return empty
         return (
             gr.update(choices=[], value=None),
             [],
@@ -237,10 +235,8 @@ def page_picker_changed(pages_dropdown, images_state):
 def chat(user_text, history, images_state, selected_img):
     if not user_text or not user_text.strip():
-        # No update; just echo current state
         return gr.update(), history
-    # Choose selected image; fallback to first page if needed
     sel_img = selected_img if selected_img is not None else (images_state[0] if images_state else None)
     if sel_img is None:
         history = history + [(user_text, "Please upload a document first.")]
@@ -260,7 +256,7 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
     gr.Markdown(
         "## Invoice Chat • SmolVLM-Instruct-250M\n"
         "Upload a PDF/PNG/JPEG, pick a page, and interrogate the document. "
-        "This is a CPU-friendly, low-latency experience designed for rapid insight capture."
     )
     with gr.Row():
@@ -270,7 +266,7 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
                 label="Select page (for PDFs)",
                 choices=[],
                 value=None,
-                allow_custom_value=True,  # set False to hard-lock to dropdown values
                 info="Type a page number (e.g., 2) or choose from the list."
             )
             load_btn = gr.Button("Prepare Document", variant="primary")
@@ -284,25 +280,23 @@ with gr.Blocks(title="Invoice Chat • SmolVLM-250M") as demo:
     )
     ask_btn = gr.Button("Ask", variant="primary")
-    # Hidden session state
     images_state = gr.State([])
     selected_img_state = gr.State(None)
-    # Wire up events
     load_btn.click(
         start_chat,
         inputs=[file, gr.State(0)],
         outputs=[pages, images_state, image_view, gr.Textbox(visible=False)]
     )
-    # When the page dropdown changes, update both preview and the selected image state
     pages.change(
         page_picker_changed,
         inputs=[pages, images_state],
         outputs=[image_view, selected_img_state]
     )
-    # Ask flows (streaming)
     ask_btn.click(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],

 # app.py
 # ------------------------------------------------------------
 # Invoice Chat • SmolVLM-Instruct-250M
+# Gradio Space with resilient page picker + streaming chat
 # ------------------------------------------------------------
 import io
 import fitz  # PyMuPDF
 from transformers import (
     AutoProcessor,
+    AutoTokenizer,
     AutoModelForVision2Seq,
     TextIteratorStreamer,
 )
 # -----------------------------
+# Model bootstrap
 # -----------------------------
 MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct-250M"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# float16 only if CUDA is available; on CPU use float32
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+# Load tokenizer (has the chat template), processor (images), and model
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = AutoModelForVision2Seq.from_pretrained(MODEL_ID, torch_dtype=DTYPE)
 model.to(DEVICE).eval()
     """
     Accept PDF/PNG/JPEG via Gradio File. Handles multiple shapes of input:
       - str path (tempfile path)
+      - dict with 'name' or 'path'
       - bytes / BytesIO
     Returns a list of PIL images. PDFs => multi-image; PNG/JPEG => single image.
     """
     if not file_val:
         return []
     path: Optional[str] = None
     raw_bytes: Optional[bytes] = None
     if isinstance(file_val, str) and os.path.exists(file_val):
         path = file_val
     elif isinstance(file_val, dict):
         maybe_path = file_val.get("name") or file_val.get("path")
         if isinstance(maybe_path, str) and os.path.exists(maybe_path):
             path = maybe_path
         else:
             data = file_val.get("data")
             if isinstance(data, (bytes, bytearray)):
                 raw_bytes = bytes(data)
     elif isinstance(file_val, io.BytesIO):
         raw_bytes = file_val.getvalue()
+    # PDF vs Image
+    def is_pdf_name(name: str) -> bool:
         return name.lower().endswith(".pdf")
     if path:
+        if is_pdf_name(path):
             return pdf_to_images_from_path(path)
         with open(path, "rb") as f:
             img = Image.open(io.BytesIO(f.read())).convert("RGB")
         return [img]
     if raw_bytes:
         if raw_bytes[:5] == b"%PDF-":
             return pdf_to_images_from_bytes(raw_bytes)
         img = Image.open(io.BytesIO(raw_bytes)).convert("RGB")
         return [img]
     return []
 def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[Image.Image]):
     """
+    Construct chat-format messages compatible with tokenizer.apply_chat_template.
     We trim the history to avoid runaway context growth.
     """
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
 def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
+    Key fix: use tokenizer.apply_chat_template and a streamer built with the same tokenizer.
     """
     messages = build_messages(chat_history, user_text, images)
+    # Text inputs via tokenizer chat template
+    text_inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_tensors="pt"
     ).to(DEVICE)
+    # Vision tensors via processor
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
+    # Merge dicts (input_ids, attention_mask, pixel_values)
     model_inputs = {**text_inputs, **vision_inputs}
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
         streamer=streamer,
 def start_chat(file_val, page_index):
     imgs = ensure_images(file_val)
     if not imgs:
         return (
             gr.update(choices=[], value=None),
             [],
 def chat(user_text, history, images_state, selected_img):
     if not user_text or not user_text.strip():
         return gr.update(), history
     sel_img = selected_img if selected_img is not None else (images_state[0] if images_state else None)
     if sel_img is None:
         history = history + [(user_text, "Please upload a document first.")]
     gr.Markdown(
         "## Invoice Chat • SmolVLM-Instruct-250M\n"
         "Upload a PDF/PNG/JPEG, pick a page, and interrogate the document. "
+        "Optimized for CPU-friendly, low-latency insights."
     )
     with gr.Row():
                 label="Select page (for PDFs)",
                 choices=[],
                 value=None,
+                allow_custom_value=True,  # set False to lock to dropdown values
                 info="Type a page number (e.g., 2) or choose from the list."
             )
             load_btn = gr.Button("Prepare Document", variant="primary")
     )
     ask_btn = gr.Button("Ask", variant="primary")
+    # Session state
     images_state = gr.State([])
     selected_img_state = gr.State(None)
+    # Events
     load_btn.click(
         start_chat,
         inputs=[file, gr.State(0)],
         outputs=[pages, images_state, image_view, gr.Textbox(visible=False)]
     )
     pages.change(
         page_picker_changed,
         inputs=[pages, images_state],
         outputs=[image_view, selected_img_state]
     )
     ask_btn.click(
         chat,
         inputs=[user_box, chatbot, images_state, selected_img_state],