Spaces:

arasuezofis
/

documentbasedresponse

Sleeping

arasuezofis commited on Oct 10, 2025

Commit

3a1ba6d

verified ·

1 Parent(s): 1fcca49

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -163,24 +163,30 @@ def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[
 def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
-    Key fix: use tokenizer.apply_chat_template and a streamer built with the same tokenizer.
     """
     messages = build_messages(chat_history, user_text, images)
-    # Text inputs via tokenizer chat template
-    text_inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
-        tokenize=True,
         return_tensors="pt"
     ).to(DEVICE)
-    # Vision tensors via processor
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
-    # Merge dicts (input_ids, attention_mask, pixel_values)
     model_inputs = {**text_inputs, **vision_inputs}
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
@@ -200,6 +206,7 @@ def generate_reply(images: List[Image.Image], user_text: str, chat_history: List
         yield partial
 # -----------------------------
 # Gradio UI Orchestration
 # -----------------------------

 def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
     """
     Stream a model reply grounded on provided images + user question + compact chat history.
+    Key fix: build text with chat template (string), then tokenize to get a dict.
     """
     messages = build_messages(chat_history, user_text, images)
+    # 1) Get the chat prompt as TEXT (not tokens)
+    prompt_text = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
+        tokenize=False,               # <-- IMPORTANT: return string
+    )
+    # 2) Tokenize to get a dict (input_ids, attention_mask)
+    text_inputs = tokenizer(
+        prompt_text,
         return_tensors="pt"
     ).to(DEVICE)
+    # 3) Vision tensors (dict with pixel_values)
     vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
+    # 4) Merge dicts safely
     model_inputs = {**text_inputs, **vision_inputs}
+    # 5) Stream with the same tokenizer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     gen_kwargs = dict(
         **model_inputs,
         yield partial
 # -----------------------------
 # Gradio UI Orchestration
 # -----------------------------