Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -163,24 +163,30 @@ def build_messages(history: List[Tuple[str, str]], user_text: str, images: List[
|
|
| 163 |
def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
|
| 164 |
"""
|
| 165 |
Stream a model reply grounded on provided images + user question + compact chat history.
|
| 166 |
-
Key fix:
|
| 167 |
"""
|
| 168 |
messages = build_messages(chat_history, user_text, images)
|
| 169 |
|
| 170 |
-
#
|
| 171 |
-
|
| 172 |
messages,
|
| 173 |
add_generation_prompt=True,
|
| 174 |
-
tokenize=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
return_tensors="pt"
|
| 176 |
).to(DEVICE)
|
| 177 |
|
| 178 |
-
# Vision tensors
|
| 179 |
vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
|
| 180 |
|
| 181 |
-
# Merge dicts
|
| 182 |
model_inputs = {**text_inputs, **vision_inputs}
|
| 183 |
|
|
|
|
| 184 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 185 |
gen_kwargs = dict(
|
| 186 |
**model_inputs,
|
|
@@ -200,6 +206,7 @@ def generate_reply(images: List[Image.Image], user_text: str, chat_history: List
|
|
| 200 |
yield partial
|
| 201 |
|
| 202 |
|
|
|
|
| 203 |
# -----------------------------
|
| 204 |
# Gradio UI Orchestration
|
| 205 |
# -----------------------------
|
|
|
|
| 163 |
def generate_reply(images: List[Image.Image], user_text: str, chat_history: List[Tuple[str, str]]):
|
| 164 |
"""
|
| 165 |
Stream a model reply grounded on provided images + user question + compact chat history.
|
| 166 |
+
Key fix: build text with chat template (string), then tokenize to get a dict.
|
| 167 |
"""
|
| 168 |
messages = build_messages(chat_history, user_text, images)
|
| 169 |
|
| 170 |
+
# 1) Get the chat prompt as TEXT (not tokens)
|
| 171 |
+
prompt_text = tokenizer.apply_chat_template(
|
| 172 |
messages,
|
| 173 |
add_generation_prompt=True,
|
| 174 |
+
tokenize=False, # <-- IMPORTANT: return string
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
# 2) Tokenize to get a dict (input_ids, attention_mask)
|
| 178 |
+
text_inputs = tokenizer(
|
| 179 |
+
prompt_text,
|
| 180 |
return_tensors="pt"
|
| 181 |
).to(DEVICE)
|
| 182 |
|
| 183 |
+
# 3) Vision tensors (dict with pixel_values)
|
| 184 |
vision_inputs = processor(images=images, return_tensors="pt").to(DEVICE)
|
| 185 |
|
| 186 |
+
# 4) Merge dicts safely
|
| 187 |
model_inputs = {**text_inputs, **vision_inputs}
|
| 188 |
|
| 189 |
+
# 5) Stream with the same tokenizer
|
| 190 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 191 |
gen_kwargs = dict(
|
| 192 |
**model_inputs,
|
|
|
|
| 206 |
yield partial
|
| 207 |
|
| 208 |
|
| 209 |
+
|
| 210 |
# -----------------------------
|
| 211 |
# Gradio UI Orchestration
|
| 212 |
# -----------------------------
|