Spaces:

mo35
/

gemma4-quantfin

Sleeping

App Files Files Community

mo35 commited on Apr 23

Commit

ee678d5

1 Parent(s): 1083e33

Fix apply_chat_template return type

Browse files

Files changed (1) hide show

app.py +16 -5

app.py CHANGED Viewed

@@ -39,17 +39,27 @@ print("Model ready.")
 # ── Inference ─────────────────────────────────────────────────────────────────
 def respond(message: str, history: list) -> str:
-    messages = [{"role": "user", "content": message}]
-    inputs = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt = True,
         return_tensors        = "pt",
-    ).to(model.device)
     with torch.no_grad():
         outputs = model.generate(
-            inputs,
             max_new_tokens     = 1024,
             temperature        = 0.7,
             do_sample          = True,
@@ -57,13 +67,14 @@ def respond(message: str, history: list) -> str:
         )
     return tokenizer.decode(
-        outputs[0][inputs.shape[-1]:],
         skip_special_tokens = True,
     )
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
 demo = gr.ChatInterface(
     fn          = respond,
     title       = "Gemma 4 — Quantitative Finance",
     description = (
         "A specialized AI assistant fine-tuned on quantitative finance: derivatives pricing, "

 # ── Inference ─────────────────────────────────────────────────────────────────
 def respond(message: str, history: list) -> str:
+    messages = []
+    for msg in history:
+        messages.append({"role": msg["role"], "content": msg["content"]})
+    messages.append({"role": "user", "content": message})
+    # apply_chat_template returns BatchEncoding in newer transformers
+    encoded = tokenizer.apply_chat_template(
         messages,
         add_generation_prompt = True,
         return_tensors        = "pt",
+        return_dict           = True,
+    )
+    input_ids      = encoded["input_ids"].to(model.device)
+    attention_mask = encoded.get("attention_mask", None)
+    if attention_mask is not None:
+        attention_mask = attention_mask.to(model.device)
     with torch.no_grad():
         outputs = model.generate(
+            input_ids,
+            attention_mask     = attention_mask,
             max_new_tokens     = 1024,
             temperature        = 0.7,
             do_sample          = True,
         )
     return tokenizer.decode(
+        outputs[0][input_ids.shape[-1]:],
         skip_special_tokens = True,
     )
 # ── Gradio UI ─────────────────────────────────────────────────────────────────
 demo = gr.ChatInterface(
     fn          = respond,
+    type        = "messages",
     title       = "Gemma 4 — Quantitative Finance",
     description = (
         "A specialized AI assistant fine-tuned on quantitative finance: derivatives pricing, "