Spaces:

Remostart
/

Plutus_PersonalisedTutor

Sleeping

App Files Files Community

Remostart commited on Sep 7, 2025

Commit

b1a7eca

verified ·

1 Parent(s): 14b43fc

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -21

app.py CHANGED Viewed

@@ -10,33 +10,61 @@ _TOKENIZER = None
 def get_tokenizer():
     global _TOKENIZER
     if _TOKENIZER is None:
-        tok = AutoTokenizer.from_pretrained(MODEL_NAME)
         if tok.pad_token_id is None:
-            tok.pad_token = tok.eos_token
         _TOKENIZER = tok
     return _TOKENIZER
 # ------------ Prompt builder ------------
-def build_prompt(personality, level, topic):
     return (
         f"You are a friendly Plutus AI tutor for a {personality} learner at {level} level.\n"
         f"Topic: {topic}\n\n"
         "Explain in a conversational, easy tone with concrete examples.\n"
-        "Keep it complete, focused, and around 120–160 words.\n"
-        "End with a one-line takeaway starting with 'Takeaway:'.\n"
     )
 # ------------ GPU-only generation ------------
 @spaces.GPU
-def generate_on_gpu(personality, level, topic, max_new_tokens=160):
-    """
-    Runs ONLY when ZeroGPU grants a GPU.
-    Loads model per-call, generates, decodes ONLY new tokens, frees VRAM.
-    """
     tokenizer = get_tokenizer()
-    prompt = build_prompt(personality, level, topic)
-    # Try 4-bit for VRAM; fall back to fp16 if not available
     try:
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
@@ -51,7 +79,6 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
         )
     model.eval()
-    # Move inputs to model device
     device = next(model.parameters()).device
     inputs = tokenizer(prompt, return_tensors="pt")
     input_len = inputs["input_ids"].shape[1]
@@ -60,8 +87,9 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
     with torch.inference_mode():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=max_new_tokens,   # keep small for ZeroGPU time/VRAM
-            temperature=0.2,
             top_p=0.9,
             do_sample=True,
             repetition_penalty=1.05,
@@ -69,9 +97,14 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
             pad_token_id=tokenizer.pad_token_id,
         )
-    # Decode ONLY the newly generated tokens (avoids prompt-echo trimming issues)
     gen_ids = outputs[0][input_len:]
     text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
     # Cleanup VRAM
     try:
@@ -81,23 +114,23 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
     except Exception:
         pass
-    # Fallback guard: ensure we return something readable
     if not text:
-        text = "Takeaway: Generation finished but returned empty text. Try again or choose a different topic."
     return text
-# ------------ Orchestrator (no CPU fallback) ------------
 def orchestrator(personality, level, topic):
     if not personality or not level or not topic:
         return "Select your personality, expertise, and topic to get a tailored explanation."
     try:
         return generate_on_gpu(personality, level, topic)
     except Exception as e:
-        # Don’t crash silently; show a friendly message
         print(f"[ZeroGPU error] {type(e).__name__}: {e}")
         return (
             "GPU was not available or the job was interrupted. "
-            "Please click **Regenerate** or change a selection to try again."
         )
 # ------------ Gradio UI ------------

 def get_tokenizer():
     global _TOKENIZER
     if _TOKENIZER is None:
+        tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
+        # Ensure pad/eos exist to avoid generation crashes
         if tok.pad_token_id is None:
+            # Prefer eos_token if present; otherwise use bos_token; otherwise add one
+            if tok.eos_token_id is not None:
+                tok.pad_token = tok.eos_token
+            elif tok.bos_token_id is not None:
+                tok.pad_token = tok.bos_token
+            else:
+                tok.add_special_tokens({"pad_token": "[PAD]"})
         _TOKENIZER = tok
     return _TOKENIZER
 # ------------ Prompt builder ------------
+def build_instructions(personality, level, topic):
     return (
         f"You are a friendly Plutus AI tutor for a {personality} learner at {level} level.\n"
         f"Topic: {topic}\n\n"
         "Explain in a conversational, easy tone with concrete examples.\n"
+        "Keep it complete and around 120–160 words.\n"
+        "End with a one-line takeaway starting with 'Takeaway:'."
     )
+def build_model_input(tokenizer, personality, level, topic):
+    user_msg = build_instructions(personality, level, topic)
+    # If the tokenizer supports chat templates, use them.
+    if hasattr(tokenizer, "apply_chat_template"):
+        messages = [
+            {"role": "system", "content": "You are a helpful Cardano Plutus tutor."},
+            {"role": "user", "content": user_msg},
+        ]
+        # add_generation_prompt=True puts the assistant tag where the model expects to start generating
+        prompt_str = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        return prompt_str
+    else:
+        # Fallback: plain prompt with a simple “Assistant:” cue
+        return (
+            "System: You are a helpful Cardano Plutus tutor.\n\n"
+            f"User: {user_msg}\n\nAssistant:"
+        )
 # ------------ GPU-only generation ------------
 @spaces.GPU
+def generate_on_gpu(personality, level, topic,
+                    max_new_tokens=180,
+                    min_new_tokens=64):
     tokenizer = get_tokenizer()
+    prompt = build_model_input(tokenizer, personality, level, topic)
+    # Try 4-bit to reduce VRAM; fall back to fp16 if unavailable
     try:
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
         )
     model.eval()
     device = next(model.parameters()).device
     inputs = tokenizer(prompt, return_tensors="pt")
     input_len = inputs["input_ids"].shape[1]
     with torch.inference_mode():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,      # ensure it doesn’t stop immediately
+            temperature=0.3,
             top_p=0.9,
             do_sample=True,
             repetition_penalty=1.05,
             pad_token_id=tokenizer.pad_token_id,
         )
+    # Prefer decoding only new tokens (avoids prompt-echo). If empty, fall back to full decode.
     gen_ids = outputs[0][input_len:]
     text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
+    if not text:
+        text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+        # If the full decode still contains the prompt, try to trim it once safely
+        if text.startswith(prompt):
+            text = text[len(prompt):].lstrip()
     # Cleanup VRAM
     try:
     except Exception:
         pass
+    # Final guard so UI shows something useful
     if not text:
+        text = ("Generation returned no content. Please click **Regenerate** or pick a different topic. "
+                "If this persists, reduce max tokens or use a lighter checkpoint.")
     return text
+# ------------ Orchestrator (GPU-only) ------------
 def orchestrator(personality, level, topic):
     if not personality or not level or not topic:
         return "Select your personality, expertise, and topic to get a tailored explanation."
     try:
         return generate_on_gpu(personality, level, topic)
     except Exception as e:
         print(f"[ZeroGPU error] {type(e).__name__}: {e}")
         return (
             "GPU was not available or the job was interrupted. "
+            "Click **Regenerate** or change a selection to try again."
         )
 # ------------ Gradio UI ------------