Spaces:

Remostart
/

Plutus_PersonalisedTutor

Sleeping

App Files Files Community

Remostart commited on Sep 7, 2025

Commit

d5aec37

verified ·

1 Parent(s): b1a7eca

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -46

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ def get_tokenizer():
         tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
         # Ensure pad/eos exist to avoid generation crashes
         if tok.pad_token_id is None:
-            # Prefer eos_token if present; otherwise use bos_token; otherwise add one
             if tok.eos_token_id is not None:
                 tok.pad_token = tok.eos_token
             elif tok.bos_token_id is not None:
@@ -36,13 +35,11 @@ def build_instructions(personality, level, topic):
 def build_model_input(tokenizer, personality, level, topic):
     user_msg = build_instructions(personality, level, topic)
-    # If the tokenizer supports chat templates, use them.
     if hasattr(tokenizer, "apply_chat_template"):
         messages = [
             {"role": "system", "content": "You are a helpful Cardano Plutus tutor."},
             {"role": "user", "content": user_msg},
         ]
-        # add_generation_prompt=True puts the assistant tag where the model expects to start generating
         prompt_str = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
@@ -50,36 +47,41 @@ def build_model_input(tokenizer, personality, level, topic):
         )
         return prompt_str
     else:
-        # Fallback: plain prompt with a simple “Assistant:” cue
         return (
             "System: You are a helpful Cardano Plutus tutor.\n\n"
             f"User: {user_msg}\n\nAssistant:"
         )
-# ------------ GPU-only generation ------------
 @spaces.GPU
-def generate_on_gpu(personality, level, topic,
-                    max_new_tokens=180,
-                    min_new_tokens=64):
     tokenizer = get_tokenizer()
     prompt = build_model_input(tokenizer, personality, level, topic)
-    # Try 4-bit to reduce VRAM; fall back to fp16 if unavailable
     try:
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             load_in_4bit=True,
             device_map="auto",
         )
-    except Exception:
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             torch_dtype=torch.float16,
-            device_map="auto",
         )
     model.eval()
-    device = next(model.parameters()).device
     inputs = tokenizer(prompt, return_tensors="pt")
     input_len = inputs["input_ids"].shape[1]
     inputs = {k: v.to(device) for k, v in inputs.items()}
@@ -88,25 +90,24 @@ def generate_on_gpu(personality, level, topic,
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
-            min_new_tokens=min_new_tokens,      # ensure it doesn’t stop immediately
-            temperature=0.3,
-            top_p=0.9,
             do_sample=True,
             repetition_penalty=1.05,
             eos_token_id=tokenizer.eos_token_id,
             pad_token_id=tokenizer.pad_token_id,
         )
-    # Prefer decoding only new tokens (avoids prompt-echo). If empty, fall back to full decode.
     gen_ids = outputs[0][input_len:]
     text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
     if not text:
         text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-        # If the full decode still contains the prompt, try to trim it once safely
         if text.startswith(prompt):
             text = text[len(prompt):].lstrip()
-    # Cleanup VRAM
     try:
         del model
         if torch.cuda.is_available():
@@ -114,31 +115,29 @@ def generate_on_gpu(personality, level, topic,
     except Exception:
         pass
-    # Final guard so UI shows something useful
-    if not text:
-        text = ("Generation returned no content. Please click **Regenerate** or pick a different topic. "
-                "If this persists, reduce max tokens or use a lighter checkpoint.")
-    return text
-# ------------ Orchestrator (GPU-only) ------------
-def orchestrator(personality, level, topic):
     if not personality or not level or not topic:
         return "Select your personality, expertise, and topic to get a tailored explanation."
-    try:
-        return generate_on_gpu(personality, level, topic)
-    except Exception as e:
-        print(f"[ZeroGPU error] {type(e).__name__}: {e}")
-        return (
-            "GPU was not available or the job was interrupted. "
-            "Click **Regenerate** or change a selection to try again."
-        )
 # ------------ Gradio UI ------------
 with gr.Blocks(theme="default") as iface:
     gr.Markdown(
         "## Cardano Plutus AI Assistant\n"
-        "Pick your **Learning Personality**, **Expertise Level**, and **Topic**. "
-        "The answer will generate automatically."
     )
     with gr.Row():
@@ -178,6 +177,7 @@ with gr.Blocks(theme="default") as iface:
         )
     with gr.Row():
         regen = gr.Button("🔁 Regenerate")
     output = gr.Textbox(
@@ -188,18 +188,11 @@ with gr.Blocks(theme="default") as iface:
         placeholder="Your tailored explanation will appear here…",
     )
-    def _maybe_generate(p, l, t):
-        if p and l and t:
-            return orchestrator(p, l, t)
-        return "Select your personality, expertise, and topic to get a tailored explanation."
-    personality.change(_maybe_generate, [personality, level, topic], output, queue=True)
-    level.change(_maybe_generate, [personality, level, topic], output, queue=True)
-    topic.change(_maybe_generate, [personality, level, topic], output, queue=True)
     regen.click(orchestrator, [personality, level, topic], output, queue=True)
-# Enable queue with broad compatibility
 iface.queue()
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

         tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
         # Ensure pad/eos exist to avoid generation crashes
         if tok.pad_token_id is None:
             if tok.eos_token_id is not None:
                 tok.pad_token = tok.eos_token
             elif tok.bos_token_id is not None:
 def build_model_input(tokenizer, personality, level, topic):
     user_msg = build_instructions(personality, level, topic)
     if hasattr(tokenizer, "apply_chat_template"):
         messages = [
             {"role": "system", "content": "You are a helpful Cardano Plutus tutor."},
             {"role": "user", "content": user_msg},
         ]
         prompt_str = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
         )
         return prompt_str
     else:
         return (
             "System: You are a helpful Cardano Plutus tutor.\n\n"
             f"User: {user_msg}\n\nAssistant:"
         )
+# ------------ GPU/CPU generation ------------
 @spaces.GPU
+def generate_on_gpu(personality, level, topic, max_new_tokens=100, min_new_tokens=32):
+    # Log GPU availability for debugging
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"GPU device: {torch.cuda.get_device_name(0)}")
     tokenizer = get_tokenizer()
     prompt = build_model_input(tokenizer, personality, level, topic)
     try:
+        # Try loading model on GPU with 4-bit quantization
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             load_in_4bit=True,
             device_map="auto",
         )
+        device = next(model.parameters()).device
+    except Exception as e:
+        print(f"GPU loading failed: {e}. Falling back to CPU.")
+        # Fallback to CPU with FP16
         model = AutoModelForCausalLM.from_pretrained(
             MODEL_NAME,
             torch_dtype=torch.float16,
+            device_map="cpu",
         )
+        device = torch.device("cpu")
     model.eval()
     inputs = tokenizer(prompt, return_tensors="pt")
     input_len = inputs["input_ids"].shape[1]
     inputs = {k: v.to(device) for k, v in inputs.items()}
         outputs = model.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
+            temperature=0.5,
+            top_p=0.95,
             do_sample=True,
             repetition_penalty=1.05,
             eos_token_id=tokenizer.eos_token_id,
             pad_token_id=tokenizer.pad_token_id,
         )
+    # Decode and clean up
     gen_ids = outputs[0][input_len:]
     text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
     if not text:
         text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
         if text.startswith(prompt):
             text = text[len(prompt):].lstrip()
+    # Cleanup
     try:
         del model
         if torch.cuda.is_available():
     except Exception:
         pass
+    return text if text else "Generation failed. Try regenerating or adjusting parameters."
+# ------------ Orchestrator with retry logic ------------
+def orchestrator(personality, level, topic, max_retries=3):
     if not personality or not level or not topic:
         return "Select your personality, expertise, and topic to get a tailored explanation."
+    for attempt in range(max_retries):
+        try:
+            return generate_on_gpu(personality, level, topic)
+        except Exception as e:
+            print(f"[Attempt {attempt + 1}/{max_retries}] ZeroGPU error: {type(e).__name__}: {e}")
+            if attempt == max_retries - 1:
+                return (
+                    "GPU was not available after multiple attempts. "
+                    "Click **Regenerate** or try again later."
+                )
 # ------------ Gradio UI ------------
 with gr.Blocks(theme="default") as iface:
     gr.Markdown(
         "## Cardano Plutus AI Assistant\n"
+        "Pick your **Learning Personality**, **Expertise Level**, and **Topic**, then click **Generate**."
     )
     with gr.Row():
         )
     with gr.Row():
+        generate_btn = gr.Button("Generate")
         regen = gr.Button("🔁 Regenerate")
     output = gr.Textbox(
         placeholder="Your tailored explanation will appear here…",
     )
+    generate_btn.click(orchestrator, [personality, level, topic], output, queue=True)
     regen.click(orchestrator, [personality, level, topic], output, queue=True)
+# Enable queue
 iface.queue()
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)