Remostart commited on
Commit
b1a7eca
·
verified ·
1 Parent(s): 14b43fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -21
app.py CHANGED
@@ -10,33 +10,61 @@ _TOKENIZER = None
10
  def get_tokenizer():
11
  global _TOKENIZER
12
  if _TOKENIZER is None:
13
- tok = AutoTokenizer.from_pretrained(MODEL_NAME)
 
14
  if tok.pad_token_id is None:
15
- tok.pad_token = tok.eos_token
 
 
 
 
 
 
16
  _TOKENIZER = tok
17
  return _TOKENIZER
18
 
19
  # ------------ Prompt builder ------------
20
- def build_prompt(personality, level, topic):
21
  return (
22
  f"You are a friendly Plutus AI tutor for a {personality} learner at {level} level.\n"
23
  f"Topic: {topic}\n\n"
24
  "Explain in a conversational, easy tone with concrete examples.\n"
25
- "Keep it complete, focused, and around 120–160 words.\n"
26
- "End with a one-line takeaway starting with 'Takeaway:'.\n"
27
  )
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # ------------ GPU-only generation ------------
30
  @spaces.GPU
31
- def generate_on_gpu(personality, level, topic, max_new_tokens=160):
32
- """
33
- Runs ONLY when ZeroGPU grants a GPU.
34
- Loads model per-call, generates, decodes ONLY new tokens, frees VRAM.
35
- """
36
  tokenizer = get_tokenizer()
37
- prompt = build_prompt(personality, level, topic)
38
 
39
- # Try 4-bit for VRAM; fall back to fp16 if not available
40
  try:
41
  model = AutoModelForCausalLM.from_pretrained(
42
  MODEL_NAME,
@@ -51,7 +79,6 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
51
  )
52
  model.eval()
53
 
54
- # Move inputs to model device
55
  device = next(model.parameters()).device
56
  inputs = tokenizer(prompt, return_tensors="pt")
57
  input_len = inputs["input_ids"].shape[1]
@@ -60,8 +87,9 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
60
  with torch.inference_mode():
61
  outputs = model.generate(
62
  **inputs,
63
- max_new_tokens=max_new_tokens, # keep small for ZeroGPU time/VRAM
64
- temperature=0.2,
 
65
  top_p=0.9,
66
  do_sample=True,
67
  repetition_penalty=1.05,
@@ -69,9 +97,14 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
69
  pad_token_id=tokenizer.pad_token_id,
70
  )
71
 
72
- # Decode ONLY the newly generated tokens (avoids prompt-echo trimming issues)
73
  gen_ids = outputs[0][input_len:]
74
  text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
 
 
 
 
 
75
 
76
  # Cleanup VRAM
77
  try:
@@ -81,23 +114,23 @@ def generate_on_gpu(personality, level, topic, max_new_tokens=160):
81
  except Exception:
82
  pass
83
 
84
- # Fallback guard: ensure we return something readable
85
  if not text:
86
- text = "Takeaway: Generation finished but returned empty text. Try again or choose a different topic."
 
87
  return text
88
 
89
- # ------------ Orchestrator (no CPU fallback) ------------
90
  def orchestrator(personality, level, topic):
91
  if not personality or not level or not topic:
92
  return "Select your personality, expertise, and topic to get a tailored explanation."
93
  try:
94
  return generate_on_gpu(personality, level, topic)
95
  except Exception as e:
96
- # Don’t crash silently; show a friendly message
97
  print(f"[ZeroGPU error] {type(e).__name__}: {e}")
98
  return (
99
  "GPU was not available or the job was interrupted. "
100
- "Please click **Regenerate** or change a selection to try again."
101
  )
102
 
103
  # ------------ Gradio UI ------------
 
10
  def get_tokenizer():
11
  global _TOKENIZER
12
  if _TOKENIZER is None:
13
+ tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
14
+ # Ensure pad/eos exist to avoid generation crashes
15
  if tok.pad_token_id is None:
16
+ # Prefer eos_token if present; otherwise use bos_token; otherwise add one
17
+ if tok.eos_token_id is not None:
18
+ tok.pad_token = tok.eos_token
19
+ elif tok.bos_token_id is not None:
20
+ tok.pad_token = tok.bos_token
21
+ else:
22
+ tok.add_special_tokens({"pad_token": "[PAD]"})
23
  _TOKENIZER = tok
24
  return _TOKENIZER
25
 
26
  # ------------ Prompt builder ------------
27
+ def build_instructions(personality, level, topic):
28
  return (
29
  f"You are a friendly Plutus AI tutor for a {personality} learner at {level} level.\n"
30
  f"Topic: {topic}\n\n"
31
  "Explain in a conversational, easy tone with concrete examples.\n"
32
+ "Keep it complete and around 120–160 words.\n"
33
+ "End with a one-line takeaway starting with 'Takeaway:'."
34
  )
35
 
36
+ def build_model_input(tokenizer, personality, level, topic):
37
+ user_msg = build_instructions(personality, level, topic)
38
+
39
+ # If the tokenizer supports chat templates, use them.
40
+ if hasattr(tokenizer, "apply_chat_template"):
41
+ messages = [
42
+ {"role": "system", "content": "You are a helpful Cardano Plutus tutor."},
43
+ {"role": "user", "content": user_msg},
44
+ ]
45
+ # add_generation_prompt=True puts the assistant tag where the model expects to start generating
46
+ prompt_str = tokenizer.apply_chat_template(
47
+ messages,
48
+ tokenize=False,
49
+ add_generation_prompt=True
50
+ )
51
+ return prompt_str
52
+ else:
53
+ # Fallback: plain prompt with a simple “Assistant:” cue
54
+ return (
55
+ "System: You are a helpful Cardano Plutus tutor.\n\n"
56
+ f"User: {user_msg}\n\nAssistant:"
57
+ )
58
+
59
  # ------------ GPU-only generation ------------
60
  @spaces.GPU
61
+ def generate_on_gpu(personality, level, topic,
62
+ max_new_tokens=180,
63
+ min_new_tokens=64):
 
 
64
  tokenizer = get_tokenizer()
65
+ prompt = build_model_input(tokenizer, personality, level, topic)
66
 
67
+ # Try 4-bit to reduce VRAM; fall back to fp16 if unavailable
68
  try:
69
  model = AutoModelForCausalLM.from_pretrained(
70
  MODEL_NAME,
 
79
  )
80
  model.eval()
81
 
 
82
  device = next(model.parameters()).device
83
  inputs = tokenizer(prompt, return_tensors="pt")
84
  input_len = inputs["input_ids"].shape[1]
 
87
  with torch.inference_mode():
88
  outputs = model.generate(
89
  **inputs,
90
+ max_new_tokens=max_new_tokens,
91
+ min_new_tokens=min_new_tokens, # ensure it doesn’t stop immediately
92
+ temperature=0.3,
93
  top_p=0.9,
94
  do_sample=True,
95
  repetition_penalty=1.05,
 
97
  pad_token_id=tokenizer.pad_token_id,
98
  )
99
 
100
+ # Prefer decoding only new tokens (avoids prompt-echo). If empty, fall back to full decode.
101
  gen_ids = outputs[0][input_len:]
102
  text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
103
+ if not text:
104
+ text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
105
+ # If the full decode still contains the prompt, try to trim it once safely
106
+ if text.startswith(prompt):
107
+ text = text[len(prompt):].lstrip()
108
 
109
  # Cleanup VRAM
110
  try:
 
114
  except Exception:
115
  pass
116
 
117
+ # Final guard so UI shows something useful
118
  if not text:
119
+ text = ("Generation returned no content. Please click **Regenerate** or pick a different topic. "
120
+ "If this persists, reduce max tokens or use a lighter checkpoint.")
121
  return text
122
 
123
+ # ------------ Orchestrator (GPU-only) ------------
124
  def orchestrator(personality, level, topic):
125
  if not personality or not level or not topic:
126
  return "Select your personality, expertise, and topic to get a tailored explanation."
127
  try:
128
  return generate_on_gpu(personality, level, topic)
129
  except Exception as e:
 
130
  print(f"[ZeroGPU error] {type(e).__name__}: {e}")
131
  return (
132
  "GPU was not available or the job was interrupted. "
133
+ "Click **Regenerate** or change a selection to try again."
134
  )
135
 
136
  # ------------ Gradio UI ------------