Remostart commited on
Commit
d5aec37
·
verified ·
1 Parent(s): b1a7eca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -46
app.py CHANGED
@@ -13,7 +13,6 @@ def get_tokenizer():
13
  tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
14
  # Ensure pad/eos exist to avoid generation crashes
15
  if tok.pad_token_id is None:
16
- # Prefer eos_token if present; otherwise use bos_token; otherwise add one
17
  if tok.eos_token_id is not None:
18
  tok.pad_token = tok.eos_token
19
  elif tok.bos_token_id is not None:
@@ -36,13 +35,11 @@ def build_instructions(personality, level, topic):
36
  def build_model_input(tokenizer, personality, level, topic):
37
  user_msg = build_instructions(personality, level, topic)
38
 
39
- # If the tokenizer supports chat templates, use them.
40
  if hasattr(tokenizer, "apply_chat_template"):
41
  messages = [
42
  {"role": "system", "content": "You are a helpful Cardano Plutus tutor."},
43
  {"role": "user", "content": user_msg},
44
  ]
45
- # add_generation_prompt=True puts the assistant tag where the model expects to start generating
46
  prompt_str = tokenizer.apply_chat_template(
47
  messages,
48
  tokenize=False,
@@ -50,36 +47,41 @@ def build_model_input(tokenizer, personality, level, topic):
50
  )
51
  return prompt_str
52
  else:
53
- # Fallback: plain prompt with a simple “Assistant:” cue
54
  return (
55
  "System: You are a helpful Cardano Plutus tutor.\n\n"
56
  f"User: {user_msg}\n\nAssistant:"
57
  )
58
 
59
- # ------------ GPU-only generation ------------
60
  @spaces.GPU
61
- def generate_on_gpu(personality, level, topic,
62
- max_new_tokens=180,
63
- min_new_tokens=64):
 
 
 
64
  tokenizer = get_tokenizer()
65
  prompt = build_model_input(tokenizer, personality, level, topic)
66
 
67
- # Try 4-bit to reduce VRAM; fall back to fp16 if unavailable
68
  try:
 
69
  model = AutoModelForCausalLM.from_pretrained(
70
  MODEL_NAME,
71
  load_in_4bit=True,
72
  device_map="auto",
73
  )
74
- except Exception:
 
 
 
75
  model = AutoModelForCausalLM.from_pretrained(
76
  MODEL_NAME,
77
  torch_dtype=torch.float16,
78
- device_map="auto",
79
  )
 
 
80
  model.eval()
81
-
82
- device = next(model.parameters()).device
83
  inputs = tokenizer(prompt, return_tensors="pt")
84
  input_len = inputs["input_ids"].shape[1]
85
  inputs = {k: v.to(device) for k, v in inputs.items()}
@@ -88,25 +90,24 @@ def generate_on_gpu(personality, level, topic,
88
  outputs = model.generate(
89
  **inputs,
90
  max_new_tokens=max_new_tokens,
91
- min_new_tokens=min_new_tokens, # ensure it doesn’t stop immediately
92
- temperature=0.3,
93
- top_p=0.9,
94
  do_sample=True,
95
  repetition_penalty=1.05,
96
  eos_token_id=tokenizer.eos_token_id,
97
  pad_token_id=tokenizer.pad_token_id,
98
  )
99
 
100
- # Prefer decoding only new tokens (avoids prompt-echo). If empty, fall back to full decode.
101
  gen_ids = outputs[0][input_len:]
102
  text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
103
  if not text:
104
  text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
105
- # If the full decode still contains the prompt, try to trim it once safely
106
  if text.startswith(prompt):
107
  text = text[len(prompt):].lstrip()
108
 
109
- # Cleanup VRAM
110
  try:
111
  del model
112
  if torch.cuda.is_available():
@@ -114,31 +115,29 @@ def generate_on_gpu(personality, level, topic,
114
  except Exception:
115
  pass
116
 
117
- # Final guard so UI shows something useful
118
- if not text:
119
- text = ("Generation returned no content. Please click **Regenerate** or pick a different topic. "
120
- "If this persists, reduce max tokens or use a lighter checkpoint.")
121
- return text
122
 
123
- # ------------ Orchestrator (GPU-only) ------------
124
- def orchestrator(personality, level, topic):
125
  if not personality or not level or not topic:
126
  return "Select your personality, expertise, and topic to get a tailored explanation."
127
- try:
128
- return generate_on_gpu(personality, level, topic)
129
- except Exception as e:
130
- print(f"[ZeroGPU error] {type(e).__name__}: {e}")
131
- return (
132
- "GPU was not available or the job was interrupted. "
133
- "Click **Regenerate** or change a selection to try again."
134
- )
 
 
 
135
 
136
  # ------------ Gradio UI ------------
137
  with gr.Blocks(theme="default") as iface:
138
  gr.Markdown(
139
  "## Cardano Plutus AI Assistant\n"
140
- "Pick your **Learning Personality**, **Expertise Level**, and **Topic**. "
141
- "The answer will generate automatically."
142
  )
143
 
144
  with gr.Row():
@@ -178,6 +177,7 @@ with gr.Blocks(theme="default") as iface:
178
  )
179
 
180
  with gr.Row():
 
181
  regen = gr.Button("🔁 Regenerate")
182
 
183
  output = gr.Textbox(
@@ -188,18 +188,11 @@ with gr.Blocks(theme="default") as iface:
188
  placeholder="Your tailored explanation will appear here…",
189
  )
190
 
191
- def _maybe_generate(p, l, t):
192
- if p and l and t:
193
- return orchestrator(p, l, t)
194
- return "Select your personality, expertise, and topic to get a tailored explanation."
195
-
196
- personality.change(_maybe_generate, [personality, level, topic], output, queue=True)
197
- level.change(_maybe_generate, [personality, level, topic], output, queue=True)
198
- topic.change(_maybe_generate, [personality, level, topic], output, queue=True)
199
  regen.click(orchestrator, [personality, level, topic], output, queue=True)
200
 
201
- # Enable queue with broad compatibility
202
  iface.queue()
203
 
204
  if __name__ == "__main__":
205
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
13
  tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
14
  # Ensure pad/eos exist to avoid generation crashes
15
  if tok.pad_token_id is None:
 
16
  if tok.eos_token_id is not None:
17
  tok.pad_token = tok.eos_token
18
  elif tok.bos_token_id is not None:
 
35
  def build_model_input(tokenizer, personality, level, topic):
36
  user_msg = build_instructions(personality, level, topic)
37
 
 
38
  if hasattr(tokenizer, "apply_chat_template"):
39
  messages = [
40
  {"role": "system", "content": "You are a helpful Cardano Plutus tutor."},
41
  {"role": "user", "content": user_msg},
42
  ]
 
43
  prompt_str = tokenizer.apply_chat_template(
44
  messages,
45
  tokenize=False,
 
47
  )
48
  return prompt_str
49
  else:
 
50
  return (
51
  "System: You are a helpful Cardano Plutus tutor.\n\n"
52
  f"User: {user_msg}\n\nAssistant:"
53
  )
54
 
55
+ # ------------ GPU/CPU generation ------------
56
  @spaces.GPU
57
+ def generate_on_gpu(personality, level, topic, max_new_tokens=100, min_new_tokens=32):
58
+ # Log GPU availability for debugging
59
+ print(f"CUDA available: {torch.cuda.is_available()}")
60
+ if torch.cuda.is_available():
61
+ print(f"GPU device: {torch.cuda.get_device_name(0)}")
62
+
63
  tokenizer = get_tokenizer()
64
  prompt = build_model_input(tokenizer, personality, level, topic)
65
 
 
66
  try:
67
+ # Try loading model on GPU with 4-bit quantization
68
  model = AutoModelForCausalLM.from_pretrained(
69
  MODEL_NAME,
70
  load_in_4bit=True,
71
  device_map="auto",
72
  )
73
+ device = next(model.parameters()).device
74
+ except Exception as e:
75
+ print(f"GPU loading failed: {e}. Falling back to CPU.")
76
+ # Fallback to CPU with FP16
77
  model = AutoModelForCausalLM.from_pretrained(
78
  MODEL_NAME,
79
  torch_dtype=torch.float16,
80
+ device_map="cpu",
81
  )
82
+ device = torch.device("cpu")
83
+
84
  model.eval()
 
 
85
  inputs = tokenizer(prompt, return_tensors="pt")
86
  input_len = inputs["input_ids"].shape[1]
87
  inputs = {k: v.to(device) for k, v in inputs.items()}
 
90
  outputs = model.generate(
91
  **inputs,
92
  max_new_tokens=max_new_tokens,
93
+ min_new_tokens=min_new_tokens,
94
+ temperature=0.5,
95
+ top_p=0.95,
96
  do_sample=True,
97
  repetition_penalty=1.05,
98
  eos_token_id=tokenizer.eos_token_id,
99
  pad_token_id=tokenizer.pad_token_id,
100
  )
101
 
102
+ # Decode and clean up
103
  gen_ids = outputs[0][input_len:]
104
  text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
105
  if not text:
106
  text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
 
107
  if text.startswith(prompt):
108
  text = text[len(prompt):].lstrip()
109
 
110
+ # Cleanup
111
  try:
112
  del model
113
  if torch.cuda.is_available():
 
115
  except Exception:
116
  pass
117
 
118
+ return text if text else "Generation failed. Try regenerating or adjusting parameters."
 
 
 
 
119
 
120
+ # ------------ Orchestrator with retry logic ------------
121
+ def orchestrator(personality, level, topic, max_retries=3):
122
  if not personality or not level or not topic:
123
  return "Select your personality, expertise, and topic to get a tailored explanation."
124
+
125
+ for attempt in range(max_retries):
126
+ try:
127
+ return generate_on_gpu(personality, level, topic)
128
+ except Exception as e:
129
+ print(f"[Attempt {attempt + 1}/{max_retries}] ZeroGPU error: {type(e).__name__}: {e}")
130
+ if attempt == max_retries - 1:
131
+ return (
132
+ "GPU was not available after multiple attempts. "
133
+ "Click **Regenerate** or try again later."
134
+ )
135
 
136
  # ------------ Gradio UI ------------
137
  with gr.Blocks(theme="default") as iface:
138
  gr.Markdown(
139
  "## Cardano Plutus AI Assistant\n"
140
+ "Pick your **Learning Personality**, **Expertise Level**, and **Topic**, then click **Generate**."
 
141
  )
142
 
143
  with gr.Row():
 
177
  )
178
 
179
  with gr.Row():
180
+ generate_btn = gr.Button("Generate")
181
  regen = gr.Button("🔁 Regenerate")
182
 
183
  output = gr.Textbox(
 
188
  placeholder="Your tailored explanation will appear here…",
189
  )
190
 
191
+ generate_btn.click(orchestrator, [personality, level, topic], output, queue=True)
 
 
 
 
 
 
 
192
  regen.click(orchestrator, [personality, level, topic], output, queue=True)
193
 
194
+ # Enable queue
195
  iface.queue()
196
 
197
  if __name__ == "__main__":
198
+ iface.launch(server_name="0.0.0.0", server_port=7860)