Remostart commited on
Commit
24082be
·
verified ·
1 Parent(s): 6b5e102

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -80
app.py CHANGED
@@ -1,57 +1,75 @@
 
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
4
  import spaces
5
 
6
  MODEL_NAME = "ubiodee/Plutus_Tutor_new"
7
 
8
- # --------- Lightweight utilities ----------
 
 
 
 
 
 
 
 
 
 
 
9
  def build_prompt(personality, level, topic):
 
10
  return (
11
- f"You are a Plutus AI Assistant tailored for a {personality} learner "
12
- f"at {level} level, focusing on {topic}. Provide a clear, concise, "
13
- f"and tailored explanation of {topic}, suitable for the specified personality and expertise level."
 
 
14
  )
15
 
16
- def _ensure_tokenizer():
17
- tok = AutoTokenizer.from_pretrained(MODEL_NAME)
18
- if tok.pad_token_id is None:
19
- tok.pad_token = tok.eos_token
20
- return tok
 
 
21
 
22
- # CPU fallback (slow, but prevents total failure)
23
- def generate_cpu(personality, level, topic, max_new_tokens=250):
24
- tokenizer = _ensure_tokenizer()
25
- prompt = build_prompt(personality, level, topic)
26
  inputs = tokenizer(prompt, return_tensors="pt")
27
- # Small settings for CPU to avoid long stalls
28
- with torch.inference_mode():
29
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) # CPU load
30
- model.eval()
31
- outputs = model.generate(
32
- **inputs,
33
- max_new_tokens=min(max_new_tokens, 128),
34
- temperature=0.2,
35
- top_p=0.9,
36
- do_sample=True,
37
- eos_token_id=tokenizer.eos_token_id,
38
- pad_token_id=tokenizer.pad_token_id,
39
- )
40
- text = tokenizer.decode(outputs[0], skip_special_tokens=True)
41
- if text.startswith(prompt):
42
- text = text[len(prompt):].strip()
43
- return text
 
 
 
 
 
 
44
 
45
  @spaces.GPU
46
- def generate_gpu(personality, level, topic, max_new_tokens=1000):
47
  """
48
- Runs ONLY under a granted GPU.
49
- Loads the model in 4-bit to fit ZeroGPU VRAM, generates, then frees VRAM.
50
  """
51
- tokenizer = _ensure_tokenizer()
52
- prompt = build_prompt(personality, level, topic)
53
 
54
- # Prefer 4-bit to minimize VRAM on ZeroGPU
55
  try:
56
  model = AutoModelForCausalLM.from_pretrained(
57
  MODEL_NAME,
@@ -59,33 +77,38 @@ def generate_gpu(personality, level, topic, max_new_tokens=1000):
59
  device_map="auto",
60
  )
61
  except Exception:
62
- # If 4-bit isn’t available for this arch, fallback to fp16 on GPU
63
  model = AutoModelForCausalLM.from_pretrained(
64
  MODEL_NAME,
65
  torch_dtype=torch.float16,
66
  device_map="auto",
67
  )
68
-
69
  model.eval()
70
 
71
  device = next(model.parameters()).device
72
  inputs = tokenizer(prompt, return_tensors="pt")
73
  inputs = {k: v.to(device) for k, v in inputs.items()}
74
 
75
- with torch.inference_mode():
76
- outputs = model.generate(
77
- **inputs,
78
- max_new_tokens=max_new_tokens,
79
- temperature=0.3,
80
- top_p=0.3,
81
- do_sample=True,
82
- eos_token_id=tokenizer.eos_token_id,
83
- pad_token_id=tokenizer.pad_token_id,
84
- )
 
85
 
86
- text = tokenizer.decode(outputs[0], skip_special_tokens=True)
87
- if text.startswith(prompt):
88
- text = text[len(prompt):].strip()
 
 
 
 
 
 
89
 
90
  # Free VRAM ASAP
91
  try:
@@ -95,24 +118,28 @@ def generate_gpu(personality, level, topic, max_new_tokens=1000):
95
  except Exception:
96
  pass
97
 
98
- return text
99
-
100
- def orchestrator(personality, level, topic):
101
- # Don’t run until all selections are made
102
  if not personality or not level or not topic:
103
- return "Select your personality, expertise, and topic to get a tailored explanation."
 
104
 
105
- # Try GPU path first; if ZeroGPU refuses/throws, fallback to CPU
 
 
106
  try:
107
- return generate_gpu(personality, level, topic)
108
- except RuntimeError as e:
109
- # Typical ZeroGPU worker errors show here – fall back gracefully
110
- return f"(GPU unavailable, using CPU fallback)\n\n{generate_cpu(personality, level, topic)}"
111
  except Exception as e:
112
- # Any other unexpected issue try CPU anyway
113
- return f"(GPU error: {type(e).__name__})\n\n{generate_cpu(personality, level, topic)}"
 
 
 
 
 
 
114
 
115
- # --------- Gradio UI ----------
116
  with gr.Blocks(theme="default") as iface:
117
  gr.Markdown(
118
  "## Cardano Plutus AI Assistant\n"
@@ -126,14 +153,14 @@ with gr.Blocks(theme="default") as iface:
126
  label="Learning Personality",
127
  value=None,
128
  allow_custom_value=False,
129
- scale=1
130
  )
131
  level = gr.Dropdown(
132
  choices=["Beginner", "Intermediate", "Advanced"],
133
  label="Expertise Level",
134
  value=None,
135
  allow_custom_value=False,
136
- scale=1
137
  )
138
  topic = gr.Dropdown(
139
  choices=[
@@ -153,26 +180,29 @@ with gr.Blocks(theme="default") as iface:
153
  label="Topic",
154
  value=None,
155
  allow_custom_value=False,
156
- scale=2
157
  )
158
 
159
  with gr.Row():
160
- regen = gr.Button("🔁 Regenerate")
161
- output = gr.Textbox(label="Model Response", lines=12, interactive=False, show_copy_button=True)
162
 
163
- # Auto-generate when any dropdown changes (only once all three have values)
164
- def _maybe_generate(p, l, t):
165
- if p and l and t:
166
- return orchestrator(p, l, t)
167
- return "Select your personality, expertise, and topic to get a tailored explanation."
 
168
 
169
- personality.change(_maybe_generate, [personality, level, topic], output, queue=True)
170
- level.change(_maybe_generate, [personality, level, topic], output, queue=True)
171
- topic.change(_maybe_generate, [personality, level, topic], output, queue=True)
172
- regen.click(orchestrator, [personality, level, topic], output, queue=True)
173
 
 
 
 
 
174
 
175
- # Enable request queueing (simpler call for older Gradio)
176
  iface.queue()
177
 
178
  if __name__ == "__main__":
 
1
+ import threading
2
  import gradio as gr
3
  import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
  import spaces
6
 
7
  MODEL_NAME = "ubiodee/Plutus_Tutor_new"
8
 
9
+ # ---------------- Utilities ----------------
10
+
11
+ _TOKENIZER = None
12
+ def get_tokenizer():
13
+ global _TOKENIZER
14
+ if _TOKENIZER is None:
15
+ tok = AutoTokenizer.from_pretrained(MODEL_NAME)
16
+ if tok.pad_token_id is None:
17
+ tok.pad_token = tok.eos_token
18
+ _TOKENIZER = tok
19
+ return _TOKENIZER
20
+
21
  def build_prompt(personality, level, topic):
22
+ # Conversational, clear, and bounded length to reduce truncation
23
  return (
24
+ f"You are a friendly Plutus AI tutor for a {personality} learner at {level} level.\n"
25
+ f"Topic: {topic}\n\n"
26
+ f"Explain in a conversational tone, with simple language and concrete examples.\n"
27
+ f"Keep it focused and complete in about 120–180 words.\n"
28
+ f"End with a single-sentence takeaway starting with 'Takeaway:'.\n"
29
  )
30
 
31
+ # ---------------- Streaming paths ----------------
32
+
33
+ def from_cpu_stream(prompt, max_new_tokens=200):
34
+ tokenizer = get_tokenizer()
35
+ # Load on CPU (fallback)
36
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
37
+ model.eval()
38
 
 
 
 
 
39
  inputs = tokenizer(prompt, return_tensors="pt")
40
+
41
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
42
+ gen_kwargs = dict(
43
+ **inputs,
44
+ max_new_tokens=max_new_tokens,
45
+ temperature=0.2,
46
+ top_p=0.9,
47
+ do_sample=True,
48
+ eos_token_id=tokenizer.eos_token_id,
49
+ pad_token_id=tokenizer.pad_token_id,
50
+ streamer=streamer,
51
+ )
52
+
53
+ t = threading.Thread(target=model.generate, kwargs=gen_kwargs)
54
+ t.start()
55
+
56
+ buffer = ""
57
+ for piece in streamer:
58
+ buffer += piece
59
+ # Trim echoed prompt once
60
+ if buffer.startswith(prompt):
61
+ buffer = buffer[len(prompt):].lstrip()
62
+ yield buffer
63
 
64
  @spaces.GPU
65
+ def _gpu_generate_stream(prompt, max_new_tokens):
66
  """
67
+ This function runs ONLY when ZeroGPU grants a GPU.
68
+ We create a nested generator so Gradio can stream tokens.
69
  """
70
+ tokenizer = get_tokenizer()
 
71
 
72
+ # Prefer 4-bit to reduce VRAM; if not available, fall back to fp16
73
  try:
74
  model = AutoModelForCausalLM.from_pretrained(
75
  MODEL_NAME,
 
77
  device_map="auto",
78
  )
79
  except Exception:
 
80
  model = AutoModelForCausalLM.from_pretrained(
81
  MODEL_NAME,
82
  torch_dtype=torch.float16,
83
  device_map="auto",
84
  )
 
85
  model.eval()
86
 
87
  device = next(model.parameters()).device
88
  inputs = tokenizer(prompt, return_tensors="pt")
89
  inputs = {k: v.to(device) for k, v in inputs.items()}
90
 
91
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
92
+ gen_kwargs = dict(
93
+ **inputs,
94
+ max_new_tokens=max_new_tokens,
95
+ temperature=0.15,
96
+ top_p=0.9,
97
+ do_sample=True,
98
+ eos_token_id=tokenizer.eos_token_id,
99
+ pad_token_id=tokenizer.pad_token_id,
100
+ streamer=streamer,
101
+ )
102
 
103
+ t = threading.Thread(target=model.generate, kwargs=gen_kwargs)
104
+ t.start()
105
+
106
+ buffer = ""
107
+ for piece in streamer:
108
+ buffer += piece
109
+ if buffer.startswith(prompt):
110
+ buffer = buffer[len(prompt):].lstrip()
111
+ yield buffer
112
 
113
  # Free VRAM ASAP
114
  try:
 
118
  except Exception:
119
  pass
120
 
121
+ def orchestrator_stream(personality, level, topic):
 
 
 
122
  if not personality or not level or not topic:
123
+ yield "Select your personality, expertise, and topic to get a tailored explanation."
124
+ return
125
 
126
+ prompt = build_prompt(personality, level, topic)
127
+
128
+ # Try GPU streaming first
129
  try:
130
+ for chunk in _gpu_generate_stream(prompt, max_new_tokens=240):
131
+ yield chunk
132
+ return
 
133
  except Exception as e:
134
+ # Log server-side; do NOT show to user
135
+ print(f"[GPU fallback] {type(e).__name__}: {e}")
136
+
137
+ # CPU streaming fallback
138
+ for chunk in from_cpu_stream(prompt, max_new_tokens=200):
139
+ yield chunk
140
+
141
+ # ---------------- Gradio UI ----------------
142
 
 
143
  with gr.Blocks(theme="default") as iface:
144
  gr.Markdown(
145
  "## Cardano Plutus AI Assistant\n"
 
153
  label="Learning Personality",
154
  value=None,
155
  allow_custom_value=False,
156
+ scale=1,
157
  )
158
  level = gr.Dropdown(
159
  choices=["Beginner", "Intermediate", "Advanced"],
160
  label="Expertise Level",
161
  value=None,
162
  allow_custom_value=False,
163
+ scale=1,
164
  )
165
  topic = gr.Dropdown(
166
  choices=[
 
180
  label="Topic",
181
  value=None,
182
  allow_custom_value=False,
183
+ scale=2,
184
  )
185
 
186
  with gr.Row():
187
+ regen = gr.Button("🔁 Generate")
 
188
 
189
+ output = gr.Textbox(
190
+ label="Model Response",
191
+ lines=12,
192
+ interactive=False,
193
+ show_copy_button=True
194
+ )
195
 
196
+ # Auto-generate whenever any dropdown changes (after all 3 are set)
197
+ def _maybe_stream(p, l, t):
198
+ return orchestrator_stream(p, l, t)
 
199
 
200
+ personality.change(_maybe_stream, [personality, level, topic], output, queue=True)
201
+ level.change(_maybe_stream, [personality, level, topic], output, queue=True)
202
+ topic.change(_maybe_stream, [personality, level, topic], output, queue=True)
203
+ regen.click(_maybe_stream, [personality, level, topic], output, queue=True)
204
 
205
+ # Enable request queueing (use simple call for wider Gradio compatibility)
206
  iface.queue()
207
 
208
  if __name__ == "__main__":