shabul commited on
Commit
4c1ec71
Β·
verified Β·
1 Parent(s): d3b3ea3

Deploy Feynman Explainer Gradio app (app.py)

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feynman Explainer β€” Gradio Chat App
3
+ Runs on Hugging Face Spaces (CPU free tier).
4
+
5
+ Loads qwen2.5-3b-feynman-explainer on CPU with a CPU-safe dtype.
6
+ Streams tokens for a responsive ChatGPT-like experience.
7
+ """
8
+
9
+ import threading
10
+
11
+ try:
12
+ import spaces # HF Spaces ZeroGPU shim β€” no-op on CPU tier
13
+ except ImportError:
14
+ pass
15
+
16
+ import gradio as gr
17
+ import torch
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
19
+
20
+ MODEL_ID = "shabul/qwen2.5-3b-feynman-explainer"
21
+
22
+ SYSTEM_PROMPT = (
23
+ "You are a Feynman-style explainer. For every question, build intuition "
24
+ "from the ground up using concrete analogies and everyday language. "
25
+ "No jargon until it's earned. No bullet points. Pure flowing prose. "
26
+ "Be conversational and enthusiastic β€” like Feynman genuinely loved this topic."
27
+ )
28
+
29
+ TITLE = "πŸ”¬ Feynman Explainer"
30
+ DESCRIPTION = """
31
+ **Ask anything.** This model explains concepts the way Richard Feynman did β€”
32
+ starting with a concrete analogy, building intuition from scratch, never hiding
33
+ behind jargon.
34
+
35
+ *Built by [Shabul Abdul](https://huggingface.co/shabul), Sr. Data Scientist.
36
+ Fine-tuned on Apple M5 MacBook Pro using [MLX](https://github.com/ml-explore/mlx).*
37
+
38
+ > *"You don't understand something unless you can explain it simply."* β€” Feynman
39
+
40
+ ---
41
+ ⏱️ **CPU only** β€” responses take 20–40 seconds. Worth the wait.
42
+ """
43
+
44
+ EXAMPLES = [
45
+ ["How does gradient descent actually work?"],
46
+ ["What is entropy and why does it always increase?"],
47
+ ["What is a p-value and why do people misuse it?"],
48
+ ["Why does ice float on water?"],
49
+ ["What is the bias-variance tradeoff?"],
50
+ ["How does attention work in language models?"],
51
+ ["What is a derivative?"],
52
+ ["Why does compounding interest feel like magic?"],
53
+ ]
54
+
55
+ print(f"Loading model: {MODEL_ID}")
56
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
57
+ model = AutoModelForCausalLM.from_pretrained(
58
+ MODEL_ID,
59
+ torch_dtype=torch.float32,
60
+ low_cpu_mem_usage=True,
61
+ )
62
+ model.to("cpu")
63
+ model.eval()
64
+ print("Model loaded.")
65
+
66
+
67
+ def respond(message: str, history: list[dict], max_new_tokens: int, temperature: float):
68
+ # Build messages list from history + new message
69
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
70
+ for h in history:
71
+ messages.append({"role": h["role"], "content": h["content"]})
72
+ messages.append({"role": "user", "content": message})
73
+
74
+ input_ids = tokenizer.apply_chat_template(
75
+ messages,
76
+ tokenize=True,
77
+ add_generation_prompt=True,
78
+ return_tensors="pt",
79
+ )
80
+
81
+ streamer = TextIteratorStreamer(
82
+ tokenizer, skip_prompt=True, skip_special_tokens=True
83
+ )
84
+
85
+ gen_kwargs = dict(
86
+ input_ids=input_ids,
87
+ streamer=streamer,
88
+ max_new_tokens=max_new_tokens,
89
+ temperature=temperature,
90
+ do_sample=temperature > 0,
91
+ repetition_penalty=1.1,
92
+ )
93
+
94
+ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
95
+ thread.start()
96
+
97
+ partial = ""
98
+ for token in streamer:
99
+ partial += token
100
+ yield partial
101
+
102
+ thread.join()
103
+
104
+
105
+ with gr.Blocks(
106
+ title=TITLE,
107
+ theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
108
+ css=".gradio-container { max-width: 820px !important; margin: auto; }",
109
+ ) as demo:
110
+
111
+ gr.Markdown(f"# {TITLE}\n{DESCRIPTION}")
112
+
113
+ with gr.Row():
114
+ with gr.Column(scale=4):
115
+ max_tokens = gr.Slider(
116
+ 100, 600, value=350, step=50,
117
+ label="Max response length (tokens)",
118
+ )
119
+ with gr.Column(scale=4):
120
+ temperature = gr.Slider(
121
+ 0.1, 1.2, value=0.75, step=0.05,
122
+ label="Creativity (temperature)",
123
+ )
124
+
125
+ chat = gr.ChatInterface(
126
+ fn=respond,
127
+ additional_inputs=[max_tokens, temperature],
128
+ examples=EXAMPLES,
129
+ cache_examples=False,
130
+ type="messages",
131
+ chatbot=gr.Chatbot(
132
+ height=480,
133
+ placeholder="<br><br><center>Ask me to explain anything β€” I'll make it simple.</center>",
134
+ show_label=False,
135
+ ),
136
+ textbox=gr.Textbox(
137
+ placeholder="e.g. How does a neural network learn?",
138
+ container=False,
139
+ scale=7,
140
+ ),
141
+ submit_btn="Explain β†’",
142
+ retry_btn="Try again",
143
+ undo_btn="Undo",
144
+ clear_btn="Clear chat",
145
+ )
146
+
147
+ gr.Markdown(
148
+ "---\n"
149
+ "🧠 Model: [`shabul/qwen2.5-3b-feynman-explainer`](https://huggingface.co/shabul/qwen2.5-3b-feynman-explainer) · "
150
+ "πŸ“¦ Base: `Qwen/Qwen2.5-3B-Instruct` Β· "
151
+ "🍎 Trained on Apple Silicon with [mlx-lm](https://github.com/ml-explore/mlx-lm)"
152
+ )
153
+
154
+ if __name__ == "__main__":
155
+ demo.launch()