File size: 2,755 Bytes
959887d
4c1ec71
 
 
 
 
 
d062dd2
4c1ec71
 
 
 
 
 
 
 
 
 
 
 
 
 
6628314
4c1ec71
 
 
 
 
 
e10b677
775c27a
 
 
e10b677
 
 
 
775c27a
 
959887d
775c27a
 
 
 
959887d
d062dd2
959887d
d062dd2
775c27a
 
959887d
e10b677
 
 
775c27a
 
 
 
959887d
775c27a
 
e10b677
 
775c27a
 
 
 
8abb226
e10b677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1ec71
e10b677
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""
Feynman Explainer — Gradio Chat App
Runs on Hugging Face Spaces (CPU free tier).
"""

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = "shabul/qwen2.5-3b-feynman-explainer"

SYSTEM_PROMPT = (
    "You are a Feynman-style explainer. For every question, build intuition "
    "from the ground up using concrete analogies and everyday language. "
    "No jargon until it's earned. No bullet points. Pure flowing prose. "
    "Be conversational and enthusiastic — like Feynman genuinely loved this topic."
)

print(f"Loading model: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True,
)
model.eval()
print("Model loaded.")


def respond(message: str, history: list):
    try:
        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
        for h in history:
            role = h.get("role") if isinstance(h, dict) else getattr(h, "role", None)
            content = h.get("content") if isinstance(h, dict) else getattr(h, "content", None)
            if role and content:
                messages.append({"role": role, "content": str(content)})
        messages.append({"role": "user", "content": message})

        encoded = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
        )
        prompt_len = encoded["input_ids"].shape[1]

        with torch.no_grad():
            output_ids = model.generate(
                **encoded,
                max_new_tokens=100,
                do_sample=True,
                temperature=0.75,
                repetition_penalty=1.1,
            )

        response = tokenizer.decode(
            output_ids[0][prompt_len:],
            skip_special_tokens=True,
        )
        return response

    except Exception as e:
        import traceback
        err = traceback.format_exc()
        print(err)
        return f"⚠️ TRACEBACK:\n{err}"


demo = gr.ChatInterface(
    fn=respond,
    type="messages",
    title="🔬 Feynman Explainer",
    description=(
        "Ask anything. Feynman-style explanations — analogy first, no jargon until it's earned.\n\n"
        "⏱️ **CPU only** — responses take 2–4 minutes. First token appears after ~30s."
    ),
    examples=[
        "How does gradient descent actually work?",
        "What is entropy and why does it always increase?",
        "What is a p-value?",
        "Why does ice float on water?",
        "How does attention work in language models?",
    ],
    cache_examples=False,
)

demo.launch()