shabul's picture
Fix: use return_dict=True + **encoded for transformers >=4.47 compatibility
959887d verified
"""
Feynman Explainer β€” Gradio Chat App
Runs on Hugging Face Spaces (CPU free tier).
"""
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = "shabul/qwen2.5-3b-feynman-explainer"
SYSTEM_PROMPT = (
"You are a Feynman-style explainer. For every question, build intuition "
"from the ground up using concrete analogies and everyday language. "
"No jargon until it's earned. No bullet points. Pure flowing prose. "
"Be conversational and enthusiastic β€” like Feynman genuinely loved this topic."
)
print(f"Loading model: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
low_cpu_mem_usage=True,
)
model.eval()
print("Model loaded.")
def respond(message: str, history: list):
try:
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for h in history:
role = h.get("role") if isinstance(h, dict) else getattr(h, "role", None)
content = h.get("content") if isinstance(h, dict) else getattr(h, "content", None)
if role and content:
messages.append({"role": role, "content": str(content)})
messages.append({"role": "user", "content": message})
encoded = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True,
)
prompt_len = encoded["input_ids"].shape[1]
with torch.no_grad():
output_ids = model.generate(
**encoded,
max_new_tokens=100,
do_sample=True,
temperature=0.75,
repetition_penalty=1.1,
)
response = tokenizer.decode(
output_ids[0][prompt_len:],
skip_special_tokens=True,
)
return response
except Exception as e:
import traceback
err = traceback.format_exc()
print(err)
return f"⚠️ TRACEBACK:\n{err}"
demo = gr.ChatInterface(
fn=respond,
type="messages",
title="πŸ”¬ Feynman Explainer",
description=(
"Ask anything. Feynman-style explanations β€” analogy first, no jargon until it's earned.\n\n"
"⏱️ **CPU only** β€” responses take 2–4 minutes. First token appears after ~30s."
),
examples=[
"How does gradient descent actually work?",
"What is entropy and why does it always increase?",
"What is a p-value?",
"Why does ice float on water?",
"How does attention work in language models?",
],
cache_examples=False,
)
demo.launch()