""" Feynman Explainer — Gradio Chat App Runs on Hugging Face Spaces (CPU free tier). """ import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_ID = "shabul/qwen2.5-3b-feynman-explainer" SYSTEM_PROMPT = ( "You are a Feynman-style explainer. For every question, build intuition " "from the ground up using concrete analogies and everyday language. " "No jargon until it's earned. No bullet points. Pure flowing prose. " "Be conversational and enthusiastic — like Feynman genuinely loved this topic." ) print(f"Loading model: {MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, low_cpu_mem_usage=True, ) model.eval() print("Model loaded.") def respond(message: str, history: list): try: messages = [{"role": "system", "content": SYSTEM_PROMPT}] for h in history: role = h.get("role") if isinstance(h, dict) else getattr(h, "role", None) content = h.get("content") if isinstance(h, dict) else getattr(h, "content", None) if role and content: messages.append({"role": role, "content": str(content)}) messages.append({"role": "user", "content": message}) encoded = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True, ) prompt_len = encoded["input_ids"].shape[1] with torch.no_grad(): output_ids = model.generate( **encoded, max_new_tokens=100, do_sample=True, temperature=0.75, repetition_penalty=1.1, ) response = tokenizer.decode( output_ids[0][prompt_len:], skip_special_tokens=True, ) return response except Exception as e: import traceback err = traceback.format_exc() print(err) return f"⚠️ TRACEBACK:\n{err}" demo = gr.ChatInterface( fn=respond, type="messages", title="🔬 Feynman Explainer", description=( "Ask anything. Feynman-style explanations — analogy first, no jargon until it's earned.\n\n" "⏱️ **CPU only** — responses take 2–4 minutes. First token appears after ~30s." ), examples=[ "How does gradient descent actually work?", "What is entropy and why does it always increase?", "What is a p-value?", "Why does ice float on water?", "How does attention work in language models?", ], cache_examples=False, ) demo.launch()