import gradio as gr from llama_cpp import Llama # Load quantized model (downloads on first run) llm = Llama( model_path="microsoft/Phi-3-mini-4k-instruct-q4.gguf", # Or use HF downloader n_ctx=2048, # Context window n_threads=2, # Match free tier CPUs verbose=False ) def chat(message, history): # Build prompt (simple chat format) prompt = f"<|user|>\n{message}<|end|>\n<|assistant|>\n" # Generate output = llm( prompt, max_tokens=256, temperature=0.7, stop=["<|end|>"] ) response = output['choices'][0]['text'].strip() history.append((message, response)) return history, "" # Gradio UI demo = gr.ChatInterface( fn=chat, title="Fast Phi-3 Chat", description="Quick responses on free HF Spaces!" ) if __name__ == "__main__": demo.launch()