import gradio as gr from ctransformers import AutoModelForCausalLM MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" llm = AutoModelForCausalLM.from_pretrained( MODEL_REPO, model_file=MODEL_FILE, model_type="llama", gpu_layers=0, context_length=4096, ) def respond(message: str, history): prompt = "" for turn in history: if isinstance(turn, (list, tuple)) and len(turn) >= 2: user_msg, bot_msg = turn[0], turn[1] else: continue prompt += f"[INST]\n{user_msg}\n[/INST]\n{bot_msg}\n" prompt += f"[INST]\n{message}\n[/INST]" out = llm( prompt, max_new_tokens=64, temperature=0.7, top_p=0.9, ) if isinstance(out, dict) and "text" in out: return out["text"] return str(out) demo = gr.ChatInterface(respond) if __name__ == "__main__": demo.launch()