import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread # ------------------------------------------------------------------ # 1. Model setup # ------------------------------------------------------------------ MODEL_ID = "michsethowusu/opani-coder_1b-merged-16bit" print("Loading tokenizer…") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) print("Loading model…") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=True ) print("Model ready!") # ------------------------------------------------------------------ # 2. Generation helper # ------------------------------------------------------------------ def generate_response(message: str, history: list[dict], temperature, top_p, top_k, max_tokens): """ message: str — the newest user message history: list[dict] — previous turns in {"role": "user"|"assistant", "content": "…"} format yields partial assistant reply strings """ messages = history + [{"role": "user", "content": message}] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True ) gen_kwargs = dict( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k, do_sample=True, streamer=streamer, ) thread = Thread(target=model.generate, kwargs=gen_kwargs) thread.start() partial = "" for new_text in streamer: partial += new_text yield partial thread.join() # ------------------------------------------------------------------ # 3. Gradio event helpers # ------------------------------------------------------------------ def user_submit(user_message, history): # history is list[dict] — append user message return "", history + [{"role": "user", "content": user_message}] def bot_respond(history, temperature, top_p, top_k, max_tokens): user_turn = history[-1]["content"] history_before = history[:-1] assistant_text = "" for assistant_text in generate_response( user_turn, history_before, temperature, top_p, top_k, max_tokens ): # update last dict incrementally history[-1] = {"role": "assistant", "content": assistant_text} yield history # ------------------------------------------------------------------ # 4. Gradio UI # ------------------------------------------------------------------ with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🇬🇭 Opani Coder 1B A fine-tuned Llama 3.2 1B model (16-bit) for coding assistance in Twi. Ask me anything about programming, and I'll help you out! """ ) chatbot = gr.Chatbot( height=500, label="Chat History", type="messages", avatar_images=(None, "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png"), ) with gr.Row(): msg = gr.Textbox( label="Your Message", placeholder="Ask me a coding question…", scale=4, lines=2, ) submit = gr.Button("Send 🚀", scale=1, variant="primary") with gr.Accordion("⚙️ Generation Parameters", open=False): gr.Markdown("*Adjust these settings to control the response style*") temperature = gr.Slider(0.1, 2.0, 0.7, step=0.1, label="Temperature") top_p = gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top P") top_k = gr.Slider(1, 100, 20, step=1, label="Top K") max_tokens = gr.Slider(64, 2048, 512, step=64, label="Max Tokens") clear = gr.Button("🗑️ Clear Chat") # ------------------------------------------------------------------ # 5. Examples # ------------------------------------------------------------------ gr.Examples( examples=[ ["Meyɛ dɛn na mekyerɛw Python function?"], ["Kyerɛkyerɛ nea for loop yɛ"], ["Kyerɛw calculator program a ɛnyɛ den"], ["Nsonoe bɛn na ɛda list ne tuple ntam?"], ["Boa me ma mensiesie saa code yi mu mfomso"], ], inputs=msg, label="Example Questions" ) # ------------------------------------------------------------------ # 6. Event wiring # ------------------------------------------------------------------ msg.submit( user_submit, [msg, chatbot], [msg, chatbot], queue=False ).then( bot_respond, [chatbot, temperature, top_p, top_k, max_tokens], chatbot, ) submit.click( user_submit, [msg, chatbot], [msg, chatbot], queue=False ).then( bot_respond, [chatbot, temperature, top_p, top_k, max_tokens], chatbot, ) clear.click(lambda: None, None, chatbot, queue=False) gr.Markdown( """ --- ### 💡 Tips for Best Results: - **Factual/Technical questions**: temperature 0.3-0.5 - **Creative coding solutions**: temperature 0.7-1.0 - **Code generation**: temperature 0.5-0.7 ### 📝 About This Model Fine-tuned Llama 3.2 1B (16-bit full model) for coding assistance in Twi. **Model**: [michsethowusu/opani-coder_1b-merged-16bit](https://huggingface.co/michsethowusu/opani-coder_1b-merged-16bit) """ ) # ------------------------------------------------------------------ # 7. Launch # ------------------------------------------------------------------ if __name__ == "__main__": demo.queue().launch()