Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from ctransformers import AutoModelForCausalLM | |
| import time | |
| # Load the quantized GGUF model (optimized for CPU) | |
| llm = AutoModelForCausalLM.from_pretrained( | |
| "TheBloke/WizardCoder-Python-13B-GGUF", # You can change to CodeLlama, Phind, etc. | |
| model_file="wizardcoder-python-13b.Q4_K_M.gguf", # Use Q4_K_M for 16GB RAM | |
| model_type="llama", | |
| config={ | |
| "max_new_tokens": 512, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "stream": True | |
| } | |
| ) | |
| def generate_response(message, history): | |
| prompt = "" | |
| for user, bot in history: | |
| prompt += f"<user>: {user}\n<assistant>: {bot}\n" | |
| prompt += f"<user>: {message}\n<assistant>:" | |
| history.append([message, ""]) | |
| response = "" | |
| for chunk in llm(prompt): | |
| response += chunk | |
| history[-1][1] = response | |
| time.sleep(0.01) | |
| yield history | |
| # Gradio UI | |
| with gr.Blocks() as demo: | |
| chatbot = gr.Chatbot() | |
| msg = gr.Textbox(placeholder="Ask coding questions...", label="Your Message") | |
| clear = gr.Button("Clear") | |
| msg.submit(generate_response, [msg, chatbot], chatbot) | |
| clear.click(lambda: [], None, chatbot) | |
| demo.launch() | |