import gradio as gr from llama_cpp import Llama # Load the model # Note: Ensure you have a Q4_K_M file in your repo, or change the filename below to match your exact upload (e.g., "*.gguf") llm = Llama.from_pretrained( repo_id="WithinUsAI/IBM-Grok4-Ultra.Fast.Coder-1B-GGUF", filename="*Q4_K_M*", n_ctx=8192, n_threads=4, verbose=False, ) def chat(message, history): # Standard ChatML format, highly effective for modern coding models prompt = "<|im_start|>system\nYou are an ultra-fast, expert coding assistant. You write clean, efficient, and well-documented code.<|im_end|>\n" for user_msg, assistant_msg in history: prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n" prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" output = llm( prompt, max_tokens=1024, stop=["<|im_end|>", "<|im_start|>"], temperature=0.6, # Slightly lower temperature for more precise code generation top_p=0.95, repeat_penalty=1.1, echo=False, ) return output["choices"][0]["text"].strip() # Wrap the ChatInterface in gr.Blocks to safely apply the blue theme with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo: gr.ChatInterface( fn=chat, title="💻 IBM-Grok4 Ultra Fast Coder — 1B", description="Lightning-fast 1B coding assistant by **WithIn Us AI**. Built for speed, efficiency, and accurate code generation.", examples=[ "Write a Python web scraper using BeautifulSoup.", "Create a simple React component for a login form.", "Explain the difference between TCP and UDP.", "Debug this code: def add(a,b) return a+b", ], ) demo.launch()