import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # ========================================== # 1. MODEL CONFIGURATION # ========================================== # This downloads your specific model from your repo automatically REPO_ID = "SALEETAI/Qwen-Coding-Model-GGUF" FILENAME = "qwen2.5-coder-7b-instruct.Q4_K_M.gguf" print(f"📦 Fetching model from {REPO_ID}...") model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) # ========================================== # 2. INITIALIZE LLM (Optimized for CPU) # ========================================== llm = Llama( model_path=model_path, n_ctx=2048, # Context window (Adjustable) n_threads=4, # Matches HF Free Tier CPU cores verbose=False ) # ========================================== # 3. PROFESSIONAL INFERENCE LOGIC # ========================================== def chat_engine(message, history): # Professional Qwen Chat Template Construction prompt = "<|im_start|>system\nYou are an expert software architect specializing in Rust and C++.<|im_end|>\n" for user_msg, assistant_msg in history: prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" # Streaming implementation for "Boss Fight" code generation stream = llm( prompt, max_tokens=1024, stop=["<|im_end|>", "<|endoftext|>"], stream=True, temperature=0.4, # Your tuned temperature repeat_penalty=1.2, # Your tuned penalty ) response = "" for output in stream: token = output["choices"][0]["text"] response += token yield response # ========================================== # 4. GRADIO PRODUCTION UI # ========================================== # Apply the soft theme at the global Blocks level to comply with Gradio 5.x architecture with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🚀 SALEETAI Coding Agent (Qwen-7B)") gr.Markdown("### Professional-grade code logic for Rust, C++, and complex architectural patterns.") gr.ChatInterface( fn=chat_engine, examples=[ "Implement a thread-safe Lock-Free Stack in C++.", "Write a Doubly Linked List in safe Rust.", "Optimize a Python script for high-density data processing." ], cache_examples=False, ) if __name__ == "__main__": demo.launch()