| import gradio as gr |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
|
|
| |
| |
| |
| |
| REPO_ID = "SALEETAI/Qwen-Coding-Model-GGUF" |
| FILENAME = "qwen2.5-coder-7b-instruct.Q4_K_M.gguf" |
|
|
| print(f"📦 Fetching model from {REPO_ID}...") |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) |
|
|
| |
| |
| |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=4, |
| verbose=False |
| ) |
|
|
| |
| |
| |
| def chat_engine(message, history): |
| |
| prompt = "<|im_start|>system\nYou are an expert software architect specializing in Rust and C++.<|im_end|>\n" |
| |
| for user_msg, assistant_msg in history: |
| prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" |
| |
| prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" |
|
|
| |
| stream = llm( |
| prompt, |
| max_tokens=1024, |
| stop=["<|im_end|>", "<|endoftext|>"], |
| stream=True, |
| temperature=0.4, |
| repeat_penalty=1.2, |
| ) |
|
|
| response = "" |
| for output in stream: |
| token = output["choices"][0]["text"] |
| response += token |
| yield response |
|
|
| |
| |
| |
| |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| gr.Markdown("# 🚀 SALEETAI Coding Agent (Qwen-7B)") |
| gr.Markdown("### Professional-grade code logic for Rust, C++, and complex architectural patterns.") |
| |
| gr.ChatInterface( |
| fn=chat_engine, |
| examples=[ |
| "Implement a thread-safe Lock-Free Stack in C++.", |
| "Write a Doubly Linked List in safe Rust.", |
| "Optimize a Python script for high-density data processing." |
| ], |
| cache_examples=False, |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|