File size: 2,575 Bytes
d35d842 e9b33cc d35d842 dda762b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# ==========================================
# 1. MODEL CONFIGURATION
# ==========================================
# This downloads your specific model from your repo automatically
REPO_ID = "SALEETAI/Qwen-Coding-Model-GGUF"
FILENAME = "qwen2.5-coder-7b-instruct.Q4_K_M.gguf"
print(f"📦 Fetching model from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
# ==========================================
# 2. INITIALIZE LLM (Optimized for CPU)
# ==========================================
llm = Llama(
model_path=model_path,
n_ctx=2048, # Context window (Adjustable)
n_threads=4, # Matches HF Free Tier CPU cores
verbose=False
)
# ==========================================
# 3. PROFESSIONAL INFERENCE LOGIC
# ==========================================
def chat_engine(message, history):
# Professional Qwen Chat Template Construction
prompt = "<|im_start|>system\nYou are an expert software architect specializing in Rust and C++.<|im_end|>\n"
for user_msg, assistant_msg in history:
prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
# Streaming implementation for "Boss Fight" code generation
stream = llm(
prompt,
max_tokens=1024,
stop=["<|im_end|>", "<|endoftext|>"],
stream=True,
temperature=0.4, # Your tuned temperature
repeat_penalty=1.2, # Your tuned penalty
)
response = ""
for output in stream:
token = output["choices"][0]["text"]
response += token
yield response
# ==========================================
# 4. GRADIO PRODUCTION UI
# ==========================================
# Apply the soft theme at the global Blocks level to comply with Gradio 5.x architecture
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 SALEETAI Coding Agent (Qwen-7B)")
gr.Markdown("### Professional-grade code logic for Rust, C++, and complex architectural patterns.")
gr.ChatInterface(
fn=chat_engine,
examples=[
"Implement a thread-safe Lock-Free Stack in C++.",
"Write a Doubly Linked List in safe Rust.",
"Optimize a Python script for high-density data processing."
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch()
|