Coder / app.py
SALEETAI's picture
Update app.py
e9b33cc verified
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# ==========================================
# 1. MODEL CONFIGURATION
# ==========================================
# This downloads your specific model from your repo automatically
REPO_ID = "SALEETAI/Qwen-Coding-Model-GGUF"
FILENAME = "qwen2.5-coder-7b-instruct.Q4_K_M.gguf"
print(f"📦 Fetching model from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
# ==========================================
# 2. INITIALIZE LLM (Optimized for CPU)
# ==========================================
llm = Llama(
model_path=model_path,
n_ctx=2048, # Context window (Adjustable)
n_threads=4, # Matches HF Free Tier CPU cores
verbose=False
)
# ==========================================
# 3. PROFESSIONAL INFERENCE LOGIC
# ==========================================
def chat_engine(message, history):
# Professional Qwen Chat Template Construction
prompt = "<|im_start|>system\nYou are an expert software architect specializing in Rust and C++.<|im_end|>\n"
for user_msg, assistant_msg in history:
prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
# Streaming implementation for "Boss Fight" code generation
stream = llm(
prompt,
max_tokens=1024,
stop=["<|im_end|>", "<|endoftext|>"],
stream=True,
temperature=0.4, # Your tuned temperature
repeat_penalty=1.2, # Your tuned penalty
)
response = ""
for output in stream:
token = output["choices"][0]["text"]
response += token
yield response
# ==========================================
# 4. GRADIO PRODUCTION UI
# ==========================================
# Apply the soft theme at the global Blocks level to comply with Gradio 5.x architecture
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🚀 SALEETAI Coding Agent (Qwen-7B)")
gr.Markdown("### Professional-grade code logic for Rust, C++, and complex architectural patterns.")
gr.ChatInterface(
fn=chat_engine,
examples=[
"Implement a thread-safe Lock-Free Stack in C++.",
"Write a Doubly Linked List in safe Rust.",
"Optimize a Python script for high-density data processing."
],
cache_examples=False,
)
if __name__ == "__main__":
demo.launch()