import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# ==========================================
# 1. MODEL CONFIGURATION
# ==========================================
# This downloads your specific model from your repo automatically
REPO_ID = "SALEETAI/Qwen-Coding-Model-GGUF"
FILENAME = "qwen2.5-coder-7b-instruct.Q4_K_M.gguf"

print(f"📦 Fetching model from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

# ==========================================
# 2. INITIALIZE LLM (Optimized for CPU)
# ==========================================
llm = Llama(
    model_path=model_path,
    n_ctx=2048,           # Context window (Adjustable)
    n_threads=4,          # Matches HF Free Tier CPU cores
    verbose=False
)

# ==========================================
# 3. PROFESSIONAL INFERENCE LOGIC
# ==========================================
def chat_engine(message, history):
    # Professional Qwen Chat Template Construction
    prompt = "<|im_start|>system\nYou are an expert software architect specializing in Rust and C++.<|im_end|>\n"
    
    for user_msg, assistant_msg in history:
        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
    
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    # Streaming implementation for "Boss Fight" code generation
    stream = llm(
        prompt,
        max_tokens=1024,
        stop=["<|im_end|>", "<|endoftext|>"],
        stream=True,
        temperature=0.4,       # Your tuned temperature
        repeat_penalty=1.2,    # Your tuned penalty
    )

    response = ""
    for output in stream:
        token = output["choices"][0]["text"]
        response += token
        yield response

# ==========================================
# 4. GRADIO PRODUCTION UI
# ==========================================
# Apply the soft theme at the global Blocks level to comply with Gradio 5.x architecture
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🚀 SALEETAI Coding Agent (Qwen-7B)")
    gr.Markdown("### Professional-grade code logic for Rust, C++, and complex architectural patterns.")
    
    gr.ChatInterface(
        fn=chat_engine,
        examples=[
            "Implement a thread-safe Lock-Free Stack in C++.",
            "Write a Doubly Linked List in safe Rust.",
            "Optimize a Python script for high-density data processing."
        ],
        cache_examples=False,
    )

if __name__ == "__main__":
    demo.launch()