import os import time import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama # 🔧 CONFIGURATION MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF" MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf" print("⏳ Starting Python Dev Assistant Space...") START_TIME = time.time() # 1️⃣ Download (only happens on first boot or cache miss) print(f"📦 Checking cache for {MODEL_FILE}...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) print(f"✅ Model cached at: {model_path}") # 2️⃣ Load into RAM (runs ONCE per Space startup) print("🧠 Loading model into memory...") llm = Llama( model_path=model_path, n_ctx=4096, n_threads=2, n_batch=512, verbose=False, use_mlock=True ) LOAD_TIME = round(time.time() - START_TIME, 1) print(f"🚀 Model loaded in {LOAD_TIME}s. Ready for prompts!") # 3️⃣ Generation function (reuses `llm` every time) def generate_python_code(user_prompt): inference_start = time.time() print(f"🔹 Processing prompt at {time.strftime('%H:%M:%S')}") messages = [ {"role": "system", "content": "You are an expert Python developer. Write clean, PEP-8 compliant code with type hints. Output only code unless asked otherwise."}, {"role": "user", "content": user_prompt} ] output = llm.create_chat_completion( messages=messages, max_tokens=1024, temperature=0.2, top_p=0.9, repeat_penalty=1.1, stop=["", "```"] ) inference_time = round(time.time() - inference_start, 2) print(f"✅ Done in {inference_time}s") return output["choices"][0]["message"]["content"] # 4️⃣ Gradio UI demo = gr.Interface( fn=generate_python_code, inputs=gr.Textbox(lines=4, placeholder="Describe your Python task..."), outputs=gr.Code(language="python"), title="🐍 Python Dev Assistant", description=f"Loaded `{MODEL_FILE}` in {LOAD_TIME}s. Model stays in RAM between prompts.", examples=[ ["Write a Pydantic v2 model for a User with email validation"], ["Create an async retry wrapper for HTTP requests using aiohttp"] ] ) if __name__ == "__main__": demo.launch()