Paul1966-2's picture
Update app.py
44d74c1 verified
Raw
History Blame Contribute Delete
2.24 kB
import os
import time
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# πŸ”§ CONFIGURATION
MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"
print("⏳ Starting Python Dev Assistant Space...")
START_TIME = time.time()
# 1️⃣ Download (only happens on first boot or cache miss)
print(f"πŸ“¦ Checking cache for {MODEL_FILE}...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
print(f"βœ… Model cached at: {model_path}")
# 2️⃣ Load into RAM (runs ONCE per Space startup)
print("🧠 Loading model into memory...")
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=2,
n_batch=512,
verbose=False,
use_mlock=True
)
LOAD_TIME = round(time.time() - START_TIME, 1)
print(f"πŸš€ Model loaded in {LOAD_TIME}s. Ready for prompts!")
# 3️⃣ Generation function (reuses `llm` every time)
def generate_python_code(user_prompt):
inference_start = time.time()
print(f"πŸ”Ή Processing prompt at {time.strftime('%H:%M:%S')}")
messages = [
{"role": "system", "content": "You are an expert Python developer. Write clean, PEP-8 compliant code with type hints. Output only code unless asked otherwise."},
{"role": "user", "content": user_prompt}
]
output = llm.create_chat_completion(
messages=messages,
max_tokens=1024,
temperature=0.2,
top_p=0.9,
repeat_penalty=1.1,
stop=["</s>", "```"]
)
inference_time = round(time.time() - inference_start, 2)
print(f"βœ… Done in {inference_time}s")
return output["choices"][0]["message"]["content"]
# 4️⃣ Gradio UI
demo = gr.Interface(
fn=generate_python_code,
inputs=gr.Textbox(lines=4, placeholder="Describe your Python task..."),
outputs=gr.Code(language="python"),
title="🐍 Python Dev Assistant",
description=f"Loaded `{MODEL_FILE}` in {LOAD_TIME}s. Model stays in RAM between prompts.",
examples=[
["Write a Pydantic v2 model for a User with email validation"],
["Create an async retry wrapper for HTTP requests using aiohttp"]
]
)
if __name__ == "__main__":
demo.launch()