File size: 2,242 Bytes
9f9e362
44d74c1
9f9e362
 
 
 
44d74c1
9f9e362
 
 
44d74c1
 
 
 
 
9f9e362
44d74c1
9f9e362
44d74c1
 
9f9e362
 
44d74c1
 
9f9e362
 
44d74c1
9f9e362
44d74c1
 
9f9e362
44d74c1
9f9e362
44d74c1
 
9f9e362
 
44d74c1
9f9e362
 
 
 
 
 
44d74c1
9f9e362
 
44d74c1
9f9e362
 
44d74c1
 
9f9e362
 
 
 
 
44d74c1
 
9f9e362
44d74c1
9f9e362
44d74c1
 
9f9e362
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import time
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# πŸ”§ CONFIGURATION
MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"

print("⏳ Starting Python Dev Assistant Space...")
START_TIME = time.time()

# 1️⃣ Download (only happens on first boot or cache miss)
print(f"πŸ“¦ Checking cache for {MODEL_FILE}...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
print(f"βœ… Model cached at: {model_path}")

# 2️⃣ Load into RAM (runs ONCE per Space startup)
print("🧠 Loading model into memory...")
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=2,
    n_batch=512,
    verbose=False,
    use_mlock=True
)
LOAD_TIME = round(time.time() - START_TIME, 1)
print(f"πŸš€ Model loaded in {LOAD_TIME}s. Ready for prompts!")

# 3️⃣ Generation function (reuses `llm` every time)
def generate_python_code(user_prompt):
    inference_start = time.time()
    print(f"πŸ”Ή Processing prompt at {time.strftime('%H:%M:%S')}")
    
    messages = [
        {"role": "system", "content": "You are an expert Python developer. Write clean, PEP-8 compliant code with type hints. Output only code unless asked otherwise."},
        {"role": "user", "content": user_prompt}
    ]
    
    output = llm.create_chat_completion(
        messages=messages,
        max_tokens=1024,
        temperature=0.2,
        top_p=0.9,
        repeat_penalty=1.1,
        stop=["</s>", "```"]
    )
    
    inference_time = round(time.time() - inference_start, 2)
    print(f"βœ… Done in {inference_time}s")
    return output["choices"][0]["message"]["content"]

# 4️⃣ Gradio UI
demo = gr.Interface(
    fn=generate_python_code,
    inputs=gr.Textbox(lines=4, placeholder="Describe your Python task..."),
    outputs=gr.Code(language="python"),
    title="🐍 Python Dev Assistant",
    description=f"Loaded `{MODEL_FILE}` in {LOAD_TIME}s. Model stays in RAM between prompts.",
    examples=[
        ["Write a Pydantic v2 model for a User with email validation"],
        ["Create an async retry wrapper for HTTP requests using aiohttp"]
    ]
)

if __name__ == "__main__":
    demo.launch()