Paul1966-2 commited on
Commit
44d74c1
Β·
verified Β·
1 Parent(s): 9f9e362

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -28
app.py CHANGED
@@ -1,67 +1,67 @@
1
  import os
 
2
  import gradio as gr
3
  from huggingface_hub import hf_hub_download
4
  from llama_cpp import Llama
5
 
6
- # πŸ”§ CONFIGURATION: Change these to swap models
7
  MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
8
  MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"
9
- # For 7B: MODEL_REPO = "bartowski/Qwen2.5-Coder-7B-Instruct-GGUF"
10
- # MODEL_FILE = "Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf"
11
 
12
- # 1️⃣ Download model on first boot (cached automatically)
13
- print(f"⬇️ Downloading {MODEL_FILE} from {MODEL_REPO}...")
 
 
 
14
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
 
15
 
16
- # 2️⃣ Initialize CPU-optimized LLM
 
17
  llm = Llama(
18
  model_path=model_path,
19
- n_ctx=4096, # Max context window
20
- n_threads=2, # Matches HF free tier vCPU count
21
  n_batch=512,
22
  verbose=False,
23
- use_mlock=True # Keep model in RAM (prevents swapping)
24
  )
 
 
25
 
26
- # 3️⃣ Generation function
27
  def generate_python_code(user_prompt):
28
- system_prompt = (
29
- "You are an expert Python developer. Write clean, efficient, PEP-8 compliant code. "
30
- "Include type hints, docstrings, and error handling where appropriate. "
31
- "Output only the code block unless explicitly asked for explanations."
32
- )
33
 
34
  messages = [
35
- {"role": "system", "content": system_prompt},
36
  {"role": "user", "content": user_prompt}
37
  ]
38
 
39
  output = llm.create_chat_completion(
40
  messages=messages,
41
  max_tokens=1024,
42
- temperature=0.2, # Low for deterministic code
43
  top_p=0.9,
44
  repeat_penalty=1.1,
45
- stop=["</s>", "```"] # Prevent runaway generation
46
  )
47
 
 
 
48
  return output["choices"][0]["message"]["content"]
49
 
50
  # 4️⃣ Gradio UI
51
  demo = gr.Interface(
52
  fn=generate_python_code,
53
- inputs=gr.Textbox(
54
- lines=4,
55
- placeholder="e.g., Write an async function to fetch JSON from a URL, retry 3 times on failure, and parse specific fields...",
56
- label="Python Task"
57
- ),
58
- outputs=gr.Code(language="python", label="Generated Code"),
59
  title="🐍 Python Dev Assistant",
60
- description=f"Running `{MODEL_FILE}` on HF Free CPU Tier. First load takes ~60s.",
61
  examples=[
62
- ["Write a FastAPI route that accepts a CSV file and returns summary statistics"],
63
- ["Refactor this list comprehension into a more readable loop with logging: `results = [x**2 for x in data if x > 0]`"],
64
- ["Create a Pydantic model for a user profile with email validation and a custom validator for age > 18"]
65
  ]
66
  )
67
 
 
1
  import os
2
+ import time
3
  import gradio as gr
4
  from huggingface_hub import hf_hub_download
5
  from llama_cpp import Llama
6
 
7
+ # πŸ”§ CONFIGURATION
8
  MODEL_REPO = "bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF"
9
  MODEL_FILE = "Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf"
 
 
10
 
11
+ print("⏳ Starting Python Dev Assistant Space...")
12
+ START_TIME = time.time()
13
+
14
+ # 1️⃣ Download (only happens on first boot or cache miss)
15
+ print(f"πŸ“¦ Checking cache for {MODEL_FILE}...")
16
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
17
+ print(f"βœ… Model cached at: {model_path}")
18
 
19
+ # 2️⃣ Load into RAM (runs ONCE per Space startup)
20
+ print("🧠 Loading model into memory...")
21
  llm = Llama(
22
  model_path=model_path,
23
+ n_ctx=4096,
24
+ n_threads=2,
25
  n_batch=512,
26
  verbose=False,
27
+ use_mlock=True
28
  )
29
+ LOAD_TIME = round(time.time() - START_TIME, 1)
30
+ print(f"πŸš€ Model loaded in {LOAD_TIME}s. Ready for prompts!")
31
 
32
+ # 3️⃣ Generation function (reuses `llm` every time)
33
  def generate_python_code(user_prompt):
34
+ inference_start = time.time()
35
+ print(f"πŸ”Ή Processing prompt at {time.strftime('%H:%M:%S')}")
 
 
 
36
 
37
  messages = [
38
+ {"role": "system", "content": "You are an expert Python developer. Write clean, PEP-8 compliant code with type hints. Output only code unless asked otherwise."},
39
  {"role": "user", "content": user_prompt}
40
  ]
41
 
42
  output = llm.create_chat_completion(
43
  messages=messages,
44
  max_tokens=1024,
45
+ temperature=0.2,
46
  top_p=0.9,
47
  repeat_penalty=1.1,
48
+ stop=["</s>", "```"]
49
  )
50
 
51
+ inference_time = round(time.time() - inference_start, 2)
52
+ print(f"βœ… Done in {inference_time}s")
53
  return output["choices"][0]["message"]["content"]
54
 
55
  # 4️⃣ Gradio UI
56
  demo = gr.Interface(
57
  fn=generate_python_code,
58
+ inputs=gr.Textbox(lines=4, placeholder="Describe your Python task..."),
59
+ outputs=gr.Code(language="python"),
 
 
 
 
60
  title="🐍 Python Dev Assistant",
61
+ description=f"Loaded `{MODEL_FILE}` in {LOAD_TIME}s. Model stays in RAM between prompts.",
62
  examples=[
63
+ ["Write a Pydantic v2 model for a User with email validation"],
64
+ ["Create an async retry wrapper for HTTP requests using aiohttp"]
 
65
  ]
66
  )
67