Sonai12aa commited on
Commit
d989d1f
·
verified ·
1 Parent(s): c0e3bda

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +25 -16
main.py CHANGED
@@ -17,17 +17,25 @@ app.add_middleware(
17
 
18
  MODEL_ID = "Sonai12aa/qwen2.5-1.5b-godot"
19
 
 
 
 
 
 
 
 
 
 
 
20
  print("--- Loading tokenizer ---")
21
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
22
 
23
  use_cuda = torch.cuda.is_available()
24
  print(f"--- CUDA available: {use_cuda} ---")
25
 
26
- model_kwargs = {
27
- "low_cpu_mem_usage": True,
28
- }
29
 
30
- # Only use 4-bit quantization if CUDA is available
31
  if use_cuda:
32
  from transformers import BitsAndBytesConfig
33
 
@@ -35,29 +43,31 @@ if use_cuda:
35
  load_in_4bit=True,
36
  bnb_4bit_use_double_quant=True,
37
  bnb_4bit_quant_type="nf4",
38
- bnb_4bit_compute_dtype=torch.float16, # safer than bf16
39
  )
40
  model_kwargs["quantization_config"] = bnb_config
41
  model_kwargs["device_map"] = "auto"
42
  else:
43
- # CPU fallback (may be slow; but it should run)
44
  model_kwargs["device_map"] = {"": "cpu"}
45
  model_kwargs["torch_dtype"] = torch.float32
46
 
47
  print("--- Loading model ---")
48
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
 
49
  print("--- Model Loaded Successfully ---")
50
 
51
 
52
  class ChatRequest(BaseModel):
53
  prompt: str
54
- max_tokens: int = 150
55
 
56
 
57
  @app.get("/")
58
  def health_check():
59
  return {"status": "online", "model": MODEL_ID, "cuda": use_cuda}
60
 
 
61
  @app.post("/chat")
62
  async def chat(request: ChatRequest):
63
  user_text = request.prompt.strip()
@@ -67,11 +77,11 @@ async def chat(request: ChatRequest):
67
  {"role": "user", "content": user_text},
68
  ]
69
 
70
- # Qwen expects chat-formatted inputs
71
  chat_text = tokenizer.apply_chat_template(
72
  messages,
73
  tokenize=False,
74
- add_generation_prompt=True
75
  )
76
 
77
  inputs = tokenizer(chat_text, return_tensors="pt")
@@ -81,8 +91,8 @@ async def chat(request: ChatRequest):
81
  with torch.inference_mode():
82
  outputs = model.generate(
83
  **inputs,
84
- max_new_tokens=min(request.max_tokens, 96), # keep it tighter
85
- do_sample=False, # less ramble
86
  use_cache=True,
87
  eos_token_id=tokenizer.eos_token_id,
88
  pad_token_id=tokenizer.eos_token_id,
@@ -90,15 +100,14 @@ async def chat(request: ChatRequest):
90
 
91
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
92
 
93
- # Remove the prompt portion if it got echoed
94
- # (works fine even if it doesn't echo)
95
- response_text = decoded.split(user_text)[-1].strip()
96
 
97
  return {"response": response_text}
98
 
99
 
100
-
101
  if __name__ == "__main__":
102
  import uvicorn
103
- port = int(os.environ.get("PORT", "7860")) # ✅ HF port
 
104
  uvicorn.run(app, host="0.0.0.0", port=port)
 
17
 
18
  MODEL_ID = "Sonai12aa/qwen2.5-1.5b-godot"
19
 
20
+ SYSTEM_PROMPT = """You are GameFroze AI, a focused Godot Engine specialist.
21
+
22
+ Rules:
23
+ - Answer ONLY Godot Engine, GDScript, C#, game development, shaders, scenes, nodes, and debugging questions.
24
+ - Be concise and practical. Prefer step-by-step help and short code examples.
25
+ - Do NOT ask personal questions.
26
+ - Do NOT talk about being an AI model or say you lack personal experience.
27
+ - If the user asks something unrelated, briefly redirect them back to Godot topics.
28
+ """
29
+
30
  print("--- Loading tokenizer ---")
31
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
32
 
33
  use_cuda = torch.cuda.is_available()
34
  print(f"--- CUDA available: {use_cuda} ---")
35
 
36
+ model_kwargs = {"low_cpu_mem_usage": True}
 
 
37
 
38
+ # Only use 4-bit quantization if CUDA is available
39
  if use_cuda:
40
  from transformers import BitsAndBytesConfig
41
 
 
43
  load_in_4bit=True,
44
  bnb_4bit_use_double_quant=True,
45
  bnb_4bit_quant_type="nf4",
46
+ bnb_4bit_compute_dtype=torch.float16,
47
  )
48
  model_kwargs["quantization_config"] = bnb_config
49
  model_kwargs["device_map"] = "auto"
50
  else:
51
+ # CPU fallback (slow but works)
52
  model_kwargs["device_map"] = {"": "cpu"}
53
  model_kwargs["torch_dtype"] = torch.float32
54
 
55
  print("--- Loading model ---")
56
  model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
57
+ model.eval()
58
  print("--- Model Loaded Successfully ---")
59
 
60
 
61
  class ChatRequest(BaseModel):
62
  prompt: str
63
+ max_tokens: int = 96 # smaller = faster on CPU
64
 
65
 
66
  @app.get("/")
67
  def health_check():
68
  return {"status": "online", "model": MODEL_ID, "cuda": use_cuda}
69
 
70
+
71
  @app.post("/chat")
72
  async def chat(request: ChatRequest):
73
  user_text = request.prompt.strip()
 
77
  {"role": "user", "content": user_text},
78
  ]
79
 
80
+ # Qwen expects chat-formatted inputs
81
  chat_text = tokenizer.apply_chat_template(
82
  messages,
83
  tokenize=False,
84
+ add_generation_prompt=True,
85
  )
86
 
87
  inputs = tokenizer(chat_text, return_tensors="pt")
 
91
  with torch.inference_mode():
92
  outputs = model.generate(
93
  **inputs,
94
+ max_new_tokens=min(request.max_tokens, 96),
95
+ do_sample=False, # deterministic (less ramble + faster)
96
  use_cache=True,
97
  eos_token_id=tokenizer.eos_token_id,
98
  pad_token_id=tokenizer.eos_token_id,
 
100
 
101
  decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
102
 
103
+ # Try to remove echoed prompt; fallback to full decoded if split fails
104
+ response_text = decoded.split(user_text)[-1].strip() if user_text else decoded.strip()
 
105
 
106
  return {"response": response_text}
107
 
108
 
 
109
  if __name__ == "__main__":
110
  import uvicorn
111
+
112
+ port = int(os.environ.get("PORT", "7860"))
113
  uvicorn.run(app, host="0.0.0.0", port=port)