MuhammadNoman7600 commited on
Commit
48e2856
Β·
verified Β·
1 Parent(s): e493837

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +369 -106
app.py CHANGED
@@ -1,7 +1,9 @@
1
  """
2
  =============================================================================
3
- Transformers + FastAPI β€” OpenAI-Compatible Server for Qwen/Qwen3.5-0.8B
4
- CPU-ONLY β€’ TOOL CALLING β€’ STREAMING β€’ Port 7860 (HF Spaces)
 
 
5
  =============================================================================
6
  """
7
 
@@ -18,28 +20,29 @@ import uvicorn
18
  from fastapi import FastAPI, HTTPException
19
  from fastapi.middleware.cors import CORSMiddleware
20
  from fastapi.responses import JSONResponse, StreamingResponse
 
21
  from pydantic import BaseModel
22
  from transformers import (
23
  AutoModelForCausalLM,
24
  AutoTokenizer,
 
25
  TextIteratorStreamer,
26
  )
27
 
28
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
29
-
30
- MODEL_NAME = "Qwen/Qwen3-0.6B"
31
- HOST = "0.0.0.0"
32
- PORT = 7860
33
- MAX_NEW_TOKENS = 1024
34
-
35
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
36
 
37
  app = FastAPI(
38
- title="Qwen3.5-0.8B OpenAI-Compatible API (CPU)",
39
- description="Transformers-powered inference with tool calling β€” runs on CPU",
40
  version="2.0.0",
41
  )
42
-
43
  app.add_middleware(
44
  CORSMiddleware,
45
  allow_origins=["*"],
@@ -48,8 +51,8 @@ app.add_middleware(
48
  allow_headers=["*"],
49
  )
50
 
51
- # ━━━━━━━━━━━━━━━━━━━━━━ Pydantic Models ━━━━━━━━━━━━━━━━━━━━━━━
52
 
 
53
 
54
  class FunctionDef(BaseModel):
55
  name: str
@@ -82,7 +85,7 @@ class ChatMessage(BaseModel):
82
 
83
 
84
  class ChatCompletionRequest(BaseModel):
85
- model: str = MODEL_NAME
86
  messages: list[ChatMessage]
87
  temperature: Optional[float] = 0.7
88
  top_p: Optional[float] = 0.9
@@ -98,7 +101,7 @@ class ChatCompletionRequest(BaseModel):
98
 
99
 
100
  class CompletionRequest(BaseModel):
101
- model: str = MODEL_NAME
102
  prompt: Union[str, list[str]] = ""
103
  temperature: Optional[float] = 0.7
104
  top_p: Optional[float] = 0.9
@@ -111,58 +114,109 @@ class CompletionRequest(BaseModel):
111
  n: Optional[int] = 1
112
 
113
 
114
- # ━━━━━━━━━━━━━━━━━━━ Model Loading (CPU) ━━━━━━━━━━━━━━━━━━━━━━
115
 
116
- tokenizer = None
117
- model = None
118
- generate_lock = Lock()
 
119
 
120
 
121
  def load_model():
122
- global tokenizer, model
123
-
124
  if model is not None:
125
  return
126
 
127
- print(f"\nπŸš€ Loading model: {MODEL_NAME} on CPU ...")
128
- print(f" HF_HOME = {os.environ.get('HF_HOME', 'default')}\n")
129
-
130
- tokenizer = AutoTokenizer.from_pretrained(
131
- MODEL_NAME,
132
- use_fast=True,
133
- )
134
 
135
- model = AutoModelForCausalLM.from_pretrained(
136
- MODEL_NAME,
137
- torch_dtype=torch.float32,
138
- device_map="cpu",
139
- trust_remote_code=True,
140
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
 
 
 
 
 
 
 
142
  model.eval()
143
- print("βœ… Model loaded on CPU!\n")
144
 
 
 
 
 
 
 
 
 
 
145
 
146
- # ━━━━━━━━━━━━━━━━━━━━ Tool-Prompt Builder (Hermes) ━━━━━━━━━━━━
 
 
 
 
 
147
 
148
  TOOL_SYSTEM_PROMPT_TEMPLATE = """\
149
  You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
150
 
151
  # Tools
 
152
  You may call one or more functions to assist with the user query.
153
 
154
  You are provided with function signatures within <tools></tools> XML tags:
155
-
156
  <tools>
157
  {tool_definitions}
158
  </tools>
159
 
160
  For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
161
-
162
  <tool_call>
163
  {{"name": "<function-name>", "arguments": <args-json-object>}}
164
- </tool_call>
165
- """
166
 
167
  NO_TOOL_SYSTEM_PROMPT = (
168
  "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
@@ -171,7 +225,6 @@ NO_TOOL_SYSTEM_PROMPT = (
171
 
172
  def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
173
  lines = []
174
-
175
  for t in tools:
176
  obj: dict[str, Any] = {
177
  "type": "function",
@@ -180,12 +233,9 @@ def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
180
  "description": t.function.description or "",
181
  },
182
  }
183
-
184
  if t.function.parameters:
185
  obj["function"]["parameters"] = t.function.parameters
186
-
187
  lines.append(json.dumps(obj))
188
-
189
  return "\n".join(lines)
190
 
191
 
@@ -195,15 +245,15 @@ def build_chat_prompt(
195
  tool_choice: Optional[Union[str, dict]] = None,
196
  ) -> str:
197
  parts: list[str] = []
198
-
199
  has_system = any(m.role == "system" for m in messages)
200
 
201
- if tools:
202
- default_sys = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
203
- tool_definitions=_serialize_tool_definitions(tools),
204
  )
205
- else:
206
- default_sys = NO_TOOL_SYSTEM_PROMPT
 
207
 
208
  if not has_system:
209
  parts.append(f"<|im_start|>system\n{default_sys}<|im_end|>\n")
@@ -212,45 +262,34 @@ def build_chat_prompt(
212
  role = msg.role
213
 
214
  if role == "system":
215
- base = msg.content or ""
216
-
217
  if tools:
218
  tool_block = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
219
- tool_definitions=_serialize_tool_definitions(tools),
220
- )
221
- merged = f"{base}\n\n{tool_block}" if base else tool_block
222
-
223
- parts.append(
224
- f"<|im_start|>system\n{merged}<|im_end|>\n"
225
  )
 
 
226
  else:
227
  parts.append(
228
- f"<|im_start|>system\n{base or NO_TOOL_SYSTEM_PROMPT}<|im_end|>\n"
229
  )
230
 
231
  elif role == "user":
232
- parts.append(
233
- f"<|im_start|>user\n{msg.content or ''}<|im_end|>\n"
234
- )
235
 
236
  elif role == "assistant":
237
  if msg.tool_calls:
238
  tc_text = ""
239
-
240
  for tc in msg.tool_calls:
241
  args = tc.function.arguments
242
  if isinstance(args, dict):
243
  args = json.dumps(args)
244
-
245
  tc_text += (
246
  f"\n<tool_call>\n"
247
  f'{{"name": "{tc.function.name}", "arguments": {args}}}\n'
248
  f"</tool_call>"
249
  )
250
-
251
- parts.append(
252
- f"<|im_start|>assistant{tc_text}<|im_end|>\n"
253
- )
254
  else:
255
  parts.append(
256
  f"<|im_start|>assistant\n{msg.content or ''}<|im_end|>\n"
@@ -269,79 +308,303 @@ def build_chat_prompt(
269
 
270
  # ━━━━━━━━━━━━━━━━━━ Tool-Call Parser ━━━━━━━━━━━━━━━━━━━━━━━━━━
271
 
272
- _TOOL_CALL_RE = re.compile(
273
- r"<tool_call>\s*(\{.*?\})\s*</tool_call>",
274
- re.DOTALL,
275
- )
276
 
277
 
278
  def parse_tool_calls(text: str) -> tuple[Optional[str], list[dict]]:
279
  tool_calls: list[dict] = []
280
-
281
  for raw_json in _TOOL_CALL_RE.findall(text):
282
  try:
283
  parsed = json.loads(raw_json)
284
  except json.JSONDecodeError:
285
  continue
286
-
287
  name = parsed.get("name", "")
288
  arguments = parsed.get("arguments", {})
289
-
290
  if isinstance(arguments, dict):
291
  arguments = json.dumps(arguments)
292
  elif not isinstance(arguments, str):
293
  arguments = json.dumps(arguments)
294
-
295
- tool_calls.append(
296
- {
297
- "id": f"call_{uuid.uuid4().hex[:24]}",
298
- "type": "function",
299
- "function": {
300
- "name": name,
301
- "arguments": arguments,
302
- },
303
- }
304
- )
305
-
306
  content = _TOOL_CALL_RE.sub("", text).strip() or None
307
  return content, tool_calls
308
 
309
 
310
- # ━━━━━━━━━━━━━━━━━━ Generation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
311
 
312
- def generate_text(prompt: str, req) -> tuple[str, int, int]:
313
- """Generate text on CPU. Returns (text, prompt_tokens, completion_tokens)."""
314
-
315
- inputs = tokenizer(prompt, return_tensors="pt")
316
- input_ids = inputs["input_ids"]
317
 
318
- prompt_tokens = input_ids.shape[1]
319
- max_new = req.max_tokens or MAX_NEW_TOKENS
320
 
321
- gen_kwargs = {
322
- "input_ids": input_ids,
 
323
  "attention_mask": inputs.get("attention_mask"),
324
- "max_new_tokens": max_new,
325
  "do_sample": True,
326
  "temperature": max(req.temperature, 0.01),
327
  "top_p": req.top_p,
328
- "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
329
- "pad_token_id": tokenizer.eos_token_id,
330
  }
331
-
332
  rep_penalty = getattr(req, "repetition_penalty", 1.0)
333
  if rep_penalty and rep_penalty > 1.0:
334
- gen_kwargs["repetition_penalty"] = rep_penalty
 
 
 
 
 
 
 
 
 
335
 
336
  with generate_lock:
337
  with torch.no_grad():
338
  output_ids = model.generate(**gen_kwargs)
339
 
340
  new_ids = output_ids[0][prompt_tokens:]
341
- text = tokenizer.decode(new_ids, skip_special_tokens=False)
 
342
 
343
- for tok in ["<|im_end|>", "<|endoftext|>"]:
344
- text = text.replace(tok, "")
345
 
346
- completion_tokens = len(new_ids)
347
- return text.strip(), prompt_tokens, completion_tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  =============================================================================
3
+ Transformers + FastAPI β€” OpenAI-Compatible Server
4
+ Base : unsloth/qwen2.5-0.5b-unsloth-bnb-4bit
5
+ Adapter: MuhammadNoman7600/mermaid (LoRA r=16 Ξ±=16)
6
+ CPU-ONLY fallback β€’ TOOL CALLING β€’ STREAMING β€’ Port 7860
7
  =============================================================================
8
  """
9
 
 
20
  from fastapi import FastAPI, HTTPException
21
  from fastapi.middleware.cors import CORSMiddleware
22
  from fastapi.responses import JSONResponse, StreamingResponse
23
+ from peft import PeftModel
24
  from pydantic import BaseModel
25
  from transformers import (
26
  AutoModelForCausalLM,
27
  AutoTokenizer,
28
+ BitsAndBytesConfig,
29
  TextIteratorStreamer,
30
  )
31
 
32
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
33
+ BASE_MODEL_NAME = "unsloth/qwen2.5-0.5b-unsloth-bnb-4bit"
34
+ ADAPTER_NAME = "MuhammadNoman7600/mermaid"
35
+ DISPLAY_MODEL_NAME = "MuhammadNoman7600/mermaid"
36
+ HOST = "0.0.0.0"
37
+ PORT = 7860
38
+ MAX_NEW_TOKENS = 32768
39
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
40
 
41
  app = FastAPI(
42
+ title="Mermaid Fine-Tuned Qwen2.5-0.5B β€” OpenAI-Compatible API",
43
+ description="LoRA adapter MuhammadNoman7600/mermaid on Qwen2.5-0.5B with tool calling",
44
  version="2.0.0",
45
  )
 
46
  app.add_middleware(
47
  CORSMiddleware,
48
  allow_origins=["*"],
 
51
  allow_headers=["*"],
52
  )
53
 
 
54
 
55
+ # ━━━━━━━━━━━━━━━━━━━━━━━ Pydantic Models ━━━━━━━━━━━━━━━━━━━━━━
56
 
57
  class FunctionDef(BaseModel):
58
  name: str
 
85
 
86
 
87
  class ChatCompletionRequest(BaseModel):
88
+ model: str = DISPLAY_MODEL_NAME
89
  messages: list[ChatMessage]
90
  temperature: Optional[float] = 0.7
91
  top_p: Optional[float] = 0.9
 
101
 
102
 
103
  class CompletionRequest(BaseModel):
104
+ model: str = DISPLAY_MODEL_NAME
105
  prompt: Union[str, list[str]] = ""
106
  temperature: Optional[float] = 0.7
107
  top_p: Optional[float] = 0.9
 
114
  n: Optional[int] = 1
115
 
116
 
117
+ # ━━━━━━━━━━━━━━━━━━━ Model Loading ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
118
 
119
+ tokenizer: Any = None
120
+ model: Any = None
121
+ generate_lock = Lock()
122
+ stop_token_ids: list[int] = []
123
 
124
 
125
  def load_model():
126
+ global tokenizer, model, stop_token_ids
 
127
  if model is not None:
128
  return
129
 
130
+ print(f"\nπŸš€ Base model : {BASE_MODEL_NAME}")
131
+ print(f"πŸ”Œ LoRA adapter: {ADAPTER_NAME}")
132
+ print(f" HF_HOME = {os.environ.get('HF_HOME', 'default')}\n")
 
 
 
 
133
 
134
+ # ── Tokenizer ───────────────────────────────────────────────
135
+ # Adapter repos rarely ship a tokenizer; fall back to base.
136
+ try:
137
+ tokenizer = AutoTokenizer.from_pretrained(
138
+ ADAPTER_NAME, use_fast=True, trust_remote_code=True
139
+ )
140
+ print(" Tokenizer loaded from adapter repo.")
141
+ except Exception:
142
+ tokenizer = AutoTokenizer.from_pretrained(
143
+ BASE_MODEL_NAME, use_fast=True, trust_remote_code=True
144
+ )
145
+ print(" Tokenizer loaded from base model repo.")
146
+
147
+ if tokenizer.pad_token is None:
148
+ tokenizer.pad_token = tokenizer.eos_token
149
+
150
+ # ── Base model ────���─────────────────────────────────────────
151
+ # Load in 4-bit if CUDA is available (matches training setup),
152
+ # otherwise fall back to float32 on CPU.
153
+ use_4bit = torch.cuda.is_available()
154
+
155
+ if use_4bit:
156
+ print(" CUDA detected β€” loading in 4-bit (bitsandbytes nf4).")
157
+ bnb_config = BitsAndBytesConfig(
158
+ load_in_4bit=True,
159
+ bnb_4bit_quant_type="nf4",
160
+ bnb_4bit_use_double_quant=True,
161
+ bnb_4bit_compute_dtype=torch.float16,
162
+ )
163
+ base = AutoModelForCausalLM.from_pretrained(
164
+ BASE_MODEL_NAME,
165
+ quantization_config=bnb_config,
166
+ device_map="auto",
167
+ trust_remote_code=True,
168
+ )
169
+ else:
170
+ print(" No CUDA β€” loading base model in float32 on CPU.")
171
+ base = AutoModelForCausalLM.from_pretrained(
172
+ BASE_MODEL_NAME,
173
+ torch_dtype=torch.float32,
174
+ device_map="cpu",
175
+ trust_remote_code=True,
176
+ )
177
 
178
+ # ── Attach LoRA adapter ─────────────────────────────────────
179
+ print(f" Attaching LoRA adapter …")
180
+ model = PeftModel.from_pretrained(
181
+ base,
182
+ ADAPTER_NAME,
183
+ is_trainable=False, # inference only
184
+ )
185
  model.eval()
 
186
 
187
+ # ── Stop-token IDs ──────────────────────────────────────────
188
+ _stop_ids: set[int] = set()
189
+ if tokenizer.eos_token_id is not None:
190
+ _stop_ids.add(tokenizer.eos_token_id)
191
+ for tok_str in ["<|im_end|>", "<|endoftext|>"]:
192
+ tid = tokenizer.convert_tokens_to_ids(tok_str)
193
+ if tid is not None and tid != tokenizer.unk_token_id:
194
+ _stop_ids.add(tid)
195
+ stop_token_ids = list(_stop_ids)
196
 
197
+ print(f" eos_token = {tokenizer.eos_token!r}")
198
+ print(f" stop_token_ids = {stop_token_ids}")
199
+ print("βœ… Fine-tuned model ready!\n")
200
+
201
+
202
+ # ━━━━━━━━━━━━━━━━━━━━ Chat-Prompt Builder (ChatML) ━━━━━━━━━━━━
203
 
204
  TOOL_SYSTEM_PROMPT_TEMPLATE = """\
205
  You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
206
 
207
  # Tools
208
+
209
  You may call one or more functions to assist with the user query.
210
 
211
  You are provided with function signatures within <tools></tools> XML tags:
 
212
  <tools>
213
  {tool_definitions}
214
  </tools>
215
 
216
  For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
 
217
  <tool_call>
218
  {{"name": "<function-name>", "arguments": <args-json-object>}}
219
+ </tool_call>"""
 
220
 
221
  NO_TOOL_SYSTEM_PROMPT = (
222
  "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
 
225
 
226
  def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
227
  lines = []
 
228
  for t in tools:
229
  obj: dict[str, Any] = {
230
  "type": "function",
 
233
  "description": t.function.description or "",
234
  },
235
  }
 
236
  if t.function.parameters:
237
  obj["function"]["parameters"] = t.function.parameters
 
238
  lines.append(json.dumps(obj))
 
239
  return "\n".join(lines)
240
 
241
 
 
245
  tool_choice: Optional[Union[str, dict]] = None,
246
  ) -> str:
247
  parts: list[str] = []
 
248
  has_system = any(m.role == "system" for m in messages)
249
 
250
+ default_sys = (
251
+ TOOL_SYSTEM_PROMPT_TEMPLATE.format(
252
+ tool_definitions=_serialize_tool_definitions(tools)
253
  )
254
+ if tools
255
+ else NO_TOOL_SYSTEM_PROMPT
256
+ )
257
 
258
  if not has_system:
259
  parts.append(f"<|im_start|>system\n{default_sys}<|im_end|>\n")
 
262
  role = msg.role
263
 
264
  if role == "system":
265
+ base_content = msg.content or ""
 
266
  if tools:
267
  tool_block = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
268
+ tool_definitions=_serialize_tool_definitions(tools)
 
 
 
 
 
269
  )
270
+ merged = f"{base_content}\n\n{tool_block}" if base_content else tool_block
271
+ parts.append(f"<|im_start|>system\n{merged}<|im_end|>\n")
272
  else:
273
  parts.append(
274
+ f"<|im_start|>system\n{base_content or NO_TOOL_SYSTEM_PROMPT}<|im_end|>\n"
275
  )
276
 
277
  elif role == "user":
278
+ parts.append(f"<|im_start|>user\n{msg.content or ''}<|im_end|>\n")
 
 
279
 
280
  elif role == "assistant":
281
  if msg.tool_calls:
282
  tc_text = ""
 
283
  for tc in msg.tool_calls:
284
  args = tc.function.arguments
285
  if isinstance(args, dict):
286
  args = json.dumps(args)
 
287
  tc_text += (
288
  f"\n<tool_call>\n"
289
  f'{{"name": "{tc.function.name}", "arguments": {args}}}\n'
290
  f"</tool_call>"
291
  )
292
+ parts.append(f"<|im_start|>assistant{tc_text}<|im_end|>\n")
 
 
 
293
  else:
294
  parts.append(
295
  f"<|im_start|>assistant\n{msg.content or ''}<|im_end|>\n"
 
308
 
309
  # ━━━━━━━━━━━━━━━━━━ Tool-Call Parser ━━━━━━━━━━━━━━━━━━━━━━━━━━
310
 
311
+ _TOOL_CALL_RE = re.compile(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL)
 
 
 
312
 
313
 
314
  def parse_tool_calls(text: str) -> tuple[Optional[str], list[dict]]:
315
  tool_calls: list[dict] = []
 
316
  for raw_json in _TOOL_CALL_RE.findall(text):
317
  try:
318
  parsed = json.loads(raw_json)
319
  except json.JSONDecodeError:
320
  continue
 
321
  name = parsed.get("name", "")
322
  arguments = parsed.get("arguments", {})
 
323
  if isinstance(arguments, dict):
324
  arguments = json.dumps(arguments)
325
  elif not isinstance(arguments, str):
326
  arguments = json.dumps(arguments)
327
+ tool_calls.append({
328
+ "id": f"call_{uuid.uuid4().hex[:24]}",
329
+ "type": "function",
330
+ "function": {"name": name, "arguments": arguments},
331
+ })
 
 
 
 
 
 
 
332
  content = _TOOL_CALL_RE.sub("", text).strip() or None
333
  return content, tool_calls
334
 
335
 
336
+ # ━━━━━━━━━━━━━━━━━━ Generation Helpers ━━━━━━━━━━━━━━━━━━━━━━━━
337
 
338
+ def _clean_output(text: str) -> str:
339
+ for tok in ["<|im_end|>", "<|im_start|>", "<|endoftext|>"]:
340
+ text = text.replace(tok, "")
341
+ return text.strip()
 
342
 
 
 
343
 
344
+ def _build_gen_kwargs(inputs: dict, req: Any, streamer=None) -> dict:
345
+ kwargs: dict[str, Any] = {
346
+ "input_ids": inputs["input_ids"],
347
  "attention_mask": inputs.get("attention_mask"),
348
+ "max_new_tokens": req.max_tokens or MAX_NEW_TOKENS,
349
  "do_sample": True,
350
  "temperature": max(req.temperature, 0.01),
351
  "top_p": req.top_p,
352
+ "eos_token_id": stop_token_ids,
353
+ "pad_token_id": tokenizer.pad_token_id,
354
  }
 
355
  rep_penalty = getattr(req, "repetition_penalty", 1.0)
356
  if rep_penalty and rep_penalty > 1.0:
357
+ kwargs["repetition_penalty"] = rep_penalty
358
+ if streamer is not None:
359
+ kwargs["streamer"] = streamer
360
+ return kwargs
361
+
362
+
363
+ def generate_text(prompt: str, req) -> tuple[str, int, int]:
364
+ inputs = tokenizer(prompt, return_tensors="pt")
365
+ prompt_tokens = inputs["input_ids"].shape[1]
366
+ gen_kwargs = _build_gen_kwargs(inputs, req)
367
 
368
  with generate_lock:
369
  with torch.no_grad():
370
  output_ids = model.generate(**gen_kwargs)
371
 
372
  new_ids = output_ids[0][prompt_tokens:]
373
+ text = _clean_output(tokenizer.decode(new_ids, skip_special_tokens=False))
374
+ return text, prompt_tokens, len(new_ids)
375
 
 
 
376
 
377
+ def generate_text_stream(prompt: str, req):
378
+ inputs = tokenizer(prompt, return_tensors="pt")
379
+ streamer = TextIteratorStreamer(
380
+ tokenizer, skip_prompt=True, skip_special_tokens=False
381
+ )
382
+ gen_kwargs = _build_gen_kwargs(inputs, req, streamer=streamer)
383
+
384
+ thread = Thread(target=_generate_in_thread, args=(gen_kwargs,))
385
+ thread.start()
386
+
387
+ for token_text in streamer:
388
+ if any(s in token_text for s in ["<|im_end|>", "<|endoftext|>"]):
389
+ cleaned = _clean_output(token_text)
390
+ if cleaned:
391
+ yield cleaned
392
+ break
393
+ yield token_text
394
+
395
+ thread.join()
396
+
397
+
398
+ def _generate_in_thread(gen_kwargs: dict):
399
+ with generate_lock:
400
+ with torch.no_grad():
401
+ model.generate(**gen_kwargs)
402
+
403
+
404
+ # ━━━━━━━━━━━━━━━━━━ Response Builders ━━━━━━━━━━━━━━━━━━━━━━━━━
405
+
406
+ def _uid(prefix: str = "chatcmpl") -> str:
407
+ return f"{prefix}-{uuid.uuid4().hex[:12]}"
408
+
409
+
410
+ def make_chat_response(
411
+ content: Optional[str],
412
+ tool_calls: list[dict],
413
+ model_name: str,
414
+ prompt_tokens: int,
415
+ completion_tokens: int,
416
+ ) -> dict:
417
+ message: dict[str, Any] = {"role": "assistant"}
418
+ if tool_calls:
419
+ message["content"] = content
420
+ message["tool_calls"] = tool_calls
421
+ finish_reason = "tool_calls"
422
+ else:
423
+ message["content"] = (content or "").strip()
424
+ finish_reason = "stop"
425
+ return {
426
+ "id": _uid(),
427
+ "object": "chat.completion",
428
+ "created": int(time.time()),
429
+ "model": model_name,
430
+ "choices": [{"index": 0, "message": message, "finish_reason": finish_reason}],
431
+ "usage": {
432
+ "prompt_tokens": prompt_tokens,
433
+ "completion_tokens": completion_tokens,
434
+ "total_tokens": prompt_tokens + completion_tokens,
435
+ },
436
+ }
437
+
438
+
439
+ def make_completion_response(
440
+ text: str, model_name: str, prompt_tokens: int, completion_tokens: int
441
+ ) -> dict:
442
+ return {
443
+ "id": _uid("cmpl"),
444
+ "object": "text_completion",
445
+ "created": int(time.time()),
446
+ "model": model_name,
447
+ "choices": [{"index": 0, "text": text.strip(), "finish_reason": "stop"}],
448
+ "usage": {
449
+ "prompt_tokens": prompt_tokens,
450
+ "completion_tokens": completion_tokens,
451
+ "total_tokens": prompt_tokens + completion_tokens,
452
+ },
453
+ }
454
+
455
+
456
+ # ━━━━━━━━━━━━━━━━━━ Streaming Helpers ━━━━━━━━━━━━━━━━━━━━━━━━
457
+
458
+ def stream_chat_response(prompt: str, req):
459
+ cid, created = _uid(), int(time.time())
460
+
461
+ def _chunk(delta: dict, finish: Optional[str] = None) -> str:
462
+ return "data: " + json.dumps({
463
+ "id": cid, "object": "chat.completion.chunk",
464
+ "created": created, "model": req.model,
465
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
466
+ }) + "\n\n"
467
+
468
+ yield _chunk({"role": "assistant"})
469
+ for token_text in generate_text_stream(prompt, req):
470
+ if token_text:
471
+ yield _chunk({"content": token_text})
472
+ yield _chunk({}, finish="stop")
473
+ yield "data: [DONE]\n\n"
474
+
475
+
476
+ def stream_tool_call_chunks(
477
+ content: Optional[str], tool_calls: list[dict], model_name: str
478
+ ):
479
+ cid, created = _uid(), int(time.time())
480
+
481
+ def _chunk(delta: dict, finish: Optional[str] = None) -> str:
482
+ return "data: " + json.dumps({
483
+ "id": cid, "object": "chat.completion.chunk",
484
+ "created": created, "model": model_name,
485
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
486
+ }) + "\n\n"
487
+
488
+ yield _chunk({"role": "assistant"})
489
+ for idx, tc in enumerate(tool_calls):
490
+ yield _chunk({"tool_calls": [{
491
+ "index": idx, "id": tc["id"], "type": "function",
492
+ "function": {"name": tc["function"]["name"], "arguments": ""},
493
+ }]})
494
+ yield _chunk({"tool_calls": [{
495
+ "index": idx,
496
+ "function": {"arguments": tc["function"]["arguments"]},
497
+ }]})
498
+ if content:
499
+ yield _chunk({"content": content})
500
+ yield _chunk({}, finish="tool_calls" if tool_calls else "stop")
501
+ yield "data: [DONE]\n\n"
502
+
503
+
504
+ # ━━━━━━━━━━━━━━━━━━━━━━ ROUTES ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
505
+
506
+ @app.get("/")
507
+ async def root():
508
+ return {
509
+ "message": "Mermaid Fine-Tuned Qwen2.5-0.5B OpenAI-Compatible API",
510
+ "base_model": BASE_MODEL_NAME,
511
+ "adapter": ADAPTER_NAME,
512
+ "docs": "/docs",
513
+ "endpoints": {
514
+ "models": "/v1/models",
515
+ "chat": "/v1/chat/completions",
516
+ "completions": "/v1/completions",
517
+ "health": "/health",
518
+ },
519
+ }
520
+
521
+
522
+ @app.get("/v1/models")
523
+ async def list_models():
524
+ return {
525
+ "object": "list",
526
+ "data": [{
527
+ "id": DISPLAY_MODEL_NAME,
528
+ "object": "model",
529
+ "created": int(time.time()),
530
+ "owned_by": "MuhammadNoman7600",
531
+ }],
532
+ }
533
+
534
+
535
+ @app.post("/v1/chat/completions")
536
+ async def chat_completions(req: ChatCompletionRequest):
537
+ try:
538
+ prompt = build_chat_prompt(req.messages, req.tools, req.tool_choice)
539
+
540
+ # Tool-calling: generate fully first, then parse
541
+ if req.tools:
542
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
543
+ content, tool_calls = parse_tool_calls(text)
544
+ if req.stream:
545
+ return StreamingResponse(
546
+ stream_tool_call_chunks(content, tool_calls, req.model),
547
+ media_type="text/event-stream",
548
+ )
549
+ return JSONResponse(
550
+ make_chat_response(
551
+ content, tool_calls, req.model, prompt_tokens, completion_tokens
552
+ )
553
+ )
554
+
555
+ # Normal chat with optional streaming
556
+ if req.stream:
557
+ return StreamingResponse(
558
+ stream_chat_response(prompt, req),
559
+ media_type="text/event-stream",
560
+ )
561
+
562
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
563
+ return JSONResponse(
564
+ make_chat_response(text, [], req.model, prompt_tokens, completion_tokens)
565
+ )
566
+
567
+ except Exception as e:
568
+ raise HTTPException(status_code=500, detail=str(e))
569
+
570
+
571
+ @app.post("/v1/completions")
572
+ async def completions(req: CompletionRequest):
573
+ try:
574
+ prompts = [req.prompt] if isinstance(req.prompt, str) else req.prompt
575
+ text, prompt_tokens, completion_tokens = generate_text(prompts[0], req)
576
+ return JSONResponse(
577
+ make_completion_response(
578
+ text, req.model, prompt_tokens, completion_tokens
579
+ )
580
+ )
581
+ except Exception as e:
582
+ raise HTTPException(status_code=500, detail=str(e))
583
+
584
+
585
+ @app.get("/health")
586
+ async def health():
587
+ device = "cuda" if torch.cuda.is_available() else "cpu"
588
+ return {
589
+ "status": "ok",
590
+ "base_model": BASE_MODEL_NAME,
591
+ "adapter": ADAPTER_NAME,
592
+ "device": device,
593
+ }
594
+
595
+
596
+ # ━━━━━━━━━━━━━━━━━━━━━━ MAIN ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
597
+
598
+ if __name__ == "__main__":
599
+ load_model()
600
+
601
+ print(f"\n{'='*60}")
602
+ print(f" OpenAI-compatible API β€” Fine-Tuned Mermaid Model")
603
+ print(f" Base : {BASE_MODEL_NAME}")
604
+ print(f" Adapter: {ADAPTER_NAME}")
605
+ device_label = "CUDA (4-bit bitsandbytes)" if torch.cuda.is_available() else "CPU (float32)"
606
+ print(f" Device : {device_label}")
607
+ print(f" URL : http://{HOST}:{PORT}/v1")
608
+ print(f"{'='*60}\n")
609
+
610
+ uvicorn.run(app, host=HOST, port=PORT, log_level="info")