MuhammadNoman7600 commited on
Commit
e832067
Β·
verified Β·
1 Parent(s): 35e71e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +487 -105
app.py CHANGED
@@ -1,14 +1,17 @@
1
  """
2
  =============================================================================
3
- SmolLM2-360M β€” Ultra FAST OpenAI-Compatible API (CPU)
4
- NO TOOL CALLING β€’ LOW LATENCY β€’ STREAMING ENABLED
5
  =============================================================================
6
  """
7
 
 
 
 
8
  import time
9
  import uuid
10
- from threading import Lock, Thread
11
- from typing import Optional, Union
12
 
13
  import torch
14
  import uvicorn
@@ -17,16 +20,20 @@ from fastapi.middleware.cors import CORSMiddleware
17
  from fastapi.responses import JSONResponse, StreamingResponse
18
  from pydantic import BaseModel
19
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
20
 
21
- # ━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━
22
- MODEL_NAME = "HuggingFaceTB/SmolLM2-360M"
23
  HOST = "0.0.0.0"
24
  PORT = 7860
25
- MAX_NEW_TOKENS = 128 # πŸ”₯ FAST
26
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
27
-
28
- app = FastAPI(title="SmolLM2-360M Fast API", version="1.0")
29
 
 
 
 
 
 
30
  app.add_middleware(
31
  CORSMiddleware,
32
  allow_origins=["*"],
@@ -35,180 +42,555 @@ app.add_middleware(
35
  allow_headers=["*"],
36
  )
37
 
38
- # ━━━━━━━━━━━━━━━━━━━ MODELS ━━━━━━━━━━━━━━━━━━━
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  class ChatMessage(BaseModel):
41
  role: str
42
  content: Optional[str] = None
 
 
 
43
 
44
 
45
  class ChatCompletionRequest(BaseModel):
46
  model: str = MODEL_NAME
47
  messages: list[ChatMessage]
48
- temperature: Optional[float] = 0.3
49
- max_tokens: Optional[int] = 128
 
50
  stream: Optional[bool] = False
 
 
 
 
 
 
 
51
 
52
 
53
  class CompletionRequest(BaseModel):
54
  model: str = MODEL_NAME
55
- prompt: Union[str, list[str]]
56
- max_tokens: Optional[int] = 128
 
 
 
 
 
 
 
 
57
 
58
 
59
- # ━━━━━━━━━━━━━━━━━━━ LOAD MODEL ━━━━━━━━━━━━━━━━━━━
60
 
61
  tokenizer = None
62
  model = None
63
- lock = Lock()
64
 
65
 
66
  def load_model():
67
  global tokenizer, model
 
 
68
 
69
- print(f"πŸš€ Loading {MODEL_NAME} ...")
 
70
 
71
  tokenizer = AutoTokenizer.from_pretrained(
72
  MODEL_NAME,
73
- use_fast=True,
74
  )
75
 
76
  model = AutoModelForCausalLM.from_pretrained(
77
  MODEL_NAME,
78
  torch_dtype=torch.float32,
79
  device_map="cpu",
80
- low_cpu_mem_usage=True,
81
  )
82
-
83
  model.eval()
84
- print("βœ… Model loaded!")
85
-
86
 
87
- # ━━━━━━━━━━━━━━━━━━━ PROMPT BUILDER ━━━━━━━━━━━━━━━━━━━
88
 
89
- def build_prompt(messages):
90
- prompt = ""
91
 
92
- for m in messages:
93
- if m.role == "user":
94
- prompt += f"User: {m.content}\n"
95
- elif m.role == "assistant":
96
- prompt += f"Assistant: {m.content}\n"
97
 
98
- prompt += "Assistant: "
99
- return prompt
100
 
 
101
 
102
- # ━━━━━━━━━━━━━━━━━━━ GENERATION ━━━━━━━━━━━━━━━━━━━
103
 
104
- def generate(prompt, req):
105
- inputs = tokenizer(prompt, return_tensors="pt")
 
 
106
 
107
- with lock:
108
- with torch.no_grad():
109
- output = model.generate(
110
- **inputs,
111
- max_new_tokens=req.max_tokens or MAX_NEW_TOKENS,
112
- do_sample=False, # πŸ”₯ stable + fast
113
- pad_token_id=tokenizer.eos_token_id,
114
- )
115
 
116
- text = tokenizer.decode(output[0], skip_special_tokens=True)
117
- return text[len(prompt):].strip()
 
118
 
119
 
120
- def generate_stream(prompt, req):
121
- inputs = tokenizer(prompt, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
124
-
125
- def run():
126
- with lock:
127
- with torch.no_grad():
128
- model.generate(
129
- **inputs,
130
- max_new_tokens=req.max_tokens or MAX_NEW_TOKENS,
131
- do_sample=False,
132
- streamer=streamer,
133
- pad_token_id=tokenizer.eos_token_id,
 
 
 
 
 
 
 
 
134
  )
135
 
136
- Thread(target=run).start()
 
 
 
 
 
137
 
138
- for token in streamer:
139
- yield token
140
 
141
 
142
- # ━━━━━━━━━━━━━━━━━━━ HELPERS ━━━━━━━━━━━━━━━━━━━
143
 
144
- def uid():
145
- return f"chatcmpl-{uuid.uuid4().hex[:10]}"
 
 
146
 
147
 
148
- # ━━━━━━━━━━━━━━━━━━━ ROUTES ━━━━━━━━━━━━━━━━━━━
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- @app.get("/")
151
- def root():
152
- return {"message": "SmolLM2 Fast API running πŸš€"}
153
 
154
 
155
- @app.get("/health")
156
- def health():
157
- return {"status": "ok", "model": MODEL_NAME}
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- @app.post("/v1/chat/completions")
161
- async def chat(req: ChatCompletionRequest):
162
- try:
163
- prompt = build_prompt(req.messages)
164
 
165
- if req.stream:
166
- def stream():
167
- yield f'data: {{"id":"{uid()}","choices":[{{"delta":{{"role":"assistant"}}}}]}}\n\n'
168
 
169
- for token in generate_stream(prompt, req):
170
- yield f'data: {{"choices":[{{"delta":{{"content":"{token}"}}}}]}}\n\n'
171
 
172
- yield "data: [DONE]\n\n"
173
 
174
- return StreamingResponse(stream(), media_type="text/event-stream")
 
 
 
 
 
175
 
176
- text = generate(prompt, req)
 
 
177
 
178
- return JSONResponse({
179
- "id": uid(),
180
- "object": "chat.completion",
181
- "choices": [{
182
- "index": 0,
183
- "message": {"role": "assistant", "content": text},
184
- "finish_reason": "stop"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  }]
186
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  except Exception as e:
189
- raise HTTPException(500, str(e))
190
 
191
 
192
  @app.post("/v1/completions")
193
- async def completion(req: CompletionRequest):
194
  try:
195
- prompt = req.prompt if isinstance(req.prompt, str) else req.prompt[0]
196
- text = generate(prompt, req)
197
-
198
- return {
199
- "id": uid(),
200
- "object": "text_completion",
201
- "choices": [{"text": text}]
202
- }
203
 
 
 
 
204
  except Exception as e:
205
- raise HTTPException(500, str(e))
206
 
207
 
208
- # ━━━━━━━━━━━━━━━━━━━ MAIN ━━━━━━━━━━━━━━━━━━━
 
 
 
 
 
209
 
210
  if __name__ == "__main__":
211
  load_model()
212
 
213
- print(f"πŸ”₯ Running on http://{HOST}:{PORT}")
214
- uvicorn.run(app, host=HOST, port=PORT)
 
 
 
 
 
 
 
1
  """
2
  =============================================================================
3
+ Transformers + FastAPI β€” OpenAI-Compatible Server for Qwen/Qwen3.5-0.8B
4
+ CPU-ONLY β€’ TOOL CALLING β€’ STREAMING β€’ Port 7860 (HF Spaces)
5
  =============================================================================
6
  """
7
 
8
+ import json
9
+ import os
10
+ import re
11
  import time
12
  import uuid
13
+ from threading import Lock
14
+ from typing import Any, Optional, Union
15
 
16
  import torch
17
  import uvicorn
 
20
  from fastapi.responses import JSONResponse, StreamingResponse
21
  from pydantic import BaseModel
22
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
23
+ from threading import Thread
24
 
25
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
26
+ MODEL_NAME = "Qwen/Qwen3.5-0.8B"
27
  HOST = "0.0.0.0"
28
  PORT = 7860
29
+ MAX_NEW_TOKENS = 1024
30
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
 
31
 
32
+ app = FastAPI(
33
+ title="Qwen3.5-0.8B OpenAI-Compatible API (CPU)",
34
+ description="Transformers-powered inference with tool calling β€” runs on CPU",
35
+ version="2.0.0",
36
+ )
37
  app.add_middleware(
38
  CORSMiddleware,
39
  allow_origins=["*"],
 
42
  allow_headers=["*"],
43
  )
44
 
45
+ # ━━━━━━━━━━━━━━━━━━━━━━ Pydantic Models ━━━━━━━━━━━━━━━━━━━━━━━
46
+
47
+
48
+ class FunctionDef(BaseModel):
49
+ name: str
50
+ description: Optional[str] = ""
51
+ parameters: Optional[dict] = None
52
+
53
+
54
+ class ToolDef(BaseModel):
55
+ type: str = "function"
56
+ function: FunctionDef
57
+
58
+
59
+ class FunctionCallModel(BaseModel):
60
+ name: str
61
+ arguments: str
62
+
63
+
64
+ class ToolCallObj(BaseModel):
65
+ id: str
66
+ type: str = "function"
67
+ function: FunctionCallModel
68
+
69
 
70
  class ChatMessage(BaseModel):
71
  role: str
72
  content: Optional[str] = None
73
+ tool_calls: Optional[list[ToolCallObj]] = None
74
+ tool_call_id: Optional[str] = None
75
+ name: Optional[str] = None
76
 
77
 
78
  class ChatCompletionRequest(BaseModel):
79
  model: str = MODEL_NAME
80
  messages: list[ChatMessage]
81
+ temperature: Optional[float] = 0.7
82
+ top_p: Optional[float] = 0.9
83
+ max_tokens: Optional[int] = 1024
84
  stream: Optional[bool] = False
85
+ stop: Optional[Union[str, list[str]]] = None
86
+ frequency_penalty: Optional[float] = 0.0
87
+ presence_penalty: Optional[float] = 0.0
88
+ repetition_penalty: Optional[float] = 1.0
89
+ n: Optional[int] = 1
90
+ tools: Optional[list[ToolDef]] = None
91
+ tool_choice: Optional[Union[str, dict]] = None
92
 
93
 
94
  class CompletionRequest(BaseModel):
95
  model: str = MODEL_NAME
96
+ prompt: Union[str, list[str]] = ""
97
+ temperature: Optional[float] = 0.7
98
+ top_p: Optional[float] = 0.9
99
+ max_tokens: Optional[int] = 512
100
+ stream: Optional[bool] = False
101
+ stop: Optional[Union[str, list[str]]] = None
102
+ frequency_penalty: Optional[float] = 0.0
103
+ presence_penalty: Optional[float] = 0.0
104
+ repetition_penalty: Optional[float] = 1.0
105
+ n: Optional[int] = 1
106
 
107
 
108
+ # ━━━━━━━━━━━━━━━━━━━ Model Loading (CPU) ━━━━━━━━━━━━━━━━━━━━━━
109
 
110
  tokenizer = None
111
  model = None
112
+ generate_lock = Lock()
113
 
114
 
115
  def load_model():
116
  global tokenizer, model
117
+ if model is not None:
118
+ return
119
 
120
+ print(f"\nπŸš€ Loading model: {MODEL_NAME} on CPU ...")
121
+ print(f" HF_HOME = {os.environ.get('HF_HOME', 'default')}\n")
122
 
123
  tokenizer = AutoTokenizer.from_pretrained(
124
  MODEL_NAME,
125
+ trust_remote_code=True,
126
  )
127
 
128
  model = AutoModelForCausalLM.from_pretrained(
129
  MODEL_NAME,
130
  torch_dtype=torch.float32,
131
  device_map="cpu",
132
+ trust_remote_code=True,
133
  )
 
134
  model.eval()
 
 
135
 
136
+ print("βœ… Model loaded on CPU!\n")
137
 
 
 
138
 
139
+ # ━━━━━━━━━━━━━━━━━━━━ Tool-Prompt Builder (Hermes) ━━━━━━━━━━━━
 
 
 
 
140
 
141
+ TOOL_SYSTEM_PROMPT_TEMPLATE = """\
142
+ You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
143
 
144
+ # Tools
145
 
146
+ You may call one or more functions to assist with the user query.
147
 
148
+ You are provided with function signatures within <tools></tools> XML tags:
149
+ <tools>
150
+ {tool_definitions}
151
+ </tools>
152
 
153
+ For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
154
+ <tool_call>
155
+ {{"name": "<function-name>", "arguments": <args-json-object>}}
156
+ </tool_call>"""
 
 
 
 
157
 
158
+ NO_TOOL_SYSTEM_PROMPT = (
159
+ "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
160
+ )
161
 
162
 
163
+ def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
164
+ lines = []
165
+ for t in tools:
166
+ obj: dict[str, Any] = {
167
+ "type": "function",
168
+ "function": {
169
+ "name": t.function.name,
170
+ "description": t.function.description or "",
171
+ },
172
+ }
173
+ if t.function.parameters:
174
+ obj["function"]["parameters"] = t.function.parameters
175
+ lines.append(json.dumps(obj))
176
+ return "\n".join(lines)
177
+
178
+
179
+ def build_chat_prompt(
180
+ messages: list[ChatMessage],
181
+ tools: Optional[list[ToolDef]] = None,
182
+ tool_choice: Optional[Union[str, dict]] = None,
183
+ ) -> str:
184
+ parts: list[str] = []
185
+ has_system = any(m.role == "system" for m in messages)
186
+
187
+ if tools:
188
+ default_sys = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
189
+ tool_definitions=_serialize_tool_definitions(tools),
190
+ )
191
+ else:
192
+ default_sys = NO_TOOL_SYSTEM_PROMPT
193
+
194
+ if not has_system:
195
+ parts.append(f"<|im_start|>system\n{default_sys}<|im_end|>\n")
196
+
197
+ for msg in messages:
198
+ role = msg.role
199
+
200
+ if role == "system":
201
+ base = msg.content or ""
202
+ if tools:
203
+ tool_block = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
204
+ tool_definitions=_serialize_tool_definitions(tools),
205
+ )
206
+ merged = f"{base}\n\n{tool_block}" if base else tool_block
207
+ parts.append(f"<|im_start|>system\n{merged}<|im_end|>\n")
208
+ else:
209
+ parts.append(
210
+ f"<|im_start|>system\n{base or NO_TOOL_SYSTEM_PROMPT}<|im_end|>\n"
211
+ )
212
 
213
+ elif role == "user":
214
+ parts.append(f"<|im_start|>user\n{msg.content or ''}<|im_end|>\n")
215
+
216
+ elif role == "assistant":
217
+ if msg.tool_calls:
218
+ tc_text = ""
219
+ for tc in msg.tool_calls:
220
+ args = tc.function.arguments
221
+ if isinstance(args, dict):
222
+ args = json.dumps(args)
223
+ tc_text += (
224
+ f"\n<tool_call>\n"
225
+ f'{{"name": "{tc.function.name}", "arguments": {args}}}\n'
226
+ f"</tool_call>"
227
+ )
228
+ parts.append(f"<|im_start|>assistant{tc_text}<|im_end|>\n")
229
+ else:
230
+ parts.append(
231
+ f"<|im_start|>assistant\n{msg.content or ''}<|im_end|>\n"
232
  )
233
 
234
+ elif role == "tool":
235
+ parts.append(
236
+ f"<|im_start|>user\n"
237
+ f"<tool_response>\n{msg.content or ''}\n</tool_response>"
238
+ f"<|im_end|>\n"
239
+ )
240
 
241
+ parts.append("<|im_start|>assistant\n")
242
+ return "".join(parts)
243
 
244
 
245
+ # ━━━━━━━━━━━━━━━━━━ Tool-Call Parser ━━━━━━━━━━━━━━━━━━━━━━━━━━
246
 
247
+ _TOOL_CALL_RE = re.compile(
248
+ r"<tool_call>\s*(\{.*?\})\s*</tool_call>",
249
+ re.DOTALL,
250
+ )
251
 
252
 
253
+ def parse_tool_calls(text: str) -> tuple[Optional[str], list[dict]]:
254
+ tool_calls: list[dict] = []
255
+
256
+ for raw_json in _TOOL_CALL_RE.findall(text):
257
+ try:
258
+ parsed = json.loads(raw_json)
259
+ except json.JSONDecodeError:
260
+ continue
261
+
262
+ name = parsed.get("name", "")
263
+ arguments = parsed.get("arguments", {})
264
+ if isinstance(arguments, dict):
265
+ arguments = json.dumps(arguments)
266
+ elif not isinstance(arguments, str):
267
+ arguments = json.dumps(arguments)
268
+
269
+ tool_calls.append({
270
+ "id": f"call_{uuid.uuid4().hex[:24]}",
271
+ "type": "function",
272
+ "function": {
273
+ "name": name,
274
+ "arguments": arguments,
275
+ },
276
+ })
277
 
278
+ content = _TOOL_CALL_RE.sub("", text).strip() or None
279
+ return content, tool_calls
 
280
 
281
 
282
+ # ━━━━━━━━━━━━━━━━━━ Generation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 
 
283
 
284
+ def generate_text(prompt: str, req) -> tuple[str, int, int]:
285
+ """Generate text on CPU. Returns (text, prompt_tokens, completion_tokens)."""
286
+ inputs = tokenizer(prompt, return_tensors="pt")
287
+ input_ids = inputs["input_ids"]
288
+ prompt_tokens = input_ids.shape[1]
289
+
290
+ max_new = req.max_tokens or MAX_NEW_TOKENS
291
+
292
+ # Build generation kwargs
293
+ gen_kwargs = {
294
+ "input_ids": input_ids,
295
+ "attention_mask": inputs.get("attention_mask"),
296
+ "max_new_tokens": max_new,
297
+ "do_sample": True,
298
+ "temperature": max(req.temperature, 0.01),
299
+ "top_p": req.top_p,
300
+ "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
301
+ "pad_token_id": tokenizer.eos_token_id,
302
+ }
303
+
304
+ rep_penalty = getattr(req, "repetition_penalty", 1.0)
305
+ if rep_penalty and rep_penalty > 1.0:
306
+ gen_kwargs["repetition_penalty"] = rep_penalty
307
+
308
+ with generate_lock:
309
+ with torch.no_grad():
310
+ output_ids = model.generate(**gen_kwargs)
311
 
312
+ # Slice off the prompt tokens
313
+ new_ids = output_ids[0][prompt_tokens:]
314
+ text = tokenizer.decode(new_ids, skip_special_tokens=False)
 
315
 
316
+ # Clean trailing special tokens
317
+ for tok in ["<|im_end|>", "<|endoftext|>"]:
318
+ text = text.replace(tok, "")
319
 
320
+ completion_tokens = len(new_ids)
321
+ return text.strip(), prompt_tokens, completion_tokens
322
 
 
323
 
324
+ def generate_text_stream(prompt: str, req):
325
+ """Generator that yields tokens one-by-one for streaming."""
326
+ inputs = tokenizer(prompt, return_tensors="pt")
327
+ input_ids = inputs["input_ids"]
328
+
329
+ max_new = req.max_tokens or MAX_NEW_TOKENS
330
 
331
+ streamer = TextIteratorStreamer(
332
+ tokenizer, skip_prompt=True, skip_special_tokens=False
333
+ )
334
 
335
+ gen_kwargs = {
336
+ "input_ids": input_ids,
337
+ "attention_mask": inputs.get("attention_mask"),
338
+ "max_new_tokens": max_new,
339
+ "do_sample": True,
340
+ "temperature": max(req.temperature, 0.01),
341
+ "top_p": req.top_p,
342
+ "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
343
+ "pad_token_id": tokenizer.eos_token_id,
344
+ "streamer": streamer,
345
+ }
346
+
347
+ rep_penalty = getattr(req, "repetition_penalty", 1.0)
348
+ if rep_penalty and rep_penalty > 1.0:
349
+ gen_kwargs["repetition_penalty"] = rep_penalty
350
+
351
+ thread = Thread(target=_generate_in_thread, args=(gen_kwargs,))
352
+ thread.start()
353
+
354
+ for token_text in streamer:
355
+ # Stop on special tokens
356
+ if "<|im_end|>" in token_text or "<|endoftext|>" in token_text:
357
+ cleaned = token_text.replace("<|im_end|>", "").replace("<|endoftext|>", "")
358
+ if cleaned:
359
+ yield cleaned
360
+ break
361
+ yield token_text
362
+
363
+ thread.join()
364
+
365
+
366
+ def _generate_in_thread(gen_kwargs):
367
+ with generate_lock:
368
+ with torch.no_grad():
369
+ model.generate(**gen_kwargs)
370
+
371
+
372
+ # ━━━━━━━━━━━━━━━━━━ Response Builders ━━━━━━━━━━━━━━━━━━━━━━━━━
373
+
374
+ def _uid(prefix: str = "chatcmpl") -> str:
375
+ return f"{prefix}-{uuid.uuid4().hex[:12]}"
376
+
377
+
378
+ def make_chat_response(
379
+ content: Optional[str],
380
+ tool_calls: list[dict],
381
+ model_name: str,
382
+ prompt_tokens: int,
383
+ completion_tokens: int,
384
+ ) -> dict:
385
+ message: dict[str, Any] = {"role": "assistant"}
386
+
387
+ if tool_calls:
388
+ message["content"] = content
389
+ message["tool_calls"] = tool_calls
390
+ finish_reason = "tool_calls"
391
+ else:
392
+ message["content"] = (content or "").strip()
393
+ finish_reason = "stop"
394
+
395
+ return {
396
+ "id": _uid(),
397
+ "object": "chat.completion",
398
+ "created": int(time.time()),
399
+ "model": model_name,
400
+ "choices": [{
401
+ "index": 0,
402
+ "message": message,
403
+ "finish_reason": finish_reason,
404
+ }],
405
+ "usage": {
406
+ "prompt_tokens": prompt_tokens,
407
+ "completion_tokens": completion_tokens,
408
+ "total_tokens": prompt_tokens + completion_tokens,
409
+ },
410
+ }
411
+
412
+
413
+ def make_completion_response(
414
+ text: str, model_name: str, prompt_tokens: int, completion_tokens: int
415
+ ) -> dict:
416
+ return {
417
+ "id": _uid("cmpl"),
418
+ "object": "text_completion",
419
+ "created": int(time.time()),
420
+ "model": model_name,
421
+ "choices": [{"index": 0, "text": text.strip(), "finish_reason": "stop"}],
422
+ "usage": {
423
+ "prompt_tokens": prompt_tokens,
424
+ "completion_tokens": completion_tokens,
425
+ "total_tokens": prompt_tokens + completion_tokens,
426
+ },
427
+ }
428
+
429
+
430
+ # ━━━━━━━━━━━━━━━━━━ Streaming Helpers ━━━━━━━━━━━━━━━━━━━━━━━━
431
+
432
+ def stream_chat_response(prompt: str, req):
433
+ """SSE streaming for non-tool-call chat completions."""
434
+ cid = _uid()
435
+ created = int(time.time())
436
+
437
+ def _chunk(delta: dict, finish: Optional[str] = None) -> str:
438
+ return "data: " + json.dumps({
439
+ "id": cid,
440
+ "object": "chat.completion.chunk",
441
+ "created": created,
442
+ "model": req.model,
443
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
444
+ }) + "\n\n"
445
+
446
+ yield _chunk({"role": "assistant"})
447
+
448
+ for token_text in generate_text_stream(prompt, req):
449
+ if token_text:
450
+ yield _chunk({"content": token_text})
451
+
452
+ yield _chunk({}, finish="stop")
453
+ yield "data: [DONE]\n\n"
454
+
455
+
456
+ def stream_tool_call_chunks(
457
+ content: Optional[str],
458
+ tool_calls: list[dict],
459
+ model_name: str,
460
+ ):
461
+ """SSE streaming for tool-call responses (post-generation)."""
462
+ cid = _uid()
463
+ created = int(time.time())
464
+
465
+ def _chunk(delta: dict, finish: Optional[str] = None) -> str:
466
+ return "data: " + json.dumps({
467
+ "id": cid,
468
+ "object": "chat.completion.chunk",
469
+ "created": created,
470
+ "model": model_name,
471
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
472
+ }) + "\n\n"
473
+
474
+ yield _chunk({"role": "assistant"})
475
+
476
+ for idx, tc in enumerate(tool_calls):
477
+ yield _chunk({
478
+ "tool_calls": [{
479
+ "index": idx,
480
+ "id": tc["id"],
481
+ "type": "function",
482
+ "function": {"name": tc["function"]["name"], "arguments": ""},
483
  }]
484
  })
485
+ yield _chunk({
486
+ "tool_calls": [{
487
+ "index": idx,
488
+ "function": {"arguments": tc["function"]["arguments"]},
489
+ }]
490
+ })
491
+
492
+ if content:
493
+ yield _chunk({"content": content})
494
+
495
+ yield _chunk({}, finish="tool_calls" if tool_calls else "stop")
496
+ yield "data: [DONE]\n\n"
497
+
498
+
499
+ # ━━━━━━━━━━━━━━━━━━━━━━ ROUTES ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
500
+
501
+ @app.get("/")
502
+ async def root():
503
+ return {
504
+ "message": "Qwen3.5-0.8B OpenAI-Compatible API (CPU) with Tool Calling",
505
+ "docs": "/docs",
506
+ "endpoints": {
507
+ "models": "/v1/models",
508
+ "chat": "/v1/chat/completions",
509
+ "completions": "/v1/completions",
510
+ "health": "/health",
511
+ },
512
+ }
513
+
514
+
515
+ @app.get("/v1/models")
516
+ async def list_models():
517
+ return {
518
+ "object": "list",
519
+ "data": [{
520
+ "id": MODEL_NAME,
521
+ "object": "model",
522
+ "created": int(time.time()),
523
+ "owned_by": "local",
524
+ }],
525
+ }
526
+
527
+
528
+ @app.post("/v1/chat/completions")
529
+ async def chat_completions(req: ChatCompletionRequest):
530
+ try:
531
+ prompt = build_chat_prompt(req.messages, req.tools, req.tool_choice)
532
+
533
+ # ── Tool-calling path (generate fully, then parse) ──
534
+ if req.tools:
535
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
536
+ content, tool_calls = parse_tool_calls(text)
537
+
538
+ if req.stream:
539
+ return StreamingResponse(
540
+ stream_tool_call_chunks(content, tool_calls, req.model),
541
+ media_type="text/event-stream",
542
+ )
543
+ return JSONResponse(
544
+ make_chat_response(
545
+ content, tool_calls, req.model, prompt_tokens, completion_tokens
546
+ )
547
+ )
548
+
549
+ # ── Normal chat (supports true token-by-token streaming) ──
550
+ if req.stream:
551
+ return StreamingResponse(
552
+ stream_chat_response(prompt, req),
553
+ media_type="text/event-stream",
554
+ )
555
+
556
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
557
+ return JSONResponse(
558
+ make_chat_response(text, [], req.model, prompt_tokens, completion_tokens)
559
+ )
560
 
561
  except Exception as e:
562
+ raise HTTPException(status_code=500, detail=str(e))
563
 
564
 
565
  @app.post("/v1/completions")
566
+ async def completions(req: CompletionRequest):
567
  try:
568
+ prompts = [req.prompt] if isinstance(req.prompt, str) else req.prompt
569
+ prompt = prompts[0]
570
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
 
 
 
 
 
571
 
572
+ return JSONResponse(
573
+ make_completion_response(text, req.model, prompt_tokens, completion_tokens)
574
+ )
575
  except Exception as e:
576
+ raise HTTPException(status_code=500, detail=str(e))
577
 
578
 
579
+ @app.get("/health")
580
+ async def health():
581
+ return {"status": "ok", "model": MODEL_NAME, "device": "cpu"}
582
+
583
+
584
+ # ━━━━━━━━━━━━━━━━━━━━━━ MAIN ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
585
 
586
  if __name__ == "__main__":
587
  load_model()
588
 
589
+ print(f"\n{'='*60}")
590
+ print(f" OpenAI-compatible API with TOOL CALLING (CPU)")
591
+ print(f" Model: {MODEL_NAME}")
592
+ print(f" Device: CPU")
593
+ print(f" URL: http://{HOST}:{PORT}/v1")
594
+ print(f"{'='*60}\n")
595
+
596
+ uvicorn.run(app, host=HOST, port=PORT, log_level="info")