MuhammadNoman7600 commited on
Commit
898bb4e
Β·
verified Β·
1 Parent(s): 304a174

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +596 -37
app.py CHANGED
@@ -1,37 +1,596 @@
1
- # ============================================================
2
- # Dockerfile β€” Qwen3.5-0.8B CPU-Only API for HF Spaces
3
- # No GPU required. Port 7860.
4
- # ============================================================
5
-
6
- FROM python:3.11-slim
7
-
8
- # ── System deps ──
9
- RUN apt-get update && \
10
- apt-get install -y --no-install-recommends git && \
11
- rm -rf /var/lib/apt/lists/*
12
-
13
- # ── Python deps (CPU-only torch β€” no CUDA bloat) ──
14
- RUN pip install --no-cache-dir \
15
- torch --index-url https://download.pytorch.org/whl/cpu
16
-
17
- RUN pip install --no-cache-dir \
18
- transformers \
19
- accelerate \
20
- fastapi \
21
- uvicorn \
22
- pydantic \
23
- huggingface_hub
24
-
25
- # ── Pre-download model at build time (~1.8 GB baked into image) ──
26
- ENV HF_HOME=/tmp/hf_cache
27
- RUN python3 -c "\
28
- from huggingface_hub import snapshot_download; \
29
- snapshot_download('Qwen/Qwen3.5-0.8B', cache_dir='/tmp/hf_cache')"
30
-
31
- # ── Copy app ──
32
- WORKDIR /app
33
- COPY app.py .
34
-
35
- EXPOSE 7860
36
-
37
- CMD ["python3", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ =============================================================================
3
+ Transformers + FastAPI β€” OpenAI-Compatible Server for Qwen/Qwen3.5-0.8B
4
+ CPU-ONLY β€’ TOOL CALLING β€’ STREAMING β€’ Port 7860 (HF Spaces)
5
+ =============================================================================
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import re
11
+ import time
12
+ import uuid
13
+ from threading import Lock
14
+ from typing import Any, Optional, Union
15
+
16
+ import torch
17
+ import uvicorn
18
+ from fastapi import FastAPI, HTTPException
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.responses import JSONResponse, StreamingResponse
21
+ from pydantic import BaseModel
22
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
23
+ from threading import Thread
24
+
25
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
26
+ MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
27
+ HOST = "0.0.0.0"
28
+ PORT = 7860
29
+ MAX_NEW_TOKENS = 1024
30
+ # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
31
+
32
+ app = FastAPI(
33
+ title="Qwen3.5-0.8B OpenAI-Compatible API (CPU)",
34
+ description="Transformers-powered inference with tool calling β€” runs on CPU",
35
+ version="2.0.0",
36
+ )
37
+ app.add_middleware(
38
+ CORSMiddleware,
39
+ allow_origins=["*"],
40
+ allow_credentials=True,
41
+ allow_methods=["*"],
42
+ allow_headers=["*"],
43
+ )
44
+
45
+ # ━━━━━━━━━━━━━━━━━━━━━━ Pydantic Models ━━━━━━━━━━━━━━━━━━━━━━━
46
+
47
+
48
+ class FunctionDef(BaseModel):
49
+ name: str
50
+ description: Optional[str] = ""
51
+ parameters: Optional[dict] = None
52
+
53
+
54
+ class ToolDef(BaseModel):
55
+ type: str = "function"
56
+ function: FunctionDef
57
+
58
+
59
+ class FunctionCallModel(BaseModel):
60
+ name: str
61
+ arguments: str
62
+
63
+
64
+ class ToolCallObj(BaseModel):
65
+ id: str
66
+ type: str = "function"
67
+ function: FunctionCallModel
68
+
69
+
70
+ class ChatMessage(BaseModel):
71
+ role: str
72
+ content: Optional[str] = None
73
+ tool_calls: Optional[list[ToolCallObj]] = None
74
+ tool_call_id: Optional[str] = None
75
+ name: Optional[str] = None
76
+
77
+
78
+ class ChatCompletionRequest(BaseModel):
79
+ model: str = MODEL_NAME
80
+ messages: list[ChatMessage]
81
+ temperature: Optional[float] = 0.7
82
+ top_p: Optional[float] = 0.9
83
+ max_tokens: Optional[int] = 1024
84
+ stream: Optional[bool] = False
85
+ stop: Optional[Union[str, list[str]]] = None
86
+ frequency_penalty: Optional[float] = 0.0
87
+ presence_penalty: Optional[float] = 0.0
88
+ repetition_penalty: Optional[float] = 1.0
89
+ n: Optional[int] = 1
90
+ tools: Optional[list[ToolDef]] = None
91
+ tool_choice: Optional[Union[str, dict]] = None
92
+
93
+
94
+ class CompletionRequest(BaseModel):
95
+ model: str = MODEL_NAME
96
+ prompt: Union[str, list[str]] = ""
97
+ temperature: Optional[float] = 0.7
98
+ top_p: Optional[float] = 0.9
99
+ max_tokens: Optional[int] = 512
100
+ stream: Optional[bool] = False
101
+ stop: Optional[Union[str, list[str]]] = None
102
+ frequency_penalty: Optional[float] = 0.0
103
+ presence_penalty: Optional[float] = 0.0
104
+ repetition_penalty: Optional[float] = 1.0
105
+ n: Optional[int] = 1
106
+
107
+
108
+ # ━━━━━━━━━━━━━━━━━━━ Model Loading (CPU) ━━━━━━━━━━━━━━━━━━━━━━
109
+
110
+ tokenizer = None
111
+ model = None
112
+ generate_lock = Lock()
113
+
114
+
115
+ def load_model():
116
+ global tokenizer, model
117
+ if model is not None:
118
+ return
119
+
120
+ print(f"\nπŸš€ Loading model: {MODEL_NAME} on CPU ...")
121
+ print(f" HF_HOME = {os.environ.get('HF_HOME', 'default')}\n")
122
+
123
+ tokenizer = AutoTokenizer.from_pretrained(
124
+ MODEL_NAME,
125
+ use_fast=True,
126
+ )
127
+
128
+ model = AutoModelForCausalLM.from_pretrained(
129
+ MODEL_NAME,
130
+ torch_dtype=torch.float32,
131
+ device_map="cpu",
132
+ trust_remote_code=True,
133
+ )
134
+ model.eval()
135
+
136
+ print("βœ… Model loaded on CPU!\n")
137
+
138
+
139
+ # ━━━━━━━━━━━━━━━━━━━━ Tool-Prompt Builder (Hermes) ━━━━━━━━━━━━
140
+
141
+ TOOL_SYSTEM_PROMPT_TEMPLATE = """\
142
+ You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
143
+
144
+ # Tools
145
+
146
+ You may call one or more functions to assist with the user query.
147
+
148
+ You are provided with function signatures within <tools></tools> XML tags:
149
+ <tools>
150
+ {tool_definitions}
151
+ </tools>
152
+
153
+ For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
154
+ <tool_call>
155
+ {{"name": "<function-name>", "arguments": <args-json-object>}}
156
+ </tool_call>"""
157
+
158
+ NO_TOOL_SYSTEM_PROMPT = (
159
+ "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
160
+ )
161
+
162
+
163
+ def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
164
+ lines = []
165
+ for t in tools:
166
+ obj: dict[str, Any] = {
167
+ "type": "function",
168
+ "function": {
169
+ "name": t.function.name,
170
+ "description": t.function.description or "",
171
+ },
172
+ }
173
+ if t.function.parameters:
174
+ obj["function"]["parameters"] = t.function.parameters
175
+ lines.append(json.dumps(obj))
176
+ return "\n".join(lines)
177
+
178
+
179
+ def build_chat_prompt(
180
+ messages: list[ChatMessage],
181
+ tools: Optional[list[ToolDef]] = None,
182
+ tool_choice: Optional[Union[str, dict]] = None,
183
+ ) -> str:
184
+ parts: list[str] = []
185
+ has_system = any(m.role == "system" for m in messages)
186
+
187
+ if tools:
188
+ default_sys = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
189
+ tool_definitions=_serialize_tool_definitions(tools),
190
+ )
191
+ else:
192
+ default_sys = NO_TOOL_SYSTEM_PROMPT
193
+
194
+ if not has_system:
195
+ parts.append(f"<|im_start|>system\n{default_sys}<|im_end|>\n")
196
+
197
+ for msg in messages:
198
+ role = msg.role
199
+
200
+ if role == "system":
201
+ base = msg.content or ""
202
+ if tools:
203
+ tool_block = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
204
+ tool_definitions=_serialize_tool_definitions(tools),
205
+ )
206
+ merged = f"{base}\n\n{tool_block}" if base else tool_block
207
+ parts.append(f"<|im_start|>system\n{merged}<|im_end|>\n")
208
+ else:
209
+ parts.append(
210
+ f"<|im_start|>system\n{base or NO_TOOL_SYSTEM_PROMPT}<|im_end|>\n"
211
+ )
212
+
213
+ elif role == "user":
214
+ parts.append(f"<|im_start|>user\n{msg.content or ''}<|im_end|>\n")
215
+
216
+ elif role == "assistant":
217
+ if msg.tool_calls:
218
+ tc_text = ""
219
+ for tc in msg.tool_calls:
220
+ args = tc.function.arguments
221
+ if isinstance(args, dict):
222
+ args = json.dumps(args)
223
+ tc_text += (
224
+ f"\n<tool_call>\n"
225
+ f'{{"name": "{tc.function.name}", "arguments": {args}}}\n'
226
+ f"</tool_call>"
227
+ )
228
+ parts.append(f"<|im_start|>assistant{tc_text}<|im_end|>\n")
229
+ else:
230
+ parts.append(
231
+ f"<|im_start|>assistant\n{msg.content or ''}<|im_end|>\n"
232
+ )
233
+
234
+ elif role == "tool":
235
+ parts.append(
236
+ f"<|im_start|>user\n"
237
+ f"<tool_response>\n{msg.content or ''}\n</tool_response>"
238
+ f"<|im_end|>\n"
239
+ )
240
+
241
+ parts.append("<|im_start|>assistant\n")
242
+ return "".join(parts)
243
+
244
+
245
+ # ━━━━━━━━━━━━━━━━━━ Tool-Call Parser ━━━━━━━━━━━━━━━━━━━━━━━━━━
246
+
247
+ _TOOL_CALL_RE = re.compile(
248
+ r"<tool_call>\s*(\{.*?\})\s*</tool_call>",
249
+ re.DOTALL,
250
+ )
251
+
252
+
253
+ def parse_tool_calls(text: str) -> tuple[Optional[str], list[dict]]:
254
+ tool_calls: list[dict] = []
255
+
256
+ for raw_json in _TOOL_CALL_RE.findall(text):
257
+ try:
258
+ parsed = json.loads(raw_json)
259
+ except json.JSONDecodeError:
260
+ continue
261
+
262
+ name = parsed.get("name", "")
263
+ arguments = parsed.get("arguments", {})
264
+ if isinstance(arguments, dict):
265
+ arguments = json.dumps(arguments)
266
+ elif not isinstance(arguments, str):
267
+ arguments = json.dumps(arguments)
268
+
269
+ tool_calls.append({
270
+ "id": f"call_{uuid.uuid4().hex[:24]}",
271
+ "type": "function",
272
+ "function": {
273
+ "name": name,
274
+ "arguments": arguments,
275
+ },
276
+ })
277
+
278
+ content = _TOOL_CALL_RE.sub("", text).strip() or None
279
+ return content, tool_calls
280
+
281
+
282
+ # ━━━━━━━━━━━━━━━━━━ Generation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
283
+
284
+ def generate_text(prompt: str, req) -> tuple[str, int, int]:
285
+ """Generate text on CPU. Returns (text, prompt_tokens, completion_tokens)."""
286
+ inputs = tokenizer(prompt, return_tensors="pt")
287
+ input_ids = inputs["input_ids"]
288
+ prompt_tokens = input_ids.shape[1]
289
+
290
+ max_new = req.max_tokens or MAX_NEW_TOKENS
291
+
292
+ # Build generation kwargs
293
+ gen_kwargs = {
294
+ "input_ids": input_ids,
295
+ "attention_mask": inputs.get("attention_mask"),
296
+ "max_new_tokens": max_new,
297
+ "do_sample": True,
298
+ "temperature": max(req.temperature, 0.01),
299
+ "top_p": req.top_p,
300
+ "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
301
+ "pad_token_id": tokenizer.eos_token_id,
302
+ }
303
+
304
+ rep_penalty = getattr(req, "repetition_penalty", 1.0)
305
+ if rep_penalty and rep_penalty > 1.0:
306
+ gen_kwargs["repetition_penalty"] = rep_penalty
307
+
308
+ with generate_lock:
309
+ with torch.no_grad():
310
+ output_ids = model.generate(**gen_kwargs)
311
+
312
+ # Slice off the prompt tokens
313
+ new_ids = output_ids[0][prompt_tokens:]
314
+ text = tokenizer.decode(new_ids, skip_special_tokens=False)
315
+
316
+ # Clean trailing special tokens
317
+ for tok in ["<|im_end|>", "<|endoftext|>"]:
318
+ text = text.replace(tok, "")
319
+
320
+ completion_tokens = len(new_ids)
321
+ return text.strip(), prompt_tokens, completion_tokens
322
+
323
+
324
+ def generate_text_stream(prompt: str, req):
325
+ """Generator that yields tokens one-by-one for streaming."""
326
+ inputs = tokenizer(prompt, return_tensors="pt")
327
+ input_ids = inputs["input_ids"]
328
+
329
+ max_new = req.max_tokens or MAX_NEW_TOKENS
330
+
331
+ streamer = TextIteratorStreamer(
332
+ tokenizer, skip_prompt=True, skip_special_tokens=False
333
+ )
334
+
335
+ gen_kwargs = {
336
+ "input_ids": input_ids,
337
+ "attention_mask": inputs.get("attention_mask"),
338
+ "max_new_tokens": max_new,
339
+ "do_sample": True,
340
+ "temperature": max(req.temperature, 0.01),
341
+ "top_p": req.top_p,
342
+ "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
343
+ "pad_token_id": tokenizer.eos_token_id,
344
+ "streamer": streamer,
345
+ }
346
+
347
+ rep_penalty = getattr(req, "repetition_penalty", 1.0)
348
+ if rep_penalty and rep_penalty > 1.0:
349
+ gen_kwargs["repetition_penalty"] = rep_penalty
350
+
351
+ thread = Thread(target=_generate_in_thread, args=(gen_kwargs,))
352
+ thread.start()
353
+
354
+ for token_text in streamer:
355
+ # Stop on special tokens
356
+ if "<|im_end|>" in token_text or "<|endoftext|>" in token_text:
357
+ cleaned = token_text.replace("<|im_end|>", "").replace("<|endoftext|>", "")
358
+ if cleaned:
359
+ yield cleaned
360
+ break
361
+ yield token_text
362
+
363
+ thread.join()
364
+
365
+
366
+ def _generate_in_thread(gen_kwargs):
367
+ with generate_lock:
368
+ with torch.no_grad():
369
+ model.generate(**gen_kwargs)
370
+
371
+
372
+ # ━━━━━━━━━━━━━━━━━━ Response Builders ━━━━━━━━━━━━━━━━━━━━━━━━━
373
+
374
+ def _uid(prefix: str = "chatcmpl") -> str:
375
+ return f"{prefix}-{uuid.uuid4().hex[:12]}"
376
+
377
+
378
+ def make_chat_response(
379
+ content: Optional[str],
380
+ tool_calls: list[dict],
381
+ model_name: str,
382
+ prompt_tokens: int,
383
+ completion_tokens: int,
384
+ ) -> dict:
385
+ message: dict[str, Any] = {"role": "assistant"}
386
+
387
+ if tool_calls:
388
+ message["content"] = content
389
+ message["tool_calls"] = tool_calls
390
+ finish_reason = "tool_calls"
391
+ else:
392
+ message["content"] = (content or "").strip()
393
+ finish_reason = "stop"
394
+
395
+ return {
396
+ "id": _uid(),
397
+ "object": "chat.completion",
398
+ "created": int(time.time()),
399
+ "model": model_name,
400
+ "choices": [{
401
+ "index": 0,
402
+ "message": message,
403
+ "finish_reason": finish_reason,
404
+ }],
405
+ "usage": {
406
+ "prompt_tokens": prompt_tokens,
407
+ "completion_tokens": completion_tokens,
408
+ "total_tokens": prompt_tokens + completion_tokens,
409
+ },
410
+ }
411
+
412
+
413
+ def make_completion_response(
414
+ text: str, model_name: str, prompt_tokens: int, completion_tokens: int
415
+ ) -> dict:
416
+ return {
417
+ "id": _uid("cmpl"),
418
+ "object": "text_completion",
419
+ "created": int(time.time()),
420
+ "model": model_name,
421
+ "choices": [{"index": 0, "text": text.strip(), "finish_reason": "stop"}],
422
+ "usage": {
423
+ "prompt_tokens": prompt_tokens,
424
+ "completion_tokens": completion_tokens,
425
+ "total_tokens": prompt_tokens + completion_tokens,
426
+ },
427
+ }
428
+
429
+
430
+ # ━━━━━━━━━━━━━━━━━━ Streaming Helpers ━━━━━━━━━━━━━━━━━━━━━━━━
431
+
432
+ def stream_chat_response(prompt: str, req):
433
+ """SSE streaming for non-tool-call chat completions."""
434
+ cid = _uid()
435
+ created = int(time.time())
436
+
437
+ def _chunk(delta: dict, finish: Optional[str] = None) -> str:
438
+ return "data: " + json.dumps({
439
+ "id": cid,
440
+ "object": "chat.completion.chunk",
441
+ "created": created,
442
+ "model": req.model,
443
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
444
+ }) + "\n\n"
445
+
446
+ yield _chunk({"role": "assistant"})
447
+
448
+ for token_text in generate_text_stream(prompt, req):
449
+ if token_text:
450
+ yield _chunk({"content": token_text})
451
+
452
+ yield _chunk({}, finish="stop")
453
+ yield "data: [DONE]\n\n"
454
+
455
+
456
+ def stream_tool_call_chunks(
457
+ content: Optional[str],
458
+ tool_calls: list[dict],
459
+ model_name: str,
460
+ ):
461
+ """SSE streaming for tool-call responses (post-generation)."""
462
+ cid = _uid()
463
+ created = int(time.time())
464
+
465
+ def _chunk(delta: dict, finish: Optional[str] = None) -> str:
466
+ return "data: " + json.dumps({
467
+ "id": cid,
468
+ "object": "chat.completion.chunk",
469
+ "created": created,
470
+ "model": model_name,
471
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
472
+ }) + "\n\n"
473
+
474
+ yield _chunk({"role": "assistant"})
475
+
476
+ for idx, tc in enumerate(tool_calls):
477
+ yield _chunk({
478
+ "tool_calls": [{
479
+ "index": idx,
480
+ "id": tc["id"],
481
+ "type": "function",
482
+ "function": {"name": tc["function"]["name"], "arguments": ""},
483
+ }]
484
+ })
485
+ yield _chunk({
486
+ "tool_calls": [{
487
+ "index": idx,
488
+ "function": {"arguments": tc["function"]["arguments"]},
489
+ }]
490
+ })
491
+
492
+ if content:
493
+ yield _chunk({"content": content})
494
+
495
+ yield _chunk({}, finish="tool_calls" if tool_calls else "stop")
496
+ yield "data: [DONE]\n\n"
497
+
498
+
499
+ # ━━━━━━━━━━━━━━━━━━━━━━ ROUTES ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
500
+
501
+ @app.get("/")
502
+ async def root():
503
+ return {
504
+ "message": "Qwen3.5-0.8B OpenAI-Compatible API (CPU) with Tool Calling",
505
+ "docs": "/docs",
506
+ "endpoints": {
507
+ "models": "/v1/models",
508
+ "chat": "/v1/chat/completions",
509
+ "completions": "/v1/completions",
510
+ "health": "/health",
511
+ },
512
+ }
513
+
514
+
515
+ @app.get("/v1/models")
516
+ async def list_models():
517
+ return {
518
+ "object": "list",
519
+ "data": [{
520
+ "id": MODEL_NAME,
521
+ "object": "model",
522
+ "created": int(time.time()),
523
+ "owned_by": "local",
524
+ }],
525
+ }
526
+
527
+
528
+ @app.post("/v1/chat/completions")
529
+ async def chat_completions(req: ChatCompletionRequest):
530
+ try:
531
+ prompt = build_chat_prompt(req.messages, req.tools, req.tool_choice)
532
+
533
+ # ── Tool-calling path (generate fully, then parse) ──
534
+ if req.tools:
535
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
536
+ content, tool_calls = parse_tool_calls(text)
537
+
538
+ if req.stream:
539
+ return StreamingResponse(
540
+ stream_tool_call_chunks(content, tool_calls, req.model),
541
+ media_type="text/event-stream",
542
+ )
543
+ return JSONResponse(
544
+ make_chat_response(
545
+ content, tool_calls, req.model, prompt_tokens, completion_tokens
546
+ )
547
+ )
548
+
549
+ # ── Normal chat (supports true token-by-token streaming) ──
550
+ if req.stream:
551
+ return StreamingResponse(
552
+ stream_chat_response(prompt, req),
553
+ media_type="text/event-stream",
554
+ )
555
+
556
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
557
+ return JSONResponse(
558
+ make_chat_response(text, [], req.model, prompt_tokens, completion_tokens)
559
+ )
560
+
561
+ except Exception as e:
562
+ raise HTTPException(status_code=500, detail=str(e))
563
+
564
+
565
+ @app.post("/v1/completions")
566
+ async def completions(req: CompletionRequest):
567
+ try:
568
+ prompts = [req.prompt] if isinstance(req.prompt, str) else req.prompt
569
+ prompt = prompts[0]
570
+ text, prompt_tokens, completion_tokens = generate_text(prompt, req)
571
+
572
+ return JSONResponse(
573
+ make_completion_response(text, req.model, prompt_tokens, completion_tokens)
574
+ )
575
+ except Exception as e:
576
+ raise HTTPException(status_code=500, detail=str(e))
577
+
578
+
579
+ @app.get("/health")
580
+ async def health():
581
+ return {"status": "ok", "model": MODEL_NAME, "device": "cpu"}
582
+
583
+
584
+ # ━━━━━━━━━━━━━━━━━━━━━━ MAIN ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
585
+
586
+ if __name__ == "__main__":
587
+ load_model()
588
+
589
+ print(f"\n{'='*60}")
590
+ print(f" OpenAI-compatible API with TOOL CALLING (CPU)")
591
+ print(f" Model: {MODEL_NAME}")
592
+ print(f" Device: CPU")
593
+ print(f" URL: http://{HOST}:{PORT}/v1")
594
+ print(f"{'='*60}\n")
595
+
596
+ uvicorn.run(app, host=HOST, port=PORT, log_level="info")