MuhammadNoman7600 commited on
Commit
304a174
Β·
verified Β·
1 Parent(s): e832067

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -596
app.py CHANGED
@@ -1,596 +1,37 @@
1
- """
2
- =============================================================================
3
- Transformers + FastAPI β€” OpenAI-Compatible Server for Qwen/Qwen3.5-0.8B
4
- CPU-ONLY β€’ TOOL CALLING β€’ STREAMING β€’ Port 7860 (HF Spaces)
5
- =============================================================================
6
- """
7
-
8
- import json
9
- import os
10
- import re
11
- import time
12
- import uuid
13
- from threading import Lock
14
- from typing import Any, Optional, Union
15
-
16
- import torch
17
- import uvicorn
18
- from fastapi import FastAPI, HTTPException
19
- from fastapi.middleware.cors import CORSMiddleware
20
- from fastapi.responses import JSONResponse, StreamingResponse
21
- from pydantic import BaseModel
22
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
23
- from threading import Thread
24
-
25
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
26
- MODEL_NAME = "Qwen/Qwen3.5-0.8B"
27
- HOST = "0.0.0.0"
28
- PORT = 7860
29
- MAX_NEW_TOKENS = 1024
30
- # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
31
-
32
- app = FastAPI(
33
- title="Qwen3.5-0.8B OpenAI-Compatible API (CPU)",
34
- description="Transformers-powered inference with tool calling β€” runs on CPU",
35
- version="2.0.0",
36
- )
37
- app.add_middleware(
38
- CORSMiddleware,
39
- allow_origins=["*"],
40
- allow_credentials=True,
41
- allow_methods=["*"],
42
- allow_headers=["*"],
43
- )
44
-
45
- # ━━━━━━━━━━━━━━━━━━━━━━ Pydantic Models ━━━━━━━━━━━━━━━━━━━━━━━
46
-
47
-
48
- class FunctionDef(BaseModel):
49
- name: str
50
- description: Optional[str] = ""
51
- parameters: Optional[dict] = None
52
-
53
-
54
- class ToolDef(BaseModel):
55
- type: str = "function"
56
- function: FunctionDef
57
-
58
-
59
- class FunctionCallModel(BaseModel):
60
- name: str
61
- arguments: str
62
-
63
-
64
- class ToolCallObj(BaseModel):
65
- id: str
66
- type: str = "function"
67
- function: FunctionCallModel
68
-
69
-
70
- class ChatMessage(BaseModel):
71
- role: str
72
- content: Optional[str] = None
73
- tool_calls: Optional[list[ToolCallObj]] = None
74
- tool_call_id: Optional[str] = None
75
- name: Optional[str] = None
76
-
77
-
78
- class ChatCompletionRequest(BaseModel):
79
- model: str = MODEL_NAME
80
- messages: list[ChatMessage]
81
- temperature: Optional[float] = 0.7
82
- top_p: Optional[float] = 0.9
83
- max_tokens: Optional[int] = 1024
84
- stream: Optional[bool] = False
85
- stop: Optional[Union[str, list[str]]] = None
86
- frequency_penalty: Optional[float] = 0.0
87
- presence_penalty: Optional[float] = 0.0
88
- repetition_penalty: Optional[float] = 1.0
89
- n: Optional[int] = 1
90
- tools: Optional[list[ToolDef]] = None
91
- tool_choice: Optional[Union[str, dict]] = None
92
-
93
-
94
- class CompletionRequest(BaseModel):
95
- model: str = MODEL_NAME
96
- prompt: Union[str, list[str]] = ""
97
- temperature: Optional[float] = 0.7
98
- top_p: Optional[float] = 0.9
99
- max_tokens: Optional[int] = 512
100
- stream: Optional[bool] = False
101
- stop: Optional[Union[str, list[str]]] = None
102
- frequency_penalty: Optional[float] = 0.0
103
- presence_penalty: Optional[float] = 0.0
104
- repetition_penalty: Optional[float] = 1.0
105
- n: Optional[int] = 1
106
-
107
-
108
- # ━━━━━━━━━━━━━━━━━━━ Model Loading (CPU) ━━━━━━━━━━━━━━━━━━━━━━
109
-
110
- tokenizer = None
111
- model = None
112
- generate_lock = Lock()
113
-
114
-
115
- def load_model():
116
- global tokenizer, model
117
- if model is not None:
118
- return
119
-
120
- print(f"\nπŸš€ Loading model: {MODEL_NAME} on CPU ...")
121
- print(f" HF_HOME = {os.environ.get('HF_HOME', 'default')}\n")
122
-
123
- tokenizer = AutoTokenizer.from_pretrained(
124
- MODEL_NAME,
125
- trust_remote_code=True,
126
- )
127
-
128
- model = AutoModelForCausalLM.from_pretrained(
129
- MODEL_NAME,
130
- torch_dtype=torch.float32,
131
- device_map="cpu",
132
- trust_remote_code=True,
133
- )
134
- model.eval()
135
-
136
- print("βœ… Model loaded on CPU!\n")
137
-
138
-
139
- # ━━━━━━━━━━━━━━━━━━━━ Tool-Prompt Builder (Hermes) ━━━━━━━━━━━━
140
-
141
- TOOL_SYSTEM_PROMPT_TEMPLATE = """\
142
- You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
143
-
144
- # Tools
145
-
146
- You may call one or more functions to assist with the user query.
147
-
148
- You are provided with function signatures within <tools></tools> XML tags:
149
- <tools>
150
- {tool_definitions}
151
- </tools>
152
-
153
- For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
154
- <tool_call>
155
- {{"name": "<function-name>", "arguments": <args-json-object>}}
156
- </tool_call>"""
157
-
158
- NO_TOOL_SYSTEM_PROMPT = (
159
- "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
160
- )
161
-
162
-
163
- def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
164
- lines = []
165
- for t in tools:
166
- obj: dict[str, Any] = {
167
- "type": "function",
168
- "function": {
169
- "name": t.function.name,
170
- "description": t.function.description or "",
171
- },
172
- }
173
- if t.function.parameters:
174
- obj["function"]["parameters"] = t.function.parameters
175
- lines.append(json.dumps(obj))
176
- return "\n".join(lines)
177
-
178
-
179
- def build_chat_prompt(
180
- messages: list[ChatMessage],
181
- tools: Optional[list[ToolDef]] = None,
182
- tool_choice: Optional[Union[str, dict]] = None,
183
- ) -> str:
184
- parts: list[str] = []
185
- has_system = any(m.role == "system" for m in messages)
186
-
187
- if tools:
188
- default_sys = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
189
- tool_definitions=_serialize_tool_definitions(tools),
190
- )
191
- else:
192
- default_sys = NO_TOOL_SYSTEM_PROMPT
193
-
194
- if not has_system:
195
- parts.append(f"<|im_start|>system\n{default_sys}<|im_end|>\n")
196
-
197
- for msg in messages:
198
- role = msg.role
199
-
200
- if role == "system":
201
- base = msg.content or ""
202
- if tools:
203
- tool_block = TOOL_SYSTEM_PROMPT_TEMPLATE.format(
204
- tool_definitions=_serialize_tool_definitions(tools),
205
- )
206
- merged = f"{base}\n\n{tool_block}" if base else tool_block
207
- parts.append(f"<|im_start|>system\n{merged}<|im_end|>\n")
208
- else:
209
- parts.append(
210
- f"<|im_start|>system\n{base or NO_TOOL_SYSTEM_PROMPT}<|im_end|>\n"
211
- )
212
-
213
- elif role == "user":
214
- parts.append(f"<|im_start|>user\n{msg.content or ''}<|im_end|>\n")
215
-
216
- elif role == "assistant":
217
- if msg.tool_calls:
218
- tc_text = ""
219
- for tc in msg.tool_calls:
220
- args = tc.function.arguments
221
- if isinstance(args, dict):
222
- args = json.dumps(args)
223
- tc_text += (
224
- f"\n<tool_call>\n"
225
- f'{{"name": "{tc.function.name}", "arguments": {args}}}\n'
226
- f"</tool_call>"
227
- )
228
- parts.append(f"<|im_start|>assistant{tc_text}<|im_end|>\n")
229
- else:
230
- parts.append(
231
- f"<|im_start|>assistant\n{msg.content or ''}<|im_end|>\n"
232
- )
233
-
234
- elif role == "tool":
235
- parts.append(
236
- f"<|im_start|>user\n"
237
- f"<tool_response>\n{msg.content or ''}\n</tool_response>"
238
- f"<|im_end|>\n"
239
- )
240
-
241
- parts.append("<|im_start|>assistant\n")
242
- return "".join(parts)
243
-
244
-
245
- # ━━━━━━━━━━━━━━━━━━ Tool-Call Parser ━━━━━━━━━━━━━━━━━━━━━━━━━━
246
-
247
- _TOOL_CALL_RE = re.compile(
248
- r"<tool_call>\s*(\{.*?\})\s*</tool_call>",
249
- re.DOTALL,
250
- )
251
-
252
-
253
- def parse_tool_calls(text: str) -> tuple[Optional[str], list[dict]]:
254
- tool_calls: list[dict] = []
255
-
256
- for raw_json in _TOOL_CALL_RE.findall(text):
257
- try:
258
- parsed = json.loads(raw_json)
259
- except json.JSONDecodeError:
260
- continue
261
-
262
- name = parsed.get("name", "")
263
- arguments = parsed.get("arguments", {})
264
- if isinstance(arguments, dict):
265
- arguments = json.dumps(arguments)
266
- elif not isinstance(arguments, str):
267
- arguments = json.dumps(arguments)
268
-
269
- tool_calls.append({
270
- "id": f"call_{uuid.uuid4().hex[:24]}",
271
- "type": "function",
272
- "function": {
273
- "name": name,
274
- "arguments": arguments,
275
- },
276
- })
277
-
278
- content = _TOOL_CALL_RE.sub("", text).strip() or None
279
- return content, tool_calls
280
-
281
-
282
- # ━━━━━━━━━━━━━━━━━━ Generation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
283
-
284
- def generate_text(prompt: str, req) -> tuple[str, int, int]:
285
- """Generate text on CPU. Returns (text, prompt_tokens, completion_tokens)."""
286
- inputs = tokenizer(prompt, return_tensors="pt")
287
- input_ids = inputs["input_ids"]
288
- prompt_tokens = input_ids.shape[1]
289
-
290
- max_new = req.max_tokens or MAX_NEW_TOKENS
291
-
292
- # Build generation kwargs
293
- gen_kwargs = {
294
- "input_ids": input_ids,
295
- "attention_mask": inputs.get("attention_mask"),
296
- "max_new_tokens": max_new,
297
- "do_sample": True,
298
- "temperature": max(req.temperature, 0.01),
299
- "top_p": req.top_p,
300
- "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
301
- "pad_token_id": tokenizer.eos_token_id,
302
- }
303
-
304
- rep_penalty = getattr(req, "repetition_penalty", 1.0)
305
- if rep_penalty and rep_penalty > 1.0:
306
- gen_kwargs["repetition_penalty"] = rep_penalty
307
-
308
- with generate_lock:
309
- with torch.no_grad():
310
- output_ids = model.generate(**gen_kwargs)
311
-
312
- # Slice off the prompt tokens
313
- new_ids = output_ids[0][prompt_tokens:]
314
- text = tokenizer.decode(new_ids, skip_special_tokens=False)
315
-
316
- # Clean trailing special tokens
317
- for tok in ["<|im_end|>", "<|endoftext|>"]:
318
- text = text.replace(tok, "")
319
-
320
- completion_tokens = len(new_ids)
321
- return text.strip(), prompt_tokens, completion_tokens
322
-
323
-
324
- def generate_text_stream(prompt: str, req):
325
- """Generator that yields tokens one-by-one for streaming."""
326
- inputs = tokenizer(prompt, return_tensors="pt")
327
- input_ids = inputs["input_ids"]
328
-
329
- max_new = req.max_tokens or MAX_NEW_TOKENS
330
-
331
- streamer = TextIteratorStreamer(
332
- tokenizer, skip_prompt=True, skip_special_tokens=False
333
- )
334
-
335
- gen_kwargs = {
336
- "input_ids": input_ids,
337
- "attention_mask": inputs.get("attention_mask"),
338
- "max_new_tokens": max_new,
339
- "do_sample": True,
340
- "temperature": max(req.temperature, 0.01),
341
- "top_p": req.top_p,
342
- "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
343
- "pad_token_id": tokenizer.eos_token_id,
344
- "streamer": streamer,
345
- }
346
-
347
- rep_penalty = getattr(req, "repetition_penalty", 1.0)
348
- if rep_penalty and rep_penalty > 1.0:
349
- gen_kwargs["repetition_penalty"] = rep_penalty
350
-
351
- thread = Thread(target=_generate_in_thread, args=(gen_kwargs,))
352
- thread.start()
353
-
354
- for token_text in streamer:
355
- # Stop on special tokens
356
- if "<|im_end|>" in token_text or "<|endoftext|>" in token_text:
357
- cleaned = token_text.replace("<|im_end|>", "").replace("<|endoftext|>", "")
358
- if cleaned:
359
- yield cleaned
360
- break
361
- yield token_text
362
-
363
- thread.join()
364
-
365
-
366
- def _generate_in_thread(gen_kwargs):
367
- with generate_lock:
368
- with torch.no_grad():
369
- model.generate(**gen_kwargs)
370
-
371
-
372
- # ━━━━━━━━━━━━━━━━━━ Response Builders ━━━━━━━━━━━━━━━━━━━━━━━━━
373
-
374
- def _uid(prefix: str = "chatcmpl") -> str:
375
- return f"{prefix}-{uuid.uuid4().hex[:12]}"
376
-
377
-
378
- def make_chat_response(
379
- content: Optional[str],
380
- tool_calls: list[dict],
381
- model_name: str,
382
- prompt_tokens: int,
383
- completion_tokens: int,
384
- ) -> dict:
385
- message: dict[str, Any] = {"role": "assistant"}
386
-
387
- if tool_calls:
388
- message["content"] = content
389
- message["tool_calls"] = tool_calls
390
- finish_reason = "tool_calls"
391
- else:
392
- message["content"] = (content or "").strip()
393
- finish_reason = "stop"
394
-
395
- return {
396
- "id": _uid(),
397
- "object": "chat.completion",
398
- "created": int(time.time()),
399
- "model": model_name,
400
- "choices": [{
401
- "index": 0,
402
- "message": message,
403
- "finish_reason": finish_reason,
404
- }],
405
- "usage": {
406
- "prompt_tokens": prompt_tokens,
407
- "completion_tokens": completion_tokens,
408
- "total_tokens": prompt_tokens + completion_tokens,
409
- },
410
- }
411
-
412
-
413
- def make_completion_response(
414
- text: str, model_name: str, prompt_tokens: int, completion_tokens: int
415
- ) -> dict:
416
- return {
417
- "id": _uid("cmpl"),
418
- "object": "text_completion",
419
- "created": int(time.time()),
420
- "model": model_name,
421
- "choices": [{"index": 0, "text": text.strip(), "finish_reason": "stop"}],
422
- "usage": {
423
- "prompt_tokens": prompt_tokens,
424
- "completion_tokens": completion_tokens,
425
- "total_tokens": prompt_tokens + completion_tokens,
426
- },
427
- }
428
-
429
-
430
- # ━━━━━━━━━━━━━━━━━━ Streaming Helpers ━━━━━━━━━━━━━━━━━━━━━━━━
431
-
432
- def stream_chat_response(prompt: str, req):
433
- """SSE streaming for non-tool-call chat completions."""
434
- cid = _uid()
435
- created = int(time.time())
436
-
437
- def _chunk(delta: dict, finish: Optional[str] = None) -> str:
438
- return "data: " + json.dumps({
439
- "id": cid,
440
- "object": "chat.completion.chunk",
441
- "created": created,
442
- "model": req.model,
443
- "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
444
- }) + "\n\n"
445
-
446
- yield _chunk({"role": "assistant"})
447
-
448
- for token_text in generate_text_stream(prompt, req):
449
- if token_text:
450
- yield _chunk({"content": token_text})
451
-
452
- yield _chunk({}, finish="stop")
453
- yield "data: [DONE]\n\n"
454
-
455
-
456
- def stream_tool_call_chunks(
457
- content: Optional[str],
458
- tool_calls: list[dict],
459
- model_name: str,
460
- ):
461
- """SSE streaming for tool-call responses (post-generation)."""
462
- cid = _uid()
463
- created = int(time.time())
464
-
465
- def _chunk(delta: dict, finish: Optional[str] = None) -> str:
466
- return "data: " + json.dumps({
467
- "id": cid,
468
- "object": "chat.completion.chunk",
469
- "created": created,
470
- "model": model_name,
471
- "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
472
- }) + "\n\n"
473
-
474
- yield _chunk({"role": "assistant"})
475
-
476
- for idx, tc in enumerate(tool_calls):
477
- yield _chunk({
478
- "tool_calls": [{
479
- "index": idx,
480
- "id": tc["id"],
481
- "type": "function",
482
- "function": {"name": tc["function"]["name"], "arguments": ""},
483
- }]
484
- })
485
- yield _chunk({
486
- "tool_calls": [{
487
- "index": idx,
488
- "function": {"arguments": tc["function"]["arguments"]},
489
- }]
490
- })
491
-
492
- if content:
493
- yield _chunk({"content": content})
494
-
495
- yield _chunk({}, finish="tool_calls" if tool_calls else "stop")
496
- yield "data: [DONE]\n\n"
497
-
498
-
499
- # ━━━━━━━━━━━━━━━━━━━━━━ ROUTES ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
500
-
501
- @app.get("/")
502
- async def root():
503
- return {
504
- "message": "Qwen3.5-0.8B OpenAI-Compatible API (CPU) with Tool Calling",
505
- "docs": "/docs",
506
- "endpoints": {
507
- "models": "/v1/models",
508
- "chat": "/v1/chat/completions",
509
- "completions": "/v1/completions",
510
- "health": "/health",
511
- },
512
- }
513
-
514
-
515
- @app.get("/v1/models")
516
- async def list_models():
517
- return {
518
- "object": "list",
519
- "data": [{
520
- "id": MODEL_NAME,
521
- "object": "model",
522
- "created": int(time.time()),
523
- "owned_by": "local",
524
- }],
525
- }
526
-
527
-
528
- @app.post("/v1/chat/completions")
529
- async def chat_completions(req: ChatCompletionRequest):
530
- try:
531
- prompt = build_chat_prompt(req.messages, req.tools, req.tool_choice)
532
-
533
- # ── Tool-calling path (generate fully, then parse) ──
534
- if req.tools:
535
- text, prompt_tokens, completion_tokens = generate_text(prompt, req)
536
- content, tool_calls = parse_tool_calls(text)
537
-
538
- if req.stream:
539
- return StreamingResponse(
540
- stream_tool_call_chunks(content, tool_calls, req.model),
541
- media_type="text/event-stream",
542
- )
543
- return JSONResponse(
544
- make_chat_response(
545
- content, tool_calls, req.model, prompt_tokens, completion_tokens
546
- )
547
- )
548
-
549
- # ── Normal chat (supports true token-by-token streaming) ──
550
- if req.stream:
551
- return StreamingResponse(
552
- stream_chat_response(prompt, req),
553
- media_type="text/event-stream",
554
- )
555
-
556
- text, prompt_tokens, completion_tokens = generate_text(prompt, req)
557
- return JSONResponse(
558
- make_chat_response(text, [], req.model, prompt_tokens, completion_tokens)
559
- )
560
-
561
- except Exception as e:
562
- raise HTTPException(status_code=500, detail=str(e))
563
-
564
-
565
- @app.post("/v1/completions")
566
- async def completions(req: CompletionRequest):
567
- try:
568
- prompts = [req.prompt] if isinstance(req.prompt, str) else req.prompt
569
- prompt = prompts[0]
570
- text, prompt_tokens, completion_tokens = generate_text(prompt, req)
571
-
572
- return JSONResponse(
573
- make_completion_response(text, req.model, prompt_tokens, completion_tokens)
574
- )
575
- except Exception as e:
576
- raise HTTPException(status_code=500, detail=str(e))
577
-
578
-
579
- @app.get("/health")
580
- async def health():
581
- return {"status": "ok", "model": MODEL_NAME, "device": "cpu"}
582
-
583
-
584
- # ━━━━━━━━━━━━━━━━━━━━━━ MAIN ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
585
-
586
- if __name__ == "__main__":
587
- load_model()
588
-
589
- print(f"\n{'='*60}")
590
- print(f" OpenAI-compatible API with TOOL CALLING (CPU)")
591
- print(f" Model: {MODEL_NAME}")
592
- print(f" Device: CPU")
593
- print(f" URL: http://{HOST}:{PORT}/v1")
594
- print(f"{'='*60}\n")
595
-
596
- uvicorn.run(app, host=HOST, port=PORT, log_level="info")
 
1
+ # ============================================================
2
+ # Dockerfile β€” Qwen3.5-0.8B CPU-Only API for HF Spaces
3
+ # No GPU required. Port 7860.
4
+ # ============================================================
5
+
6
+ FROM python:3.11-slim
7
+
8
+ # ── System deps ──
9
+ RUN apt-get update && \
10
+ apt-get install -y --no-install-recommends git && \
11
+ rm -rf /var/lib/apt/lists/*
12
+
13
+ # ── Python deps (CPU-only torch β€” no CUDA bloat) ──
14
+ RUN pip install --no-cache-dir \
15
+ torch --index-url https://download.pytorch.org/whl/cpu
16
+
17
+ RUN pip install --no-cache-dir \
18
+ transformers \
19
+ accelerate \
20
+ fastapi \
21
+ uvicorn \
22
+ pydantic \
23
+ huggingface_hub
24
+
25
+ # ── Pre-download model at build time (~1.8 GB baked into image) ──
26
+ ENV HF_HOME=/tmp/hf_cache
27
+ RUN python3 -c "\
28
+ from huggingface_hub import snapshot_download; \
29
+ snapshot_download('Qwen/Qwen3.5-0.8B', cache_dir='/tmp/hf_cache')"
30
+
31
+ # ── Copy app ──
32
+ WORKDIR /app
33
+ COPY app.py .
34
+
35
+ EXPOSE 7860
36
+
37
+ CMD ["python3", "app.py"]