MuhammadNoman7600 commited on
Commit
9b8c94c
Β·
verified Β·
1 Parent(s): 39d2798

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -36
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  =============================================================================
3
- Transformers + FastAPI β€” OpenAI-Compatible Server for Qwen/Qwen3.5-0.8B
4
  CPU-ONLY β€’ TOOL CALLING β€’ STREAMING β€’ Port 7860 (HF Spaces)
5
  =============================================================================
6
  """
@@ -10,7 +10,7 @@ import os
10
  import re
11
  import time
12
  import uuid
13
- from threading import Lock
14
  from typing import Any, Optional, Union
15
 
16
  import torch
@@ -20,17 +20,16 @@ from fastapi.middleware.cors import CORSMiddleware
20
  from fastapi.responses import JSONResponse, StreamingResponse
21
  from pydantic import BaseModel
22
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
23
- from threading import Thread
24
 
25
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
26
- MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
27
  HOST = "0.0.0.0"
28
  PORT = 7860
29
  MAX_NEW_TOKENS = 1024
30
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
31
 
32
  app = FastAPI(
33
- title="Qwen3.5-0.8B OpenAI-Compatible API (CPU)",
34
  description="Transformers-powered inference with tool calling β€” runs on CPU",
35
  version="2.0.0",
36
  )
@@ -42,8 +41,8 @@ app.add_middleware(
42
  allow_headers=["*"],
43
  )
44
 
45
- # ━━━━━━━━━━━━━━━━━━━━━━ Pydantic Models ━━━━━━━━━━━━━━━━━━━━━━━
46
 
 
47
 
48
  class FunctionDef(BaseModel):
49
  name: str
@@ -85,7 +84,7 @@ class ChatCompletionRequest(BaseModel):
85
  stop: Optional[Union[str, list[str]]] = None
86
  frequency_penalty: Optional[float] = 0.0
87
  presence_penalty: Optional[float] = 0.0
88
- repetition_penalty: Optional[float] = 1.0
89
  n: Optional[int] = 1
90
  tools: Optional[list[ToolDef]] = None
91
  tool_choice: Optional[Union[str, dict]] = None
@@ -101,7 +100,7 @@ class CompletionRequest(BaseModel):
101
  stop: Optional[Union[str, list[str]]] = None
102
  frequency_penalty: Optional[float] = 0.0
103
  presence_penalty: Optional[float] = 0.0
104
- repetition_penalty: Optional[float] = 1.0
105
  n: Optional[int] = 1
106
 
107
 
@@ -110,10 +109,12 @@ class CompletionRequest(BaseModel):
110
  tokenizer = None
111
  model = None
112
  generate_lock = Lock()
 
 
113
 
114
 
115
  def load_model():
116
- global tokenizer, model
117
  if model is not None:
118
  return
119
 
@@ -122,8 +123,11 @@ def load_model():
122
 
123
  tokenizer = AutoTokenizer.from_pretrained(
124
  MODEL_NAME,
125
- use_fast=True,
126
  )
 
 
 
127
 
128
  model = AutoModelForCausalLM.from_pretrained(
129
  MODEL_NAME,
@@ -133,13 +137,33 @@ def load_model():
133
  )
134
  model.eval()
135
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  print("βœ… Model loaded on CPU!\n")
137
 
138
 
139
- # ━━━━━━━━━━━━━━━━━━━━ Tool-Prompt Builder (Hermes) ━━━━━━━━━━━━
 
 
 
 
 
 
 
 
140
 
141
  TOOL_SYSTEM_PROMPT_TEMPLATE = """\
142
- You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
143
 
144
  # Tools
145
 
@@ -155,9 +179,7 @@ For each function call, return a json object with function name and arguments wi
155
  {{"name": "<function-name>", "arguments": <args-json-object>}}
156
  </tool_call>"""
157
 
158
- NO_TOOL_SYSTEM_PROMPT = (
159
- "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
160
- )
161
 
162
 
163
  def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
@@ -281,24 +303,30 @@ def parse_tool_calls(text: str) -> tuple[Optional[str], list[dict]]:
281
 
282
  # ━━━━━━━━━━━━━━━━━━ Generation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
283
 
 
 
 
 
 
 
 
284
  def generate_text(prompt: str, req) -> tuple[str, int, int]:
285
- """Generate text on CPU. Returns (text, prompt_tokens, completion_tokens)."""
286
  inputs = tokenizer(prompt, return_tensors="pt")
287
  input_ids = inputs["input_ids"]
288
  prompt_tokens = input_ids.shape[1]
289
 
290
  max_new = req.max_tokens or MAX_NEW_TOKENS
291
 
292
- # Build generation kwargs
293
- gen_kwargs = {
294
  "input_ids": input_ids,
295
  "attention_mask": inputs.get("attention_mask"),
296
  "max_new_tokens": max_new,
297
  "do_sample": True,
298
  "temperature": max(req.temperature, 0.01),
299
  "top_p": req.top_p,
300
- "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
301
- "pad_token_id": tokenizer.eos_token_id,
302
  }
303
 
304
  rep_penalty = getattr(req, "repetition_penalty", 1.0)
@@ -309,16 +337,12 @@ def generate_text(prompt: str, req) -> tuple[str, int, int]:
309
  with torch.no_grad():
310
  output_ids = model.generate(**gen_kwargs)
311
 
312
- # Slice off the prompt tokens
313
  new_ids = output_ids[0][prompt_tokens:]
314
  text = tokenizer.decode(new_ids, skip_special_tokens=False)
315
-
316
- # Clean trailing special tokens
317
- for tok in ["<|im_end|>", "<|endoftext|>"]:
318
- text = text.replace(tok, "")
319
 
320
  completion_tokens = len(new_ids)
321
- return text.strip(), prompt_tokens, completion_tokens
322
 
323
 
324
  def generate_text_stream(prompt: str, req):
@@ -332,15 +356,15 @@ def generate_text_stream(prompt: str, req):
332
  tokenizer, skip_prompt=True, skip_special_tokens=False
333
  )
334
 
335
- gen_kwargs = {
336
  "input_ids": input_ids,
337
  "attention_mask": inputs.get("attention_mask"),
338
  "max_new_tokens": max_new,
339
  "do_sample": True,
340
  "temperature": max(req.temperature, 0.01),
341
  "top_p": req.top_p,
342
- "eos_token_id": tokenizer.convert_tokens_to_ids("<|im_end|>"),
343
- "pad_token_id": tokenizer.eos_token_id,
344
  "streamer": streamer,
345
  }
346
 
@@ -352,9 +376,8 @@ def generate_text_stream(prompt: str, req):
352
  thread.start()
353
 
354
  for token_text in streamer:
355
- # Stop on special tokens
356
- if "<|im_end|>" in token_text or "<|endoftext|>" in token_text:
357
- cleaned = token_text.replace("<|im_end|>", "").replace("<|endoftext|>", "")
358
  if cleaned:
359
  yield cleaned
360
  break
@@ -430,7 +453,6 @@ def make_completion_response(
430
  # ━━━━━━━━━━━━━━━━━━ Streaming Helpers ━━━━━━━━━━━━━━━━━━━━━━━━
431
 
432
  def stream_chat_response(prompt: str, req):
433
- """SSE streaming for non-tool-call chat completions."""
434
  cid = _uid()
435
  created = int(time.time())
436
 
@@ -458,7 +480,6 @@ def stream_tool_call_chunks(
458
  tool_calls: list[dict],
459
  model_name: str,
460
  ):
461
- """SSE streaming for tool-call responses (post-generation)."""
462
  cid = _uid()
463
  created = int(time.time())
464
 
@@ -501,7 +522,7 @@ def stream_tool_call_chunks(
501
  @app.get("/")
502
  async def root():
503
  return {
504
- "message": "Qwen3.5-0.8B OpenAI-Compatible API (CPU) with Tool Calling",
505
  "docs": "/docs",
506
  "endpoints": {
507
  "models": "/v1/models",
@@ -520,7 +541,7 @@ async def list_models():
520
  "id": MODEL_NAME,
521
  "object": "model",
522
  "created": int(time.time()),
523
- "owned_by": "local",
524
  }],
525
  }
526
 
 
1
  """
2
  =============================================================================
3
+ Transformers + FastAPI β€” OpenAI-Compatible Server for SmolLM2-360M
4
  CPU-ONLY β€’ TOOL CALLING β€’ STREAMING β€’ Port 7860 (HF Spaces)
5
  =============================================================================
6
  """
 
10
  import re
11
  import time
12
  import uuid
13
+ from threading import Lock, Thread
14
  from typing import Any, Optional, Union
15
 
16
  import torch
 
20
  from fastapi.responses import JSONResponse, StreamingResponse
21
  from pydantic import BaseModel
22
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
23
 
24
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━ CONFIG ━━━━━━━━━━━━━━━━━━━━━━━━━━━━
25
+ MODEL_NAME = "HuggingFaceTB/SmolLM2-360M"
26
  HOST = "0.0.0.0"
27
  PORT = 7860
28
  MAX_NEW_TOKENS = 1024
29
  # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
30
 
31
  app = FastAPI(
32
+ title="SmolLM2-360M OpenAI-Compatible API (CPU)",
33
  description="Transformers-powered inference with tool calling β€” runs on CPU",
34
  version="2.0.0",
35
  )
 
41
  allow_headers=["*"],
42
  )
43
 
 
44
 
45
+ # ━━━━━━━━━━━━━━━━━━━━━━ Pydantic Models ━━━━━━━━━━━━━━━━━━━━━━━
46
 
47
  class FunctionDef(BaseModel):
48
  name: str
 
84
  stop: Optional[Union[str, list[str]]] = None
85
  frequency_penalty: Optional[float] = 0.0
86
  presence_penalty: Optional[float] = 0.0
87
+ repetition_penalty: Optional[float] = 1.1
88
  n: Optional[int] = 1
89
  tools: Optional[list[ToolDef]] = None
90
  tool_choice: Optional[Union[str, dict]] = None
 
100
  stop: Optional[Union[str, list[str]]] = None
101
  frequency_penalty: Optional[float] = 0.0
102
  presence_penalty: Optional[float] = 0.0
103
+ repetition_penalty: Optional[float] = 1.1
104
  n: Optional[int] = 1
105
 
106
 
 
109
  tokenizer = None
110
  model = None
111
  generate_lock = Lock()
112
+ # Will hold all token IDs the model should stop on
113
+ stop_token_ids: list[int] = []
114
 
115
 
116
  def load_model():
117
+ global tokenizer, model, stop_token_ids
118
  if model is not None:
119
  return
120
 
 
123
 
124
  tokenizer = AutoTokenizer.from_pretrained(
125
  MODEL_NAME,
126
+ trust_remote_code=True,
127
  )
128
+ # Ensure pad token exists
129
+ if tokenizer.pad_token is None:
130
+ tokenizer.pad_token = tokenizer.eos_token
131
 
132
  model = AutoModelForCausalLM.from_pretrained(
133
  MODEL_NAME,
 
137
  )
138
  model.eval()
139
 
140
+ # Build stop-token list: eos + any ChatML special tokens the vocab has
141
+ _stop_ids = set()
142
+ _stop_ids.add(tokenizer.eos_token_id)
143
+ for tok_str in ["<|im_end|>", "<|endoftext|>"]:
144
+ tid = tokenizer.convert_tokens_to_ids(tok_str)
145
+ # convert_tokens_to_ids returns unk_id when token is missing
146
+ if tid != tokenizer.unk_token_id and tid is not None:
147
+ _stop_ids.add(tid)
148
+ stop_token_ids = list(_stop_ids)
149
+
150
+ print(f" eos_token = {tokenizer.eos_token!r}")
151
+ print(f" stop_token_ids = {stop_token_ids}")
152
  print("βœ… Model loaded on CPU!\n")
153
 
154
 
155
+ # ━━━━━━━━━━━━━━━━━━━━ Chat-Prompt Builder (ChatML) ━━━━━━━━━━━━
156
+ #
157
+ # SmolLM2 uses the ChatML template:
158
+ # <|im_start|>system\n...<|im_end|>\n
159
+ # <|im_start|>user\n...<|im_end|>\n
160
+ # <|im_start|>assistant\n...<|im_end|>\n
161
+ #
162
+ # For tool calling we inject Hermes-style tool defs into the system prompt.
163
+ #
164
 
165
  TOOL_SYSTEM_PROMPT_TEMPLATE = """\
166
+ You are a helpful assistant.
167
 
168
  # Tools
169
 
 
179
  {{"name": "<function-name>", "arguments": <args-json-object>}}
180
  </tool_call>"""
181
 
182
+ NO_TOOL_SYSTEM_PROMPT = "You are a helpful assistant."
 
 
183
 
184
 
185
  def _serialize_tool_definitions(tools: list[ToolDef]) -> str:
 
303
 
304
  # ━━━━━━━━━━━━━━━━━━ Generation ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
305
 
306
+ def _clean_output(text: str) -> str:
307
+ """Strip all known special / stop tokens from generated text."""
308
+ for tok in ["<|im_end|>", "<|im_start|>", "<|endoftext|>"]:
309
+ text = text.replace(tok, "")
310
+ return text.strip()
311
+
312
+
313
  def generate_text(prompt: str, req) -> tuple[str, int, int]:
314
+ """Generate on CPU. Returns (text, prompt_tokens, completion_tokens)."""
315
  inputs = tokenizer(prompt, return_tensors="pt")
316
  input_ids = inputs["input_ids"]
317
  prompt_tokens = input_ids.shape[1]
318
 
319
  max_new = req.max_tokens or MAX_NEW_TOKENS
320
 
321
+ gen_kwargs: dict[str, Any] = {
 
322
  "input_ids": input_ids,
323
  "attention_mask": inputs.get("attention_mask"),
324
  "max_new_tokens": max_new,
325
  "do_sample": True,
326
  "temperature": max(req.temperature, 0.01),
327
  "top_p": req.top_p,
328
+ "eos_token_id": stop_token_ids,
329
+ "pad_token_id": tokenizer.pad_token_id,
330
  }
331
 
332
  rep_penalty = getattr(req, "repetition_penalty", 1.0)
 
337
  with torch.no_grad():
338
  output_ids = model.generate(**gen_kwargs)
339
 
 
340
  new_ids = output_ids[0][prompt_tokens:]
341
  text = tokenizer.decode(new_ids, skip_special_tokens=False)
342
+ text = _clean_output(text)
 
 
 
343
 
344
  completion_tokens = len(new_ids)
345
+ return text, prompt_tokens, completion_tokens
346
 
347
 
348
  def generate_text_stream(prompt: str, req):
 
356
  tokenizer, skip_prompt=True, skip_special_tokens=False
357
  )
358
 
359
+ gen_kwargs: dict[str, Any] = {
360
  "input_ids": input_ids,
361
  "attention_mask": inputs.get("attention_mask"),
362
  "max_new_tokens": max_new,
363
  "do_sample": True,
364
  "temperature": max(req.temperature, 0.01),
365
  "top_p": req.top_p,
366
+ "eos_token_id": stop_token_ids,
367
+ "pad_token_id": tokenizer.pad_token_id,
368
  "streamer": streamer,
369
  }
370
 
 
376
  thread.start()
377
 
378
  for token_text in streamer:
379
+ if any(s in token_text for s in ["<|im_end|>", "<|endoftext|>"]):
380
+ cleaned = _clean_output(token_text)
 
381
  if cleaned:
382
  yield cleaned
383
  break
 
453
  # ━━━━━━━━━━━━━━━━━━ Streaming Helpers ━━━━━━━━━━━━━━━━━━━━━━━━
454
 
455
  def stream_chat_response(prompt: str, req):
 
456
  cid = _uid()
457
  created = int(time.time())
458
 
 
480
  tool_calls: list[dict],
481
  model_name: str,
482
  ):
 
483
  cid = _uid()
484
  created = int(time.time())
485
 
 
522
  @app.get("/")
523
  async def root():
524
  return {
525
+ "message": "SmolLM2-360M OpenAI-Compatible API (CPU) with Tool Calling",
526
  "docs": "/docs",
527
  "endpoints": {
528
  "models": "/v1/models",
 
541
  "id": MODEL_NAME,
542
  "object": "model",
543
  "created": int(time.time()),
544
+ "owned_by": "huggingface",
545
  }],
546
  }
547