likhonsheikh commited on
Commit
c880d13
·
verified ·
1 Parent(s): 49560dc

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +218 -47
app.py CHANGED
@@ -1,13 +1,14 @@
1
  """
2
  Anthropic-Compatible API Endpoint
3
  Lightweight CPU-based implementation for Hugging Face Spaces
4
- Full Anthropic API parameter compatibility
5
  """
6
 
7
  import os
8
  import time
9
  import uuid
10
  import logging
 
11
  from datetime import datetime
12
  from logging.handlers import RotatingFileHandler
13
  from typing import List, Optional, Union, Dict, Any, Literal
@@ -83,7 +84,7 @@ async def lifespan(app: FastAPI):
83
 
84
  app = FastAPI(
85
  title="Anthropic-Compatible API",
86
- description="Lightweight CPU-based API with full Anthropic Messages API compatibility",
87
  version="1.0.0",
88
  lifespan=lifespan
89
  )
@@ -184,6 +185,16 @@ class SystemContent(BaseModel):
184
  text: str
185
  cache_control: Optional[Dict[str, str]] = None
186
 
 
 
 
 
 
 
 
 
 
 
187
  # Main request model (matching Anthropic exactly)
188
  class MessageRequest(BaseModel):
189
  # Required parameters
@@ -202,25 +213,33 @@ class MessageRequest(BaseModel):
202
  top_k: Optional[int] = Field(default=None, ge=0)
203
  top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
204
 
205
- # Usage model (matching Anthropic exactly)
 
 
 
206
  class Usage(BaseModel):
207
  input_tokens: int
208
  output_tokens: int
209
  cache_creation_input_tokens: Optional[int] = None
210
  cache_read_input_tokens: Optional[int] = None
211
 
212
- # Response content block
213
  class ResponseTextBlock(BaseModel):
214
  type: Literal["text"] = "text"
215
  text: str
216
 
 
 
 
 
 
217
  class ResponseToolUseBlock(BaseModel):
218
  type: Literal["tool_use"] = "tool_use"
219
  id: str
220
  name: str
221
  input: Dict[str, Any]
222
 
223
- ResponseContentBlock = Union[ResponseTextBlock, ResponseToolUseBlock]
224
 
225
  # Main response model (matching Anthropic exactly)
226
  class MessageResponse(BaseModel):
@@ -248,6 +267,7 @@ class TokenCountRequest(BaseModel):
248
  messages: List[Message]
249
  system: Optional[Union[str, List[SystemContent]]] = None
250
  tools: Optional[List[Tool]] = None
 
251
 
252
  class TokenCountResponse(BaseModel):
253
  input_tokens: int
@@ -281,11 +301,36 @@ def extract_system_content(system: Optional[Union[str, List[SystemContent]]]) ->
281
  texts.append(block.text)
282
  return " ".join(texts)
283
 
284
- def format_messages(messages: List[Message], system: Optional[Union[str, List[SystemContent]]] = None) -> str:
285
- """Format messages into a prompt string"""
 
 
 
 
 
286
  formatted_messages = []
287
 
288
  system_text = extract_system_content(system)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  if system_text:
290
  formatted_messages.append({"role": "system", "content": system_text})
291
 
@@ -305,6 +350,22 @@ def format_messages(messages: List[Message], system: Optional[Union[str, List[Sy
305
  prompt += "Assistant: "
306
  return prompt
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  def generate_id() -> str:
309
  return f"msg_{uuid.uuid4().hex[:24]}"
310
 
@@ -318,6 +379,7 @@ async def root():
318
  "model": MODEL_ID,
319
  "api_version": "2023-06-01",
320
  "compatibility": "anthropic-messages-api",
 
321
  "log_file": LOG_FILE
322
  }
323
 
@@ -331,7 +393,8 @@ async def list_models():
331
  "object": "model",
332
  "created": int(time.time()),
333
  "owned_by": "huggingface",
334
- "display_name": "SmolLM2 135M Instruct"
 
335
  }]
336
  }
337
 
@@ -357,13 +420,27 @@ async def create_message(
357
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
358
  anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
359
  ):
360
- """Create a message (Anthropic Messages API compatible)"""
361
  message_id = generate_id()
362
- logger.info(f"[{message_id}] Creating message - model: {request.model}, max_tokens: {request.max_tokens}, stream: {request.stream}")
363
- logger.debug(f"[{message_id}] Request params - temp: {request.temperature}, top_p: {request.top_p}, top_k: {request.top_k}")
 
 
 
 
 
 
 
 
364
 
365
  try:
366
- prompt = format_messages(request.messages, request.system)
 
 
 
 
 
 
367
  logger.debug(f"[{message_id}] Prompt length: {len(prompt)} chars")
368
 
369
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
@@ -372,29 +449,28 @@ async def create_message(
372
 
373
  if request.stream:
374
  logger.info(f"[{message_id}] Starting streaming response")
375
- return await stream_response(request, inputs, input_token_count, message_id)
376
 
377
- # Build generation kwargs matching Anthropic params
 
 
 
 
 
378
  gen_kwargs = {
379
- "max_new_tokens": request.max_tokens,
380
  "do_sample": request.temperature > 0 if request.temperature else False,
381
  "pad_token_id": tokenizer.eos_token_id,
382
  "eos_token_id": tokenizer.eos_token_id,
383
  }
384
 
385
- # Temperature (Anthropic default: 1.0)
386
  if request.temperature is not None and request.temperature > 0:
387
  gen_kwargs["temperature"] = request.temperature
388
-
389
- # Top-p (nucleus sampling)
390
  if request.top_p is not None:
391
  gen_kwargs["top_p"] = request.top_p
392
-
393
- # Top-k sampling
394
  if request.top_k is not None:
395
  gen_kwargs["top_k"] = request.top_k
396
 
397
- # Stop sequences
398
  if request.stop_sequences:
399
  stop_token_ids = []
400
  for seq in request.stop_sequences:
@@ -413,17 +489,27 @@ async def create_message(
413
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
414
  output_token_count = len(generated_tokens)
415
 
 
 
 
 
 
 
 
 
 
 
 
416
  # Determine stop reason
417
  stop_reason = "end_turn"
418
  stop_sequence = None
419
- if output_token_count >= request.max_tokens:
420
  stop_reason = "max_tokens"
421
  elif request.stop_sequences:
422
  for seq in request.stop_sequences:
423
  if seq in generated_text:
424
  stop_reason = "stop_sequence"
425
  stop_sequence = seq
426
- generated_text = generated_text.split(seq)[0]
427
  break
428
 
429
  tokens_per_sec = output_token_count / gen_time if gen_time > 0 else 0
@@ -431,7 +517,7 @@ async def create_message(
431
 
432
  response = MessageResponse(
433
  id=message_id,
434
- content=[ResponseTextBlock(type="text", text=generated_text.strip())],
435
  model=request.model,
436
  stop_reason=stop_reason,
437
  stop_sequence=stop_sequence,
@@ -446,8 +532,15 @@ async def create_message(
446
  logger.error(f"[{message_id}] Error creating message: {e}", exc_info=True)
447
  raise HTTPException(status_code=500, detail=str(e))
448
 
449
- async def stream_response(request: MessageRequest, inputs, input_token_count: int, message_id: str):
450
- """Stream response using SSE (Server-Sent Events) - Anthropic format"""
 
 
 
 
 
 
 
451
 
452
  async def generate():
453
  # message_start event
@@ -466,22 +559,24 @@ async def stream_response(request: MessageRequest, inputs, input_token_count: in
466
  }
467
  yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
468
 
469
- # content_block_start event
470
- block_start = {
471
- "type": "content_block_start",
472
- "index": 0,
473
- "content_block": {"type": "text", "text": ""}
474
- }
475
- yield f"event: content_block_start\ndata: {json.dumps(block_start)}\n\n"
476
 
477
- # ping event (Anthropic sends these)
478
  yield f"event: ping\ndata: {json.dumps({'type': 'ping'})}\n\n"
479
 
480
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
481
 
 
 
 
 
482
  gen_kwargs = {
483
  **inputs,
484
- "max_new_tokens": request.max_tokens,
485
  "do_sample": request.temperature > 0 if request.temperature else False,
486
  "pad_token_id": tokenizer.eos_token_id,
487
  "eos_token_id": tokenizer.eos_token_id,
@@ -500,26 +595,89 @@ async def stream_response(request: MessageRequest, inputs, input_token_count: in
500
  thread.start()
501
 
502
  output_tokens = 0
 
 
503
  for text in streamer:
504
  if text:
505
  output_tokens += len(tokenizer.encode(text, add_special_tokens=False))
506
- delta_event = {
507
- "type": "content_block_delta",
508
- "index": 0,
509
- "delta": {"type": "text_delta", "text": text}
510
- }
511
- yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
  thread.join()
514
  gen_time = time.time() - gen_start
515
  tokens_per_sec = output_tokens / gen_time if gen_time > 0 else 0
516
  logger.info(f"[{message_id}] Stream completed: {output_tokens} tokens in {gen_time:.2f}s ({tokens_per_sec:.1f} tok/s)")
517
 
518
- # content_block_stop event
519
- yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
520
 
521
  # message_delta event
522
- stop_reason = "max_tokens" if output_tokens >= request.max_tokens else "end_turn"
523
  delta = {
524
  "type": "message_delta",
525
  "delta": {"stop_reason": stop_reason, "stop_sequence": None},
@@ -543,14 +701,27 @@ async def stream_response(request: MessageRequest, inputs, input_token_count: in
543
  @app.post("/v1/messages/count_tokens", response_model=TokenCountResponse)
544
  async def count_tokens(request: TokenCountRequest):
545
  """Count tokens for a message request (Anthropic compatible)"""
546
- prompt = format_messages(request.messages, request.system)
 
 
 
 
 
 
 
 
547
  tokens = tokenizer.encode(prompt)
548
- logger.debug(f"Token count request: {len(tokens)} tokens")
549
  return TokenCountResponse(input_tokens=len(tokens))
550
 
551
  @app.get("/health")
552
  async def health():
553
- return {"status": "ok", "model_loaded": model is not None, "log_file": LOG_FILE}
 
 
 
 
 
554
 
555
  if __name__ == "__main__":
556
  import uvicorn
 
1
  """
2
  Anthropic-Compatible API Endpoint
3
  Lightweight CPU-based implementation for Hugging Face Spaces
4
+ Full Anthropic API parameter compatibility with Extended Thinking support
5
  """
6
 
7
  import os
8
  import time
9
  import uuid
10
  import logging
11
+ import re
12
  from datetime import datetime
13
  from logging.handlers import RotatingFileHandler
14
  from typing import List, Optional, Union, Dict, Any, Literal
 
84
 
85
  app = FastAPI(
86
  title="Anthropic-Compatible API",
87
+ description="Lightweight CPU-based API with full Anthropic Messages API compatibility including Extended Thinking",
88
  version="1.0.0",
89
  lifespan=lifespan
90
  )
 
185
  text: str
186
  cache_control: Optional[Dict[str, str]] = None
187
 
188
+ # ============== Extended Thinking (ThinkingConfig) ==============
189
+ class ThinkingConfig(BaseModel):
190
+ """
191
+ Extended thinking configuration (matching Anthropic's ThinkingConfig)
192
+ Enables Claude to think through complex problems before responding
193
+ """
194
+ type: Literal["enabled", "disabled"] = "enabled"
195
+ # Budget tokens for thinking (Anthropic uses budget_tokens)
196
+ budget_tokens: Optional[int] = Field(default=1024, ge=1, le=128000)
197
+
198
  # Main request model (matching Anthropic exactly)
199
  class MessageRequest(BaseModel):
200
  # Required parameters
 
213
  top_k: Optional[int] = Field(default=None, ge=0)
214
  top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
215
 
216
+ # Extended Thinking (ThinkingConfig)
217
+ thinking: Optional[ThinkingConfig] = None
218
+
219
+ # Usage model (matching Anthropic exactly with thinking tokens)
220
  class Usage(BaseModel):
221
  input_tokens: int
222
  output_tokens: int
223
  cache_creation_input_tokens: Optional[int] = None
224
  cache_read_input_tokens: Optional[int] = None
225
 
226
+ # Response content blocks
227
  class ResponseTextBlock(BaseModel):
228
  type: Literal["text"] = "text"
229
  text: str
230
 
231
+ class ResponseThinkingBlock(BaseModel):
232
+ """Thinking block in response (matching Anthropic's thinking content block)"""
233
+ type: Literal["thinking"] = "thinking"
234
+ thinking: str
235
+
236
  class ResponseToolUseBlock(BaseModel):
237
  type: Literal["tool_use"] = "tool_use"
238
  id: str
239
  name: str
240
  input: Dict[str, Any]
241
 
242
+ ResponseContentBlock = Union[ResponseTextBlock, ResponseThinkingBlock, ResponseToolUseBlock]
243
 
244
  # Main response model (matching Anthropic exactly)
245
  class MessageResponse(BaseModel):
 
267
  messages: List[Message]
268
  system: Optional[Union[str, List[SystemContent]]] = None
269
  tools: Optional[List[Tool]] = None
270
+ thinking: Optional[ThinkingConfig] = None
271
 
272
  class TokenCountResponse(BaseModel):
273
  input_tokens: int
 
301
  texts.append(block.text)
302
  return " ".join(texts)
303
 
304
+ def format_messages_with_thinking(
305
+ messages: List[Message],
306
+ system: Optional[Union[str, List[SystemContent]]] = None,
307
+ thinking_enabled: bool = False,
308
+ budget_tokens: int = 1024
309
+ ) -> str:
310
+ """Format messages with optional thinking prompt"""
311
  formatted_messages = []
312
 
313
  system_text = extract_system_content(system)
314
+
315
+ # Add thinking instructions to system prompt if enabled
316
+ if thinking_enabled:
317
+ thinking_instruction = f"""You are a helpful AI assistant with extended thinking capabilities.
318
+
319
+ When responding to complex problems:
320
+ 1. First, think through the problem step by step inside <thinking>...</thinking> tags
321
+ 2. Consider multiple approaches and evaluate them
322
+ 3. Show your reasoning process clearly
323
+ 4. After thinking, provide your final answer outside the thinking tags
324
+
325
+ Budget for thinking: up to {budget_tokens} tokens for reasoning.
326
+
327
+ Think deeply and thoroughly before responding."""
328
+
329
+ if system_text:
330
+ system_text = f"{thinking_instruction}\n\n{system_text}"
331
+ else:
332
+ system_text = thinking_instruction
333
+
334
  if system_text:
335
  formatted_messages.append({"role": "system", "content": system_text})
336
 
 
350
  prompt += "Assistant: "
351
  return prompt
352
 
353
+ def parse_thinking_response(text: str) -> tuple:
354
+ """
355
+ Parse response to extract thinking and final answer
356
+ Returns: (thinking_text, answer_text)
357
+ """
358
+ thinking_pattern = r'<thinking>(.*?)</thinking>'
359
+ thinking_matches = re.findall(thinking_pattern, text, re.DOTALL)
360
+
361
+ if thinking_matches:
362
+ thinking_text = "\n".join(thinking_matches).strip()
363
+ # Remove thinking blocks from response
364
+ answer_text = re.sub(thinking_pattern, '', text, flags=re.DOTALL).strip()
365
+ return thinking_text, answer_text
366
+ else:
367
+ return None, text.strip()
368
+
369
  def generate_id() -> str:
370
  return f"msg_{uuid.uuid4().hex[:24]}"
371
 
 
379
  "model": MODEL_ID,
380
  "api_version": "2023-06-01",
381
  "compatibility": "anthropic-messages-api",
382
+ "features": ["extended-thinking", "streaming", "tool-use"],
383
  "log_file": LOG_FILE
384
  }
385
 
 
393
  "object": "model",
394
  "created": int(time.time()),
395
  "owned_by": "huggingface",
396
+ "display_name": "SmolLM2 135M Instruct",
397
+ "supports_thinking": True
398
  }]
399
  }
400
 
 
420
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
421
  anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
422
  ):
423
+ """Create a message (Anthropic Messages API compatible with Extended Thinking)"""
424
  message_id = generate_id()
425
+
426
+ # Check if thinking is enabled
427
+ thinking_enabled = False
428
+ budget_tokens = 1024
429
+ if request.thinking:
430
+ thinking_enabled = request.thinking.type == "enabled"
431
+ budget_tokens = request.thinking.budget_tokens or 1024
432
+
433
+ logger.info(f"[{message_id}] Creating message - model: {request.model}, max_tokens: {request.max_tokens}, stream: {request.stream}, thinking: {thinking_enabled}")
434
+ logger.debug(f"[{message_id}] Request params - temp: {request.temperature}, top_p: {request.top_p}, top_k: {request.top_k}, thinking_budget: {budget_tokens}")
435
 
436
  try:
437
+ # Format prompt with thinking if enabled
438
+ prompt = format_messages_with_thinking(
439
+ request.messages,
440
+ request.system,
441
+ thinking_enabled=thinking_enabled,
442
+ budget_tokens=budget_tokens
443
+ )
444
  logger.debug(f"[{message_id}] Prompt length: {len(prompt)} chars")
445
 
446
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
 
449
 
450
  if request.stream:
451
  logger.info(f"[{message_id}] Starting streaming response")
452
+ return await stream_response(request, inputs, input_token_count, message_id, thinking_enabled, budget_tokens)
453
 
454
+ # Calculate max tokens (include thinking budget if enabled)
455
+ total_max_tokens = request.max_tokens
456
+ if thinking_enabled:
457
+ total_max_tokens += budget_tokens
458
+
459
+ # Build generation kwargs
460
  gen_kwargs = {
461
+ "max_new_tokens": total_max_tokens,
462
  "do_sample": request.temperature > 0 if request.temperature else False,
463
  "pad_token_id": tokenizer.eos_token_id,
464
  "eos_token_id": tokenizer.eos_token_id,
465
  }
466
 
 
467
  if request.temperature is not None and request.temperature > 0:
468
  gen_kwargs["temperature"] = request.temperature
 
 
469
  if request.top_p is not None:
470
  gen_kwargs["top_p"] = request.top_p
 
 
471
  if request.top_k is not None:
472
  gen_kwargs["top_k"] = request.top_k
473
 
 
474
  if request.stop_sequences:
475
  stop_token_ids = []
476
  for seq in request.stop_sequences:
 
489
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
490
  output_token_count = len(generated_tokens)
491
 
492
+ # Parse thinking from response if enabled
493
+ content_blocks = []
494
+ if thinking_enabled:
495
+ thinking_text, answer_text = parse_thinking_response(generated_text)
496
+ if thinking_text:
497
+ logger.info(f"[{message_id}] Thinking extracted: {len(thinking_text)} chars")
498
+ content_blocks.append(ResponseThinkingBlock(type="thinking", thinking=thinking_text))
499
+ content_blocks.append(ResponseTextBlock(type="text", text=answer_text))
500
+ else:
501
+ content_blocks.append(ResponseTextBlock(type="text", text=generated_text.strip()))
502
+
503
  # Determine stop reason
504
  stop_reason = "end_turn"
505
  stop_sequence = None
506
+ if output_token_count >= total_max_tokens:
507
  stop_reason = "max_tokens"
508
  elif request.stop_sequences:
509
  for seq in request.stop_sequences:
510
  if seq in generated_text:
511
  stop_reason = "stop_sequence"
512
  stop_sequence = seq
 
513
  break
514
 
515
  tokens_per_sec = output_token_count / gen_time if gen_time > 0 else 0
 
517
 
518
  response = MessageResponse(
519
  id=message_id,
520
+ content=content_blocks,
521
  model=request.model,
522
  stop_reason=stop_reason,
523
  stop_sequence=stop_sequence,
 
532
  logger.error(f"[{message_id}] Error creating message: {e}", exc_info=True)
533
  raise HTTPException(status_code=500, detail=str(e))
534
 
535
+ async def stream_response(
536
+ request: MessageRequest,
537
+ inputs,
538
+ input_token_count: int,
539
+ message_id: str,
540
+ thinking_enabled: bool = False,
541
+ budget_tokens: int = 1024
542
+ ):
543
+ """Stream response using SSE (Server-Sent Events) - Anthropic format with thinking support"""
544
 
545
  async def generate():
546
  # message_start event
 
559
  }
560
  yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
561
 
562
+ # If thinking is enabled, we'll track thinking vs text blocks
563
+ block_index = 0
564
+ in_thinking = False
565
+ thinking_started = False
566
+ text_block_started = False
 
 
567
 
568
+ # ping event
569
  yield f"event: ping\ndata: {json.dumps({'type': 'ping'})}\n\n"
570
 
571
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
572
 
573
+ total_max_tokens = request.max_tokens
574
+ if thinking_enabled:
575
+ total_max_tokens += budget_tokens
576
+
577
  gen_kwargs = {
578
  **inputs,
579
+ "max_new_tokens": total_max_tokens,
580
  "do_sample": request.temperature > 0 if request.temperature else False,
581
  "pad_token_id": tokenizer.eos_token_id,
582
  "eos_token_id": tokenizer.eos_token_id,
 
595
  thread.start()
596
 
597
  output_tokens = 0
598
+ accumulated_text = ""
599
+
600
  for text in streamer:
601
  if text:
602
  output_tokens += len(tokenizer.encode(text, add_special_tokens=False))
603
+ accumulated_text += text
604
+
605
+ if thinking_enabled:
606
+ # Check for thinking tags
607
+ if "<thinking>" in accumulated_text and not thinking_started:
608
+ # Start thinking block
609
+ thinking_started = True
610
+ in_thinking = True
611
+ block_start = {
612
+ "type": "content_block_start",
613
+ "index": block_index,
614
+ "content_block": {"type": "thinking", "thinking": ""}
615
+ }
616
+ yield f"event: content_block_start\ndata: {json.dumps(block_start)}\n\n"
617
+
618
+ if in_thinking:
619
+ # Stream thinking content
620
+ clean_text = text.replace("<thinking>", "").replace("</thinking>", "")
621
+ if clean_text:
622
+ delta_event = {
623
+ "type": "content_block_delta",
624
+ "index": block_index,
625
+ "delta": {"type": "thinking_delta", "thinking": clean_text}
626
+ }
627
+ yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
628
+
629
+ if "</thinking>" in accumulated_text:
630
+ # End thinking block
631
+ in_thinking = False
632
+ yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
633
+ block_index += 1
634
+
635
+ # Start text block
636
+ text_block_started = True
637
+ block_start = {
638
+ "type": "content_block_start",
639
+ "index": block_index,
640
+ "content_block": {"type": "text", "text": ""}
641
+ }
642
+ yield f"event: content_block_start\ndata: {json.dumps(block_start)}\n\n"
643
+
644
+ elif text_block_started:
645
+ # Stream text content
646
+ delta_event = {
647
+ "type": "content_block_delta",
648
+ "index": block_index,
649
+ "delta": {"type": "text_delta", "text": text}
650
+ }
651
+ yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
652
+
653
+ else:
654
+ # No thinking - just stream text
655
+ if not text_block_started:
656
+ text_block_started = True
657
+ block_start = {
658
+ "type": "content_block_start",
659
+ "index": 0,
660
+ "content_block": {"type": "text", "text": ""}
661
+ }
662
+ yield f"event: content_block_start\ndata: {json.dumps(block_start)}\n\n"
663
+
664
+ delta_event = {
665
+ "type": "content_block_delta",
666
+ "index": 0,
667
+ "delta": {"type": "text_delta", "text": text}
668
+ }
669
+ yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
670
 
671
  thread.join()
672
  gen_time = time.time() - gen_start
673
  tokens_per_sec = output_tokens / gen_time if gen_time > 0 else 0
674
  logger.info(f"[{message_id}] Stream completed: {output_tokens} tokens in {gen_time:.2f}s ({tokens_per_sec:.1f} tok/s)")
675
 
676
+ # content_block_stop for final block
677
+ yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
678
 
679
  # message_delta event
680
+ stop_reason = "max_tokens" if output_tokens >= total_max_tokens else "end_turn"
681
  delta = {
682
  "type": "message_delta",
683
  "delta": {"stop_reason": stop_reason, "stop_sequence": None},
 
701
  @app.post("/v1/messages/count_tokens", response_model=TokenCountResponse)
702
  async def count_tokens(request: TokenCountRequest):
703
  """Count tokens for a message request (Anthropic compatible)"""
704
+ thinking_enabled = request.thinking and request.thinking.type == "enabled"
705
+ budget_tokens = request.thinking.budget_tokens if request.thinking else 1024
706
+
707
+ prompt = format_messages_with_thinking(
708
+ request.messages,
709
+ request.system,
710
+ thinking_enabled=thinking_enabled,
711
+ budget_tokens=budget_tokens
712
+ )
713
  tokens = tokenizer.encode(prompt)
714
+ logger.debug(f"Token count request: {len(tokens)} tokens (thinking: {thinking_enabled})")
715
  return TokenCountResponse(input_tokens=len(tokens))
716
 
717
  @app.get("/health")
718
  async def health():
719
+ return {
720
+ "status": "ok",
721
+ "model_loaded": model is not None,
722
+ "log_file": LOG_FILE,
723
+ "features": ["extended-thinking", "streaming"]
724
+ }
725
 
726
  if __name__ == "__main__":
727
  import uvicorn