likhonsheikh commited on
Commit
2cd298a
·
verified ·
1 Parent(s): c880d13

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +387 -306
app.py CHANGED
@@ -1,7 +1,8 @@
1
  """
2
- Anthropic-Compatible API Endpoint
3
  Lightweight CPU-based implementation for Hugging Face Spaces
4
- Full Anthropic API parameter compatibility with Extended Thinking support
 
5
  """
6
 
7
  import os
@@ -44,14 +45,14 @@ console_handler.setFormatter(log_format)
44
  console_handler.setLevel(logging.INFO)
45
 
46
  logging.basicConfig(level=logging.DEBUG, handlers=[file_handler, console_handler])
47
- logger = logging.getLogger("anthropic-api")
48
 
49
  for uvicorn_logger in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
50
  uv_log = logging.getLogger(uvicorn_logger)
51
  uv_log.handlers = [file_handler, console_handler]
52
 
53
  logger.info("=" * 60)
54
- logger.info(f"Application Startup at {datetime.now().isoformat()}")
55
  logger.info(f"Log file: {LOG_FILE}")
56
  logger.info("=" * 60)
57
 
@@ -83,8 +84,12 @@ async def lifespan(app: FastAPI):
83
  del model, tokenizer
84
 
85
  app = FastAPI(
86
- title="Anthropic-Compatible API",
87
- description="Lightweight CPU-based API with full Anthropic Messages API compatibility including Extended Thinking",
 
 
 
 
88
  version="1.0.0",
89
  lifespan=lifespan
90
  )
@@ -112,170 +117,216 @@ async def log_requests(request: Request, call_next):
112
  logger.error(f"[{request_id}] {request.method} {request.url.path} - Error: {e} ({duration:.2f}ms)")
113
  raise
114
 
115
- # ============== Anthropic-Compatible Pydantic Models ==============
 
 
116
 
117
- # Content block types (matching Anthropic exactly)
118
- class TextBlock(BaseModel):
119
  type: Literal["text"] = "text"
120
  text: str
121
 
122
- class ImageSource(BaseModel):
123
  type: Literal["base64", "url"] = "base64"
124
  media_type: Optional[str] = None
125
  data: Optional[str] = None
126
  url: Optional[str] = None
127
 
128
- class ImageBlock(BaseModel):
129
  type: Literal["image"] = "image"
130
- source: ImageSource
131
 
132
- class ToolUseBlock(BaseModel):
133
  type: Literal["tool_use"] = "tool_use"
134
  id: str
135
  name: str
136
  input: Dict[str, Any]
137
 
138
- class ToolResultBlock(BaseModel):
139
  type: Literal["tool_result"] = "tool_result"
140
  tool_use_id: str
141
- content: Optional[Union[str, List[TextBlock]]] = None
142
  is_error: Optional[bool] = False
143
 
144
- ContentBlock = Union[TextBlock, ImageBlock, ToolUseBlock, ToolResultBlock]
145
 
146
- # Message structure (matching Anthropic exactly)
147
- class Message(BaseModel):
148
  role: Literal["user", "assistant"]
149
- content: Union[str, List[ContentBlock]]
150
 
151
- # Tool definition (matching Anthropic exactly)
152
- class ToolInputSchema(BaseModel):
153
  type: Literal["object"] = "object"
154
  properties: Optional[Dict[str, Any]] = None
155
  required: Optional[List[str]] = None
156
 
157
- class Tool(BaseModel):
158
  name: str
159
  description: Optional[str] = None
160
- input_schema: ToolInputSchema
161
 
162
- # Tool choice (matching Anthropic exactly)
163
- class ToolChoiceAuto(BaseModel):
164
  type: Literal["auto"] = "auto"
165
  disable_parallel_tool_use: Optional[bool] = None
166
 
167
- class ToolChoiceAny(BaseModel):
168
  type: Literal["any"] = "any"
169
  disable_parallel_tool_use: Optional[bool] = None
170
 
171
- class ToolChoiceTool(BaseModel):
172
  type: Literal["tool"] = "tool"
173
  name: str
174
  disable_parallel_tool_use: Optional[bool] = None
175
 
176
- ToolChoice = Union[ToolChoiceAuto, ToolChoiceAny, ToolChoiceTool]
177
 
178
- # Metadata (matching Anthropic exactly)
179
- class Metadata(BaseModel):
180
  user_id: Optional[str] = None
181
 
182
- # System content (matching Anthropic exactly)
183
- class SystemContent(BaseModel):
184
  type: Literal["text"] = "text"
185
  text: str
186
  cache_control: Optional[Dict[str, str]] = None
187
 
188
- # ============== Extended Thinking (ThinkingConfig) ==============
189
- class ThinkingConfig(BaseModel):
190
- """
191
- Extended thinking configuration (matching Anthropic's ThinkingConfig)
192
- Enables Claude to think through complex problems before responding
193
- """
194
  type: Literal["enabled", "disabled"] = "enabled"
195
- # Budget tokens for thinking (Anthropic uses budget_tokens)
196
  budget_tokens: Optional[int] = Field(default=1024, ge=1, le=128000)
197
 
198
- # Main request model (matching Anthropic exactly)
199
- class MessageRequest(BaseModel):
200
- # Required parameters
201
  model: str
202
  max_tokens: int
203
- messages: List[Message]
204
-
205
- # Optional parameters (matching Anthropic exactly)
206
- metadata: Optional[Metadata] = None
207
  stop_sequences: Optional[List[str]] = None
208
  stream: Optional[bool] = False
209
- system: Optional[Union[str, List[SystemContent]]] = None
210
  temperature: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
211
- tool_choice: Optional[ToolChoice] = None
212
- tools: Optional[List[Tool]] = None
213
  top_k: Optional[int] = Field(default=None, ge=0)
214
  top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
 
215
 
216
- # Extended Thinking (ThinkingConfig)
217
- thinking: Optional[ThinkingConfig] = None
218
-
219
- # Usage model (matching Anthropic exactly with thinking tokens)
220
- class Usage(BaseModel):
221
  input_tokens: int
222
  output_tokens: int
223
  cache_creation_input_tokens: Optional[int] = None
224
  cache_read_input_tokens: Optional[int] = None
225
 
226
- # Response content blocks
227
- class ResponseTextBlock(BaseModel):
228
  type: Literal["text"] = "text"
229
  text: str
230
 
231
- class ResponseThinkingBlock(BaseModel):
232
- """Thinking block in response (matching Anthropic's thinking content block)"""
233
  type: Literal["thinking"] = "thinking"
234
  thinking: str
235
 
236
- class ResponseToolUseBlock(BaseModel):
237
  type: Literal["tool_use"] = "tool_use"
238
  id: str
239
  name: str
240
  input: Dict[str, Any]
241
 
242
- ResponseContentBlock = Union[ResponseTextBlock, ResponseThinkingBlock, ResponseToolUseBlock]
243
 
244
- # Main response model (matching Anthropic exactly)
245
- class MessageResponse(BaseModel):
246
  id: str
247
  type: Literal["message"] = "message"
248
  role: Literal["assistant"] = "assistant"
249
- content: List[ResponseContentBlock]
250
  model: str
251
  stop_reason: Optional[Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]] = None
252
  stop_sequence: Optional[str] = None
253
- usage: Usage
 
 
 
 
 
 
 
 
 
 
254
 
255
- # Error response (matching Anthropic exactly)
256
- class ErrorDetail(BaseModel):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  type: str
258
- message: str
259
 
260
- class ErrorResponse(BaseModel):
261
- type: Literal["error"] = "error"
262
- error: ErrorDetail
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
- # Token count request/response (matching Anthropic exactly)
265
- class TokenCountRequest(BaseModel):
 
 
266
  model: str
267
- messages: List[Message]
268
- system: Optional[Union[str, List[SystemContent]]] = None
269
- tools: Optional[List[Tool]] = None
270
- thinking: Optional[ThinkingConfig] = None
271
 
272
- class TokenCountResponse(BaseModel):
273
- input_tokens: int
 
 
 
 
 
 
 
274
 
275
  # ============== Helper Functions ==============
276
 
277
- def extract_text_content(content: Union[str, List[ContentBlock]]) -> str:
278
- """Extract text from content (string or list of blocks)"""
279
  if isinstance(content, str):
280
  return content
281
  texts = []
@@ -287,8 +338,7 @@ def extract_text_content(content: Union[str, List[ContentBlock]]) -> str:
287
  texts.append(block.text)
288
  return " ".join(texts)
289
 
290
- def extract_system_content(system: Optional[Union[str, List[SystemContent]]]) -> Optional[str]:
291
- """Extract system prompt from string or list of system content blocks"""
292
  if system is None:
293
  return None
294
  if isinstance(system, str):
@@ -301,18 +351,26 @@ def extract_system_content(system: Optional[Union[str, List[SystemContent]]]) ->
301
  texts.append(block.text)
302
  return " ".join(texts)
303
 
304
- def format_messages_with_thinking(
305
- messages: List[Message],
306
- system: Optional[Union[str, List[SystemContent]]] = None,
 
 
 
 
 
 
 
 
 
 
 
307
  thinking_enabled: bool = False,
308
  budget_tokens: int = 1024
309
  ) -> str:
310
- """Format messages with optional thinking prompt"""
311
  formatted_messages = []
 
312
 
313
- system_text = extract_system_content(system)
314
-
315
- # Add thinking instructions to system prompt if enabled
316
  if thinking_enabled:
317
  thinking_instruction = f"""You are a helpful AI assistant with extended thinking capabilities.
318
 
@@ -325,7 +383,6 @@ When responding to complex problems:
325
  Budget for thinking: up to {budget_tokens} tokens for reasoning.
326
 
327
  Think deeply and thoroughly before responding."""
328
-
329
  if system_text:
330
  system_text = f"{thinking_instruction}\n\n{system_text}"
331
  else:
@@ -335,13 +392,27 @@ Think deeply and thoroughly before responding."""
335
  formatted_messages.append({"role": "system", "content": system_text})
336
 
337
  for msg in messages:
338
- content = extract_text_content(msg.content)
339
  formatted_messages.append({"role": msg.role, "content": content})
340
 
341
  if tokenizer.chat_template:
342
- return tokenizer.apply_chat_template(
343
- formatted_messages, tokenize=False, add_generation_prompt=True
344
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
  prompt = ""
347
  for msg in formatted_messages:
@@ -351,41 +422,206 @@ Think deeply and thoroughly before responding."""
351
  return prompt
352
 
353
  def parse_thinking_response(text: str) -> tuple:
354
- """
355
- Parse response to extract thinking and final answer
356
- Returns: (thinking_text, answer_text)
357
- """
358
  thinking_pattern = r'<thinking>(.*?)</thinking>'
359
  thinking_matches = re.findall(thinking_pattern, text, re.DOTALL)
360
-
361
  if thinking_matches:
362
  thinking_text = "\n".join(thinking_matches).strip()
363
- # Remove thinking blocks from response
364
  answer_text = re.sub(thinking_pattern, '', text, flags=re.DOTALL).strip()
365
  return thinking_text, answer_text
366
- else:
367
- return None, text.strip()
368
 
369
- def generate_id() -> str:
370
- return f"msg_{uuid.uuid4().hex[:24]}"
371
 
372
- # ============== API Endpoints ==============
373
 
374
  @app.get("/")
375
  async def root():
376
- logger.debug("Root endpoint accessed")
377
  return {
378
  "status": "healthy",
379
  "model": MODEL_ID,
380
- "api_version": "2023-06-01",
381
- "compatibility": "anthropic-messages-api",
382
- "features": ["extended-thinking", "streaming", "tool-use"],
 
 
 
 
 
 
383
  "log_file": LOG_FILE
384
  }
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  @app.get("/v1/models")
387
- async def list_models():
388
- logger.debug("Models list requested")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  return {
390
  "object": "list",
391
  "data": [{
@@ -398,65 +634,34 @@ async def list_models():
398
  }]
399
  }
400
 
401
- @app.get("/logs")
402
- async def get_logs(lines: int = 100):
403
- try:
404
- with open(LOG_FILE, 'r') as f:
405
- all_lines = f.readlines()
406
- recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
407
- return {
408
- "log_file": LOG_FILE,
409
- "total_lines": len(all_lines),
410
- "returned_lines": len(recent_lines),
411
- "logs": "".join(recent_lines)
412
- }
413
- except FileNotFoundError:
414
- return {"error": "Log file not found", "log_file": LOG_FILE}
415
-
416
- @app.post("/v1/messages", response_model=MessageResponse)
417
- async def create_message(
418
- request: MessageRequest,
419
  x_api_key: Optional[str] = Header(None, alias="x-api-key"),
420
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
421
  anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
422
  ):
423
- """Create a message (Anthropic Messages API compatible with Extended Thinking)"""
424
- message_id = generate_id()
425
 
426
- # Check if thinking is enabled
427
  thinking_enabled = False
428
  budget_tokens = 1024
429
  if request.thinking:
430
  thinking_enabled = request.thinking.type == "enabled"
431
  budget_tokens = request.thinking.budget_tokens or 1024
432
 
433
- logger.info(f"[{message_id}] Creating message - model: {request.model}, max_tokens: {request.max_tokens}, stream: {request.stream}, thinking: {thinking_enabled}")
434
- logger.debug(f"[{message_id}] Request params - temp: {request.temperature}, top_p: {request.top_p}, top_k: {request.top_k}, thinking_budget: {budget_tokens}")
435
 
436
  try:
437
- # Format prompt with thinking if enabled
438
- prompt = format_messages_with_thinking(
439
- request.messages,
440
- request.system,
441
- thinking_enabled=thinking_enabled,
442
- budget_tokens=budget_tokens
443
- )
444
- logger.debug(f"[{message_id}] Prompt length: {len(prompt)} chars")
445
-
446
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
447
  input_token_count = inputs.input_ids.shape[1]
448
- logger.info(f"[{message_id}] Input tokens: {input_token_count}")
449
 
450
  if request.stream:
451
- logger.info(f"[{message_id}] Starting streaming response")
452
- return await stream_response(request, inputs, input_token_count, message_id, thinking_enabled, budget_tokens)
453
 
454
- # Calculate max tokens (include thinking budget if enabled)
455
- total_max_tokens = request.max_tokens
456
- if thinking_enabled:
457
- total_max_tokens += budget_tokens
458
 
459
- # Build generation kwargs
460
  gen_kwargs = {
461
  "max_new_tokens": total_max_tokens,
462
  "do_sample": request.temperature > 0 if request.temperature else False,
@@ -464,22 +669,13 @@ async def create_message(
464
  "eos_token_id": tokenizer.eos_token_id,
465
  }
466
 
467
- if request.temperature is not None and request.temperature > 0:
468
  gen_kwargs["temperature"] = request.temperature
469
- if request.top_p is not None:
470
  gen_kwargs["top_p"] = request.top_p
471
- if request.top_k is not None:
472
  gen_kwargs["top_k"] = request.top_k
473
 
474
- if request.stop_sequences:
475
- stop_token_ids = []
476
- for seq in request.stop_sequences:
477
- tokens = tokenizer.encode(seq, add_special_tokens=False)
478
- if tokens:
479
- stop_token_ids.extend(tokens)
480
- if stop_token_ids:
481
- gen_kwargs["eos_token_id"] = list(set([tokenizer.eos_token_id] + stop_token_ids))
482
-
483
  gen_start = time.time()
484
  with torch.no_grad():
485
  outputs = model.generate(**inputs, **gen_kwargs)
@@ -489,90 +685,55 @@ async def create_message(
489
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
490
  output_token_count = len(generated_tokens)
491
 
492
- # Parse thinking from response if enabled
493
  content_blocks = []
494
  if thinking_enabled:
495
  thinking_text, answer_text = parse_thinking_response(generated_text)
496
  if thinking_text:
497
- logger.info(f"[{message_id}] Thinking extracted: {len(thinking_text)} chars")
498
- content_blocks.append(ResponseThinkingBlock(type="thinking", thinking=thinking_text))
499
- content_blocks.append(ResponseTextBlock(type="text", text=answer_text))
500
  else:
501
- content_blocks.append(ResponseTextBlock(type="text", text=generated_text.strip()))
502
 
503
- # Determine stop reason
504
  stop_reason = "end_turn"
505
- stop_sequence = None
506
  if output_token_count >= total_max_tokens:
507
  stop_reason = "max_tokens"
508
- elif request.stop_sequences:
509
- for seq in request.stop_sequences:
510
- if seq in generated_text:
511
- stop_reason = "stop_sequence"
512
- stop_sequence = seq
513
- break
514
 
515
- tokens_per_sec = output_token_count / gen_time if gen_time > 0 else 0
516
- logger.info(f"[{message_id}] Generated {output_token_count} tokens in {gen_time:.2f}s ({tokens_per_sec:.1f} tok/s)")
517
 
518
- response = MessageResponse(
519
  id=message_id,
520
  content=content_blocks,
521
  model=request.model,
522
  stop_reason=stop_reason,
523
- stop_sequence=stop_sequence,
524
- usage=Usage(
525
- input_tokens=input_token_count,
526
- output_tokens=output_token_count
527
- )
528
  )
529
- return response
530
 
531
  except Exception as e:
532
- logger.error(f"[{message_id}] Error creating message: {e}", exc_info=True)
533
  raise HTTPException(status_code=500, detail=str(e))
534
 
535
- async def stream_response(
536
- request: MessageRequest,
537
- inputs,
538
- input_token_count: int,
539
- message_id: str,
540
- thinking_enabled: bool = False,
541
- budget_tokens: int = 1024
542
- ):
543
- """Stream response using SSE (Server-Sent Events) - Anthropic format with thinking support"""
544
 
545
  async def generate():
546
- # message_start event
547
  start_event = {
548
  "type": "message_start",
549
  "message": {
550
- "id": message_id,
551
- "type": "message",
552
- "role": "assistant",
553
- "content": [],
554
- "model": request.model,
555
- "stop_reason": None,
556
- "stop_sequence": None,
557
  "usage": {"input_tokens": input_token_count, "output_tokens": 0}
558
  }
559
  }
560
  yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
 
561
 
562
- # If thinking is enabled, we'll track thinking vs text blocks
563
  block_index = 0
564
  in_thinking = False
565
  thinking_started = False
566
  text_block_started = False
567
 
568
- # ping event
569
- yield f"event: ping\ndata: {json.dumps({'type': 'ping'})}\n\n"
570
-
571
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
572
-
573
- total_max_tokens = request.max_tokens
574
- if thinking_enabled:
575
- total_max_tokens += budget_tokens
576
 
577
  gen_kwargs = {
578
  **inputs,
@@ -583,14 +744,13 @@ async def stream_response(
583
  "streamer": streamer,
584
  }
585
 
586
- if request.temperature is not None and request.temperature > 0:
587
  gen_kwargs["temperature"] = request.temperature
588
- if request.top_p is not None:
589
  gen_kwargs["top_p"] = request.top_p
590
- if request.top_k is not None:
591
  gen_kwargs["top_k"] = request.top_k
592
 
593
- gen_start = time.time()
594
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
595
  thread.start()
596
 
@@ -603,125 +763,46 @@ async def stream_response(
603
  accumulated_text += text
604
 
605
  if thinking_enabled:
606
- # Check for thinking tags
607
  if "<thinking>" in accumulated_text and not thinking_started:
608
- # Start thinking block
609
  thinking_started = True
610
  in_thinking = True
611
- block_start = {
612
- "type": "content_block_start",
613
- "index": block_index,
614
- "content_block": {"type": "thinking", "thinking": ""}
615
- }
616
- yield f"event: content_block_start\ndata: {json.dumps(block_start)}\n\n"
617
 
618
  if in_thinking:
619
- # Stream thinking content
620
  clean_text = text.replace("<thinking>", "").replace("</thinking>", "")
621
  if clean_text:
622
- delta_event = {
623
- "type": "content_block_delta",
624
- "index": block_index,
625
- "delta": {"type": "thinking_delta", "thinking": clean_text}
626
- }
627
- yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
628
-
629
  if "</thinking>" in accumulated_text:
630
- # End thinking block
631
  in_thinking = False
632
  yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
633
  block_index += 1
634
-
635
- # Start text block
636
  text_block_started = True
637
- block_start = {
638
- "type": "content_block_start",
639
- "index": block_index,
640
- "content_block": {"type": "text", "text": ""}
641
- }
642
- yield f"event: content_block_start\ndata: {json.dumps(block_start)}\n\n"
643
-
644
  elif text_block_started:
645
- # Stream text content
646
- delta_event = {
647
- "type": "content_block_delta",
648
- "index": block_index,
649
- "delta": {"type": "text_delta", "text": text}
650
- }
651
- yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
652
-
653
  else:
654
- # No thinking - just stream text
655
  if not text_block_started:
656
  text_block_started = True
657
- block_start = {
658
- "type": "content_block_start",
659
- "index": 0,
660
- "content_block": {"type": "text", "text": ""}
661
- }
662
- yield f"event: content_block_start\ndata: {json.dumps(block_start)}\n\n"
663
-
664
- delta_event = {
665
- "type": "content_block_delta",
666
- "index": 0,
667
- "delta": {"type": "text_delta", "text": text}
668
- }
669
- yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
670
 
671
  thread.join()
672
- gen_time = time.time() - gen_start
673
- tokens_per_sec = output_tokens / gen_time if gen_time > 0 else 0
674
- logger.info(f"[{message_id}] Stream completed: {output_tokens} tokens in {gen_time:.2f}s ({tokens_per_sec:.1f} tok/s)")
675
 
676
- # content_block_stop for final block
677
  yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
678
 
679
- # message_delta event
680
  stop_reason = "max_tokens" if output_tokens >= total_max_tokens else "end_turn"
681
- delta = {
682
- "type": "message_delta",
683
- "delta": {"stop_reason": stop_reason, "stop_sequence": None},
684
- "usage": {"output_tokens": output_tokens}
685
- }
686
- yield f"event: message_delta\ndata: {json.dumps(delta)}\n\n"
687
-
688
- # message_stop event
689
  yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
690
 
691
- return StreamingResponse(
692
- generate(),
693
- media_type="text/event-stream",
694
- headers={
695
- "Cache-Control": "no-cache",
696
- "Connection": "keep-alive",
697
- "X-Accel-Buffering": "no"
698
- }
699
- )
700
 
701
- @app.post("/v1/messages/count_tokens", response_model=TokenCountResponse)
702
- async def count_tokens(request: TokenCountRequest):
703
- """Count tokens for a message request (Anthropic compatible)"""
704
  thinking_enabled = request.thinking and request.thinking.type == "enabled"
705
  budget_tokens = request.thinking.budget_tokens if request.thinking else 1024
706
-
707
- prompt = format_messages_with_thinking(
708
- request.messages,
709
- request.system,
710
- thinking_enabled=thinking_enabled,
711
- budget_tokens=budget_tokens
712
- )
713
  tokens = tokenizer.encode(prompt)
714
- logger.debug(f"Token count request: {len(tokens)} tokens (thinking: {thinking_enabled})")
715
- return TokenCountResponse(input_tokens=len(tokens))
716
-
717
- @app.get("/health")
718
- async def health():
719
- return {
720
- "status": "ok",
721
- "model_loaded": model is not None,
722
- "log_file": LOG_FILE,
723
- "features": ["extended-thinking", "streaming"]
724
- }
725
 
726
  if __name__ == "__main__":
727
  import uvicorn
 
1
  """
2
+ Dual-Compatible API Endpoint (OpenAI + Anthropic)
3
  Lightweight CPU-based implementation for Hugging Face Spaces
4
+ - OpenAI format: /v1/chat/completions
5
+ - Anthropic format: /anthropic/v1/messages
6
  """
7
 
8
  import os
 
45
  console_handler.setLevel(logging.INFO)
46
 
47
  logging.basicConfig(level=logging.DEBUG, handlers=[file_handler, console_handler])
48
+ logger = logging.getLogger("dual-api")
49
 
50
  for uvicorn_logger in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
51
  uv_log = logging.getLogger(uvicorn_logger)
52
  uv_log.handlers = [file_handler, console_handler]
53
 
54
  logger.info("=" * 60)
55
+ logger.info(f"Dual API (OpenAI + Anthropic) Startup at {datetime.now().isoformat()}")
56
  logger.info(f"Log file: {LOG_FILE}")
57
  logger.info("=" * 60)
58
 
 
84
  del model, tokenizer
85
 
86
  app = FastAPI(
87
+ title="Dual-Compatible API (OpenAI + Anthropic)",
88
+ description="""
89
+ Lightweight CPU-based API with dual compatibility:
90
+ - OpenAI format: /v1/chat/completions
91
+ - Anthropic format: /anthropic/v1/messages
92
+ """,
93
  version="1.0.0",
94
  lifespan=lifespan
95
  )
 
117
  logger.error(f"[{request_id}] {request.method} {request.url.path} - Error: {e} ({duration:.2f}ms)")
118
  raise
119
 
120
+ # ============================================================
121
+ # ANTHROPIC-COMPATIBLE MODELS (under /anthropic)
122
+ # ============================================================
123
 
124
+ class AnthropicTextBlock(BaseModel):
 
125
  type: Literal["text"] = "text"
126
  text: str
127
 
128
+ class AnthropicImageSource(BaseModel):
129
  type: Literal["base64", "url"] = "base64"
130
  media_type: Optional[str] = None
131
  data: Optional[str] = None
132
  url: Optional[str] = None
133
 
134
+ class AnthropicImageBlock(BaseModel):
135
  type: Literal["image"] = "image"
136
+ source: AnthropicImageSource
137
 
138
+ class AnthropicToolUseBlock(BaseModel):
139
  type: Literal["tool_use"] = "tool_use"
140
  id: str
141
  name: str
142
  input: Dict[str, Any]
143
 
144
+ class AnthropicToolResultBlock(BaseModel):
145
  type: Literal["tool_result"] = "tool_result"
146
  tool_use_id: str
147
+ content: Optional[Union[str, List[AnthropicTextBlock]]] = None
148
  is_error: Optional[bool] = False
149
 
150
+ AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, AnthropicToolUseBlock, AnthropicToolResultBlock]
151
 
152
+ class AnthropicMessage(BaseModel):
 
153
  role: Literal["user", "assistant"]
154
+ content: Union[str, List[AnthropicContentBlock]]
155
 
156
+ class AnthropicToolInputSchema(BaseModel):
 
157
  type: Literal["object"] = "object"
158
  properties: Optional[Dict[str, Any]] = None
159
  required: Optional[List[str]] = None
160
 
161
+ class AnthropicTool(BaseModel):
162
  name: str
163
  description: Optional[str] = None
164
+ input_schema: AnthropicToolInputSchema
165
 
166
+ class AnthropicToolChoiceAuto(BaseModel):
 
167
  type: Literal["auto"] = "auto"
168
  disable_parallel_tool_use: Optional[bool] = None
169
 
170
+ class AnthropicToolChoiceAny(BaseModel):
171
  type: Literal["any"] = "any"
172
  disable_parallel_tool_use: Optional[bool] = None
173
 
174
+ class AnthropicToolChoiceTool(BaseModel):
175
  type: Literal["tool"] = "tool"
176
  name: str
177
  disable_parallel_tool_use: Optional[bool] = None
178
 
179
+ AnthropicToolChoice = Union[AnthropicToolChoiceAuto, AnthropicToolChoiceAny, AnthropicToolChoiceTool]
180
 
181
+ class AnthropicMetadata(BaseModel):
 
182
  user_id: Optional[str] = None
183
 
184
+ class AnthropicSystemContent(BaseModel):
 
185
  type: Literal["text"] = "text"
186
  text: str
187
  cache_control: Optional[Dict[str, str]] = None
188
 
189
+ class AnthropicThinkingConfig(BaseModel):
 
 
 
 
 
190
  type: Literal["enabled", "disabled"] = "enabled"
 
191
  budget_tokens: Optional[int] = Field(default=1024, ge=1, le=128000)
192
 
193
+ class AnthropicMessageRequest(BaseModel):
 
 
194
  model: str
195
  max_tokens: int
196
+ messages: List[AnthropicMessage]
197
+ metadata: Optional[AnthropicMetadata] = None
 
 
198
  stop_sequences: Optional[List[str]] = None
199
  stream: Optional[bool] = False
200
+ system: Optional[Union[str, List[AnthropicSystemContent]]] = None
201
  temperature: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
202
+ tool_choice: Optional[AnthropicToolChoice] = None
203
+ tools: Optional[List[AnthropicTool]] = None
204
  top_k: Optional[int] = Field(default=None, ge=0)
205
  top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
206
+ thinking: Optional[AnthropicThinkingConfig] = None
207
 
208
+ class AnthropicUsage(BaseModel):
 
 
 
 
209
  input_tokens: int
210
  output_tokens: int
211
  cache_creation_input_tokens: Optional[int] = None
212
  cache_read_input_tokens: Optional[int] = None
213
 
214
+ class AnthropicResponseTextBlock(BaseModel):
 
215
  type: Literal["text"] = "text"
216
  text: str
217
 
218
+ class AnthropicResponseThinkingBlock(BaseModel):
 
219
  type: Literal["thinking"] = "thinking"
220
  thinking: str
221
 
222
+ class AnthropicResponseToolUseBlock(BaseModel):
223
  type: Literal["tool_use"] = "tool_use"
224
  id: str
225
  name: str
226
  input: Dict[str, Any]
227
 
228
+ AnthropicResponseContentBlock = Union[AnthropicResponseTextBlock, AnthropicResponseThinkingBlock, AnthropicResponseToolUseBlock]
229
 
230
+ class AnthropicMessageResponse(BaseModel):
 
231
  id: str
232
  type: Literal["message"] = "message"
233
  role: Literal["assistant"] = "assistant"
234
+ content: List[AnthropicResponseContentBlock]
235
  model: str
236
  stop_reason: Optional[Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]] = None
237
  stop_sequence: Optional[str] = None
238
+ usage: AnthropicUsage
239
+
240
+ class AnthropicTokenCountRequest(BaseModel):
241
+ model: str
242
+ messages: List[AnthropicMessage]
243
+ system: Optional[Union[str, List[AnthropicSystemContent]]] = None
244
+ tools: Optional[List[AnthropicTool]] = None
245
+ thinking: Optional[AnthropicThinkingConfig] = None
246
+
247
+ class AnthropicTokenCountResponse(BaseModel):
248
+ input_tokens: int
249
 
250
+ # ============================================================
251
+ # OPENAI-COMPATIBLE MODELS (under /v1)
252
+ # ============================================================
253
+
254
+ class OpenAIMessage(BaseModel):
255
+ role: Literal["system", "user", "assistant", "tool"]
256
+ content: Optional[Union[str, List[Dict[str, Any]]]] = None
257
+ name: Optional[str] = None
258
+ tool_calls: Optional[List[Dict[str, Any]]] = None
259
+ tool_call_id: Optional[str] = None
260
+
261
+ class OpenAITool(BaseModel):
262
+ type: Literal["function"] = "function"
263
+ function: Dict[str, Any]
264
+
265
+ class OpenAIToolChoice(BaseModel):
266
  type: str
267
+ function: Optional[Dict[str, str]] = None
268
 
269
+ class OpenAIChatRequest(BaseModel):
270
+ model: str
271
+ messages: List[OpenAIMessage]
272
+ max_tokens: Optional[int] = 1024
273
+ temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0)
274
+ top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
275
+ n: Optional[int] = 1
276
+ stream: Optional[bool] = False
277
+ stop: Optional[Union[str, List[str]]] = None
278
+ presence_penalty: Optional[float] = 0.0
279
+ frequency_penalty: Optional[float] = 0.0
280
+ logit_bias: Optional[Dict[str, float]] = None
281
+ user: Optional[str] = None
282
+ tools: Optional[List[OpenAITool]] = None
283
+ tool_choice: Optional[Union[str, OpenAIToolChoice]] = None
284
+ seed: Optional[int] = None
285
+
286
+ class OpenAIUsage(BaseModel):
287
+ prompt_tokens: int
288
+ completion_tokens: int
289
+ total_tokens: int
290
+
291
+ class OpenAIChoice(BaseModel):
292
+ index: int
293
+ message: Dict[str, Any]
294
+ finish_reason: Optional[str] = None
295
+
296
+ class OpenAIChatResponse(BaseModel):
297
+ id: str
298
+ object: Literal["chat.completion"] = "chat.completion"
299
+ created: int
300
+ model: str
301
+ choices: List[OpenAIChoice]
302
+ usage: OpenAIUsage
303
+ system_fingerprint: Optional[str] = None
304
+
305
+ class OpenAIStreamChoice(BaseModel):
306
+ index: int
307
+ delta: Dict[str, Any]
308
+ finish_reason: Optional[str] = None
309
 
310
+ class OpenAIStreamResponse(BaseModel):
311
+ id: str
312
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
313
+ created: int
314
  model: str
315
+ choices: List[OpenAIStreamChoice]
 
 
 
316
 
317
+ class OpenAIModel(BaseModel):
318
+ id: str
319
+ object: Literal["model"] = "model"
320
+ created: int
321
+ owned_by: str
322
+
323
+ class OpenAIModelList(BaseModel):
324
+ object: Literal["list"] = "list"
325
+ data: List[OpenAIModel]
326
 
327
  # ============== Helper Functions ==============
328
 
329
+ def extract_anthropic_text(content: Union[str, List[AnthropicContentBlock]]) -> str:
 
330
  if isinstance(content, str):
331
  return content
332
  texts = []
 
338
  texts.append(block.text)
339
  return " ".join(texts)
340
 
341
+ def extract_anthropic_system(system: Optional[Union[str, List[AnthropicSystemContent]]]) -> Optional[str]:
 
342
  if system is None:
343
  return None
344
  if isinstance(system, str):
 
351
  texts.append(block.text)
352
  return " ".join(texts)
353
 
354
+ def extract_openai_content(content: Optional[Union[str, List[Dict[str, Any]]]]) -> str:
355
+ if content is None:
356
+ return ""
357
+ if isinstance(content, str):
358
+ return content
359
+ texts = []
360
+ for item in content:
361
+ if isinstance(item, dict) and item.get("type") == "text":
362
+ texts.append(item.get("text", ""))
363
+ return " ".join(texts)
364
+
365
+ def format_anthropic_messages(
366
+ messages: List[AnthropicMessage],
367
+ system: Optional[Union[str, List[AnthropicSystemContent]]] = None,
368
  thinking_enabled: bool = False,
369
  budget_tokens: int = 1024
370
  ) -> str:
 
371
  formatted_messages = []
372
+ system_text = extract_anthropic_system(system)
373
 
 
 
 
374
  if thinking_enabled:
375
  thinking_instruction = f"""You are a helpful AI assistant with extended thinking capabilities.
376
 
 
383
  Budget for thinking: up to {budget_tokens} tokens for reasoning.
384
 
385
  Think deeply and thoroughly before responding."""
 
386
  if system_text:
387
  system_text = f"{thinking_instruction}\n\n{system_text}"
388
  else:
 
392
  formatted_messages.append({"role": "system", "content": system_text})
393
 
394
  for msg in messages:
395
+ content = extract_anthropic_text(msg.content)
396
  formatted_messages.append({"role": msg.role, "content": content})
397
 
398
  if tokenizer.chat_template:
399
+ return tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
400
+
401
+ prompt = ""
402
+ for msg in formatted_messages:
403
+ role = msg["role"].capitalize()
404
+ prompt += f"{role}: {msg['content']}\n"
405
+ prompt += "Assistant: "
406
+ return prompt
407
+
408
+ def format_openai_messages(messages: List[OpenAIMessage]) -> str:
409
+ formatted_messages = []
410
+ for msg in messages:
411
+ content = extract_openai_content(msg.content)
412
+ formatted_messages.append({"role": msg.role, "content": content})
413
+
414
+ if tokenizer.chat_template:
415
+ return tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
416
 
417
  prompt = ""
418
  for msg in formatted_messages:
 
422
  return prompt
423
 
424
  def parse_thinking_response(text: str) -> tuple:
 
 
 
 
425
  thinking_pattern = r'<thinking>(.*?)</thinking>'
426
  thinking_matches = re.findall(thinking_pattern, text, re.DOTALL)
 
427
  if thinking_matches:
428
  thinking_text = "\n".join(thinking_matches).strip()
 
429
  answer_text = re.sub(thinking_pattern, '', text, flags=re.DOTALL).strip()
430
  return thinking_text, answer_text
431
+ return None, text.strip()
 
432
 
433
+ def generate_id(prefix: str = "msg") -> str:
434
+ return f"{prefix}_{uuid.uuid4().hex[:24]}"
435
 
436
+ # ============== ROOT ENDPOINTS ==============
437
 
438
  @app.get("/")
439
  async def root():
 
440
  return {
441
  "status": "healthy",
442
  "model": MODEL_ID,
443
+ "endpoints": {
444
+ "openai": "/v1/chat/completions",
445
+ "anthropic": "/anthropic/v1/messages"
446
+ },
447
+ "base_urls": {
448
+ "openai_sdk": "https://likhonsheikh-anthropic-compatible-api.hf.space/v1",
449
+ "anthropic_sdk": "https://likhonsheikh-anthropic-compatible-api.hf.space/anthropic"
450
+ },
451
+ "features": ["extended-thinking", "streaming", "dual-compatibility"],
452
  "log_file": LOG_FILE
453
  }
454
 
455
+ @app.get("/logs")
456
+ async def get_logs(lines: int = 100):
457
+ try:
458
+ with open(LOG_FILE, 'r') as f:
459
+ all_lines = f.readlines()
460
+ recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
461
+ return {"log_file": LOG_FILE, "total_lines": len(all_lines), "returned_lines": len(recent_lines), "logs": "".join(recent_lines)}
462
+ except FileNotFoundError:
463
+ return {"error": "Log file not found", "log_file": LOG_FILE}
464
+
465
+ @app.get("/health")
466
+ async def health():
467
+ return {"status": "ok", "model_loaded": model is not None, "log_file": LOG_FILE, "features": ["openai-compatible", "anthropic-compatible", "extended-thinking"]}
468
+
469
+ # ============================================================
470
+ # OPENAI-COMPATIBLE ENDPOINTS (/v1)
471
+ # ============================================================
472
+
473
  @app.get("/v1/models")
474
+ async def openai_list_models():
475
+ """List models (OpenAI format)"""
476
+ return OpenAIModelList(
477
+ data=[OpenAIModel(id="smollm2-135m", created=int(time.time()), owned_by="huggingface")]
478
+ )
479
+
480
+ @app.post("/v1/chat/completions")
481
+ async def openai_chat_completions(
482
+ request: OpenAIChatRequest,
483
+ authorization: Optional[str] = Header(None)
484
+ ):
485
+ """Chat completions (OpenAI format)"""
486
+ chat_id = generate_id("chatcmpl")
487
+ logger.info(f"[{chat_id}] OpenAI chat - model: {request.model}, max_tokens: {request.max_tokens}, stream: {request.stream}")
488
+
489
+ try:
490
+ prompt = format_openai_messages(request.messages)
491
+ inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
492
+ input_token_count = inputs.input_ids.shape[1]
493
+
494
+ if request.stream:
495
+ return await openai_stream_response(request, inputs, input_token_count, chat_id)
496
+
497
+ gen_kwargs = {
498
+ "max_new_tokens": request.max_tokens or 1024,
499
+ "do_sample": request.temperature > 0 if request.temperature else False,
500
+ "pad_token_id": tokenizer.eos_token_id,
501
+ "eos_token_id": tokenizer.eos_token_id,
502
+ }
503
+
504
+ if request.temperature and request.temperature > 0:
505
+ gen_kwargs["temperature"] = min(request.temperature, 1.0)
506
+ if request.top_p:
507
+ gen_kwargs["top_p"] = request.top_p
508
+
509
+ if request.stop:
510
+ stop_seqs = [request.stop] if isinstance(request.stop, str) else request.stop
511
+ stop_ids = []
512
+ for seq in stop_seqs:
513
+ tokens = tokenizer.encode(seq, add_special_tokens=False)
514
+ if tokens:
515
+ stop_ids.extend(tokens)
516
+ if stop_ids:
517
+ gen_kwargs["eos_token_id"] = list(set([tokenizer.eos_token_id] + stop_ids))
518
+
519
+ gen_start = time.time()
520
+ with torch.no_grad():
521
+ outputs = model.generate(**inputs, **gen_kwargs)
522
+ gen_time = time.time() - gen_start
523
+
524
+ generated_tokens = outputs[0][input_token_count:]
525
+ generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
526
+ output_token_count = len(generated_tokens)
527
+
528
+ finish_reason = "stop"
529
+ if output_token_count >= (request.max_tokens or 1024):
530
+ finish_reason = "length"
531
+
532
+ logger.info(f"[{chat_id}] Generated {output_token_count} tokens in {gen_time:.2f}s")
533
+
534
+ return OpenAIChatResponse(
535
+ id=chat_id,
536
+ created=int(time.time()),
537
+ model=request.model,
538
+ choices=[OpenAIChoice(
539
+ index=0,
540
+ message={"role": "assistant", "content": generated_text.strip()},
541
+ finish_reason=finish_reason
542
+ )],
543
+ usage=OpenAIUsage(
544
+ prompt_tokens=input_token_count,
545
+ completion_tokens=output_token_count,
546
+ total_tokens=input_token_count + output_token_count
547
+ )
548
+ )
549
+
550
+ except Exception as e:
551
+ logger.error(f"[{chat_id}] Error: {e}", exc_info=True)
552
+ raise HTTPException(status_code=500, detail=str(e))
553
+
554
+ async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token_count: int, chat_id: str):
555
+ """Stream response in OpenAI format"""
556
+
557
+ async def generate():
558
+ created = int(time.time())
559
+
560
+ # Initial chunk with role
561
+ initial_chunk = {
562
+ "id": chat_id,
563
+ "object": "chat.completion.chunk",
564
+ "created": created,
565
+ "model": request.model,
566
+ "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}, "finish_reason": None}]
567
+ }
568
+ yield f"data: {json.dumps(initial_chunk)}\n\n"
569
+
570
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
571
+
572
+ gen_kwargs = {
573
+ **inputs,
574
+ "max_new_tokens": request.max_tokens or 1024,
575
+ "do_sample": request.temperature > 0 if request.temperature else False,
576
+ "pad_token_id": tokenizer.eos_token_id,
577
+ "eos_token_id": tokenizer.eos_token_id,
578
+ "streamer": streamer,
579
+ }
580
+
581
+ if request.temperature and request.temperature > 0:
582
+ gen_kwargs["temperature"] = min(request.temperature, 1.0)
583
+ if request.top_p:
584
+ gen_kwargs["top_p"] = request.top_p
585
+
586
+ thread = Thread(target=model.generate, kwargs=gen_kwargs)
587
+ thread.start()
588
+
589
+ output_tokens = 0
590
+ for text in streamer:
591
+ if text:
592
+ output_tokens += len(tokenizer.encode(text, add_special_tokens=False))
593
+ chunk = {
594
+ "id": chat_id,
595
+ "object": "chat.completion.chunk",
596
+ "created": created,
597
+ "model": request.model,
598
+ "choices": [{"index": 0, "delta": {"content": text}, "finish_reason": None}]
599
+ }
600
+ yield f"data: {json.dumps(chunk)}\n\n"
601
+
602
+ thread.join()
603
+
604
+ # Final chunk
605
+ finish_reason = "length" if output_tokens >= (request.max_tokens or 1024) else "stop"
606
+ final_chunk = {
607
+ "id": chat_id,
608
+ "object": "chat.completion.chunk",
609
+ "created": created,
610
+ "model": request.model,
611
+ "choices": [{"index": 0, "delta": {}, "finish_reason": finish_reason}]
612
+ }
613
+ yield f"data: {json.dumps(final_chunk)}\n\n"
614
+ yield "data: [DONE]\n\n"
615
+
616
+ return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "Connection": "keep-alive"})
617
+
618
+ # ============================================================
619
+ # ANTHROPIC-COMPATIBLE ENDPOINTS (/anthropic)
620
+ # ============================================================
621
+
622
+ @app.get("/anthropic/v1/models")
623
+ async def anthropic_list_models():
624
+ """List models (Anthropic format)"""
625
  return {
626
  "object": "list",
627
  "data": [{
 
634
  }]
635
  }
636
 
637
+ @app.post("/anthropic/v1/messages", response_model=AnthropicMessageResponse)
638
+ async def anthropic_create_message(
639
+ request: AnthropicMessageRequest,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
  x_api_key: Optional[str] = Header(None, alias="x-api-key"),
641
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
642
  anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
643
  ):
644
+ """Create message (Anthropic format with Extended Thinking)"""
645
+ message_id = generate_id("msg")
646
 
 
647
  thinking_enabled = False
648
  budget_tokens = 1024
649
  if request.thinking:
650
  thinking_enabled = request.thinking.type == "enabled"
651
  budget_tokens = request.thinking.budget_tokens or 1024
652
 
653
+ logger.info(f"[{message_id}] Anthropic msg - model: {request.model}, max_tokens: {request.max_tokens}, thinking: {thinking_enabled}")
 
654
 
655
  try:
656
+ prompt = format_anthropic_messages(request.messages, request.system, thinking_enabled, budget_tokens)
 
 
 
 
 
 
 
 
657
  inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
658
  input_token_count = inputs.input_ids.shape[1]
 
659
 
660
  if request.stream:
661
+ return await anthropic_stream_response(request, inputs, input_token_count, message_id, thinking_enabled, budget_tokens)
 
662
 
663
+ total_max_tokens = request.max_tokens + (budget_tokens if thinking_enabled else 0)
 
 
 
664
 
 
665
  gen_kwargs = {
666
  "max_new_tokens": total_max_tokens,
667
  "do_sample": request.temperature > 0 if request.temperature else False,
 
669
  "eos_token_id": tokenizer.eos_token_id,
670
  }
671
 
672
+ if request.temperature and request.temperature > 0:
673
  gen_kwargs["temperature"] = request.temperature
674
+ if request.top_p:
675
  gen_kwargs["top_p"] = request.top_p
676
+ if request.top_k:
677
  gen_kwargs["top_k"] = request.top_k
678
 
 
 
 
 
 
 
 
 
 
679
  gen_start = time.time()
680
  with torch.no_grad():
681
  outputs = model.generate(**inputs, **gen_kwargs)
 
685
  generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
686
  output_token_count = len(generated_tokens)
687
 
 
688
  content_blocks = []
689
  if thinking_enabled:
690
  thinking_text, answer_text = parse_thinking_response(generated_text)
691
  if thinking_text:
692
+ content_blocks.append(AnthropicResponseThinkingBlock(type="thinking", thinking=thinking_text))
693
+ content_blocks.append(AnthropicResponseTextBlock(type="text", text=answer_text))
 
694
  else:
695
+ content_blocks.append(AnthropicResponseTextBlock(type="text", text=generated_text.strip()))
696
 
 
697
  stop_reason = "end_turn"
 
698
  if output_token_count >= total_max_tokens:
699
  stop_reason = "max_tokens"
 
 
 
 
 
 
700
 
701
+ logger.info(f"[{message_id}] Generated {output_token_count} tokens in {gen_time:.2f}s")
 
702
 
703
+ return AnthropicMessageResponse(
704
  id=message_id,
705
  content=content_blocks,
706
  model=request.model,
707
  stop_reason=stop_reason,
708
+ usage=AnthropicUsage(input_tokens=input_token_count, output_tokens=output_token_count)
 
 
 
 
709
  )
 
710
 
711
  except Exception as e:
712
+ logger.error(f"[{message_id}] Error: {e}", exc_info=True)
713
  raise HTTPException(status_code=500, detail=str(e))
714
 
715
+ async def anthropic_stream_response(request: AnthropicMessageRequest, inputs, input_token_count: int, message_id: str, thinking_enabled: bool, budget_tokens: int):
716
+ """Stream response in Anthropic format"""
 
 
 
 
 
 
 
717
 
718
  async def generate():
 
719
  start_event = {
720
  "type": "message_start",
721
  "message": {
722
+ "id": message_id, "type": "message", "role": "assistant", "content": [],
723
+ "model": request.model, "stop_reason": None, "stop_sequence": None,
 
 
 
 
 
724
  "usage": {"input_tokens": input_token_count, "output_tokens": 0}
725
  }
726
  }
727
  yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
728
+ yield f"event: ping\ndata: {json.dumps({'type': 'ping'})}\n\n"
729
 
 
730
  block_index = 0
731
  in_thinking = False
732
  thinking_started = False
733
  text_block_started = False
734
 
 
 
 
735
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
736
+ total_max_tokens = request.max_tokens + (budget_tokens if thinking_enabled else 0)
 
 
 
737
 
738
  gen_kwargs = {
739
  **inputs,
 
744
  "streamer": streamer,
745
  }
746
 
747
+ if request.temperature and request.temperature > 0:
748
  gen_kwargs["temperature"] = request.temperature
749
+ if request.top_p:
750
  gen_kwargs["top_p"] = request.top_p
751
+ if request.top_k:
752
  gen_kwargs["top_k"] = request.top_k
753
 
 
754
  thread = Thread(target=model.generate, kwargs=gen_kwargs)
755
  thread.start()
756
 
 
763
  accumulated_text += text
764
 
765
  if thinking_enabled:
 
766
  if "<thinking>" in accumulated_text and not thinking_started:
 
767
  thinking_started = True
768
  in_thinking = True
769
+ yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'thinking', 'thinking': ''}})}\n\n"
 
 
 
 
 
770
 
771
  if in_thinking:
 
772
  clean_text = text.replace("<thinking>", "").replace("</thinking>", "")
773
  if clean_text:
774
+ yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'thinking_delta', 'thinking': clean_text}})}\n\n"
 
 
 
 
 
 
775
  if "</thinking>" in accumulated_text:
 
776
  in_thinking = False
777
  yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
778
  block_index += 1
 
 
779
  text_block_started = True
780
+ yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
 
 
 
 
 
 
781
  elif text_block_started:
782
+ yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
 
 
 
 
 
 
 
783
  else:
 
784
  if not text_block_started:
785
  text_block_started = True
786
+ yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
787
+ yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
 
 
 
 
 
 
 
 
 
 
 
788
 
789
  thread.join()
 
 
 
790
 
 
791
  yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
792
 
 
793
  stop_reason = "max_tokens" if output_tokens >= total_max_tokens else "end_turn"
794
+ yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': stop_reason}, 'usage': {'output_tokens': output_tokens}})}\n\n"
 
 
 
 
 
 
 
795
  yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
796
 
797
+ return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"})
 
 
 
 
 
 
 
 
798
 
799
+ @app.post("/anthropic/v1/messages/count_tokens", response_model=AnthropicTokenCountResponse)
800
+ async def anthropic_count_tokens(request: AnthropicTokenCountRequest):
 
801
  thinking_enabled = request.thinking and request.thinking.type == "enabled"
802
  budget_tokens = request.thinking.budget_tokens if request.thinking else 1024
803
+ prompt = format_anthropic_messages(request.messages, request.system, thinking_enabled, budget_tokens)
 
 
 
 
 
 
804
  tokens = tokenizer.encode(prompt)
805
+ return AnthropicTokenCountResponse(input_tokens=len(tokens))
 
 
 
 
 
 
 
 
 
 
806
 
807
  if __name__ == "__main__":
808
  import uvicorn