likhonsheikh commited on
Commit
7ef800a
·
verified ·
1 Parent(s): 9b2c0ff

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +12 -10
  2. README.md +9 -8
  3. app.py +219 -272
  4. requirements.txt +1 -4
Dockerfile CHANGED
@@ -2,27 +2,29 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install system dependencies
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
 
 
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
- # Copy requirements first for caching
11
  COPY requirements.txt .
12
 
13
- # Install Python dependencies with CPU-only PyTorch
14
- RUN pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r requirements.txt
 
 
 
 
 
15
 
16
  # Copy application code
17
  COPY app.py .
18
 
19
- # Create cache directory for model
20
- RUN mkdir -p /app/.cache
21
- ENV HF_HOME=/app/.cache
22
- ENV TRANSFORMERS_CACHE=/app/.cache
23
-
24
  # Expose port
25
  EXPOSE 7860
26
 
27
  # Run the application
28
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install build dependencies for llama-cpp-python
6
  RUN apt-get update && apt-get install -y \
7
  build-essential \
8
+ cmake \
9
+ curl \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
+ # Copy requirements
13
  COPY requirements.txt .
14
 
15
+ # Install Python dependencies (llama-cpp-python compiles from source)
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Download Qwen2.5-Coder-7B-Instruct Q4_K_M GGUF
19
+ RUN mkdir -p /app/models && \
20
+ curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
21
+ "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
22
 
23
  # Copy application code
24
  COPY app.py .
25
 
 
 
 
 
 
26
  # Expose port
27
  EXPOSE 7860
28
 
29
  # Run the application
30
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -35,7 +35,7 @@ curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
35
  -H "x-api-key: your-api-key" \
36
  -H "anthropic-version: 2023-06-01" \
37
  -d '{
38
- "model": "qwen2.5-coder-3b",
39
  "max_tokens": 256,
40
  "messages": [
41
  {"role": "user", "content": "Hello, how are you?"}
@@ -48,7 +48,7 @@ curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
48
  curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
49
  -H "Content-Type: application/json" \
50
  -d '{
51
- "model": "qwen2.5-coder-3b",
52
  "max_tokens": 256,
53
  "stream": true,
54
  "messages": [
@@ -69,7 +69,7 @@ client = anthropic.Anthropic(
69
  )
70
 
71
  message = client.messages.create(
72
- model="qwen2.5-coder-3b",
73
  max_tokens=256,
74
  messages=[{"role": "user", "content": "Hello!"}]
75
  )
@@ -86,7 +86,7 @@ const client = new Anthropic({
86
  });
87
 
88
  const message = await client.messages.create({
89
- model: 'qwen2.5-coder-3b',
90
  max_tokens: 256,
91
  messages: [{ role: 'user', content: 'Hello!' }]
92
  });
@@ -95,10 +95,11 @@ console.log(message.content[0].text);
95
 
96
  ## Model Info
97
 
98
- - **Model**: Qwen/Qwen2.5-Coder-3B-Instruct
99
- - **Parameters**: 3 Billion
100
- - **Optimized for**: Code & Tool reasoning
101
- - **Context Length**: 32K tokens
 
102
 
103
  ## Rate Limits
104
 
 
35
  -H "x-api-key: your-api-key" \
36
  -H "anthropic-version: 2023-06-01" \
37
  -d '{
38
+ "model": "qwen2.5-coder-7b",
39
  "max_tokens": 256,
40
  "messages": [
41
  {"role": "user", "content": "Hello, how are you?"}
 
48
  curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
49
  -H "Content-Type: application/json" \
50
  -d '{
51
+ "model": "qwen2.5-coder-7b",
52
  "max_tokens": 256,
53
  "stream": true,
54
  "messages": [
 
69
  )
70
 
71
  message = client.messages.create(
72
+ model="qwen2.5-coder-7b",
73
  max_tokens=256,
74
  messages=[{"role": "user", "content": "Hello!"}]
75
  )
 
86
  });
87
 
88
  const message = await client.messages.create({
89
+ model: 'qwen2.5-coder-7b',
90
  max_tokens: 256,
91
  messages: [{ role: 'user', content: 'Hello!' }]
92
  });
 
95
 
96
  ## Model Info
97
 
98
+ - **Model**: Qwen2.5-Coder-7B-Instruct (Q4_K_M GGUF)
99
+ - **Parameters**: 7 Billion (quantized)
100
+ - **Backend**: llama.cpp
101
+ - **Optimized for**: Code, Tool reasoning, Agent workflows
102
+ - **Context Length**: 8K tokens
103
 
104
  ## Rate Limits
105
 
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Dual-Compatible API Endpoint (OpenAI + Anthropic)
3
- Lightweight CPU-based implementation for Hugging Face Spaces
4
  - OpenAI format: /v1/chat/completions
5
  - Anthropic format: /anthropic/v1/messages
6
  """
@@ -10,19 +10,18 @@ import time
10
  import uuid
11
  import logging
12
  import re
 
13
  from datetime import datetime
14
  from logging.handlers import RotatingFileHandler
15
  from typing import List, Optional, Union, Dict, Any, Literal
16
  from contextlib import asynccontextmanager
 
17
 
18
  from fastapi import FastAPI, HTTPException, Header, Request
19
  from fastapi.responses import StreamingResponse, JSONResponse
20
  from fastapi.middleware.cors import CORSMiddleware
21
  from pydantic import BaseModel, Field
22
- import torch
23
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
24
- from threading import Thread
25
- import json
26
 
27
  # ============== Logging Configuration ==============
28
  LOG_DIR = "/tmp/logs"
@@ -45,52 +44,49 @@ console_handler.setFormatter(log_format)
45
  console_handler.setLevel(logging.INFO)
46
 
47
  logging.basicConfig(level=logging.DEBUG, handlers=[file_handler, console_handler])
48
- logger = logging.getLogger("dual-api")
49
 
50
  for uvicorn_logger in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
51
  uv_log = logging.getLogger(uvicorn_logger)
52
  uv_log.handlers = [file_handler, console_handler]
53
 
54
  logger.info("=" * 60)
55
- logger.info(f"Dual API (OpenAI + Anthropic) Startup at {datetime.now().isoformat()}")
56
  logger.info(f"Log file: {LOG_FILE}")
57
  logger.info("=" * 60)
58
 
59
  # ============== Configuration ==============
60
- MODEL_ID = "Qwen/Qwen2.5-Coder-3B-Instruct"
61
- DEVICE = "cpu"
 
 
62
 
63
- model = None
64
- tokenizer = None
65
 
66
  @asynccontextmanager
67
  async def lifespan(app: FastAPI):
68
- global model, tokenizer
69
- logger.info(f"Loading model: {MODEL_ID}")
70
  try:
71
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
72
- logger.info("Tokenizer loaded successfully")
73
- model = AutoModelForCausalLM.from_pretrained(
74
- MODEL_ID, torch_dtype=torch.float32, device_map=DEVICE, low_cpu_mem_usage=True
 
 
75
  )
76
- model.eval()
77
  logger.info("Model loaded successfully!")
78
- logger.info(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
79
  except Exception as e:
80
  logger.error(f"Failed to load model: {e}", exc_info=True)
81
  raise
82
  yield
83
- logger.info("Shutting down, cleaning up model...")
84
- del model, tokenizer
85
 
86
  app = FastAPI(
87
  title="Dual-Compatible API (OpenAI + Anthropic)",
88
- description="""
89
- Lightweight CPU-based API with dual compatibility:
90
- - OpenAI format: /v1/chat/completions
91
- - Anthropic format: /anthropic/v1/messages
92
- """,
93
- version="1.0.0",
94
  lifespan=lifespan
95
  )
96
 
@@ -118,7 +114,7 @@ async def log_requests(request: Request, call_next):
118
  raise
119
 
120
  # ============================================================
121
- # ANTHROPIC-COMPATIBLE MODELS (under /anthropic)
122
  # ============================================================
123
 
124
  class AnthropicTextBlock(BaseModel):
@@ -198,7 +194,7 @@ class AnthropicMessageRequest(BaseModel):
198
  stop_sequences: Optional[List[str]] = None
199
  stream: Optional[bool] = False
200
  system: Optional[Union[str, List[AnthropicSystemContent]]] = None
201
- temperature: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
202
  tool_choice: Optional[AnthropicToolChoice] = None
203
  tools: Optional[List[AnthropicTool]] = None
204
  top_k: Optional[int] = Field(default=None, ge=0)
@@ -248,7 +244,7 @@ class AnthropicTokenCountResponse(BaseModel):
248
  input_tokens: int
249
 
250
  # ============================================================
251
- # OPENAI-COMPATIBLE MODELS (under /v1)
252
  # ============================================================
253
 
254
  class OpenAIMessage(BaseModel):
@@ -270,8 +266,8 @@ class OpenAIChatRequest(BaseModel):
270
  model: str
271
  messages: List[OpenAIMessage]
272
  max_tokens: Optional[int] = 1024
273
- temperature: Optional[float] = Field(default=1.0, ge=0.0, le=2.0)
274
- top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
275
  n: Optional[int] = 1
276
  stream: Optional[bool] = False
277
  stop: Optional[Union[str, List[str]]] = None
@@ -302,18 +298,6 @@ class OpenAIChatResponse(BaseModel):
302
  usage: OpenAIUsage
303
  system_fingerprint: Optional[str] = None
304
 
305
- class OpenAIStreamChoice(BaseModel):
306
- index: int
307
- delta: Dict[str, Any]
308
- finish_reason: Optional[str] = None
309
-
310
- class OpenAIStreamResponse(BaseModel):
311
- id: str
312
- object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
313
- created: int
314
- model: str
315
- choices: List[OpenAIStreamChoice]
316
-
317
  class OpenAIModel(BaseModel):
318
  id: str
319
  object: Literal["model"] = "model"
@@ -362,64 +346,76 @@ def extract_openai_content(content: Optional[Union[str, List[Dict[str, Any]]]])
362
  texts.append(item.get("text", ""))
363
  return " ".join(texts)
364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  def format_anthropic_messages(
366
  messages: List[AnthropicMessage],
367
  system: Optional[Union[str, List[AnthropicSystemContent]]] = None,
 
368
  thinking_enabled: bool = False,
369
  budget_tokens: int = 1024
370
  ) -> str:
371
  formatted_messages = []
372
- system_text = extract_anthropic_system(system)
 
 
 
 
 
 
 
 
 
 
 
373
 
374
- if thinking_enabled:
375
- thinking_instruction = f"""You are a helpful AI assistant with extended thinking capabilities.
376
 
377
- When responding to complex problems:
378
- 1. First, think through the problem step by step inside <thinking>...</thinking> tags
379
- 2. Consider multiple approaches and evaluate them
380
- 3. Show your reasoning process clearly
381
- 4. After thinking, provide your final answer outside the thinking tags
382
 
383
- Budget for thinking: up to {budget_tokens} tokens for reasoning.
 
384
 
385
- Think deeply and thoroughly before responding."""
386
- if system_text:
387
- system_text = f"{thinking_instruction}\n\n{system_text}"
388
- else:
389
- system_text = thinking_instruction
390
 
391
- if system_text:
392
- formatted_messages.append({"role": "system", "content": system_text})
 
 
 
 
393
 
394
  for msg in messages:
395
  content = extract_anthropic_text(msg.content)
396
  formatted_messages.append({"role": msg.role, "content": content})
397
 
398
- if tokenizer.chat_template:
399
- return tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
400
-
401
- prompt = ""
402
- for msg in formatted_messages:
403
- role = msg["role"].capitalize()
404
- prompt += f"{role}: {msg['content']}\n"
405
- prompt += "Assistant: "
406
- return prompt
407
 
408
  def format_openai_messages(messages: List[OpenAIMessage]) -> str:
 
409
  formatted_messages = []
410
- for msg in messages:
411
- content = extract_openai_content(msg.content)
412
- formatted_messages.append({"role": msg.role, "content": content})
413
 
414
- if tokenizer.chat_template:
415
- return tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
416
 
417
- prompt = ""
418
- for msg in formatted_messages:
419
- role = msg["role"].capitalize()
420
- prompt += f"{role}: {msg['content']}\n"
421
- prompt += "Assistant: "
422
- return prompt
423
 
424
  def parse_thinking_response(text: str) -> tuple:
425
  thinking_pattern = r'<thinking>(.*?)</thinking>'
@@ -430,6 +426,21 @@ def parse_thinking_response(text: str) -> tuple:
430
  return thinking_text, answer_text
431
  return None, text.strip()
432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  def generate_id(prefix: str = "msg") -> str:
434
  return f"{prefix}_{uuid.uuid4().hex[:24]}"
435
 
@@ -439,17 +450,14 @@ def generate_id(prefix: str = "msg") -> str:
439
  async def root():
440
  return {
441
  "status": "healthy",
442
- "model": MODEL_ID,
 
443
  "endpoints": {
444
  "openai": "/v1/chat/completions",
445
  "anthropic": "/anthropic/v1/messages"
446
  },
447
- "base_urls": {
448
- "openai_sdk": "https://likhonsheikh-anthropic-compatible-api.hf.space/v1",
449
- "anthropic_sdk": "https://likhonsheikh-anthropic-compatible-api.hf.space/anthropic"
450
- },
451
- "features": ["extended-thinking", "streaming", "dual-compatibility"],
452
- "log_file": LOG_FILE
453
  }
454
 
455
  @app.get("/logs")
@@ -458,13 +466,13 @@ async def get_logs(lines: int = 100):
458
  with open(LOG_FILE, 'r') as f:
459
  all_lines = f.readlines()
460
  recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
461
- return {"log_file": LOG_FILE, "total_lines": len(all_lines), "returned_lines": len(recent_lines), "logs": "".join(recent_lines)}
462
  except FileNotFoundError:
463
- return {"error": "Log file not found", "log_file": LOG_FILE}
464
 
465
  @app.get("/health")
466
  async def health():
467
- return {"status": "ok", "model_loaded": model is not None, "log_file": LOG_FILE, "features": ["openai-compatible", "anthropic-compatible", "extended-thinking"]}
468
 
469
  # ============================================================
470
  # OPENAI-COMPATIBLE ENDPOINTS (/v1)
@@ -472,9 +480,8 @@ async def health():
472
 
473
  @app.get("/v1/models")
474
  async def openai_list_models():
475
- """List models (OpenAI format)"""
476
  return OpenAIModelList(
477
- data=[OpenAIModel(id="qwen2.5-coder-3b", created=int(time.time()), owned_by="qwen")]
478
  )
479
 
480
  @app.post("/v1/chat/completions")
@@ -482,54 +489,37 @@ async def openai_chat_completions(
482
  request: OpenAIChatRequest,
483
  authorization: Optional[str] = Header(None)
484
  ):
485
- """Chat completions (OpenAI format)"""
486
  chat_id = generate_id("chatcmpl")
487
- logger.info(f"[{chat_id}] OpenAI chat - model: {request.model}, max_tokens: {request.max_tokens}, stream: {request.stream}")
488
 
489
  try:
490
  prompt = format_openai_messages(request.messages)
491
- inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
492
- input_token_count = inputs.input_ids.shape[1]
493
 
494
  if request.stream:
495
- return await openai_stream_response(request, inputs, input_token_count, chat_id)
496
-
497
- gen_kwargs = {
498
- "max_new_tokens": request.max_tokens or 1024,
499
- "do_sample": request.temperature > 0 if request.temperature else False,
500
- "pad_token_id": tokenizer.eos_token_id,
501
- "eos_token_id": tokenizer.eos_token_id,
502
- }
503
-
504
- if request.temperature and request.temperature > 0:
505
- gen_kwargs["temperature"] = min(request.temperature, 1.0)
506
- if request.top_p:
507
- gen_kwargs["top_p"] = request.top_p
508
 
 
509
  if request.stop:
510
- stop_seqs = [request.stop] if isinstance(request.stop, str) else request.stop
511
- stop_ids = []
512
- for seq in stop_seqs:
513
- tokens = tokenizer.encode(seq, add_special_tokens=False)
514
- if tokens:
515
- stop_ids.extend(tokens)
516
- if stop_ids:
517
- gen_kwargs["eos_token_id"] = list(set([tokenizer.eos_token_id] + stop_ids))
518
 
519
  gen_start = time.time()
520
- with torch.no_grad():
521
- outputs = model.generate(**inputs, **gen_kwargs)
 
 
 
 
 
 
522
  gen_time = time.time() - gen_start
523
 
524
- generated_tokens = outputs[0][input_token_count:]
525
- generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
526
- output_token_count = len(generated_tokens)
527
-
528
- finish_reason = "stop"
529
- if output_token_count >= (request.max_tokens or 1024):
530
- finish_reason = "length"
531
 
532
- logger.info(f"[{chat_id}] Generated {output_token_count} tokens in {gen_time:.2f}s")
533
 
534
  return OpenAIChatResponse(
535
  id=chat_id,
@@ -537,13 +527,13 @@ async def openai_chat_completions(
537
  model=request.model,
538
  choices=[OpenAIChoice(
539
  index=0,
540
- message={"role": "assistant", "content": generated_text.strip()},
541
- finish_reason=finish_reason
542
  )],
543
  usage=OpenAIUsage(
544
- prompt_tokens=input_token_count,
545
- completion_tokens=output_token_count,
546
- total_tokens=input_token_count + output_token_count
547
  )
548
  )
549
 
@@ -551,13 +541,10 @@ async def openai_chat_completions(
551
  logger.error(f"[{chat_id}] Error: {e}", exc_info=True)
552
  raise HTTPException(status_code=500, detail=str(e))
553
 
554
- async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token_count: int, chat_id: str):
555
- """Stream response in OpenAI format"""
556
-
557
  async def generate():
558
  created = int(time.time())
559
 
560
- # Initial chunk with role
561
  initial_chunk = {
562
  "id": chat_id,
563
  "object": "chat.completion.chunk",
@@ -567,29 +554,24 @@ async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token
567
  }
568
  yield f"data: {json.dumps(initial_chunk)}\n\n"
569
 
570
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
571
-
572
- gen_kwargs = {
573
- **inputs,
574
- "max_new_tokens": request.max_tokens or 1024,
575
- "do_sample": request.temperature > 0 if request.temperature else False,
576
- "pad_token_id": tokenizer.eos_token_id,
577
- "eos_token_id": tokenizer.eos_token_id,
578
- "streamer": streamer,
579
- }
580
-
581
- if request.temperature and request.temperature > 0:
582
- gen_kwargs["temperature"] = min(request.temperature, 1.0)
583
- if request.top_p:
584
- gen_kwargs["top_p"] = request.top_p
585
-
586
- thread = Thread(target=model.generate, kwargs=gen_kwargs)
587
- thread.start()
588
-
589
- output_tokens = 0
590
- for text in streamer:
591
  if text:
592
- output_tokens += len(tokenizer.encode(text, add_special_tokens=False))
593
  chunk = {
594
  "id": chat_id,
595
  "object": "chat.completion.chunk",
@@ -599,21 +581,17 @@ async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token
599
  }
600
  yield f"data: {json.dumps(chunk)}\n\n"
601
 
602
- thread.join()
603
-
604
- # Final chunk
605
- finish_reason = "length" if output_tokens >= (request.max_tokens or 1024) else "stop"
606
  final_chunk = {
607
  "id": chat_id,
608
  "object": "chat.completion.chunk",
609
  "created": created,
610
  "model": request.model,
611
- "choices": [{"index": 0, "delta": {}, "finish_reason": finish_reason}]
612
  }
613
  yield f"data: {json.dumps(final_chunk)}\n\n"
614
  yield "data: [DONE]\n\n"
615
 
616
- return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "Connection": "keep-alive"})
617
 
618
  # ============================================================
619
  # ANTHROPIC-COMPATIBLE ENDPOINTS (/anthropic)
@@ -621,16 +599,16 @@ async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token
621
 
622
  @app.get("/anthropic/v1/models")
623
  async def anthropic_list_models():
624
- """List models (Anthropic format)"""
625
  return {
626
  "object": "list",
627
  "data": [{
628
- "id": "qwen2.5-coder-3b",
629
  "object": "model",
630
  "created": int(time.time()),
631
  "owned_by": "qwen",
632
- "display_name": "Qwen2.5 Coder 3B Instruct",
633
- "supports_thinking": True
 
634
  }]
635
  }
636
 
@@ -641,7 +619,6 @@ async def anthropic_create_message(
641
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
642
  anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
643
  ):
644
- """Create message (Anthropic format with Extended Thinking)"""
645
  message_id = generate_id("msg")
646
 
647
  thinking_enabled = False
@@ -650,158 +627,128 @@ async def anthropic_create_message(
650
  thinking_enabled = request.thinking.type == "enabled"
651
  budget_tokens = request.thinking.budget_tokens or 1024
652
 
653
- logger.info(f"[{message_id}] Anthropic msg - model: {request.model}, max_tokens: {request.max_tokens}, thinking: {thinking_enabled}")
654
 
655
  try:
656
- prompt = format_anthropic_messages(request.messages, request.system, thinking_enabled, budget_tokens)
657
- inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
658
- input_token_count = inputs.input_ids.shape[1]
 
 
 
 
659
 
660
  if request.stream:
661
- return await anthropic_stream_response(request, inputs, input_token_count, message_id, thinking_enabled, budget_tokens)
662
 
663
  total_max_tokens = request.max_tokens + (budget_tokens if thinking_enabled else 0)
664
 
665
- gen_kwargs = {
666
- "max_new_tokens": total_max_tokens,
667
- "do_sample": request.temperature > 0 if request.temperature else False,
668
- "pad_token_id": tokenizer.eos_token_id,
669
- "eos_token_id": tokenizer.eos_token_id,
670
- }
671
-
672
- if request.temperature and request.temperature > 0:
673
- gen_kwargs["temperature"] = request.temperature
674
- if request.top_p:
675
- gen_kwargs["top_p"] = request.top_p
676
- if request.top_k:
677
- gen_kwargs["top_k"] = request.top_k
678
 
679
  gen_start = time.time()
680
- with torch.no_grad():
681
- outputs = model.generate(**inputs, **gen_kwargs)
 
 
 
 
 
 
 
682
  gen_time = time.time() - gen_start
683
 
684
- generated_tokens = outputs[0][input_token_count:]
685
- generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
686
- output_token_count = len(generated_tokens)
687
 
 
688
  content_blocks = []
689
- if thinking_enabled:
 
 
 
 
 
 
 
 
 
 
 
 
 
690
  thinking_text, answer_text = parse_thinking_response(generated_text)
691
  if thinking_text:
692
  content_blocks.append(AnthropicResponseThinkingBlock(type="thinking", thinking=thinking_text))
693
  content_blocks.append(AnthropicResponseTextBlock(type="text", text=answer_text))
694
  else:
695
- content_blocks.append(AnthropicResponseTextBlock(type="text", text=generated_text.strip()))
696
 
697
- stop_reason = "end_turn"
698
- if output_token_count >= total_max_tokens:
699
  stop_reason = "max_tokens"
700
 
701
- logger.info(f"[{message_id}] Generated {output_token_count} tokens in {gen_time:.2f}s")
702
 
703
  return AnthropicMessageResponse(
704
  id=message_id,
705
  content=content_blocks,
706
  model=request.model,
707
  stop_reason=stop_reason,
708
- usage=AnthropicUsage(input_tokens=input_token_count, output_tokens=output_token_count)
 
 
 
709
  )
710
 
711
  except Exception as e:
712
  logger.error(f"[{message_id}] Error: {e}", exc_info=True)
713
  raise HTTPException(status_code=500, detail=str(e))
714
 
715
- async def anthropic_stream_response(request: AnthropicMessageRequest, inputs, input_token_count: int, message_id: str, thinking_enabled: bool, budget_tokens: int):
716
- """Stream response in Anthropic format"""
717
-
718
  async def generate():
719
  start_event = {
720
  "type": "message_start",
721
  "message": {
722
  "id": message_id, "type": "message", "role": "assistant", "content": [],
723
  "model": request.model, "stop_reason": None, "stop_sequence": None,
724
- "usage": {"input_tokens": input_token_count, "output_tokens": 0}
725
  }
726
  }
727
  yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
728
- yield f"event: ping\ndata: {json.dumps({'type': 'ping'})}\n\n"
729
-
730
- block_index = 0
731
- in_thinking = False
732
- thinking_started = False
733
- text_block_started = False
734
 
735
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
736
- total_max_tokens = request.max_tokens + (budget_tokens if thinking_enabled else 0)
737
-
738
- gen_kwargs = {
739
- **inputs,
740
- "max_new_tokens": total_max_tokens,
741
- "do_sample": request.temperature > 0 if request.temperature else False,
742
- "pad_token_id": tokenizer.eos_token_id,
743
- "eos_token_id": tokenizer.eos_token_id,
744
- "streamer": streamer,
745
- }
746
-
747
- if request.temperature and request.temperature > 0:
748
- gen_kwargs["temperature"] = request.temperature
749
- if request.top_p:
750
- gen_kwargs["top_p"] = request.top_p
751
- if request.top_k:
752
- gen_kwargs["top_k"] = request.top_k
753
-
754
- thread = Thread(target=model.generate, kwargs=gen_kwargs)
755
- thread.start()
756
-
757
- output_tokens = 0
758
- accumulated_text = ""
759
-
760
- for text in streamer:
761
  if text:
762
- output_tokens += len(tokenizer.encode(text, add_special_tokens=False))
763
- accumulated_text += text
764
-
765
- if thinking_enabled:
766
- if "<thinking>" in accumulated_text and not thinking_started:
767
- thinking_started = True
768
- in_thinking = True
769
- yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'thinking', 'thinking': ''}})}\n\n"
770
-
771
- if in_thinking:
772
- clean_text = text.replace("<thinking>", "").replace("</thinking>", "")
773
- if clean_text:
774
- yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'thinking_delta', 'thinking': clean_text}})}\n\n"
775
- if "</thinking>" in accumulated_text:
776
- in_thinking = False
777
- yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
778
- block_index += 1
779
- text_block_started = True
780
- yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
781
- elif text_block_started:
782
- yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
783
- else:
784
- if not text_block_started:
785
- text_block_started = True
786
- yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
787
- yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
788
-
789
- thread.join()
790
-
791
- yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
792
-
793
- stop_reason = "max_tokens" if output_tokens >= total_max_tokens else "end_turn"
794
- yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': stop_reason}, 'usage': {'output_tokens': output_tokens}})}\n\n"
795
  yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
796
 
797
- return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"})
798
 
799
  @app.post("/anthropic/v1/messages/count_tokens", response_model=AnthropicTokenCountResponse)
800
  async def anthropic_count_tokens(request: AnthropicTokenCountRequest):
801
- thinking_enabled = request.thinking and request.thinking.type == "enabled"
802
- budget_tokens = request.thinking.budget_tokens if request.thinking else 1024
803
- prompt = format_anthropic_messages(request.messages, request.system, thinking_enabled, budget_tokens)
804
- tokens = tokenizer.encode(prompt)
805
  return AnthropicTokenCountResponse(input_tokens=len(tokens))
806
 
807
  if __name__ == "__main__":
 
1
  """
2
  Dual-Compatible API Endpoint (OpenAI + Anthropic)
3
+ llama.cpp powered - Qwen2.5-Coder-7B-Instruct Q4_K_M
4
  - OpenAI format: /v1/chat/completions
5
  - Anthropic format: /anthropic/v1/messages
6
  """
 
10
  import uuid
11
  import logging
12
  import re
13
+ import json
14
  from datetime import datetime
15
  from logging.handlers import RotatingFileHandler
16
  from typing import List, Optional, Union, Dict, Any, Literal
17
  from contextlib import asynccontextmanager
18
+ from threading import Thread
19
 
20
  from fastapi import FastAPI, HTTPException, Header, Request
21
  from fastapi.responses import StreamingResponse, JSONResponse
22
  from fastapi.middleware.cors import CORSMiddleware
23
  from pydantic import BaseModel, Field
24
+ from llama_cpp import Llama
 
 
 
25
 
26
  # ============== Logging Configuration ==============
27
  LOG_DIR = "/tmp/logs"
 
44
  console_handler.setLevel(logging.INFO)
45
 
46
  logging.basicConfig(level=logging.DEBUG, handlers=[file_handler, console_handler])
47
+ logger = logging.getLogger("llama-api")
48
 
49
  for uvicorn_logger in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
50
  uv_log = logging.getLogger(uvicorn_logger)
51
  uv_log.handlers = [file_handler, console_handler]
52
 
53
  logger.info("=" * 60)
54
+ logger.info(f"llama.cpp API (OpenAI + Anthropic) Startup at {datetime.now().isoformat()}")
55
  logger.info(f"Log file: {LOG_FILE}")
56
  logger.info("=" * 60)
57
 
58
  # ============== Configuration ==============
59
+ MODEL_PATH = "/app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
60
+ N_CTX = 8192 # Context window
61
+ N_THREADS = 2 # CPU threads
62
+ N_BATCH = 128 # Batch size
63
 
64
+ llm = None
 
65
 
66
  @asynccontextmanager
67
  async def lifespan(app: FastAPI):
68
+ global llm
69
+ logger.info(f"Loading model: {MODEL_PATH}")
70
  try:
71
+ llm = Llama(
72
+ model_path=MODEL_PATH,
73
+ n_ctx=N_CTX,
74
+ n_threads=N_THREADS,
75
+ n_batch=N_BATCH,
76
+ verbose=True
77
  )
 
78
  logger.info("Model loaded successfully!")
 
79
  except Exception as e:
80
  logger.error(f"Failed to load model: {e}", exc_info=True)
81
  raise
82
  yield
83
+ logger.info("Shutting down...")
84
+ del llm
85
 
86
  app = FastAPI(
87
  title="Dual-Compatible API (OpenAI + Anthropic)",
88
+ description="llama.cpp powered API with dual SDK compatibility",
89
+ version="2.0.0",
 
 
 
 
90
  lifespan=lifespan
91
  )
92
 
 
114
  raise
115
 
116
  # ============================================================
117
+ # ANTHROPIC-COMPATIBLE MODELS
118
  # ============================================================
119
 
120
  class AnthropicTextBlock(BaseModel):
 
194
  stop_sequences: Optional[List[str]] = None
195
  stream: Optional[bool] = False
196
  system: Optional[Union[str, List[AnthropicSystemContent]]] = None
197
+ temperature: Optional[float] = Field(default=0.7, ge=0.0, le=1.0)
198
  tool_choice: Optional[AnthropicToolChoice] = None
199
  tools: Optional[List[AnthropicTool]] = None
200
  top_k: Optional[int] = Field(default=None, ge=0)
 
244
  input_tokens: int
245
 
246
  # ============================================================
247
+ # OPENAI-COMPATIBLE MODELS
248
  # ============================================================
249
 
250
  class OpenAIMessage(BaseModel):
 
266
  model: str
267
  messages: List[OpenAIMessage]
268
  max_tokens: Optional[int] = 1024
269
+ temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)
270
+ top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0)
271
  n: Optional[int] = 1
272
  stream: Optional[bool] = False
273
  stop: Optional[Union[str, List[str]]] = None
 
298
  usage: OpenAIUsage
299
  system_fingerprint: Optional[str] = None
300
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  class OpenAIModel(BaseModel):
302
  id: str
303
  object: Literal["model"] = "model"
 
346
  texts.append(item.get("text", ""))
347
  return " ".join(texts)
348
 
349
+ def format_chat_prompt(messages: List[Dict[str, str]], system: Optional[str] = None) -> str:
350
+ """Format messages for Qwen2.5 chat template"""
351
+ prompt = ""
352
+ if system:
353
+ prompt += f"<|im_start|>system\n{system}<|im_end|>\n"
354
+
355
+ for msg in messages:
356
+ role = msg["role"]
357
+ content = msg["content"]
358
+ prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
359
+
360
+ prompt += "<|im_start|>assistant\n"
361
+ return prompt
362
+
363
  def format_anthropic_messages(
364
  messages: List[AnthropicMessage],
365
  system: Optional[Union[str, List[AnthropicSystemContent]]] = None,
366
+ tools: Optional[List[AnthropicTool]] = None,
367
  thinking_enabled: bool = False,
368
  budget_tokens: int = 1024
369
  ) -> str:
370
  formatted_messages = []
371
+ system_text = extract_anthropic_system(system) or ""
372
+
373
+ # Add tool definitions to system prompt if provided
374
+ if tools:
375
+ tool_defs = []
376
+ for tool in tools:
377
+ tool_def = {
378
+ "name": tool.name,
379
+ "description": tool.description,
380
+ "parameters": tool.input_schema.model_dump()
381
+ }
382
+ tool_defs.append(tool_def)
383
 
384
+ tool_instruction = f"""You have access to the following tools:
 
385
 
386
+ {json.dumps(tool_defs, indent=2)}
 
 
 
 
387
 
388
+ To use a tool, respond with a JSON object in this exact format:
389
+ {{"tool": "tool_name", "arguments": {{"arg1": "value1"}}}}
390
 
391
+ Only use tools when necessary. If you don't need a tool, respond normally."""
392
+ system_text = f"{tool_instruction}\n\n{system_text}" if system_text else tool_instruction
 
 
 
393
 
394
+ if thinking_enabled:
395
+ thinking_instruction = f"""When solving complex problems:
396
+ 1. Think through the problem step by step inside <thinking>...</thinking> tags
397
+ 2. After thinking, provide your final answer outside the thinking tags
398
+ Budget for thinking: up to {budget_tokens} tokens."""
399
+ system_text = f"{thinking_instruction}\n\n{system_text}" if system_text else thinking_instruction
400
 
401
  for msg in messages:
402
  content = extract_anthropic_text(msg.content)
403
  formatted_messages.append({"role": msg.role, "content": content})
404
 
405
+ return format_chat_prompt(formatted_messages, system_text if system_text else None)
 
 
 
 
 
 
 
 
406
 
407
  def format_openai_messages(messages: List[OpenAIMessage]) -> str:
408
+ system_text = None
409
  formatted_messages = []
 
 
 
410
 
411
+ for msg in messages:
412
+ if msg.role == "system":
413
+ system_text = extract_openai_content(msg.content)
414
+ else:
415
+ content = extract_openai_content(msg.content)
416
+ formatted_messages.append({"role": msg.role, "content": content})
417
 
418
+ return format_chat_prompt(formatted_messages, system_text)
 
 
 
 
 
419
 
420
  def parse_thinking_response(text: str) -> tuple:
421
  thinking_pattern = r'<thinking>(.*?)</thinking>'
 
426
  return thinking_text, answer_text
427
  return None, text.strip()
428
 
429
+ def parse_tool_use(text: str) -> Optional[Dict[str, Any]]:
430
+ """Parse tool use from model response"""
431
+ try:
432
+ # Look for JSON tool call pattern
433
+ json_pattern = r'\{[^{}]*"tool"[^{}]*\}'
434
+ matches = re.findall(json_pattern, text, re.DOTALL)
435
+ if matches:
436
+ for match in matches:
437
+ parsed = json.loads(match)
438
+ if "tool" in parsed:
439
+ return parsed
440
+ except:
441
+ pass
442
+ return None
443
+
444
  def generate_id(prefix: str = "msg") -> str:
445
  return f"{prefix}_{uuid.uuid4().hex[:24]}"
446
 
 
450
  async def root():
451
  return {
452
  "status": "healthy",
453
+ "model": "qwen2.5-coder-7b-instruct-q4_k_m",
454
+ "backend": "llama.cpp",
455
  "endpoints": {
456
  "openai": "/v1/chat/completions",
457
  "anthropic": "/anthropic/v1/messages"
458
  },
459
+ "features": ["extended-thinking", "streaming", "tool-use", "dual-compatibility"],
460
+ "context_length": N_CTX
 
 
 
 
461
  }
462
 
463
  @app.get("/logs")
 
466
  with open(LOG_FILE, 'r') as f:
467
  all_lines = f.readlines()
468
  recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
469
+ return {"log_file": LOG_FILE, "total_lines": len(all_lines), "logs": "".join(recent_lines)}
470
  except FileNotFoundError:
471
+ return {"error": "Log file not found"}
472
 
473
  @app.get("/health")
474
  async def health():
475
+ return {"status": "ok", "model_loaded": llm is not None, "backend": "llama.cpp"}
476
 
477
  # ============================================================
478
  # OPENAI-COMPATIBLE ENDPOINTS (/v1)
 
480
 
481
  @app.get("/v1/models")
482
  async def openai_list_models():
 
483
  return OpenAIModelList(
484
+ data=[OpenAIModel(id="qwen2.5-coder-7b", created=int(time.time()), owned_by="qwen")]
485
  )
486
 
487
  @app.post("/v1/chat/completions")
 
489
  request: OpenAIChatRequest,
490
  authorization: Optional[str] = Header(None)
491
  ):
 
492
  chat_id = generate_id("chatcmpl")
493
+ logger.info(f"[{chat_id}] OpenAI chat - model: {request.model}, max_tokens: {request.max_tokens}")
494
 
495
  try:
496
  prompt = format_openai_messages(request.messages)
 
 
497
 
498
  if request.stream:
499
+ return await openai_stream_response(request, prompt, chat_id)
 
 
 
 
 
 
 
 
 
 
 
 
500
 
501
+ stop_tokens = ["<|im_end|>", "<|endoftext|>"]
502
  if request.stop:
503
+ if isinstance(request.stop, str):
504
+ stop_tokens.append(request.stop)
505
+ else:
506
+ stop_tokens.extend(request.stop)
 
 
 
 
507
 
508
  gen_start = time.time()
509
+ output = llm(
510
+ prompt,
511
+ max_tokens=request.max_tokens or 1024,
512
+ temperature=request.temperature or 0.7,
513
+ top_p=request.top_p or 0.95,
514
+ stop=stop_tokens,
515
+ echo=False
516
+ )
517
  gen_time = time.time() - gen_start
518
 
519
+ generated_text = output["choices"][0]["text"].strip()
520
+ usage = output["usage"]
 
 
 
 
 
521
 
522
+ logger.info(f"[{chat_id}] Generated in {gen_time:.2f}s - tokens: {usage['completion_tokens']}")
523
 
524
  return OpenAIChatResponse(
525
  id=chat_id,
 
527
  model=request.model,
528
  choices=[OpenAIChoice(
529
  index=0,
530
+ message={"role": "assistant", "content": generated_text},
531
+ finish_reason="stop"
532
  )],
533
  usage=OpenAIUsage(
534
+ prompt_tokens=usage["prompt_tokens"],
535
+ completion_tokens=usage["completion_tokens"],
536
+ total_tokens=usage["total_tokens"]
537
  )
538
  )
539
 
 
541
  logger.error(f"[{chat_id}] Error: {e}", exc_info=True)
542
  raise HTTPException(status_code=500, detail=str(e))
543
 
544
+ async def openai_stream_response(request: OpenAIChatRequest, prompt: str, chat_id: str):
 
 
545
  async def generate():
546
  created = int(time.time())
547
 
 
548
  initial_chunk = {
549
  "id": chat_id,
550
  "object": "chat.completion.chunk",
 
554
  }
555
  yield f"data: {json.dumps(initial_chunk)}\n\n"
556
 
557
+ stop_tokens = ["<|im_end|>", "<|endoftext|>"]
558
+ if request.stop:
559
+ if isinstance(request.stop, str):
560
+ stop_tokens.append(request.stop)
561
+ else:
562
+ stop_tokens.extend(request.stop)
563
+
564
+ for output in llm(
565
+ prompt,
566
+ max_tokens=request.max_tokens or 1024,
567
+ temperature=request.temperature or 0.7,
568
+ top_p=request.top_p or 0.95,
569
+ stop=stop_tokens,
570
+ stream=True,
571
+ echo=False
572
+ ):
573
+ text = output["choices"][0]["text"]
 
 
 
 
574
  if text:
 
575
  chunk = {
576
  "id": chat_id,
577
  "object": "chat.completion.chunk",
 
581
  }
582
  yield f"data: {json.dumps(chunk)}\n\n"
583
 
 
 
 
 
584
  final_chunk = {
585
  "id": chat_id,
586
  "object": "chat.completion.chunk",
587
  "created": created,
588
  "model": request.model,
589
+ "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
590
  }
591
  yield f"data: {json.dumps(final_chunk)}\n\n"
592
  yield "data: [DONE]\n\n"
593
 
594
+ return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache"})
595
 
596
  # ============================================================
597
  # ANTHROPIC-COMPATIBLE ENDPOINTS (/anthropic)
 
599
 
600
  @app.get("/anthropic/v1/models")
601
  async def anthropic_list_models():
 
602
  return {
603
  "object": "list",
604
  "data": [{
605
+ "id": "qwen2.5-coder-7b",
606
  "object": "model",
607
  "created": int(time.time()),
608
  "owned_by": "qwen",
609
+ "display_name": "Qwen2.5 Coder 7B Instruct (Q4_K_M)",
610
+ "supports_thinking": True,
611
+ "supports_tools": True
612
  }]
613
  }
614
 
 
619
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
620
  anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
621
  ):
 
622
  message_id = generate_id("msg")
623
 
624
  thinking_enabled = False
 
627
  thinking_enabled = request.thinking.type == "enabled"
628
  budget_tokens = request.thinking.budget_tokens or 1024
629
 
630
+ logger.info(f"[{message_id}] Anthropic msg - model: {request.model}, max_tokens: {request.max_tokens}, thinking: {thinking_enabled}, tools: {len(request.tools) if request.tools else 0}")
631
 
632
  try:
633
+ prompt = format_anthropic_messages(
634
+ request.messages,
635
+ request.system,
636
+ request.tools,
637
+ thinking_enabled,
638
+ budget_tokens
639
+ )
640
 
641
  if request.stream:
642
+ return await anthropic_stream_response(request, prompt, message_id, thinking_enabled)
643
 
644
  total_max_tokens = request.max_tokens + (budget_tokens if thinking_enabled else 0)
645
 
646
+ stop_tokens = ["<|im_end|>", "<|endoftext|>"]
647
+ if request.stop_sequences:
648
+ stop_tokens.extend(request.stop_sequences)
 
 
 
 
 
 
 
 
 
 
649
 
650
  gen_start = time.time()
651
+ output = llm(
652
+ prompt,
653
+ max_tokens=total_max_tokens,
654
+ temperature=request.temperature or 0.7,
655
+ top_p=request.top_p or 0.95,
656
+ top_k=request.top_k or 40,
657
+ stop=stop_tokens,
658
+ echo=False
659
+ )
660
  gen_time = time.time() - gen_start
661
 
662
+ generated_text = output["choices"][0]["text"].strip()
663
+ usage = output["usage"]
 
664
 
665
+ # Parse response for tool use, thinking, etc.
666
  content_blocks = []
667
+ stop_reason = "end_turn"
668
+
669
+ # Check for tool use
670
+ tool_call = parse_tool_use(generated_text)
671
+ if tool_call and request.tools:
672
+ tool_id = f"toolu_{uuid.uuid4().hex[:24]}"
673
+ content_blocks.append(AnthropicResponseToolUseBlock(
674
+ type="tool_use",
675
+ id=tool_id,
676
+ name=tool_call["tool"],
677
+ input=tool_call.get("arguments", {})
678
+ ))
679
+ stop_reason = "tool_use"
680
+ elif thinking_enabled:
681
  thinking_text, answer_text = parse_thinking_response(generated_text)
682
  if thinking_text:
683
  content_blocks.append(AnthropicResponseThinkingBlock(type="thinking", thinking=thinking_text))
684
  content_blocks.append(AnthropicResponseTextBlock(type="text", text=answer_text))
685
  else:
686
+ content_blocks.append(AnthropicResponseTextBlock(type="text", text=generated_text))
687
 
688
+ if usage["completion_tokens"] >= total_max_tokens:
 
689
  stop_reason = "max_tokens"
690
 
691
+ logger.info(f"[{message_id}] Generated in {gen_time:.2f}s - tokens: {usage['completion_tokens']}")
692
 
693
  return AnthropicMessageResponse(
694
  id=message_id,
695
  content=content_blocks,
696
  model=request.model,
697
  stop_reason=stop_reason,
698
+ usage=AnthropicUsage(
699
+ input_tokens=usage["prompt_tokens"],
700
+ output_tokens=usage["completion_tokens"]
701
+ )
702
  )
703
 
704
  except Exception as e:
705
  logger.error(f"[{message_id}] Error: {e}", exc_info=True)
706
  raise HTTPException(status_code=500, detail=str(e))
707
 
708
+ async def anthropic_stream_response(request: AnthropicMessageRequest, prompt: str, message_id: str, thinking_enabled: bool):
 
 
709
  async def generate():
710
  start_event = {
711
  "type": "message_start",
712
  "message": {
713
  "id": message_id, "type": "message", "role": "assistant", "content": [],
714
  "model": request.model, "stop_reason": None, "stop_sequence": None,
715
+ "usage": {"input_tokens": 0, "output_tokens": 0}
716
  }
717
  }
718
  yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
 
 
 
 
 
 
719
 
720
+ # Start text block
721
+ yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
722
+
723
+ stop_tokens = ["<|im_end|>", "<|endoftext|>"]
724
+ if request.stop_sequences:
725
+ stop_tokens.extend(request.stop_sequences)
726
+
727
+ total_tokens = 0
728
+ for output in llm(
729
+ prompt,
730
+ max_tokens=request.max_tokens,
731
+ temperature=request.temperature or 0.7,
732
+ top_p=request.top_p or 0.95,
733
+ stop=stop_tokens,
734
+ stream=True,
735
+ echo=False
736
+ ):
737
+ text = output["choices"][0]["text"]
 
 
 
 
 
 
 
 
738
  if text:
739
+ total_tokens += 1
740
+ yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
741
+
742
+ yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
743
+ yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn'}, 'usage': {'output_tokens': total_tokens}})}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
745
 
746
+ return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"})
747
 
748
  @app.post("/anthropic/v1/messages/count_tokens", response_model=AnthropicTokenCountResponse)
749
  async def anthropic_count_tokens(request: AnthropicTokenCountRequest):
750
+ prompt = format_anthropic_messages(request.messages, request.system)
751
+ tokens = llm.tokenize(prompt.encode())
 
 
752
  return AnthropicTokenCountResponse(input_tokens=len(tokens))
753
 
754
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,8 +1,5 @@
1
  fastapi==0.115.5
2
  uvicorn[standard]==0.32.0
3
- transformers==4.46.2
4
- torch==2.1.2+cpu
5
- accelerate==1.1.1
6
  pydantic==2.10.1
7
  python-multipart==0.0.12
8
- numpy<2
 
1
  fastapi==0.115.5
2
  uvicorn[standard]==0.32.0
3
+ llama-cpp-python==0.3.2
 
 
4
  pydantic==2.10.1
5
  python-multipart==0.0.12