Upload folder using huggingface_hub
Browse files- Dockerfile +12 -10
- README.md +9 -8
- app.py +219 -272
- requirements.txt +1 -4
Dockerfile
CHANGED
|
@@ -2,27 +2,29 @@ FROM python:3.10-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
-
# Install
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
|
|
|
|
|
|
| 8 |
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
|
| 10 |
-
# Copy requirements
|
| 11 |
COPY requirements.txt .
|
| 12 |
|
| 13 |
-
# Install Python dependencies
|
| 14 |
-
RUN pip install --no-cache-dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Copy application code
|
| 17 |
COPY app.py .
|
| 18 |
|
| 19 |
-
# Create cache directory for model
|
| 20 |
-
RUN mkdir -p /app/.cache
|
| 21 |
-
ENV HF_HOME=/app/.cache
|
| 22 |
-
ENV TRANSFORMERS_CACHE=/app/.cache
|
| 23 |
-
|
| 24 |
# Expose port
|
| 25 |
EXPOSE 7860
|
| 26 |
|
| 27 |
# Run the application
|
| 28 |
-
CMD ["
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install build dependencies for llama-cpp-python
|
| 6 |
RUN apt-get update && apt-get install -y \
|
| 7 |
build-essential \
|
| 8 |
+
cmake \
|
| 9 |
+
curl \
|
| 10 |
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
+
# Copy requirements
|
| 13 |
COPY requirements.txt .
|
| 14 |
|
| 15 |
+
# Install Python dependencies (llama-cpp-python compiles from source)
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Download Qwen2.5-Coder-7B-Instruct Q4_K_M GGUF
|
| 19 |
+
RUN mkdir -p /app/models && \
|
| 20 |
+
curl -L -o /app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
|
| 21 |
+
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
|
| 22 |
|
| 23 |
# Copy application code
|
| 24 |
COPY app.py .
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# Expose port
|
| 27 |
EXPOSE 7860
|
| 28 |
|
| 29 |
# Run the application
|
| 30 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -35,7 +35,7 @@ curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
|
|
| 35 |
-H "x-api-key: your-api-key" \
|
| 36 |
-H "anthropic-version: 2023-06-01" \
|
| 37 |
-d '{
|
| 38 |
-
"model": "qwen2.5-coder-
|
| 39 |
"max_tokens": 256,
|
| 40 |
"messages": [
|
| 41 |
{"role": "user", "content": "Hello, how are you?"}
|
|
@@ -48,7 +48,7 @@ curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
|
|
| 48 |
curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
|
| 49 |
-H "Content-Type: application/json" \
|
| 50 |
-d '{
|
| 51 |
-
"model": "qwen2.5-coder-
|
| 52 |
"max_tokens": 256,
|
| 53 |
"stream": true,
|
| 54 |
"messages": [
|
|
@@ -69,7 +69,7 @@ client = anthropic.Anthropic(
|
|
| 69 |
)
|
| 70 |
|
| 71 |
message = client.messages.create(
|
| 72 |
-
model="qwen2.5-coder-
|
| 73 |
max_tokens=256,
|
| 74 |
messages=[{"role": "user", "content": "Hello!"}]
|
| 75 |
)
|
|
@@ -86,7 +86,7 @@ const client = new Anthropic({
|
|
| 86 |
});
|
| 87 |
|
| 88 |
const message = await client.messages.create({
|
| 89 |
-
model: 'qwen2.5-coder-
|
| 90 |
max_tokens: 256,
|
| 91 |
messages: [{ role: 'user', content: 'Hello!' }]
|
| 92 |
});
|
|
@@ -95,10 +95,11 @@ console.log(message.content[0].text);
|
|
| 95 |
|
| 96 |
## Model Info
|
| 97 |
|
| 98 |
-
- **Model**:
|
| 99 |
-
- **Parameters**:
|
| 100 |
-
- **
|
| 101 |
-
- **
|
|
|
|
| 102 |
|
| 103 |
## Rate Limits
|
| 104 |
|
|
|
|
| 35 |
-H "x-api-key: your-api-key" \
|
| 36 |
-H "anthropic-version: 2023-06-01" \
|
| 37 |
-d '{
|
| 38 |
+
"model": "qwen2.5-coder-7b",
|
| 39 |
"max_tokens": 256,
|
| 40 |
"messages": [
|
| 41 |
{"role": "user", "content": "Hello, how are you?"}
|
|
|
|
| 48 |
curl -X POST "https://YOUR_SPACE.hf.space/anthropic/v1/messages" \
|
| 49 |
-H "Content-Type: application/json" \
|
| 50 |
-d '{
|
| 51 |
+
"model": "qwen2.5-coder-7b",
|
| 52 |
"max_tokens": 256,
|
| 53 |
"stream": true,
|
| 54 |
"messages": [
|
|
|
|
| 69 |
)
|
| 70 |
|
| 71 |
message = client.messages.create(
|
| 72 |
+
model="qwen2.5-coder-7b",
|
| 73 |
max_tokens=256,
|
| 74 |
messages=[{"role": "user", "content": "Hello!"}]
|
| 75 |
)
|
|
|
|
| 86 |
});
|
| 87 |
|
| 88 |
const message = await client.messages.create({
|
| 89 |
+
model: 'qwen2.5-coder-7b',
|
| 90 |
max_tokens: 256,
|
| 91 |
messages: [{ role: 'user', content: 'Hello!' }]
|
| 92 |
});
|
|
|
|
| 95 |
|
| 96 |
## Model Info
|
| 97 |
|
| 98 |
+
- **Model**: Qwen2.5-Coder-7B-Instruct (Q4_K_M GGUF)
|
| 99 |
+
- **Parameters**: 7 Billion (quantized)
|
| 100 |
+
- **Backend**: llama.cpp
|
| 101 |
+
- **Optimized for**: Code, Tool reasoning, Agent workflows
|
| 102 |
+
- **Context Length**: 8K tokens
|
| 103 |
|
| 104 |
## Rate Limits
|
| 105 |
|
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
"""
|
| 2 |
Dual-Compatible API Endpoint (OpenAI + Anthropic)
|
| 3 |
-
|
| 4 |
- OpenAI format: /v1/chat/completions
|
| 5 |
- Anthropic format: /anthropic/v1/messages
|
| 6 |
"""
|
|
@@ -10,19 +10,18 @@ import time
|
|
| 10 |
import uuid
|
| 11 |
import logging
|
| 12 |
import re
|
|
|
|
| 13 |
from datetime import datetime
|
| 14 |
from logging.handlers import RotatingFileHandler
|
| 15 |
from typing import List, Optional, Union, Dict, Any, Literal
|
| 16 |
from contextlib import asynccontextmanager
|
|
|
|
| 17 |
|
| 18 |
from fastapi import FastAPI, HTTPException, Header, Request
|
| 19 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 20 |
from fastapi.middleware.cors import CORSMiddleware
|
| 21 |
from pydantic import BaseModel, Field
|
| 22 |
-
import
|
| 23 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
| 24 |
-
from threading import Thread
|
| 25 |
-
import json
|
| 26 |
|
| 27 |
# ============== Logging Configuration ==============
|
| 28 |
LOG_DIR = "/tmp/logs"
|
|
@@ -45,52 +44,49 @@ console_handler.setFormatter(log_format)
|
|
| 45 |
console_handler.setLevel(logging.INFO)
|
| 46 |
|
| 47 |
logging.basicConfig(level=logging.DEBUG, handlers=[file_handler, console_handler])
|
| 48 |
-
logger = logging.getLogger("
|
| 49 |
|
| 50 |
for uvicorn_logger in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
|
| 51 |
uv_log = logging.getLogger(uvicorn_logger)
|
| 52 |
uv_log.handlers = [file_handler, console_handler]
|
| 53 |
|
| 54 |
logger.info("=" * 60)
|
| 55 |
-
logger.info(f"
|
| 56 |
logger.info(f"Log file: {LOG_FILE}")
|
| 57 |
logger.info("=" * 60)
|
| 58 |
|
| 59 |
# ============== Configuration ==============
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
tokenizer = None
|
| 65 |
|
| 66 |
@asynccontextmanager
|
| 67 |
async def lifespan(app: FastAPI):
|
| 68 |
-
global
|
| 69 |
-
logger.info(f"Loading model: {
|
| 70 |
try:
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
| 75 |
)
|
| 76 |
-
model.eval()
|
| 77 |
logger.info("Model loaded successfully!")
|
| 78 |
-
logger.info(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
|
| 79 |
except Exception as e:
|
| 80 |
logger.error(f"Failed to load model: {e}", exc_info=True)
|
| 81 |
raise
|
| 82 |
yield
|
| 83 |
-
logger.info("Shutting down
|
| 84 |
-
del
|
| 85 |
|
| 86 |
app = FastAPI(
|
| 87 |
title="Dual-Compatible API (OpenAI + Anthropic)",
|
| 88 |
-
description=""
|
| 89 |
-
|
| 90 |
-
- OpenAI format: /v1/chat/completions
|
| 91 |
-
- Anthropic format: /anthropic/v1/messages
|
| 92 |
-
""",
|
| 93 |
-
version="1.0.0",
|
| 94 |
lifespan=lifespan
|
| 95 |
)
|
| 96 |
|
|
@@ -118,7 +114,7 @@ async def log_requests(request: Request, call_next):
|
|
| 118 |
raise
|
| 119 |
|
| 120 |
# ============================================================
|
| 121 |
-
# ANTHROPIC-COMPATIBLE MODELS
|
| 122 |
# ============================================================
|
| 123 |
|
| 124 |
class AnthropicTextBlock(BaseModel):
|
|
@@ -198,7 +194,7 @@ class AnthropicMessageRequest(BaseModel):
|
|
| 198 |
stop_sequences: Optional[List[str]] = None
|
| 199 |
stream: Optional[bool] = False
|
| 200 |
system: Optional[Union[str, List[AnthropicSystemContent]]] = None
|
| 201 |
-
temperature: Optional[float] = Field(default=
|
| 202 |
tool_choice: Optional[AnthropicToolChoice] = None
|
| 203 |
tools: Optional[List[AnthropicTool]] = None
|
| 204 |
top_k: Optional[int] = Field(default=None, ge=0)
|
|
@@ -248,7 +244,7 @@ class AnthropicTokenCountResponse(BaseModel):
|
|
| 248 |
input_tokens: int
|
| 249 |
|
| 250 |
# ============================================================
|
| 251 |
-
# OPENAI-COMPATIBLE MODELS
|
| 252 |
# ============================================================
|
| 253 |
|
| 254 |
class OpenAIMessage(BaseModel):
|
|
@@ -270,8 +266,8 @@ class OpenAIChatRequest(BaseModel):
|
|
| 270 |
model: str
|
| 271 |
messages: List[OpenAIMessage]
|
| 272 |
max_tokens: Optional[int] = 1024
|
| 273 |
-
temperature: Optional[float] = Field(default=
|
| 274 |
-
top_p: Optional[float] = Field(default=
|
| 275 |
n: Optional[int] = 1
|
| 276 |
stream: Optional[bool] = False
|
| 277 |
stop: Optional[Union[str, List[str]]] = None
|
|
@@ -302,18 +298,6 @@ class OpenAIChatResponse(BaseModel):
|
|
| 302 |
usage: OpenAIUsage
|
| 303 |
system_fingerprint: Optional[str] = None
|
| 304 |
|
| 305 |
-
class OpenAIStreamChoice(BaseModel):
|
| 306 |
-
index: int
|
| 307 |
-
delta: Dict[str, Any]
|
| 308 |
-
finish_reason: Optional[str] = None
|
| 309 |
-
|
| 310 |
-
class OpenAIStreamResponse(BaseModel):
|
| 311 |
-
id: str
|
| 312 |
-
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
|
| 313 |
-
created: int
|
| 314 |
-
model: str
|
| 315 |
-
choices: List[OpenAIStreamChoice]
|
| 316 |
-
|
| 317 |
class OpenAIModel(BaseModel):
|
| 318 |
id: str
|
| 319 |
object: Literal["model"] = "model"
|
|
@@ -362,64 +346,76 @@ def extract_openai_content(content: Optional[Union[str, List[Dict[str, Any]]]])
|
|
| 362 |
texts.append(item.get("text", ""))
|
| 363 |
return " ".join(texts)
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
def format_anthropic_messages(
|
| 366 |
messages: List[AnthropicMessage],
|
| 367 |
system: Optional[Union[str, List[AnthropicSystemContent]]] = None,
|
|
|
|
| 368 |
thinking_enabled: bool = False,
|
| 369 |
budget_tokens: int = 1024
|
| 370 |
) -> str:
|
| 371 |
formatted_messages = []
|
| 372 |
-
system_text = extract_anthropic_system(system)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
-
|
| 375 |
-
thinking_instruction = f"""You are a helpful AI assistant with extended thinking capabilities.
|
| 376 |
|
| 377 |
-
|
| 378 |
-
1. First, think through the problem step by step inside <thinking>...</thinking> tags
|
| 379 |
-
2. Consider multiple approaches and evaluate them
|
| 380 |
-
3. Show your reasoning process clearly
|
| 381 |
-
4. After thinking, provide your final answer outside the thinking tags
|
| 382 |
|
| 383 |
-
|
|
|
|
| 384 |
|
| 385 |
-
|
| 386 |
-
if system_text
|
| 387 |
-
system_text = f"{thinking_instruction}\n\n{system_text}"
|
| 388 |
-
else:
|
| 389 |
-
system_text = thinking_instruction
|
| 390 |
|
| 391 |
-
if
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
for msg in messages:
|
| 395 |
content = extract_anthropic_text(msg.content)
|
| 396 |
formatted_messages.append({"role": msg.role, "content": content})
|
| 397 |
|
| 398 |
-
if
|
| 399 |
-
return tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
|
| 400 |
-
|
| 401 |
-
prompt = ""
|
| 402 |
-
for msg in formatted_messages:
|
| 403 |
-
role = msg["role"].capitalize()
|
| 404 |
-
prompt += f"{role}: {msg['content']}\n"
|
| 405 |
-
prompt += "Assistant: "
|
| 406 |
-
return prompt
|
| 407 |
|
| 408 |
def format_openai_messages(messages: List[OpenAIMessage]) -> str:
|
|
|
|
| 409 |
formatted_messages = []
|
| 410 |
-
for msg in messages:
|
| 411 |
-
content = extract_openai_content(msg.content)
|
| 412 |
-
formatted_messages.append({"role": msg.role, "content": content})
|
| 413 |
|
| 414 |
-
|
| 415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
-
|
| 418 |
-
for msg in formatted_messages:
|
| 419 |
-
role = msg["role"].capitalize()
|
| 420 |
-
prompt += f"{role}: {msg['content']}\n"
|
| 421 |
-
prompt += "Assistant: "
|
| 422 |
-
return prompt
|
| 423 |
|
| 424 |
def parse_thinking_response(text: str) -> tuple:
|
| 425 |
thinking_pattern = r'<thinking>(.*?)</thinking>'
|
|
@@ -430,6 +426,21 @@ def parse_thinking_response(text: str) -> tuple:
|
|
| 430 |
return thinking_text, answer_text
|
| 431 |
return None, text.strip()
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
def generate_id(prefix: str = "msg") -> str:
|
| 434 |
return f"{prefix}_{uuid.uuid4().hex[:24]}"
|
| 435 |
|
|
@@ -439,17 +450,14 @@ def generate_id(prefix: str = "msg") -> str:
|
|
| 439 |
async def root():
|
| 440 |
return {
|
| 441 |
"status": "healthy",
|
| 442 |
-
"model":
|
|
|
|
| 443 |
"endpoints": {
|
| 444 |
"openai": "/v1/chat/completions",
|
| 445 |
"anthropic": "/anthropic/v1/messages"
|
| 446 |
},
|
| 447 |
-
"
|
| 448 |
-
|
| 449 |
-
"anthropic_sdk": "https://likhonsheikh-anthropic-compatible-api.hf.space/anthropic"
|
| 450 |
-
},
|
| 451 |
-
"features": ["extended-thinking", "streaming", "dual-compatibility"],
|
| 452 |
-
"log_file": LOG_FILE
|
| 453 |
}
|
| 454 |
|
| 455 |
@app.get("/logs")
|
|
@@ -458,13 +466,13 @@ async def get_logs(lines: int = 100):
|
|
| 458 |
with open(LOG_FILE, 'r') as f:
|
| 459 |
all_lines = f.readlines()
|
| 460 |
recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
|
| 461 |
-
return {"log_file": LOG_FILE, "total_lines": len(all_lines), "
|
| 462 |
except FileNotFoundError:
|
| 463 |
-
return {"error": "Log file not found"
|
| 464 |
|
| 465 |
@app.get("/health")
|
| 466 |
async def health():
|
| 467 |
-
return {"status": "ok", "model_loaded":
|
| 468 |
|
| 469 |
# ============================================================
|
| 470 |
# OPENAI-COMPATIBLE ENDPOINTS (/v1)
|
|
@@ -472,9 +480,8 @@ async def health():
|
|
| 472 |
|
| 473 |
@app.get("/v1/models")
|
| 474 |
async def openai_list_models():
|
| 475 |
-
"""List models (OpenAI format)"""
|
| 476 |
return OpenAIModelList(
|
| 477 |
-
data=[OpenAIModel(id="qwen2.5-coder-
|
| 478 |
)
|
| 479 |
|
| 480 |
@app.post("/v1/chat/completions")
|
|
@@ -482,54 +489,37 @@ async def openai_chat_completions(
|
|
| 482 |
request: OpenAIChatRequest,
|
| 483 |
authorization: Optional[str] = Header(None)
|
| 484 |
):
|
| 485 |
-
"""Chat completions (OpenAI format)"""
|
| 486 |
chat_id = generate_id("chatcmpl")
|
| 487 |
-
logger.info(f"[{chat_id}] OpenAI chat - model: {request.model}, max_tokens: {request.max_tokens}
|
| 488 |
|
| 489 |
try:
|
| 490 |
prompt = format_openai_messages(request.messages)
|
| 491 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
|
| 492 |
-
input_token_count = inputs.input_ids.shape[1]
|
| 493 |
|
| 494 |
if request.stream:
|
| 495 |
-
return await openai_stream_response(request,
|
| 496 |
-
|
| 497 |
-
gen_kwargs = {
|
| 498 |
-
"max_new_tokens": request.max_tokens or 1024,
|
| 499 |
-
"do_sample": request.temperature > 0 if request.temperature else False,
|
| 500 |
-
"pad_token_id": tokenizer.eos_token_id,
|
| 501 |
-
"eos_token_id": tokenizer.eos_token_id,
|
| 502 |
-
}
|
| 503 |
-
|
| 504 |
-
if request.temperature and request.temperature > 0:
|
| 505 |
-
gen_kwargs["temperature"] = min(request.temperature, 1.0)
|
| 506 |
-
if request.top_p:
|
| 507 |
-
gen_kwargs["top_p"] = request.top_p
|
| 508 |
|
|
|
|
| 509 |
if request.stop:
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
if tokens:
|
| 515 |
-
stop_ids.extend(tokens)
|
| 516 |
-
if stop_ids:
|
| 517 |
-
gen_kwargs["eos_token_id"] = list(set([tokenizer.eos_token_id] + stop_ids))
|
| 518 |
|
| 519 |
gen_start = time.time()
|
| 520 |
-
|
| 521 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
gen_time = time.time() - gen_start
|
| 523 |
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
output_token_count = len(generated_tokens)
|
| 527 |
-
|
| 528 |
-
finish_reason = "stop"
|
| 529 |
-
if output_token_count >= (request.max_tokens or 1024):
|
| 530 |
-
finish_reason = "length"
|
| 531 |
|
| 532 |
-
logger.info(f"[{chat_id}] Generated
|
| 533 |
|
| 534 |
return OpenAIChatResponse(
|
| 535 |
id=chat_id,
|
|
@@ -537,13 +527,13 @@ async def openai_chat_completions(
|
|
| 537 |
model=request.model,
|
| 538 |
choices=[OpenAIChoice(
|
| 539 |
index=0,
|
| 540 |
-
message={"role": "assistant", "content": generated_text
|
| 541 |
-
finish_reason=
|
| 542 |
)],
|
| 543 |
usage=OpenAIUsage(
|
| 544 |
-
prompt_tokens=
|
| 545 |
-
completion_tokens=
|
| 546 |
-
total_tokens=
|
| 547 |
)
|
| 548 |
)
|
| 549 |
|
|
@@ -551,13 +541,10 @@ async def openai_chat_completions(
|
|
| 551 |
logger.error(f"[{chat_id}] Error: {e}", exc_info=True)
|
| 552 |
raise HTTPException(status_code=500, detail=str(e))
|
| 553 |
|
| 554 |
-
async def openai_stream_response(request: OpenAIChatRequest,
|
| 555 |
-
"""Stream response in OpenAI format"""
|
| 556 |
-
|
| 557 |
async def generate():
|
| 558 |
created = int(time.time())
|
| 559 |
|
| 560 |
-
# Initial chunk with role
|
| 561 |
initial_chunk = {
|
| 562 |
"id": chat_id,
|
| 563 |
"object": "chat.completion.chunk",
|
|
@@ -567,29 +554,24 @@ async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token
|
|
| 567 |
}
|
| 568 |
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
| 569 |
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
thread.start()
|
| 588 |
-
|
| 589 |
-
output_tokens = 0
|
| 590 |
-
for text in streamer:
|
| 591 |
if text:
|
| 592 |
-
output_tokens += len(tokenizer.encode(text, add_special_tokens=False))
|
| 593 |
chunk = {
|
| 594 |
"id": chat_id,
|
| 595 |
"object": "chat.completion.chunk",
|
|
@@ -599,21 +581,17 @@ async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token
|
|
| 599 |
}
|
| 600 |
yield f"data: {json.dumps(chunk)}\n\n"
|
| 601 |
|
| 602 |
-
thread.join()
|
| 603 |
-
|
| 604 |
-
# Final chunk
|
| 605 |
-
finish_reason = "length" if output_tokens >= (request.max_tokens or 1024) else "stop"
|
| 606 |
final_chunk = {
|
| 607 |
"id": chat_id,
|
| 608 |
"object": "chat.completion.chunk",
|
| 609 |
"created": created,
|
| 610 |
"model": request.model,
|
| 611 |
-
"choices": [{"index": 0, "delta": {}, "finish_reason":
|
| 612 |
}
|
| 613 |
yield f"data: {json.dumps(final_chunk)}\n\n"
|
| 614 |
yield "data: [DONE]\n\n"
|
| 615 |
|
| 616 |
-
return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache"
|
| 617 |
|
| 618 |
# ============================================================
|
| 619 |
# ANTHROPIC-COMPATIBLE ENDPOINTS (/anthropic)
|
|
@@ -621,16 +599,16 @@ async def openai_stream_response(request: OpenAIChatRequest, inputs, input_token
|
|
| 621 |
|
| 622 |
@app.get("/anthropic/v1/models")
|
| 623 |
async def anthropic_list_models():
|
| 624 |
-
"""List models (Anthropic format)"""
|
| 625 |
return {
|
| 626 |
"object": "list",
|
| 627 |
"data": [{
|
| 628 |
-
"id": "qwen2.5-coder-
|
| 629 |
"object": "model",
|
| 630 |
"created": int(time.time()),
|
| 631 |
"owned_by": "qwen",
|
| 632 |
-
"display_name": "Qwen2.5 Coder
|
| 633 |
-
"supports_thinking": True
|
|
|
|
| 634 |
}]
|
| 635 |
}
|
| 636 |
|
|
@@ -641,7 +619,6 @@ async def anthropic_create_message(
|
|
| 641 |
anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
|
| 642 |
anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
|
| 643 |
):
|
| 644 |
-
"""Create message (Anthropic format with Extended Thinking)"""
|
| 645 |
message_id = generate_id("msg")
|
| 646 |
|
| 647 |
thinking_enabled = False
|
|
@@ -650,158 +627,128 @@ async def anthropic_create_message(
|
|
| 650 |
thinking_enabled = request.thinking.type == "enabled"
|
| 651 |
budget_tokens = request.thinking.budget_tokens or 1024
|
| 652 |
|
| 653 |
-
logger.info(f"[{message_id}] Anthropic msg - model: {request.model}, max_tokens: {request.max_tokens}, thinking: {thinking_enabled}")
|
| 654 |
|
| 655 |
try:
|
| 656 |
-
prompt = format_anthropic_messages(
|
| 657 |
-
|
| 658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
|
| 660 |
if request.stream:
|
| 661 |
-
return await anthropic_stream_response(request,
|
| 662 |
|
| 663 |
total_max_tokens = request.max_tokens + (budget_tokens if thinking_enabled else 0)
|
| 664 |
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
"pad_token_id": tokenizer.eos_token_id,
|
| 669 |
-
"eos_token_id": tokenizer.eos_token_id,
|
| 670 |
-
}
|
| 671 |
-
|
| 672 |
-
if request.temperature and request.temperature > 0:
|
| 673 |
-
gen_kwargs["temperature"] = request.temperature
|
| 674 |
-
if request.top_p:
|
| 675 |
-
gen_kwargs["top_p"] = request.top_p
|
| 676 |
-
if request.top_k:
|
| 677 |
-
gen_kwargs["top_k"] = request.top_k
|
| 678 |
|
| 679 |
gen_start = time.time()
|
| 680 |
-
|
| 681 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 682 |
gen_time = time.time() - gen_start
|
| 683 |
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
output_token_count = len(generated_tokens)
|
| 687 |
|
|
|
|
| 688 |
content_blocks = []
|
| 689 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
thinking_text, answer_text = parse_thinking_response(generated_text)
|
| 691 |
if thinking_text:
|
| 692 |
content_blocks.append(AnthropicResponseThinkingBlock(type="thinking", thinking=thinking_text))
|
| 693 |
content_blocks.append(AnthropicResponseTextBlock(type="text", text=answer_text))
|
| 694 |
else:
|
| 695 |
-
content_blocks.append(AnthropicResponseTextBlock(type="text", text=generated_text
|
| 696 |
|
| 697 |
-
|
| 698 |
-
if output_token_count >= total_max_tokens:
|
| 699 |
stop_reason = "max_tokens"
|
| 700 |
|
| 701 |
-
logger.info(f"[{message_id}] Generated
|
| 702 |
|
| 703 |
return AnthropicMessageResponse(
|
| 704 |
id=message_id,
|
| 705 |
content=content_blocks,
|
| 706 |
model=request.model,
|
| 707 |
stop_reason=stop_reason,
|
| 708 |
-
usage=AnthropicUsage(
|
|
|
|
|
|
|
|
|
|
| 709 |
)
|
| 710 |
|
| 711 |
except Exception as e:
|
| 712 |
logger.error(f"[{message_id}] Error: {e}", exc_info=True)
|
| 713 |
raise HTTPException(status_code=500, detail=str(e))
|
| 714 |
|
| 715 |
-
async def anthropic_stream_response(request: AnthropicMessageRequest,
|
| 716 |
-
"""Stream response in Anthropic format"""
|
| 717 |
-
|
| 718 |
async def generate():
|
| 719 |
start_event = {
|
| 720 |
"type": "message_start",
|
| 721 |
"message": {
|
| 722 |
"id": message_id, "type": "message", "role": "assistant", "content": [],
|
| 723 |
"model": request.model, "stop_reason": None, "stop_sequence": None,
|
| 724 |
-
"usage": {"input_tokens":
|
| 725 |
}
|
| 726 |
}
|
| 727 |
yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
|
| 728 |
-
yield f"event: ping\ndata: {json.dumps({'type': 'ping'})}\n\n"
|
| 729 |
-
|
| 730 |
-
block_index = 0
|
| 731 |
-
in_thinking = False
|
| 732 |
-
thinking_started = False
|
| 733 |
-
text_block_started = False
|
| 734 |
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
thread = Thread(target=model.generate, kwargs=gen_kwargs)
|
| 755 |
-
thread.start()
|
| 756 |
-
|
| 757 |
-
output_tokens = 0
|
| 758 |
-
accumulated_text = ""
|
| 759 |
-
|
| 760 |
-
for text in streamer:
|
| 761 |
if text:
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
thinking_started = True
|
| 768 |
-
in_thinking = True
|
| 769 |
-
yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'thinking', 'thinking': ''}})}\n\n"
|
| 770 |
-
|
| 771 |
-
if in_thinking:
|
| 772 |
-
clean_text = text.replace("<thinking>", "").replace("</thinking>", "")
|
| 773 |
-
if clean_text:
|
| 774 |
-
yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'thinking_delta', 'thinking': clean_text}})}\n\n"
|
| 775 |
-
if "</thinking>" in accumulated_text:
|
| 776 |
-
in_thinking = False
|
| 777 |
-
yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
|
| 778 |
-
block_index += 1
|
| 779 |
-
text_block_started = True
|
| 780 |
-
yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': block_index, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
| 781 |
-
elif text_block_started:
|
| 782 |
-
yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': block_index, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
|
| 783 |
-
else:
|
| 784 |
-
if not text_block_started:
|
| 785 |
-
text_block_started = True
|
| 786 |
-
yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
| 787 |
-
yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
|
| 788 |
-
|
| 789 |
-
thread.join()
|
| 790 |
-
|
| 791 |
-
yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': block_index})}\n\n"
|
| 792 |
-
|
| 793 |
-
stop_reason = "max_tokens" if output_tokens >= total_max_tokens else "end_turn"
|
| 794 |
-
yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': stop_reason}, 'usage': {'output_tokens': output_tokens}})}\n\n"
|
| 795 |
yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
|
| 796 |
|
| 797 |
-
return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "
|
| 798 |
|
| 799 |
@app.post("/anthropic/v1/messages/count_tokens", response_model=AnthropicTokenCountResponse)
|
| 800 |
async def anthropic_count_tokens(request: AnthropicTokenCountRequest):
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
prompt = format_anthropic_messages(request.messages, request.system, thinking_enabled, budget_tokens)
|
| 804 |
-
tokens = tokenizer.encode(prompt)
|
| 805 |
return AnthropicTokenCountResponse(input_tokens=len(tokens))
|
| 806 |
|
| 807 |
if __name__ == "__main__":
|
|
|
|
| 1 |
"""
|
| 2 |
Dual-Compatible API Endpoint (OpenAI + Anthropic)
|
| 3 |
+
llama.cpp powered - Qwen2.5-Coder-7B-Instruct Q4_K_M
|
| 4 |
- OpenAI format: /v1/chat/completions
|
| 5 |
- Anthropic format: /anthropic/v1/messages
|
| 6 |
"""
|
|
|
|
| 10 |
import uuid
|
| 11 |
import logging
|
| 12 |
import re
|
| 13 |
+
import json
|
| 14 |
from datetime import datetime
|
| 15 |
from logging.handlers import RotatingFileHandler
|
| 16 |
from typing import List, Optional, Union, Dict, Any, Literal
|
| 17 |
from contextlib import asynccontextmanager
|
| 18 |
+
from threading import Thread
|
| 19 |
|
| 20 |
from fastapi import FastAPI, HTTPException, Header, Request
|
| 21 |
from fastapi.responses import StreamingResponse, JSONResponse
|
| 22 |
from fastapi.middleware.cors import CORSMiddleware
|
| 23 |
from pydantic import BaseModel, Field
|
| 24 |
+
from llama_cpp import Llama
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# ============== Logging Configuration ==============
|
| 27 |
LOG_DIR = "/tmp/logs"
|
|
|
|
| 44 |
console_handler.setLevel(logging.INFO)
|
| 45 |
|
| 46 |
logging.basicConfig(level=logging.DEBUG, handlers=[file_handler, console_handler])
|
| 47 |
+
logger = logging.getLogger("llama-api")
|
| 48 |
|
| 49 |
for uvicorn_logger in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
|
| 50 |
uv_log = logging.getLogger(uvicorn_logger)
|
| 51 |
uv_log.handlers = [file_handler, console_handler]
|
| 52 |
|
| 53 |
logger.info("=" * 60)
|
| 54 |
+
logger.info(f"llama.cpp API (OpenAI + Anthropic) Startup at {datetime.now().isoformat()}")
|
| 55 |
logger.info(f"Log file: {LOG_FILE}")
|
| 56 |
logger.info("=" * 60)
|
| 57 |
|
| 58 |
# ============== Configuration ==============
|
| 59 |
+
MODEL_PATH = "/app/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf"
|
| 60 |
+
N_CTX = 8192 # Context window
|
| 61 |
+
N_THREADS = 2 # CPU threads
|
| 62 |
+
N_BATCH = 128 # Batch size
|
| 63 |
|
| 64 |
+
llm = None
|
|
|
|
| 65 |
|
| 66 |
@asynccontextmanager
|
| 67 |
async def lifespan(app: FastAPI):
|
| 68 |
+
global llm
|
| 69 |
+
logger.info(f"Loading model: {MODEL_PATH}")
|
| 70 |
try:
|
| 71 |
+
llm = Llama(
|
| 72 |
+
model_path=MODEL_PATH,
|
| 73 |
+
n_ctx=N_CTX,
|
| 74 |
+
n_threads=N_THREADS,
|
| 75 |
+
n_batch=N_BATCH,
|
| 76 |
+
verbose=True
|
| 77 |
)
|
|
|
|
| 78 |
logger.info("Model loaded successfully!")
|
|
|
|
| 79 |
except Exception as e:
|
| 80 |
logger.error(f"Failed to load model: {e}", exc_info=True)
|
| 81 |
raise
|
| 82 |
yield
|
| 83 |
+
logger.info("Shutting down...")
|
| 84 |
+
del llm
|
| 85 |
|
| 86 |
app = FastAPI(
|
| 87 |
title="Dual-Compatible API (OpenAI + Anthropic)",
|
| 88 |
+
description="llama.cpp powered API with dual SDK compatibility",
|
| 89 |
+
version="2.0.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
lifespan=lifespan
|
| 91 |
)
|
| 92 |
|
|
|
|
| 114 |
raise
|
| 115 |
|
| 116 |
# ============================================================
|
| 117 |
+
# ANTHROPIC-COMPATIBLE MODELS
|
| 118 |
# ============================================================
|
| 119 |
|
| 120 |
class AnthropicTextBlock(BaseModel):
|
|
|
|
| 194 |
stop_sequences: Optional[List[str]] = None
|
| 195 |
stream: Optional[bool] = False
|
| 196 |
system: Optional[Union[str, List[AnthropicSystemContent]]] = None
|
| 197 |
+
temperature: Optional[float] = Field(default=0.7, ge=0.0, le=1.0)
|
| 198 |
tool_choice: Optional[AnthropicToolChoice] = None
|
| 199 |
tools: Optional[List[AnthropicTool]] = None
|
| 200 |
top_k: Optional[int] = Field(default=None, ge=0)
|
|
|
|
| 244 |
input_tokens: int
|
| 245 |
|
| 246 |
# ============================================================
|
| 247 |
+
# OPENAI-COMPATIBLE MODELS
|
| 248 |
# ============================================================
|
| 249 |
|
| 250 |
class OpenAIMessage(BaseModel):
|
|
|
|
| 266 |
model: str
|
| 267 |
messages: List[OpenAIMessage]
|
| 268 |
max_tokens: Optional[int] = 1024
|
| 269 |
+
temperature: Optional[float] = Field(default=0.7, ge=0.0, le=2.0)
|
| 270 |
+
top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0)
|
| 271 |
n: Optional[int] = 1
|
| 272 |
stream: Optional[bool] = False
|
| 273 |
stop: Optional[Union[str, List[str]]] = None
|
|
|
|
| 298 |
usage: OpenAIUsage
|
| 299 |
system_fingerprint: Optional[str] = None
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
class OpenAIModel(BaseModel):
|
| 302 |
id: str
|
| 303 |
object: Literal["model"] = "model"
|
|
|
|
| 346 |
texts.append(item.get("text", ""))
|
| 347 |
return " ".join(texts)
|
| 348 |
|
| 349 |
+
def format_chat_prompt(messages: List[Dict[str, str]], system: Optional[str] = None) -> str:
|
| 350 |
+
"""Format messages for Qwen2.5 chat template"""
|
| 351 |
+
prompt = ""
|
| 352 |
+
if system:
|
| 353 |
+
prompt += f"<|im_start|>system\n{system}<|im_end|>\n"
|
| 354 |
+
|
| 355 |
+
for msg in messages:
|
| 356 |
+
role = msg["role"]
|
| 357 |
+
content = msg["content"]
|
| 358 |
+
prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
|
| 359 |
+
|
| 360 |
+
prompt += "<|im_start|>assistant\n"
|
| 361 |
+
return prompt
|
| 362 |
+
|
| 363 |
def format_anthropic_messages(
|
| 364 |
messages: List[AnthropicMessage],
|
| 365 |
system: Optional[Union[str, List[AnthropicSystemContent]]] = None,
|
| 366 |
+
tools: Optional[List[AnthropicTool]] = None,
|
| 367 |
thinking_enabled: bool = False,
|
| 368 |
budget_tokens: int = 1024
|
| 369 |
) -> str:
|
| 370 |
formatted_messages = []
|
| 371 |
+
system_text = extract_anthropic_system(system) or ""
|
| 372 |
+
|
| 373 |
+
# Add tool definitions to system prompt if provided
|
| 374 |
+
if tools:
|
| 375 |
+
tool_defs = []
|
| 376 |
+
for tool in tools:
|
| 377 |
+
tool_def = {
|
| 378 |
+
"name": tool.name,
|
| 379 |
+
"description": tool.description,
|
| 380 |
+
"parameters": tool.input_schema.model_dump()
|
| 381 |
+
}
|
| 382 |
+
tool_defs.append(tool_def)
|
| 383 |
|
| 384 |
+
tool_instruction = f"""You have access to the following tools:
|
|
|
|
| 385 |
|
| 386 |
+
{json.dumps(tool_defs, indent=2)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
+
To use a tool, respond with a JSON object in this exact format:
|
| 389 |
+
{{"tool": "tool_name", "arguments": {{"arg1": "value1"}}}}
|
| 390 |
|
| 391 |
+
Only use tools when necessary. If you don't need a tool, respond normally."""
|
| 392 |
+
system_text = f"{tool_instruction}\n\n{system_text}" if system_text else tool_instruction
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
+
if thinking_enabled:
|
| 395 |
+
thinking_instruction = f"""When solving complex problems:
|
| 396 |
+
1. Think through the problem step by step inside <thinking>...</thinking> tags
|
| 397 |
+
2. After thinking, provide your final answer outside the thinking tags
|
| 398 |
+
Budget for thinking: up to {budget_tokens} tokens."""
|
| 399 |
+
system_text = f"{thinking_instruction}\n\n{system_text}" if system_text else thinking_instruction
|
| 400 |
|
| 401 |
for msg in messages:
|
| 402 |
content = extract_anthropic_text(msg.content)
|
| 403 |
formatted_messages.append({"role": msg.role, "content": content})
|
| 404 |
|
| 405 |
+
return format_chat_prompt(formatted_messages, system_text if system_text else None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
def format_openai_messages(messages: List[OpenAIMessage]) -> str:
|
| 408 |
+
system_text = None
|
| 409 |
formatted_messages = []
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
+
for msg in messages:
|
| 412 |
+
if msg.role == "system":
|
| 413 |
+
system_text = extract_openai_content(msg.content)
|
| 414 |
+
else:
|
| 415 |
+
content = extract_openai_content(msg.content)
|
| 416 |
+
formatted_messages.append({"role": msg.role, "content": content})
|
| 417 |
|
| 418 |
+
return format_chat_prompt(formatted_messages, system_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
def parse_thinking_response(text: str) -> tuple:
|
| 421 |
thinking_pattern = r'<thinking>(.*?)</thinking>'
|
|
|
|
| 426 |
return thinking_text, answer_text
|
| 427 |
return None, text.strip()
|
| 428 |
|
| 429 |
+
def parse_tool_use(text: str) -> Optional[Dict[str, Any]]:
|
| 430 |
+
"""Parse tool use from model response"""
|
| 431 |
+
try:
|
| 432 |
+
# Look for JSON tool call pattern
|
| 433 |
+
json_pattern = r'\{[^{}]*"tool"[^{}]*\}'
|
| 434 |
+
matches = re.findall(json_pattern, text, re.DOTALL)
|
| 435 |
+
if matches:
|
| 436 |
+
for match in matches:
|
| 437 |
+
parsed = json.loads(match)
|
| 438 |
+
if "tool" in parsed:
|
| 439 |
+
return parsed
|
| 440 |
+
except:
|
| 441 |
+
pass
|
| 442 |
+
return None
|
| 443 |
+
|
| 444 |
def generate_id(prefix: str = "msg") -> str:
|
| 445 |
return f"{prefix}_{uuid.uuid4().hex[:24]}"
|
| 446 |
|
|
|
|
| 450 |
async def root():
|
| 451 |
return {
|
| 452 |
"status": "healthy",
|
| 453 |
+
"model": "qwen2.5-coder-7b-instruct-q4_k_m",
|
| 454 |
+
"backend": "llama.cpp",
|
| 455 |
"endpoints": {
|
| 456 |
"openai": "/v1/chat/completions",
|
| 457 |
"anthropic": "/anthropic/v1/messages"
|
| 458 |
},
|
| 459 |
+
"features": ["extended-thinking", "streaming", "tool-use", "dual-compatibility"],
|
| 460 |
+
"context_length": N_CTX
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
}
|
| 462 |
|
| 463 |
@app.get("/logs")
|
|
|
|
| 466 |
with open(LOG_FILE, 'r') as f:
|
| 467 |
all_lines = f.readlines()
|
| 468 |
recent_lines = all_lines[-lines:] if len(all_lines) > lines else all_lines
|
| 469 |
+
return {"log_file": LOG_FILE, "total_lines": len(all_lines), "logs": "".join(recent_lines)}
|
| 470 |
except FileNotFoundError:
|
| 471 |
+
return {"error": "Log file not found"}
|
| 472 |
|
| 473 |
@app.get("/health")
|
| 474 |
async def health():
|
| 475 |
+
return {"status": "ok", "model_loaded": llm is not None, "backend": "llama.cpp"}
|
| 476 |
|
| 477 |
# ============================================================
|
| 478 |
# OPENAI-COMPATIBLE ENDPOINTS (/v1)
|
|
|
|
| 480 |
|
| 481 |
@app.get("/v1/models")
|
| 482 |
async def openai_list_models():
|
|
|
|
| 483 |
return OpenAIModelList(
|
| 484 |
+
data=[OpenAIModel(id="qwen2.5-coder-7b", created=int(time.time()), owned_by="qwen")]
|
| 485 |
)
|
| 486 |
|
| 487 |
@app.post("/v1/chat/completions")
|
|
|
|
| 489 |
request: OpenAIChatRequest,
|
| 490 |
authorization: Optional[str] = Header(None)
|
| 491 |
):
|
|
|
|
| 492 |
chat_id = generate_id("chatcmpl")
|
| 493 |
+
logger.info(f"[{chat_id}] OpenAI chat - model: {request.model}, max_tokens: {request.max_tokens}")
|
| 494 |
|
| 495 |
try:
|
| 496 |
prompt = format_openai_messages(request.messages)
|
|
|
|
|
|
|
| 497 |
|
| 498 |
if request.stream:
|
| 499 |
+
return await openai_stream_response(request, prompt, chat_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
|
| 501 |
+
stop_tokens = ["<|im_end|>", "<|endoftext|>"]
|
| 502 |
if request.stop:
|
| 503 |
+
if isinstance(request.stop, str):
|
| 504 |
+
stop_tokens.append(request.stop)
|
| 505 |
+
else:
|
| 506 |
+
stop_tokens.extend(request.stop)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
|
| 508 |
gen_start = time.time()
|
| 509 |
+
output = llm(
|
| 510 |
+
prompt,
|
| 511 |
+
max_tokens=request.max_tokens or 1024,
|
| 512 |
+
temperature=request.temperature or 0.7,
|
| 513 |
+
top_p=request.top_p or 0.95,
|
| 514 |
+
stop=stop_tokens,
|
| 515 |
+
echo=False
|
| 516 |
+
)
|
| 517 |
gen_time = time.time() - gen_start
|
| 518 |
|
| 519 |
+
generated_text = output["choices"][0]["text"].strip()
|
| 520 |
+
usage = output["usage"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
|
| 522 |
+
logger.info(f"[{chat_id}] Generated in {gen_time:.2f}s - tokens: {usage['completion_tokens']}")
|
| 523 |
|
| 524 |
return OpenAIChatResponse(
|
| 525 |
id=chat_id,
|
|
|
|
| 527 |
model=request.model,
|
| 528 |
choices=[OpenAIChoice(
|
| 529 |
index=0,
|
| 530 |
+
message={"role": "assistant", "content": generated_text},
|
| 531 |
+
finish_reason="stop"
|
| 532 |
)],
|
| 533 |
usage=OpenAIUsage(
|
| 534 |
+
prompt_tokens=usage["prompt_tokens"],
|
| 535 |
+
completion_tokens=usage["completion_tokens"],
|
| 536 |
+
total_tokens=usage["total_tokens"]
|
| 537 |
)
|
| 538 |
)
|
| 539 |
|
|
|
|
| 541 |
logger.error(f"[{chat_id}] Error: {e}", exc_info=True)
|
| 542 |
raise HTTPException(status_code=500, detail=str(e))
|
| 543 |
|
| 544 |
+
async def openai_stream_response(request: OpenAIChatRequest, prompt: str, chat_id: str):
|
|
|
|
|
|
|
| 545 |
async def generate():
|
| 546 |
created = int(time.time())
|
| 547 |
|
|
|
|
| 548 |
initial_chunk = {
|
| 549 |
"id": chat_id,
|
| 550 |
"object": "chat.completion.chunk",
|
|
|
|
| 554 |
}
|
| 555 |
yield f"data: {json.dumps(initial_chunk)}\n\n"
|
| 556 |
|
| 557 |
+
stop_tokens = ["<|im_end|>", "<|endoftext|>"]
|
| 558 |
+
if request.stop:
|
| 559 |
+
if isinstance(request.stop, str):
|
| 560 |
+
stop_tokens.append(request.stop)
|
| 561 |
+
else:
|
| 562 |
+
stop_tokens.extend(request.stop)
|
| 563 |
+
|
| 564 |
+
for output in llm(
|
| 565 |
+
prompt,
|
| 566 |
+
max_tokens=request.max_tokens or 1024,
|
| 567 |
+
temperature=request.temperature or 0.7,
|
| 568 |
+
top_p=request.top_p or 0.95,
|
| 569 |
+
stop=stop_tokens,
|
| 570 |
+
stream=True,
|
| 571 |
+
echo=False
|
| 572 |
+
):
|
| 573 |
+
text = output["choices"][0]["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
if text:
|
|
|
|
| 575 |
chunk = {
|
| 576 |
"id": chat_id,
|
| 577 |
"object": "chat.completion.chunk",
|
|
|
|
| 581 |
}
|
| 582 |
yield f"data: {json.dumps(chunk)}\n\n"
|
| 583 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
final_chunk = {
|
| 585 |
"id": chat_id,
|
| 586 |
"object": "chat.completion.chunk",
|
| 587 |
"created": created,
|
| 588 |
"model": request.model,
|
| 589 |
+
"choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
|
| 590 |
}
|
| 591 |
yield f"data: {json.dumps(final_chunk)}\n\n"
|
| 592 |
yield "data: [DONE]\n\n"
|
| 593 |
|
| 594 |
+
return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache"})
|
| 595 |
|
| 596 |
# ============================================================
|
| 597 |
# ANTHROPIC-COMPATIBLE ENDPOINTS (/anthropic)
|
|
|
|
| 599 |
|
| 600 |
@app.get("/anthropic/v1/models")
|
| 601 |
async def anthropic_list_models():
|
|
|
|
| 602 |
return {
|
| 603 |
"object": "list",
|
| 604 |
"data": [{
|
| 605 |
+
"id": "qwen2.5-coder-7b",
|
| 606 |
"object": "model",
|
| 607 |
"created": int(time.time()),
|
| 608 |
"owned_by": "qwen",
|
| 609 |
+
"display_name": "Qwen2.5 Coder 7B Instruct (Q4_K_M)",
|
| 610 |
+
"supports_thinking": True,
|
| 611 |
+
"supports_tools": True
|
| 612 |
}]
|
| 613 |
}
|
| 614 |
|
|
|
|
| 619 |
anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
|
| 620 |
anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
|
| 621 |
):
|
|
|
|
| 622 |
message_id = generate_id("msg")
|
| 623 |
|
| 624 |
thinking_enabled = False
|
|
|
|
| 627 |
thinking_enabled = request.thinking.type == "enabled"
|
| 628 |
budget_tokens = request.thinking.budget_tokens or 1024
|
| 629 |
|
| 630 |
+
logger.info(f"[{message_id}] Anthropic msg - model: {request.model}, max_tokens: {request.max_tokens}, thinking: {thinking_enabled}, tools: {len(request.tools) if request.tools else 0}")
|
| 631 |
|
| 632 |
try:
|
| 633 |
+
prompt = format_anthropic_messages(
|
| 634 |
+
request.messages,
|
| 635 |
+
request.system,
|
| 636 |
+
request.tools,
|
| 637 |
+
thinking_enabled,
|
| 638 |
+
budget_tokens
|
| 639 |
+
)
|
| 640 |
|
| 641 |
if request.stream:
|
| 642 |
+
return await anthropic_stream_response(request, prompt, message_id, thinking_enabled)
|
| 643 |
|
| 644 |
total_max_tokens = request.max_tokens + (budget_tokens if thinking_enabled else 0)
|
| 645 |
|
| 646 |
+
stop_tokens = ["<|im_end|>", "<|endoftext|>"]
|
| 647 |
+
if request.stop_sequences:
|
| 648 |
+
stop_tokens.extend(request.stop_sequences)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
|
| 650 |
gen_start = time.time()
|
| 651 |
+
output = llm(
|
| 652 |
+
prompt,
|
| 653 |
+
max_tokens=total_max_tokens,
|
| 654 |
+
temperature=request.temperature or 0.7,
|
| 655 |
+
top_p=request.top_p or 0.95,
|
| 656 |
+
top_k=request.top_k or 40,
|
| 657 |
+
stop=stop_tokens,
|
| 658 |
+
echo=False
|
| 659 |
+
)
|
| 660 |
gen_time = time.time() - gen_start
|
| 661 |
|
| 662 |
+
generated_text = output["choices"][0]["text"].strip()
|
| 663 |
+
usage = output["usage"]
|
|
|
|
| 664 |
|
| 665 |
+
# Parse response for tool use, thinking, etc.
|
| 666 |
content_blocks = []
|
| 667 |
+
stop_reason = "end_turn"
|
| 668 |
+
|
| 669 |
+
# Check for tool use
|
| 670 |
+
tool_call = parse_tool_use(generated_text)
|
| 671 |
+
if tool_call and request.tools:
|
| 672 |
+
tool_id = f"toolu_{uuid.uuid4().hex[:24]}"
|
| 673 |
+
content_blocks.append(AnthropicResponseToolUseBlock(
|
| 674 |
+
type="tool_use",
|
| 675 |
+
id=tool_id,
|
| 676 |
+
name=tool_call["tool"],
|
| 677 |
+
input=tool_call.get("arguments", {})
|
| 678 |
+
))
|
| 679 |
+
stop_reason = "tool_use"
|
| 680 |
+
elif thinking_enabled:
|
| 681 |
thinking_text, answer_text = parse_thinking_response(generated_text)
|
| 682 |
if thinking_text:
|
| 683 |
content_blocks.append(AnthropicResponseThinkingBlock(type="thinking", thinking=thinking_text))
|
| 684 |
content_blocks.append(AnthropicResponseTextBlock(type="text", text=answer_text))
|
| 685 |
else:
|
| 686 |
+
content_blocks.append(AnthropicResponseTextBlock(type="text", text=generated_text))
|
| 687 |
|
| 688 |
+
if usage["completion_tokens"] >= total_max_tokens:
|
|
|
|
| 689 |
stop_reason = "max_tokens"
|
| 690 |
|
| 691 |
+
logger.info(f"[{message_id}] Generated in {gen_time:.2f}s - tokens: {usage['completion_tokens']}")
|
| 692 |
|
| 693 |
return AnthropicMessageResponse(
|
| 694 |
id=message_id,
|
| 695 |
content=content_blocks,
|
| 696 |
model=request.model,
|
| 697 |
stop_reason=stop_reason,
|
| 698 |
+
usage=AnthropicUsage(
|
| 699 |
+
input_tokens=usage["prompt_tokens"],
|
| 700 |
+
output_tokens=usage["completion_tokens"]
|
| 701 |
+
)
|
| 702 |
)
|
| 703 |
|
| 704 |
except Exception as e:
|
| 705 |
logger.error(f"[{message_id}] Error: {e}", exc_info=True)
|
| 706 |
raise HTTPException(status_code=500, detail=str(e))
|
| 707 |
|
| 708 |
+
async def anthropic_stream_response(request: AnthropicMessageRequest, prompt: str, message_id: str, thinking_enabled: bool):
|
|
|
|
|
|
|
| 709 |
async def generate():
|
| 710 |
start_event = {
|
| 711 |
"type": "message_start",
|
| 712 |
"message": {
|
| 713 |
"id": message_id, "type": "message", "role": "assistant", "content": [],
|
| 714 |
"model": request.model, "stop_reason": None, "stop_sequence": None,
|
| 715 |
+
"usage": {"input_tokens": 0, "output_tokens": 0}
|
| 716 |
}
|
| 717 |
}
|
| 718 |
yield f"event: message_start\ndata: {json.dumps(start_event)}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
|
| 720 |
+
# Start text block
|
| 721 |
+
yield f"event: content_block_start\ndata: {json.dumps({'type': 'content_block_start', 'index': 0, 'content_block': {'type': 'text', 'text': ''}})}\n\n"
|
| 722 |
+
|
| 723 |
+
stop_tokens = ["<|im_end|>", "<|endoftext|>"]
|
| 724 |
+
if request.stop_sequences:
|
| 725 |
+
stop_tokens.extend(request.stop_sequences)
|
| 726 |
+
|
| 727 |
+
total_tokens = 0
|
| 728 |
+
for output in llm(
|
| 729 |
+
prompt,
|
| 730 |
+
max_tokens=request.max_tokens,
|
| 731 |
+
temperature=request.temperature or 0.7,
|
| 732 |
+
top_p=request.top_p or 0.95,
|
| 733 |
+
stop=stop_tokens,
|
| 734 |
+
stream=True,
|
| 735 |
+
echo=False
|
| 736 |
+
):
|
| 737 |
+
text = output["choices"][0]["text"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
if text:
|
| 739 |
+
total_tokens += 1
|
| 740 |
+
yield f"event: content_block_delta\ndata: {json.dumps({'type': 'content_block_delta', 'index': 0, 'delta': {'type': 'text_delta', 'text': text}})}\n\n"
|
| 741 |
+
|
| 742 |
+
yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
|
| 743 |
+
yield f"event: message_delta\ndata: {json.dumps({'type': 'message_delta', 'delta': {'stop_reason': 'end_turn'}, 'usage': {'output_tokens': total_tokens}})}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
|
| 745 |
|
| 746 |
+
return StreamingResponse(generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"})
|
| 747 |
|
| 748 |
@app.post("/anthropic/v1/messages/count_tokens", response_model=AnthropicTokenCountResponse)
|
| 749 |
async def anthropic_count_tokens(request: AnthropicTokenCountRequest):
|
| 750 |
+
prompt = format_anthropic_messages(request.messages, request.system)
|
| 751 |
+
tokens = llm.tokenize(prompt.encode())
|
|
|
|
|
|
|
| 752 |
return AnthropicTokenCountResponse(input_tokens=len(tokens))
|
| 753 |
|
| 754 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
fastapi==0.115.5
|
| 2 |
uvicorn[standard]==0.32.0
|
| 3 |
-
|
| 4 |
-
torch==2.1.2+cpu
|
| 5 |
-
accelerate==1.1.1
|
| 6 |
pydantic==2.10.1
|
| 7 |
python-multipart==0.0.12
|
| 8 |
-
numpy<2
|
|
|
|
| 1 |
fastapi==0.115.5
|
| 2 |
uvicorn[standard]==0.32.0
|
| 3 |
+
llama-cpp-python==0.3.2
|
|
|
|
|
|
|
| 4 |
pydantic==2.10.1
|
| 5 |
python-multipart==0.0.12
|
|
|