Spaces:
Sleeping
Sleeping
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -6,12 +6,16 @@ Author: Matrix Agent
|
|
| 6 |
Features:
|
| 7 |
- Full OpenAI API compatibility (/v1/chat/completions)
|
| 8 |
- Full Anthropic API compatibility (/v1/messages)
|
|
|
|
|
|
|
| 9 |
- Optimized for coding tasks
|
| 10 |
- Runs on free HF Spaces (2 vCPU, 16GB RAM)
|
| 11 |
|
| 12 |
API Specifications verified against:
|
| 13 |
- OpenAI: https://platform.openai.com/docs/api-reference/chat/create
|
| 14 |
- Anthropic: https://docs.anthropic.com/en/api/messages
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
|
| 17 |
import os
|
|
@@ -36,10 +40,10 @@ from pydantic import BaseModel, Field
|
|
| 36 |
# ============================================================================
|
| 37 |
|
| 38 |
MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct")
|
| 39 |
-
ANTHROPIC_VERSION = "2023-06-01"
|
| 40 |
|
| 41 |
MODEL_ALIASES = {
|
| 42 |
-
# OpenAI-style model names
|
| 43 |
"gpt-4": MODEL_ID,
|
| 44 |
"gpt-4-turbo": MODEL_ID,
|
| 45 |
"gpt-4o": MODEL_ID,
|
|
@@ -89,7 +93,6 @@ def load_model():
|
|
| 89 |
if tokenizer.pad_token is None:
|
| 90 |
tokenizer.pad_token = tokenizer.eos_token
|
| 91 |
|
| 92 |
-
# Load with CPU optimizations for 16GB RAM
|
| 93 |
model = AutoModelForCausalLM.from_pretrained(
|
| 94 |
MODEL_ID,
|
| 95 |
torch_dtype=torch.float32,
|
|
@@ -103,52 +106,45 @@ def load_model():
|
|
| 103 |
return model, tokenizer
|
| 104 |
|
| 105 |
# ============================================================================
|
| 106 |
-
# Pydantic Models - OpenAI Compatible
|
| 107 |
# ============================================================================
|
| 108 |
|
| 109 |
class OpenAIContentPart(BaseModel):
|
| 110 |
-
|
| 111 |
-
type: str # "text", "image_url"
|
| 112 |
text: Optional[str] = None
|
| 113 |
image_url: Optional[Dict[str, str]] = None
|
| 114 |
|
| 115 |
class OpenAIMessage(BaseModel):
|
| 116 |
-
|
| 117 |
-
role: str # "system", "user", "assistant", "tool"
|
| 118 |
content: Optional[Union[str, List[OpenAIContentPart]]] = None
|
| 119 |
name: Optional[str] = None
|
| 120 |
tool_calls: Optional[List[Dict]] = None
|
| 121 |
tool_call_id: Optional[str] = None
|
| 122 |
|
| 123 |
class OpenAIResponseFormat(BaseModel):
|
| 124 |
-
|
| 125 |
-
type: str = "text" # "text", "json_object", "json_schema"
|
| 126 |
json_schema: Optional[Dict] = None
|
| 127 |
|
| 128 |
class OpenAIChatRequest(BaseModel):
|
| 129 |
-
"""Full OpenAI Chat Completions request spec"""
|
| 130 |
model: str
|
| 131 |
messages: List[OpenAIMessage]
|
| 132 |
-
# Generation parameters
|
| 133 |
temperature: Optional[float] = Field(default=1.0, ge=0, le=2)
|
| 134 |
top_p: Optional[float] = Field(default=1.0, ge=0, le=1)
|
| 135 |
n: Optional[int] = Field(default=1, ge=1, le=10)
|
| 136 |
stream: Optional[bool] = False
|
| 137 |
stop: Optional[Union[str, List[str]]] = None
|
| 138 |
max_tokens: Optional[int] = None
|
| 139 |
-
max_completion_tokens: Optional[int] = None
|
| 140 |
presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
|
| 141 |
frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
|
| 142 |
logit_bias: Optional[Dict[str, float]] = None
|
| 143 |
logprobs: Optional[bool] = False
|
| 144 |
top_logprobs: Optional[int] = None
|
| 145 |
-
# Additional parameters
|
| 146 |
user: Optional[str] = None
|
| 147 |
seed: Optional[int] = None
|
| 148 |
tools: Optional[List[Dict]] = None
|
| 149 |
tool_choice: Optional[Union[str, Dict]] = None
|
| 150 |
response_format: Optional[OpenAIResponseFormat] = None
|
| 151 |
-
# Stream options
|
| 152 |
stream_options: Optional[Dict] = None
|
| 153 |
|
| 154 |
class OpenAIChoiceMessage(BaseModel):
|
|
@@ -159,7 +155,7 @@ class OpenAIChoiceMessage(BaseModel):
|
|
| 159 |
class OpenAIChoice(BaseModel):
|
| 160 |
index: int
|
| 161 |
message: OpenAIChoiceMessage
|
| 162 |
-
finish_reason: Optional[str] = None
|
| 163 |
logprobs: Optional[Dict] = None
|
| 164 |
|
| 165 |
class OpenAIStreamChoice(BaseModel):
|
|
@@ -176,7 +172,6 @@ class OpenAIUsage(BaseModel):
|
|
| 176 |
completion_tokens_details: Optional[Dict] = None
|
| 177 |
|
| 178 |
class OpenAIChatResponse(BaseModel):
|
| 179 |
-
"""Full OpenAI Chat Completions response spec"""
|
| 180 |
id: str
|
| 181 |
object: str = "chat.completion"
|
| 182 |
created: int
|
|
@@ -186,14 +181,6 @@ class OpenAIChatResponse(BaseModel):
|
|
| 186 |
system_fingerprint: Optional[str] = None
|
| 187 |
service_tier: Optional[str] = None
|
| 188 |
|
| 189 |
-
class OpenAIStreamResponse(BaseModel):
|
| 190 |
-
id: str
|
| 191 |
-
object: str = "chat.completion.chunk"
|
| 192 |
-
created: int
|
| 193 |
-
model: str
|
| 194 |
-
choices: List[OpenAIStreamChoice]
|
| 195 |
-
system_fingerprint: Optional[str] = None
|
| 196 |
-
|
| 197 |
class OpenAIModelInfo(BaseModel):
|
| 198 |
id: str
|
| 199 |
object: str = "model"
|
|
@@ -205,62 +192,52 @@ class OpenAIModelsResponse(BaseModel):
|
|
| 205 |
data: List[OpenAIModelInfo]
|
| 206 |
|
| 207 |
# ============================================================================
|
| 208 |
-
# Pydantic Models - Anthropic Compatible (
|
| 209 |
# ============================================================================
|
| 210 |
|
| 211 |
class AnthropicTextBlock(BaseModel):
|
| 212 |
-
"""Text content block"""
|
| 213 |
type: str = "text"
|
| 214 |
text: str
|
| 215 |
|
| 216 |
class AnthropicImageSource(BaseModel):
|
| 217 |
-
"""Image source for vision"""
|
| 218 |
type: str = "base64"
|
| 219 |
-
media_type: str
|
| 220 |
data: str
|
| 221 |
|
| 222 |
class AnthropicImageBlock(BaseModel):
|
| 223 |
-
"""Image content block"""
|
| 224 |
type: str = "image"
|
| 225 |
source: AnthropicImageSource
|
| 226 |
|
| 227 |
-
class
|
| 228 |
-
"""
|
| 229 |
-
type: str = "
|
| 230 |
-
|
| 231 |
-
name: str
|
| 232 |
-
input: Dict
|
| 233 |
-
|
| 234 |
-
class AnthropicToolResultBlock(BaseModel):
|
| 235 |
-
"""Tool result content block"""
|
| 236 |
-
type: str = "tool_result"
|
| 237 |
-
tool_use_id: str
|
| 238 |
-
content: Union[str, List[Dict]]
|
| 239 |
|
| 240 |
-
|
| 241 |
-
AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, Dict]
|
| 242 |
|
| 243 |
class AnthropicMessage(BaseModel):
|
| 244 |
-
"""Anthropic message format"""
|
| 245 |
role: str # "user", "assistant"
|
| 246 |
content: Union[str, List[AnthropicContentBlock]]
|
| 247 |
|
| 248 |
class AnthropicTool(BaseModel):
|
| 249 |
-
"""Tool definition"""
|
| 250 |
name: str
|
| 251 |
description: Optional[str] = None
|
| 252 |
input_schema: Dict
|
| 253 |
|
| 254 |
class AnthropicToolChoice(BaseModel):
|
| 255 |
-
|
| 256 |
-
type: str # "auto", "any", "tool"
|
| 257 |
name: Optional[str] = None
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
class AnthropicRequest(BaseModel):
|
| 260 |
-
"""Full Anthropic Messages API request
|
| 261 |
model: str
|
| 262 |
messages: List[AnthropicMessage]
|
| 263 |
-
max_tokens: int
|
| 264 |
# Optional parameters
|
| 265 |
system: Optional[Union[str, List[Dict]]] = None
|
| 266 |
temperature: Optional[float] = Field(default=1.0, ge=0, le=1)
|
|
@@ -271,12 +248,16 @@ class AnthropicRequest(BaseModel):
|
|
| 271 |
# Tool use
|
| 272 |
tools: Optional[List[AnthropicTool]] = None
|
| 273 |
tool_choice: Optional[AnthropicToolChoice] = None
|
|
|
|
|
|
|
| 274 |
# Metadata
|
| 275 |
metadata: Optional[Dict] = None
|
| 276 |
|
| 277 |
class AnthropicResponseContent(BaseModel):
|
| 278 |
type: str = "text"
|
| 279 |
text: Optional[str] = None
|
|
|
|
|
|
|
| 280 |
# For tool_use
|
| 281 |
id: Optional[str] = None
|
| 282 |
name: Optional[str] = None
|
|
@@ -287,13 +268,12 @@ class AnthropicUsage(BaseModel):
|
|
| 287 |
output_tokens: int
|
| 288 |
|
| 289 |
class AnthropicResponse(BaseModel):
|
| 290 |
-
"""Full Anthropic Messages API response spec"""
|
| 291 |
id: str
|
| 292 |
type: str = "message"
|
| 293 |
role: str = "assistant"
|
| 294 |
model: str
|
| 295 |
content: List[AnthropicResponseContent]
|
| 296 |
-
stop_reason: Optional[str] = None
|
| 297 |
stop_sequence: Optional[str] = None
|
| 298 |
usage: AnthropicUsage
|
| 299 |
|
|
@@ -302,7 +282,6 @@ class AnthropicResponse(BaseModel):
|
|
| 302 |
# ============================================================================
|
| 303 |
|
| 304 |
def extract_text_from_openai_content(content: Union[str, List, None]) -> str:
|
| 305 |
-
"""Extract text from OpenAI message content (string or array)"""
|
| 306 |
if content is None:
|
| 307 |
return ""
|
| 308 |
if isinstance(content, str):
|
|
@@ -319,7 +298,6 @@ def extract_text_from_openai_content(content: Union[str, List, None]) -> str:
|
|
| 319 |
return str(content)
|
| 320 |
|
| 321 |
def extract_text_from_anthropic_content(content: Union[str, List]) -> str:
|
| 322 |
-
"""Extract text from Anthropic message content (string or array)"""
|
| 323 |
if isinstance(content, str):
|
| 324 |
return content
|
| 325 |
if isinstance(content, list):
|
|
@@ -328,19 +306,20 @@ def extract_text_from_anthropic_content(content: Union[str, List]) -> str:
|
|
| 328 |
if isinstance(block, dict):
|
| 329 |
if block.get("type") == "text":
|
| 330 |
text_parts.append(block.get("text", ""))
|
| 331 |
-
|
| 332 |
-
|
|
|
|
|
|
|
|
|
|
| 333 |
return "\n".join(text_parts)
|
| 334 |
return str(content)
|
| 335 |
|
| 336 |
def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str:
|
| 337 |
-
"""Extract system prompt from Anthropic format"""
|
| 338 |
if system is None:
|
| 339 |
return ""
|
| 340 |
if isinstance(system, str):
|
| 341 |
return system
|
| 342 |
if isinstance(system, list):
|
| 343 |
-
# System can be array of text blocks
|
| 344 |
text_parts = []
|
| 345 |
for block in system:
|
| 346 |
if isinstance(block, dict) and block.get("type") == "text":
|
|
@@ -348,15 +327,40 @@ def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str
|
|
| 348 |
return "\n".join(text_parts)
|
| 349 |
return ""
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
# ============================================================================
|
| 352 |
-
# Message Formatting
|
| 353 |
# ============================================================================
|
| 354 |
|
| 355 |
def format_messages_for_model(
|
| 356 |
messages: List[Dict],
|
| 357 |
-
system_prompt: Optional[str] = None
|
|
|
|
| 358 |
) -> str:
|
| 359 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 360 |
formatted_messages = []
|
| 361 |
|
| 362 |
if system_prompt:
|
|
@@ -366,7 +370,6 @@ def format_messages_for_model(
|
|
| 366 |
role = msg.get("role", "user")
|
| 367 |
content = msg.get("content", "")
|
| 368 |
|
| 369 |
-
# Map tool role to assistant for compatibility
|
| 370 |
if role == "tool":
|
| 371 |
role = "user"
|
| 372 |
|
|
@@ -375,15 +378,19 @@ def format_messages_for_model(
|
|
| 375 |
# Use tokenizer's chat template if available
|
| 376 |
if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
|
| 377 |
try:
|
| 378 |
-
|
| 379 |
formatted_messages,
|
| 380 |
tokenize=False,
|
| 381 |
add_generation_prompt=True
|
| 382 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
except Exception:
|
| 384 |
pass
|
| 385 |
|
| 386 |
-
# Fallback
|
| 387 |
prompt = ""
|
| 388 |
for msg in formatted_messages:
|
| 389 |
role = msg["role"]
|
|
@@ -395,10 +402,15 @@ def format_messages_for_model(
|
|
| 395 |
elif role == "assistant":
|
| 396 |
prompt += f"<|assistant|>\n{content}\n"
|
| 397 |
prompt += "<|assistant|>\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
return prompt
|
| 399 |
|
| 400 |
# ============================================================================
|
| 401 |
-
# Generation Logic
|
| 402 |
# ============================================================================
|
| 403 |
|
| 404 |
def generate_response(
|
|
@@ -408,15 +420,16 @@ def generate_response(
|
|
| 408 |
top_p: float = 0.95,
|
| 409 |
top_k: Optional[int] = None,
|
| 410 |
stop: Optional[List[str]] = None,
|
| 411 |
-
|
|
|
|
|
|
|
| 412 |
"""
|
| 413 |
-
Generate response from the model
|
| 414 |
-
Returns: (response_text, input_tokens, output_tokens, stop_reason)
|
| 415 |
"""
|
| 416 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
|
| 417 |
input_length = inputs.input_ids.shape[1]
|
| 418 |
|
| 419 |
-
# Generation config
|
| 420 |
gen_kwargs = {
|
| 421 |
"max_new_tokens": max_tokens,
|
| 422 |
"temperature": max(temperature, 0.01),
|
|
@@ -432,12 +445,20 @@ def generate_response(
|
|
| 432 |
with torch.no_grad():
|
| 433 |
outputs = model.generate(inputs.input_ids, **gen_kwargs)
|
| 434 |
|
| 435 |
-
# Decode only the new tokens
|
| 436 |
generated_tokens = outputs[0][input_length:]
|
| 437 |
response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
| 438 |
|
| 439 |
output_length = len(generated_tokens)
|
| 440 |
-
stop_reason = "stop"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
# Handle stop sequences
|
| 443 |
if stop:
|
|
@@ -447,11 +468,10 @@ def generate_response(
|
|
| 447 |
stop_reason = "stop"
|
| 448 |
break
|
| 449 |
|
| 450 |
-
# Check if max tokens reached
|
| 451 |
if output_length >= max_tokens:
|
| 452 |
stop_reason = "length"
|
| 453 |
|
| 454 |
-
return response_text.strip(), input_length, output_length, stop_reason
|
| 455 |
|
| 456 |
async def generate_stream(
|
| 457 |
prompt: str,
|
|
@@ -492,14 +512,13 @@ async def generate_stream(
|
|
| 492 |
|
| 493 |
@asynccontextmanager
|
| 494 |
async def lifespan(app: FastAPI):
|
| 495 |
-
"""Load model on startup"""
|
| 496 |
load_model()
|
| 497 |
yield
|
| 498 |
|
| 499 |
app = FastAPI(
|
| 500 |
title="Free Coding API",
|
| 501 |
-
description="OpenAI & Anthropic compatible API
|
| 502 |
-
version="1.
|
| 503 |
lifespan=lifespan
|
| 504 |
)
|
| 505 |
|
|
@@ -516,7 +535,6 @@ app.add_middleware(
|
|
| 516 |
# ============================================================================
|
| 517 |
|
| 518 |
def verify_api_key(authorization: Optional[str] = None) -> bool:
|
| 519 |
-
"""Simple API key verification"""
|
| 520 |
if not API_KEY or API_KEY == "":
|
| 521 |
return True
|
| 522 |
|
|
@@ -536,7 +554,6 @@ def verify_api_key(authorization: Optional[str] = None) -> bool:
|
|
| 536 |
|
| 537 |
@app.get("/v1/models")
|
| 538 |
async def list_models():
|
| 539 |
-
"""List available models (OpenAI compatible)"""
|
| 540 |
models = [
|
| 541 |
OpenAIModelInfo(id=alias, created=int(time.time()))
|
| 542 |
for alias in MODEL_ALIASES.keys()
|
|
@@ -545,7 +562,6 @@ async def list_models():
|
|
| 545 |
|
| 546 |
@app.get("/v1/models/{model_id}")
|
| 547 |
async def get_model(model_id: str):
|
| 548 |
-
"""Get model info"""
|
| 549 |
if model_id in MODEL_ALIASES or model_id == MODEL_ID:
|
| 550 |
return OpenAIModelInfo(id=model_id, created=int(time.time()))
|
| 551 |
raise HTTPException(status_code=404, detail="Model not found")
|
|
@@ -555,7 +571,7 @@ async def openai_chat_completions(
|
|
| 555 |
request: OpenAIChatRequest,
|
| 556 |
authorization: Optional[str] = Header(None),
|
| 557 |
):
|
| 558 |
-
"""OpenAI-compatible chat completions
|
| 559 |
|
| 560 |
if not verify_api_key(authorization):
|
| 561 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
@@ -566,7 +582,10 @@ async def openai_chat_completions(
|
|
| 566 |
content = extract_text_from_openai_content(m.content)
|
| 567 |
messages.append({"role": m.role, "content": content})
|
| 568 |
|
| 569 |
-
#
|
|
|
|
|
|
|
|
|
|
| 570 |
system_prompt = None
|
| 571 |
filtered_messages = []
|
| 572 |
for msg in messages:
|
|
@@ -575,12 +594,10 @@ async def openai_chat_completions(
|
|
| 575 |
else:
|
| 576 |
filtered_messages.append(msg)
|
| 577 |
|
| 578 |
-
prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt)
|
| 579 |
|
| 580 |
-
# Determine max tokens
|
| 581 |
max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT
|
| 582 |
|
| 583 |
-
# Handle stop sequences
|
| 584 |
stop_sequences = None
|
| 585 |
if request.stop:
|
| 586 |
stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop
|
|
@@ -590,9 +607,7 @@ async def openai_chat_completions(
|
|
| 590 |
created_time = int(time.time())
|
| 591 |
|
| 592 |
if request.stream:
|
| 593 |
-
# OpenAI Streaming format
|
| 594 |
async def stream_generator():
|
| 595 |
-
# First chunk with role
|
| 596 |
first_chunk = {
|
| 597 |
"id": request_id,
|
| 598 |
"object": "chat.completion.chunk",
|
|
@@ -601,14 +616,13 @@ async def openai_chat_completions(
|
|
| 601 |
"system_fingerprint": system_fingerprint,
|
| 602 |
"choices": [{
|
| 603 |
"index": 0,
|
| 604 |
-
"delta": {"role": "assistant", "content":
|
| 605 |
"logprobs": None,
|
| 606 |
"finish_reason": None
|
| 607 |
}]
|
| 608 |
}
|
| 609 |
yield f"data: {json.dumps(first_chunk)}\n\n"
|
| 610 |
|
| 611 |
-
# Stream content
|
| 612 |
async for token in generate_stream(
|
| 613 |
prompt,
|
| 614 |
max_tokens=max_tokens,
|
|
@@ -630,7 +644,6 @@ async def openai_chat_completions(
|
|
| 630 |
}
|
| 631 |
yield f"data: {json.dumps(chunk)}\n\n"
|
| 632 |
|
| 633 |
-
# Final chunk with finish_reason
|
| 634 |
final_chunk = {
|
| 635 |
"id": request_id,
|
| 636 |
"object": "chat.completion.chunk",
|
|
@@ -646,7 +659,6 @@ async def openai_chat_completions(
|
|
| 646 |
}
|
| 647 |
yield f"data: {json.dumps(final_chunk)}\n\n"
|
| 648 |
|
| 649 |
-
# Usage chunk if requested
|
| 650 |
if request.stream_options and request.stream_options.get("include_usage"):
|
| 651 |
usage_chunk = {
|
| 652 |
"id": request_id,
|
|
@@ -654,11 +666,7 @@ async def openai_chat_completions(
|
|
| 654 |
"created": created_time,
|
| 655 |
"model": request.model,
|
| 656 |
"choices": [],
|
| 657 |
-
"usage": {
|
| 658 |
-
"prompt_tokens": 0,
|
| 659 |
-
"completion_tokens": 0,
|
| 660 |
-
"total_tokens": 0
|
| 661 |
-
}
|
| 662 |
}
|
| 663 |
yield f"data: {json.dumps(usage_chunk)}\n\n"
|
| 664 |
|
|
@@ -667,15 +675,11 @@ async def openai_chat_completions(
|
|
| 667 |
return StreamingResponse(
|
| 668 |
stream_generator(),
|
| 669 |
media_type="text/event-stream",
|
| 670 |
-
headers={
|
| 671 |
-
"Cache-Control": "no-cache",
|
| 672 |
-
"Connection": "keep-alive",
|
| 673 |
-
"X-Accel-Buffering": "no"
|
| 674 |
-
}
|
| 675 |
)
|
| 676 |
|
| 677 |
-
# Non-streaming
|
| 678 |
-
response_text, input_tokens, output_tokens, stop_reason = generate_response(
|
| 679 |
prompt,
|
| 680 |
max_tokens=max_tokens,
|
| 681 |
temperature=request.temperature or 1.0,
|
|
@@ -683,7 +687,9 @@ async def openai_chat_completions(
|
|
| 683 |
stop=stop_sequences,
|
| 684 |
)
|
| 685 |
|
| 686 |
-
#
|
|
|
|
|
|
|
| 687 |
openai_finish_reason = "stop" if stop_reason == "stop" else "length"
|
| 688 |
|
| 689 |
return OpenAIChatResponse(
|
|
@@ -694,7 +700,7 @@ async def openai_chat_completions(
|
|
| 694 |
choices=[
|
| 695 |
OpenAIChoice(
|
| 696 |
index=0,
|
| 697 |
-
message=OpenAIChoiceMessage(role="assistant", content=
|
| 698 |
finish_reason=openai_finish_reason,
|
| 699 |
logprobs=None
|
| 700 |
)
|
|
@@ -707,7 +713,7 @@ async def openai_chat_completions(
|
|
| 707 |
)
|
| 708 |
|
| 709 |
# ============================================================================
|
| 710 |
-
# Anthropic Compatible Endpoints
|
| 711 |
# ============================================================================
|
| 712 |
|
| 713 |
@app.post("/v1/messages")
|
|
@@ -717,9 +723,8 @@ async def anthropic_messages(
|
|
| 717 |
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
|
| 718 |
anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
|
| 719 |
):
|
| 720 |
-
"""Anthropic-compatible messages endpoint
|
| 721 |
|
| 722 |
-
# Anthropic uses x-api-key header
|
| 723 |
auth_key = x_api_key or authorization
|
| 724 |
if not verify_api_key(auth_key):
|
| 725 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
@@ -730,19 +735,30 @@ async def anthropic_messages(
|
|
| 730 |
content = extract_text_from_anthropic_content(m.content)
|
| 731 |
messages.append({"role": m.role, "content": content})
|
| 732 |
|
|
|
|
|
|
|
|
|
|
| 733 |
# Extract system prompt
|
| 734 |
system_prompt = extract_system_prompt_anthropic(request.system)
|
| 735 |
|
| 736 |
-
prompt = format_messages_for_model(messages, system_prompt=system_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
|
| 738 |
request_id = f"msg_{uuid.uuid4().hex[:24]}"
|
| 739 |
|
| 740 |
if request.stream:
|
| 741 |
-
# Anthropic streaming format (Server-Sent Events)
|
| 742 |
async def stream_generator():
|
| 743 |
-
input_tokens = 0
|
| 744 |
|
| 745 |
-
#
|
| 746 |
message_start = {
|
| 747 |
"type": "message_start",
|
| 748 |
"message": {
|
|
@@ -753,26 +769,55 @@ async def anthropic_messages(
|
|
| 753 |
"content": [],
|
| 754 |
"stop_reason": None,
|
| 755 |
"stop_sequence": None,
|
| 756 |
-
"usage": {
|
| 757 |
-
"input_tokens": input_tokens,
|
| 758 |
-
"output_tokens": 0
|
| 759 |
-
}
|
| 760 |
}
|
| 761 |
}
|
| 762 |
yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
|
| 763 |
|
| 764 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 765 |
content_block_start = {
|
| 766 |
"type": "content_block_start",
|
| 767 |
-
"index":
|
| 768 |
-
"content_block": {
|
| 769 |
-
"type": "text",
|
| 770 |
-
"text": ""
|
| 771 |
-
}
|
| 772 |
}
|
| 773 |
yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
|
| 774 |
|
| 775 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
output_tokens = 0
|
| 777 |
async for token in generate_stream(
|
| 778 |
prompt,
|
|
@@ -784,59 +829,61 @@ async def anthropic_messages(
|
|
| 784 |
output_tokens += 1
|
| 785 |
delta = {
|
| 786 |
"type": "content_block_delta",
|
| 787 |
-
"index":
|
| 788 |
-
"delta": {
|
| 789 |
-
"type": "text_delta",
|
| 790 |
-
"text": token
|
| 791 |
-
}
|
| 792 |
}
|
| 793 |
yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n"
|
| 794 |
|
| 795 |
-
#
|
| 796 |
-
content_block_stop = {
|
| 797 |
-
"type": "content_block_stop",
|
| 798 |
-
"index": 0
|
| 799 |
-
}
|
| 800 |
yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n"
|
| 801 |
|
| 802 |
-
#
|
| 803 |
message_delta = {
|
| 804 |
"type": "message_delta",
|
| 805 |
-
"delta": {
|
| 806 |
-
|
| 807 |
-
"stop_sequence": None
|
| 808 |
-
},
|
| 809 |
-
"usage": {
|
| 810 |
-
"output_tokens": output_tokens
|
| 811 |
-
}
|
| 812 |
}
|
| 813 |
yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
|
| 814 |
|
| 815 |
-
#
|
| 816 |
message_stop = {"type": "message_stop"}
|
| 817 |
yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
|
| 818 |
|
| 819 |
return StreamingResponse(
|
| 820 |
stream_generator(),
|
| 821 |
media_type="text/event-stream",
|
| 822 |
-
headers={
|
| 823 |
-
"Cache-Control": "no-cache",
|
| 824 |
-
"Connection": "keep-alive",
|
| 825 |
-
"X-Accel-Buffering": "no"
|
| 826 |
-
}
|
| 827 |
)
|
| 828 |
|
| 829 |
# Non-streaming response
|
| 830 |
-
response_text, input_tokens, output_tokens, stop_reason = generate_response(
|
| 831 |
prompt,
|
| 832 |
max_tokens=request.max_tokens,
|
| 833 |
temperature=request.temperature or 1.0,
|
| 834 |
top_p=request.top_p or 0.999,
|
| 835 |
top_k=request.top_k,
|
| 836 |
stop=request.stop_sequences,
|
|
|
|
|
|
|
| 837 |
)
|
| 838 |
|
| 839 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
anthropic_stop_reason = "end_turn"
|
| 841 |
stop_sequence_used = None
|
| 842 |
if stop_reason == "length":
|
|
@@ -851,7 +898,7 @@ async def anthropic_messages(
|
|
| 851 |
return AnthropicResponse(
|
| 852 |
id=request_id,
|
| 853 |
model=request.model,
|
| 854 |
-
content=
|
| 855 |
stop_reason=anthropic_stop_reason,
|
| 856 |
stop_sequence=stop_sequence_used,
|
| 857 |
usage=AnthropicUsage(
|
|
@@ -868,8 +915,13 @@ async def anthropic_messages(
|
|
| 868 |
async def root():
|
| 869 |
return {
|
| 870 |
"name": "Free Coding API",
|
| 871 |
-
"version": "1.
|
| 872 |
"model": MODEL_ID,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 873 |
"compatibility": {
|
| 874 |
"openai": "v1 Chat Completions API",
|
| 875 |
"anthropic": "Messages API (2023-06-01)"
|
|
|
|
| 6 |
Features:
|
| 7 |
- Full OpenAI API compatibility (/v1/chat/completions)
|
| 8 |
- Full Anthropic API compatibility (/v1/messages)
|
| 9 |
+
- Prefill Response Support (assistant message prefix for output control)
|
| 10 |
+
- Thinking/Reasoning Content Block Support
|
| 11 |
- Optimized for coding tasks
|
| 12 |
- Runs on free HF Spaces (2 vCPU, 16GB RAM)
|
| 13 |
|
| 14 |
API Specifications verified against:
|
| 15 |
- OpenAI: https://platform.openai.com/docs/api-reference/chat/create
|
| 16 |
- Anthropic: https://docs.anthropic.com/en/api/messages
|
| 17 |
+
- Prefill: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
|
| 18 |
+
- MiniMax Anthropic: https://platform.minimax.io/docs/api-reference/text-anthropic-api
|
| 19 |
"""
|
| 20 |
|
| 21 |
import os
|
|
|
|
| 40 |
# ============================================================================
|
| 41 |
|
| 42 |
MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct")
|
| 43 |
+
ANTHROPIC_VERSION = "2023-06-01"
|
| 44 |
|
| 45 |
MODEL_ALIASES = {
|
| 46 |
+
# OpenAI-style model names
|
| 47 |
"gpt-4": MODEL_ID,
|
| 48 |
"gpt-4-turbo": MODEL_ID,
|
| 49 |
"gpt-4o": MODEL_ID,
|
|
|
|
| 93 |
if tokenizer.pad_token is None:
|
| 94 |
tokenizer.pad_token = tokenizer.eos_token
|
| 95 |
|
|
|
|
| 96 |
model = AutoModelForCausalLM.from_pretrained(
|
| 97 |
MODEL_ID,
|
| 98 |
torch_dtype=torch.float32,
|
|
|
|
| 106 |
return model, tokenizer
|
| 107 |
|
| 108 |
# ============================================================================
|
| 109 |
+
# Pydantic Models - OpenAI Compatible
|
| 110 |
# ============================================================================
|
| 111 |
|
| 112 |
class OpenAIContentPart(BaseModel):
|
| 113 |
+
type: str
|
|
|
|
| 114 |
text: Optional[str] = None
|
| 115 |
image_url: Optional[Dict[str, str]] = None
|
| 116 |
|
| 117 |
class OpenAIMessage(BaseModel):
|
| 118 |
+
role: str
|
|
|
|
| 119 |
content: Optional[Union[str, List[OpenAIContentPart]]] = None
|
| 120 |
name: Optional[str] = None
|
| 121 |
tool_calls: Optional[List[Dict]] = None
|
| 122 |
tool_call_id: Optional[str] = None
|
| 123 |
|
| 124 |
class OpenAIResponseFormat(BaseModel):
|
| 125 |
+
type: str = "text"
|
|
|
|
| 126 |
json_schema: Optional[Dict] = None
|
| 127 |
|
| 128 |
class OpenAIChatRequest(BaseModel):
|
|
|
|
| 129 |
model: str
|
| 130 |
messages: List[OpenAIMessage]
|
|
|
|
| 131 |
temperature: Optional[float] = Field(default=1.0, ge=0, le=2)
|
| 132 |
top_p: Optional[float] = Field(default=1.0, ge=0, le=1)
|
| 133 |
n: Optional[int] = Field(default=1, ge=1, le=10)
|
| 134 |
stream: Optional[bool] = False
|
| 135 |
stop: Optional[Union[str, List[str]]] = None
|
| 136 |
max_tokens: Optional[int] = None
|
| 137 |
+
max_completion_tokens: Optional[int] = None
|
| 138 |
presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
|
| 139 |
frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
|
| 140 |
logit_bias: Optional[Dict[str, float]] = None
|
| 141 |
logprobs: Optional[bool] = False
|
| 142 |
top_logprobs: Optional[int] = None
|
|
|
|
| 143 |
user: Optional[str] = None
|
| 144 |
seed: Optional[int] = None
|
| 145 |
tools: Optional[List[Dict]] = None
|
| 146 |
tool_choice: Optional[Union[str, Dict]] = None
|
| 147 |
response_format: Optional[OpenAIResponseFormat] = None
|
|
|
|
| 148 |
stream_options: Optional[Dict] = None
|
| 149 |
|
| 150 |
class OpenAIChoiceMessage(BaseModel):
|
|
|
|
| 155 |
class OpenAIChoice(BaseModel):
|
| 156 |
index: int
|
| 157 |
message: OpenAIChoiceMessage
|
| 158 |
+
finish_reason: Optional[str] = None
|
| 159 |
logprobs: Optional[Dict] = None
|
| 160 |
|
| 161 |
class OpenAIStreamChoice(BaseModel):
|
|
|
|
| 172 |
completion_tokens_details: Optional[Dict] = None
|
| 173 |
|
| 174 |
class OpenAIChatResponse(BaseModel):
|
|
|
|
| 175 |
id: str
|
| 176 |
object: str = "chat.completion"
|
| 177 |
created: int
|
|
|
|
| 181 |
system_fingerprint: Optional[str] = None
|
| 182 |
service_tier: Optional[str] = None
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
class OpenAIModelInfo(BaseModel):
|
| 185 |
id: str
|
| 186 |
object: str = "model"
|
|
|
|
| 192 |
data: List[OpenAIModelInfo]
|
| 193 |
|
| 194 |
# ============================================================================
|
| 195 |
+
# Pydantic Models - Anthropic Compatible (with Thinking & Prefill support)
|
| 196 |
# ============================================================================
|
| 197 |
|
| 198 |
class AnthropicTextBlock(BaseModel):
|
|
|
|
| 199 |
type: str = "text"
|
| 200 |
text: str
|
| 201 |
|
| 202 |
class AnthropicImageSource(BaseModel):
|
|
|
|
| 203 |
type: str = "base64"
|
| 204 |
+
media_type: str
|
| 205 |
data: str
|
| 206 |
|
| 207 |
class AnthropicImageBlock(BaseModel):
|
|
|
|
| 208 |
type: str = "image"
|
| 209 |
source: AnthropicImageSource
|
| 210 |
|
| 211 |
+
class AnthropicThinkingBlock(BaseModel):
|
| 212 |
+
"""Thinking/reasoning content block"""
|
| 213 |
+
type: str = "thinking"
|
| 214 |
+
thinking: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
+
AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, AnthropicThinkingBlock, Dict]
|
|
|
|
| 217 |
|
| 218 |
class AnthropicMessage(BaseModel):
|
|
|
|
| 219 |
role: str # "user", "assistant"
|
| 220 |
content: Union[str, List[AnthropicContentBlock]]
|
| 221 |
|
| 222 |
class AnthropicTool(BaseModel):
|
|
|
|
| 223 |
name: str
|
| 224 |
description: Optional[str] = None
|
| 225 |
input_schema: Dict
|
| 226 |
|
| 227 |
class AnthropicToolChoice(BaseModel):
|
| 228 |
+
type: str
|
|
|
|
| 229 |
name: Optional[str] = None
|
| 230 |
|
| 231 |
+
class AnthropicThinkingConfig(BaseModel):
|
| 232 |
+
"""Configuration for thinking/reasoning mode"""
|
| 233 |
+
type: str = "enabled" # "enabled" or "disabled"
|
| 234 |
+
budget_tokens: Optional[int] = None # Token budget for thinking
|
| 235 |
+
|
| 236 |
class AnthropicRequest(BaseModel):
|
| 237 |
+
"""Full Anthropic Messages API request with thinking & prefill support"""
|
| 238 |
model: str
|
| 239 |
messages: List[AnthropicMessage]
|
| 240 |
+
max_tokens: int
|
| 241 |
# Optional parameters
|
| 242 |
system: Optional[Union[str, List[Dict]]] = None
|
| 243 |
temperature: Optional[float] = Field(default=1.0, ge=0, le=1)
|
|
|
|
| 248 |
# Tool use
|
| 249 |
tools: Optional[List[AnthropicTool]] = None
|
| 250 |
tool_choice: Optional[AnthropicToolChoice] = None
|
| 251 |
+
# Thinking/reasoning support
|
| 252 |
+
thinking: Optional[AnthropicThinkingConfig] = None
|
| 253 |
# Metadata
|
| 254 |
metadata: Optional[Dict] = None
|
| 255 |
|
| 256 |
class AnthropicResponseContent(BaseModel):
|
| 257 |
type: str = "text"
|
| 258 |
text: Optional[str] = None
|
| 259 |
+
# For thinking blocks
|
| 260 |
+
thinking: Optional[str] = None
|
| 261 |
# For tool_use
|
| 262 |
id: Optional[str] = None
|
| 263 |
name: Optional[str] = None
|
|
|
|
| 268 |
output_tokens: int
|
| 269 |
|
| 270 |
class AnthropicResponse(BaseModel):
|
|
|
|
| 271 |
id: str
|
| 272 |
type: str = "message"
|
| 273 |
role: str = "assistant"
|
| 274 |
model: str
|
| 275 |
content: List[AnthropicResponseContent]
|
| 276 |
+
stop_reason: Optional[str] = None
|
| 277 |
stop_sequence: Optional[str] = None
|
| 278 |
usage: AnthropicUsage
|
| 279 |
|
|
|
|
| 282 |
# ============================================================================
|
| 283 |
|
| 284 |
def extract_text_from_openai_content(content: Union[str, List, None]) -> str:
|
|
|
|
| 285 |
if content is None:
|
| 286 |
return ""
|
| 287 |
if isinstance(content, str):
|
|
|
|
| 298 |
return str(content)
|
| 299 |
|
| 300 |
def extract_text_from_anthropic_content(content: Union[str, List]) -> str:
|
|
|
|
| 301 |
if isinstance(content, str):
|
| 302 |
return content
|
| 303 |
if isinstance(content, list):
|
|
|
|
| 306 |
if isinstance(block, dict):
|
| 307 |
if block.get("type") == "text":
|
| 308 |
text_parts.append(block.get("text", ""))
|
| 309 |
+
elif block.get("type") == "thinking":
|
| 310 |
+
pass # Skip thinking blocks in extraction
|
| 311 |
+
elif hasattr(block, "type"):
|
| 312 |
+
if block.type == "text":
|
| 313 |
+
text_parts.append(block.text or "")
|
| 314 |
return "\n".join(text_parts)
|
| 315 |
return str(content)
|
| 316 |
|
| 317 |
def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str:
|
|
|
|
| 318 |
if system is None:
|
| 319 |
return ""
|
| 320 |
if isinstance(system, str):
|
| 321 |
return system
|
| 322 |
if isinstance(system, list):
|
|
|
|
| 323 |
text_parts = []
|
| 324 |
for block in system:
|
| 325 |
if isinstance(block, dict) and block.get("type") == "text":
|
|
|
|
| 327 |
return "\n".join(text_parts)
|
| 328 |
return ""
|
| 329 |
|
| 330 |
+
def extract_prefill_from_messages(messages: List[Dict]) -> tuple[List[Dict], str]:
|
| 331 |
+
"""
|
| 332 |
+
Extract prefill content if the last message is from assistant.
|
| 333 |
+
Returns (messages_without_prefill, prefill_text)
|
| 334 |
+
|
| 335 |
+
Prefill allows controlling output by providing initial assistant response.
|
| 336 |
+
See: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
|
| 337 |
+
"""
|
| 338 |
+
if not messages:
|
| 339 |
+
return messages, ""
|
| 340 |
+
|
| 341 |
+
last_msg = messages[-1]
|
| 342 |
+
if last_msg.get("role") == "assistant":
|
| 343 |
+
prefill = last_msg.get("content", "")
|
| 344 |
+
# Prefill cannot end with trailing whitespace
|
| 345 |
+
if isinstance(prefill, str):
|
| 346 |
+
prefill = prefill.rstrip()
|
| 347 |
+
return messages[:-1], prefill
|
| 348 |
+
|
| 349 |
+
return messages, ""
|
| 350 |
+
|
| 351 |
# ============================================================================
|
| 352 |
+
# Message Formatting with Prefill Support
|
| 353 |
# ============================================================================
|
| 354 |
|
| 355 |
def format_messages_for_model(
|
| 356 |
messages: List[Dict],
|
| 357 |
+
system_prompt: Optional[str] = None,
|
| 358 |
+
prefill: str = ""
|
| 359 |
) -> str:
|
| 360 |
+
"""
|
| 361 |
+
Format messages for the model using chat template.
|
| 362 |
+
Supports prefill for controlling output format.
|
| 363 |
+
"""
|
| 364 |
formatted_messages = []
|
| 365 |
|
| 366 |
if system_prompt:
|
|
|
|
| 370 |
role = msg.get("role", "user")
|
| 371 |
content = msg.get("content", "")
|
| 372 |
|
|
|
|
| 373 |
if role == "tool":
|
| 374 |
role = "user"
|
| 375 |
|
|
|
|
| 378 |
# Use tokenizer's chat template if available
|
| 379 |
if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
|
| 380 |
try:
|
| 381 |
+
prompt = tokenizer.apply_chat_template(
|
| 382 |
formatted_messages,
|
| 383 |
tokenize=False,
|
| 384 |
add_generation_prompt=True
|
| 385 |
)
|
| 386 |
+
# Append prefill if provided
|
| 387 |
+
if prefill:
|
| 388 |
+
prompt = prompt + prefill
|
| 389 |
+
return prompt
|
| 390 |
except Exception:
|
| 391 |
pass
|
| 392 |
|
| 393 |
+
# Fallback format
|
| 394 |
prompt = ""
|
| 395 |
for msg in formatted_messages:
|
| 396 |
role = msg["role"]
|
|
|
|
| 402 |
elif role == "assistant":
|
| 403 |
prompt += f"<|assistant|>\n{content}\n"
|
| 404 |
prompt += "<|assistant|>\n"
|
| 405 |
+
|
| 406 |
+
# Append prefill
|
| 407 |
+
if prefill:
|
| 408 |
+
prompt = prompt + prefill
|
| 409 |
+
|
| 410 |
return prompt
|
| 411 |
|
| 412 |
# ============================================================================
|
| 413 |
+
# Generation Logic with Thinking Support
|
| 414 |
# ============================================================================
|
| 415 |
|
| 416 |
def generate_response(
|
|
|
|
| 420 |
top_p: float = 0.95,
|
| 421 |
top_k: Optional[int] = None,
|
| 422 |
stop: Optional[List[str]] = None,
|
| 423 |
+
enable_thinking: bool = False,
|
| 424 |
+
thinking_budget: int = 512,
|
| 425 |
+
) -> tuple[str, str, int, int, str]:
|
| 426 |
"""
|
| 427 |
+
Generate response from the model.
|
| 428 |
+
Returns: (response_text, thinking_text, input_tokens, output_tokens, stop_reason)
|
| 429 |
"""
|
| 430 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
|
| 431 |
input_length = inputs.input_ids.shape[1]
|
| 432 |
|
|
|
|
| 433 |
gen_kwargs = {
|
| 434 |
"max_new_tokens": max_tokens,
|
| 435 |
"temperature": max(temperature, 0.01),
|
|
|
|
| 445 |
with torch.no_grad():
|
| 446 |
outputs = model.generate(inputs.input_ids, **gen_kwargs)
|
| 447 |
|
|
|
|
| 448 |
generated_tokens = outputs[0][input_length:]
|
| 449 |
response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
| 450 |
|
| 451 |
output_length = len(generated_tokens)
|
| 452 |
+
stop_reason = "stop"
|
| 453 |
+
thinking_text = ""
|
| 454 |
+
|
| 455 |
+
# Simulate thinking by extracting <think>...</think> blocks if present
|
| 456 |
+
if enable_thinking and "<think>" in response_text:
|
| 457 |
+
import re
|
| 458 |
+
think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
|
| 459 |
+
if think_match:
|
| 460 |
+
thinking_text = think_match.group(1).strip()
|
| 461 |
+
response_text = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
|
| 462 |
|
| 463 |
# Handle stop sequences
|
| 464 |
if stop:
|
|
|
|
| 468 |
stop_reason = "stop"
|
| 469 |
break
|
| 470 |
|
|
|
|
| 471 |
if output_length >= max_tokens:
|
| 472 |
stop_reason = "length"
|
| 473 |
|
| 474 |
+
return response_text.strip(), thinking_text, input_length, output_length, stop_reason
|
| 475 |
|
| 476 |
async def generate_stream(
|
| 477 |
prompt: str,
|
|
|
|
| 512 |
|
| 513 |
@asynccontextmanager
|
| 514 |
async def lifespan(app: FastAPI):
|
|
|
|
| 515 |
load_model()
|
| 516 |
yield
|
| 517 |
|
| 518 |
app = FastAPI(
|
| 519 |
title="Free Coding API",
|
| 520 |
+
description="OpenAI & Anthropic compatible API with Prefill & Thinking support",
|
| 521 |
+
version="1.1.0",
|
| 522 |
lifespan=lifespan
|
| 523 |
)
|
| 524 |
|
|
|
|
| 535 |
# ============================================================================
|
| 536 |
|
| 537 |
def verify_api_key(authorization: Optional[str] = None) -> bool:
|
|
|
|
| 538 |
if not API_KEY or API_KEY == "":
|
| 539 |
return True
|
| 540 |
|
|
|
|
| 554 |
|
| 555 |
@app.get("/v1/models")
|
| 556 |
async def list_models():
|
|
|
|
| 557 |
models = [
|
| 558 |
OpenAIModelInfo(id=alias, created=int(time.time()))
|
| 559 |
for alias in MODEL_ALIASES.keys()
|
|
|
|
| 562 |
|
| 563 |
@app.get("/v1/models/{model_id}")
|
| 564 |
async def get_model(model_id: str):
|
|
|
|
| 565 |
if model_id in MODEL_ALIASES or model_id == MODEL_ID:
|
| 566 |
return OpenAIModelInfo(id=model_id, created=int(time.time()))
|
| 567 |
raise HTTPException(status_code=404, detail="Model not found")
|
|
|
|
| 571 |
request: OpenAIChatRequest,
|
| 572 |
authorization: Optional[str] = Header(None),
|
| 573 |
):
|
| 574 |
+
"""OpenAI-compatible chat completions with prefill support"""
|
| 575 |
|
| 576 |
if not verify_api_key(authorization):
|
| 577 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
|
|
| 582 |
content = extract_text_from_openai_content(m.content)
|
| 583 |
messages.append({"role": m.role, "content": content})
|
| 584 |
|
| 585 |
+
# Check for prefill (last assistant message)
|
| 586 |
+
messages, prefill = extract_prefill_from_messages(messages)
|
| 587 |
+
|
| 588 |
+
# Extract system message
|
| 589 |
system_prompt = None
|
| 590 |
filtered_messages = []
|
| 591 |
for msg in messages:
|
|
|
|
| 594 |
else:
|
| 595 |
filtered_messages.append(msg)
|
| 596 |
|
| 597 |
+
prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt, prefill=prefill)
|
| 598 |
|
|
|
|
| 599 |
max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT
|
| 600 |
|
|
|
|
| 601 |
stop_sequences = None
|
| 602 |
if request.stop:
|
| 603 |
stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop
|
|
|
|
| 607 |
created_time = int(time.time())
|
| 608 |
|
| 609 |
if request.stream:
|
|
|
|
| 610 |
async def stream_generator():
|
|
|
|
| 611 |
first_chunk = {
|
| 612 |
"id": request_id,
|
| 613 |
"object": "chat.completion.chunk",
|
|
|
|
| 616 |
"system_fingerprint": system_fingerprint,
|
| 617 |
"choices": [{
|
| 618 |
"index": 0,
|
| 619 |
+
"delta": {"role": "assistant", "content": prefill}, # Include prefill in first chunk
|
| 620 |
"logprobs": None,
|
| 621 |
"finish_reason": None
|
| 622 |
}]
|
| 623 |
}
|
| 624 |
yield f"data: {json.dumps(first_chunk)}\n\n"
|
| 625 |
|
|
|
|
| 626 |
async for token in generate_stream(
|
| 627 |
prompt,
|
| 628 |
max_tokens=max_tokens,
|
|
|
|
| 644 |
}
|
| 645 |
yield f"data: {json.dumps(chunk)}\n\n"
|
| 646 |
|
|
|
|
| 647 |
final_chunk = {
|
| 648 |
"id": request_id,
|
| 649 |
"object": "chat.completion.chunk",
|
|
|
|
| 659 |
}
|
| 660 |
yield f"data: {json.dumps(final_chunk)}\n\n"
|
| 661 |
|
|
|
|
| 662 |
if request.stream_options and request.stream_options.get("include_usage"):
|
| 663 |
usage_chunk = {
|
| 664 |
"id": request_id,
|
|
|
|
| 666 |
"created": created_time,
|
| 667 |
"model": request.model,
|
| 668 |
"choices": [],
|
| 669 |
+
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
}
|
| 671 |
yield f"data: {json.dumps(usage_chunk)}\n\n"
|
| 672 |
|
|
|
|
| 675 |
return StreamingResponse(
|
| 676 |
stream_generator(),
|
| 677 |
media_type="text/event-stream",
|
| 678 |
+
headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 679 |
)
|
| 680 |
|
| 681 |
+
# Non-streaming
|
| 682 |
+
response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
|
| 683 |
prompt,
|
| 684 |
max_tokens=max_tokens,
|
| 685 |
temperature=request.temperature or 1.0,
|
|
|
|
| 687 |
stop=stop_sequences,
|
| 688 |
)
|
| 689 |
|
| 690 |
+
# Prepend prefill to response
|
| 691 |
+
full_response = prefill + response_text if prefill else response_text
|
| 692 |
+
|
| 693 |
openai_finish_reason = "stop" if stop_reason == "stop" else "length"
|
| 694 |
|
| 695 |
return OpenAIChatResponse(
|
|
|
|
| 700 |
choices=[
|
| 701 |
OpenAIChoice(
|
| 702 |
index=0,
|
| 703 |
+
message=OpenAIChoiceMessage(role="assistant", content=full_response),
|
| 704 |
finish_reason=openai_finish_reason,
|
| 705 |
logprobs=None
|
| 706 |
)
|
|
|
|
| 713 |
)
|
| 714 |
|
| 715 |
# ============================================================================
|
| 716 |
+
# Anthropic Compatible Endpoints with Prefill & Thinking
|
| 717 |
# ============================================================================
|
| 718 |
|
| 719 |
@app.post("/v1/messages")
|
|
|
|
| 723 |
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
|
| 724 |
anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
|
| 725 |
):
|
| 726 |
+
"""Anthropic-compatible messages endpoint with prefill & thinking support"""
|
| 727 |
|
|
|
|
| 728 |
auth_key = x_api_key or authorization
|
| 729 |
if not verify_api_key(auth_key):
|
| 730 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
|
|
| 735 |
content = extract_text_from_anthropic_content(m.content)
|
| 736 |
messages.append({"role": m.role, "content": content})
|
| 737 |
|
| 738 |
+
# Check for prefill (last assistant message)
|
| 739 |
+
messages, prefill = extract_prefill_from_messages(messages)
|
| 740 |
+
|
| 741 |
# Extract system prompt
|
| 742 |
system_prompt = extract_system_prompt_anthropic(request.system)
|
| 743 |
|
| 744 |
+
prompt = format_messages_for_model(messages, system_prompt=system_prompt, prefill=prefill)
|
| 745 |
+
|
| 746 |
+
# Check thinking configuration
|
| 747 |
+
enable_thinking = False
|
| 748 |
+
thinking_budget = 512
|
| 749 |
+
if request.thinking:
|
| 750 |
+
if request.thinking.type == "enabled":
|
| 751 |
+
enable_thinking = True
|
| 752 |
+
if request.thinking.budget_tokens:
|
| 753 |
+
thinking_budget = request.thinking.budget_tokens
|
| 754 |
|
| 755 |
request_id = f"msg_{uuid.uuid4().hex[:24]}"
|
| 756 |
|
| 757 |
if request.stream:
|
|
|
|
| 758 |
async def stream_generator():
|
| 759 |
+
input_tokens = 0
|
| 760 |
|
| 761 |
+
# message_start
|
| 762 |
message_start = {
|
| 763 |
"type": "message_start",
|
| 764 |
"message": {
|
|
|
|
| 769 |
"content": [],
|
| 770 |
"stop_reason": None,
|
| 771 |
"stop_sequence": None,
|
| 772 |
+
"usage": {"input_tokens": input_tokens, "output_tokens": 0}
|
|
|
|
|
|
|
|
|
|
| 773 |
}
|
| 774 |
}
|
| 775 |
yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
|
| 776 |
|
| 777 |
+
content_index = 0
|
| 778 |
+
|
| 779 |
+
# If thinking is enabled, add thinking block first (simulated)
|
| 780 |
+
if enable_thinking:
|
| 781 |
+
# thinking block start
|
| 782 |
+
thinking_block_start = {
|
| 783 |
+
"type": "content_block_start",
|
| 784 |
+
"index": content_index,
|
| 785 |
+
"content_block": {"type": "thinking", "thinking": ""}
|
| 786 |
+
}
|
| 787 |
+
yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n"
|
| 788 |
+
|
| 789 |
+
# Simulate thinking content
|
| 790 |
+
thinking_text = "Analyzing the request and formulating a response..."
|
| 791 |
+
thinking_delta = {
|
| 792 |
+
"type": "content_block_delta",
|
| 793 |
+
"index": content_index,
|
| 794 |
+
"delta": {"type": "thinking_delta", "thinking": thinking_text}
|
| 795 |
+
}
|
| 796 |
+
yield f"event: content_block_delta\ndata: {json.dumps(thinking_delta)}\n\n"
|
| 797 |
+
|
| 798 |
+
thinking_block_stop = {"type": "content_block_stop", "index": content_index}
|
| 799 |
+
yield f"event: content_block_stop\ndata: {json.dumps(thinking_block_stop)}\n\n"
|
| 800 |
+
|
| 801 |
+
content_index += 1
|
| 802 |
+
|
| 803 |
+
# text content block start
|
| 804 |
content_block_start = {
|
| 805 |
"type": "content_block_start",
|
| 806 |
+
"index": content_index,
|
| 807 |
+
"content_block": {"type": "text", "text": ""}
|
|
|
|
|
|
|
|
|
|
| 808 |
}
|
| 809 |
yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
|
| 810 |
|
| 811 |
+
# Include prefill in first delta if present
|
| 812 |
+
if prefill:
|
| 813 |
+
prefill_delta = {
|
| 814 |
+
"type": "content_block_delta",
|
| 815 |
+
"index": content_index,
|
| 816 |
+
"delta": {"type": "text_delta", "text": prefill}
|
| 817 |
+
}
|
| 818 |
+
yield f"event: content_block_delta\ndata: {json.dumps(prefill_delta)}\n\n"
|
| 819 |
+
|
| 820 |
+
# Stream content
|
| 821 |
output_tokens = 0
|
| 822 |
async for token in generate_stream(
|
| 823 |
prompt,
|
|
|
|
| 829 |
output_tokens += 1
|
| 830 |
delta = {
|
| 831 |
"type": "content_block_delta",
|
| 832 |
+
"index": content_index,
|
| 833 |
+
"delta": {"type": "text_delta", "text": token}
|
|
|
|
|
|
|
|
|
|
| 834 |
}
|
| 835 |
yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n"
|
| 836 |
|
| 837 |
+
# content_block_stop
|
| 838 |
+
content_block_stop = {"type": "content_block_stop", "index": content_index}
|
|
|
|
|
|
|
|
|
|
| 839 |
yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n"
|
| 840 |
|
| 841 |
+
# message_delta
|
| 842 |
message_delta = {
|
| 843 |
"type": "message_delta",
|
| 844 |
+
"delta": {"stop_reason": "end_turn", "stop_sequence": None},
|
| 845 |
+
"usage": {"output_tokens": output_tokens}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
}
|
| 847 |
yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
|
| 848 |
|
| 849 |
+
# message_stop
|
| 850 |
message_stop = {"type": "message_stop"}
|
| 851 |
yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
|
| 852 |
|
| 853 |
return StreamingResponse(
|
| 854 |
stream_generator(),
|
| 855 |
media_type="text/event-stream",
|
| 856 |
+
headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 857 |
)
|
| 858 |
|
| 859 |
# Non-streaming response
|
| 860 |
+
response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
|
| 861 |
prompt,
|
| 862 |
max_tokens=request.max_tokens,
|
| 863 |
temperature=request.temperature or 1.0,
|
| 864 |
top_p=request.top_p or 0.999,
|
| 865 |
top_k=request.top_k,
|
| 866 |
stop=request.stop_sequences,
|
| 867 |
+
enable_thinking=enable_thinking,
|
| 868 |
+
thinking_budget=thinking_budget,
|
| 869 |
)
|
| 870 |
|
| 871 |
+
# Prepend prefill to response
|
| 872 |
+
full_response = prefill + response_text if prefill else response_text
|
| 873 |
+
|
| 874 |
+
# Build content blocks
|
| 875 |
+
content_blocks = []
|
| 876 |
+
|
| 877 |
+
# Add thinking block if enabled and we have thinking content
|
| 878 |
+
if enable_thinking:
|
| 879 |
+
if not thinking_text:
|
| 880 |
+
thinking_text = "Analyzing the request and formulating a response."
|
| 881 |
+
content_blocks.append(AnthropicResponseContent(type="thinking", thinking=thinking_text))
|
| 882 |
+
|
| 883 |
+
# Add text block
|
| 884 |
+
content_blocks.append(AnthropicResponseContent(type="text", text=full_response))
|
| 885 |
+
|
| 886 |
+
# Determine stop reason
|
| 887 |
anthropic_stop_reason = "end_turn"
|
| 888 |
stop_sequence_used = None
|
| 889 |
if stop_reason == "length":
|
|
|
|
| 898 |
return AnthropicResponse(
|
| 899 |
id=request_id,
|
| 900 |
model=request.model,
|
| 901 |
+
content=content_blocks,
|
| 902 |
stop_reason=anthropic_stop_reason,
|
| 903 |
stop_sequence=stop_sequence_used,
|
| 904 |
usage=AnthropicUsage(
|
|
|
|
| 915 |
async def root():
|
| 916 |
return {
|
| 917 |
"name": "Free Coding API",
|
| 918 |
+
"version": "1.1.0",
|
| 919 |
"model": MODEL_ID,
|
| 920 |
+
"features": {
|
| 921 |
+
"prefill_response": "Supported - Include assistant message at end for output control",
|
| 922 |
+
"thinking": "Supported - Enable with thinking: {type: 'enabled'}",
|
| 923 |
+
"streaming": "Supported - Both OpenAI and Anthropic formats"
|
| 924 |
+
},
|
| 925 |
"compatibility": {
|
| 926 |
"openai": "v1 Chat Completions API",
|
| 927 |
"anthropic": "Messages API (2023-06-01)"
|