Spaces:
Running
Running
| """ | |
| HuggingFace Spaces - OpenAI & Anthropic Compatible Coding API | |
| A free, skills-only API endpoint for coding tasks (like Codex/Claude Code) | |
| Author: Matrix Agent | |
| Features: | |
| - Full OpenAI API compatibility (/v1/chat/completions) | |
| - Full Anthropic API compatibility (/v1/messages) | |
| - Optimized for coding tasks | |
| - Runs on free HF Spaces (2 vCPU, 16GB RAM) | |
| API Specifications verified against: | |
| - OpenAI: https://platform.openai.com/docs/api-reference/chat/create | |
| - Anthropic: https://docs.anthropic.com/en/api/messages | |
| """ | |
| import os | |
| import time | |
| import uuid | |
| import json | |
| import asyncio | |
| from typing import List, Optional, Union, Dict, Any, AsyncGenerator | |
| from contextlib import asynccontextmanager | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from threading import Thread | |
| from fastapi import FastAPI, HTTPException, Header, Request, Response | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import StreamingResponse, JSONResponse | |
| from pydantic import BaseModel, Field | |
| # ============================================================================ | |
| # Configuration | |
| # ============================================================================ | |
| MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct") | |
| ANTHROPIC_VERSION = "2023-06-01" # Standard Anthropic API version | |
| MODEL_ALIASES = { | |
| # OpenAI-style model names -> actual model | |
| "gpt-4": MODEL_ID, | |
| "gpt-4-turbo": MODEL_ID, | |
| "gpt-4o": MODEL_ID, | |
| "gpt-4o-mini": MODEL_ID, | |
| "gpt-3.5-turbo": MODEL_ID, | |
| "codex": MODEL_ID, | |
| "code-davinci-002": MODEL_ID, | |
| "o1": MODEL_ID, | |
| "o1-mini": MODEL_ID, | |
| # Anthropic-style model names | |
| "claude-3-opus-20240229": MODEL_ID, | |
| "claude-3-sonnet-20240229": MODEL_ID, | |
| "claude-3-haiku-20240307": MODEL_ID, | |
| "claude-3-5-sonnet-20241022": MODEL_ID, | |
| "claude-3-5-haiku-20241022": MODEL_ID, | |
| "claude-3-opus": MODEL_ID, | |
| "claude-3-sonnet": MODEL_ID, | |
| "claude-3-haiku": MODEL_ID, | |
| "claude-3-5-sonnet": MODEL_ID, | |
| "claude-code": MODEL_ID, | |
| } | |
| API_KEY = os.getenv("API_KEY", "sk-free-coding-api") | |
| MAX_TOKENS_DEFAULT = 2048 | |
| TEMPERATURE_DEFAULT = 0.7 | |
| # ============================================================================ | |
| # Global Model Instance | |
| # ============================================================================ | |
| model = None | |
| tokenizer = None | |
| def load_model(): | |
| """Load model with CPU optimization""" | |
| global model, tokenizer | |
| print(f"π Loading model: {MODEL_ID}") | |
| print(f"π Device: CPU (Free HF Spaces)") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| padding_side="left" | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load with CPU optimizations for 16GB RAM | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, | |
| device_map="cpu", | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| ) | |
| model.eval() | |
| print("β Model loaded successfully!") | |
| return model, tokenizer | |
| # ============================================================================ | |
| # Pydantic Models - OpenAI Compatible (Full Spec) | |
| # ============================================================================ | |
| class OpenAIContentPart(BaseModel): | |
| """Content part for multimodal messages""" | |
| type: str # "text", "image_url" | |
| text: Optional[str] = None | |
| image_url: Optional[Dict[str, str]] = None | |
| class OpenAIMessage(BaseModel): | |
| """OpenAI message format - supports both string and array content""" | |
| role: str # "system", "user", "assistant", "tool" | |
| content: Optional[Union[str, List[OpenAIContentPart]]] = None | |
| name: Optional[str] = None | |
| tool_calls: Optional[List[Dict]] = None | |
| tool_call_id: Optional[str] = None | |
| class OpenAIResponseFormat(BaseModel): | |
| """Response format specification""" | |
| type: str = "text" # "text", "json_object", "json_schema" | |
| json_schema: Optional[Dict] = None | |
| class OpenAIChatRequest(BaseModel): | |
| """Full OpenAI Chat Completions request spec""" | |
| model: str | |
| messages: List[OpenAIMessage] | |
| # Generation parameters | |
| temperature: Optional[float] = Field(default=1.0, ge=0, le=2) | |
| top_p: Optional[float] = Field(default=1.0, ge=0, le=1) | |
| n: Optional[int] = Field(default=1, ge=1, le=10) | |
| stream: Optional[bool] = False | |
| stop: Optional[Union[str, List[str]]] = None | |
| max_tokens: Optional[int] = None | |
| max_completion_tokens: Optional[int] = None # Newer parameter | |
| presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2) | |
| frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2) | |
| logit_bias: Optional[Dict[str, float]] = None | |
| logprobs: Optional[bool] = False | |
| top_logprobs: Optional[int] = None | |
| # Additional parameters | |
| user: Optional[str] = None | |
| seed: Optional[int] = None | |
| tools: Optional[List[Dict]] = None | |
| tool_choice: Optional[Union[str, Dict]] = None | |
| response_format: Optional[OpenAIResponseFormat] = None | |
| # Stream options | |
| stream_options: Optional[Dict] = None | |
| class OpenAIChoiceMessage(BaseModel): | |
| role: str = "assistant" | |
| content: Optional[str] = None | |
| tool_calls: Optional[List[Dict]] = None | |
| class OpenAIChoice(BaseModel): | |
| index: int | |
| message: OpenAIChoiceMessage | |
| finish_reason: Optional[str] = None # "stop", "length", "tool_calls", "content_filter" | |
| logprobs: Optional[Dict] = None | |
| class OpenAIStreamChoice(BaseModel): | |
| index: int | |
| delta: Dict | |
| finish_reason: Optional[str] = None | |
| logprobs: Optional[Dict] = None | |
| class OpenAIUsage(BaseModel): | |
| prompt_tokens: int | |
| completion_tokens: int | |
| total_tokens: int | |
| prompt_tokens_details: Optional[Dict] = None | |
| completion_tokens_details: Optional[Dict] = None | |
| class OpenAIChatResponse(BaseModel): | |
| """Full OpenAI Chat Completions response spec""" | |
| id: str | |
| object: str = "chat.completion" | |
| created: int | |
| model: str | |
| choices: List[OpenAIChoice] | |
| usage: Optional[OpenAIUsage] = None | |
| system_fingerprint: Optional[str] = None | |
| service_tier: Optional[str] = None | |
| class OpenAIStreamResponse(BaseModel): | |
| id: str | |
| object: str = "chat.completion.chunk" | |
| created: int | |
| model: str | |
| choices: List[OpenAIStreamChoice] | |
| system_fingerprint: Optional[str] = None | |
| class OpenAIModelInfo(BaseModel): | |
| id: str | |
| object: str = "model" | |
| created: int | |
| owned_by: str = "hf-spaces" | |
| class OpenAIModelsResponse(BaseModel): | |
| object: str = "list" | |
| data: List[OpenAIModelInfo] | |
| # ============================================================================ | |
| # Pydantic Models - Anthropic Compatible (Full Spec) | |
| # ============================================================================ | |
| class AnthropicTextBlock(BaseModel): | |
| """Text content block""" | |
| type: str = "text" | |
| text: str | |
| class AnthropicImageSource(BaseModel): | |
| """Image source for vision""" | |
| type: str = "base64" | |
| media_type: str # "image/jpeg", "image/png", "image/webp", "image/gif" | |
| data: str | |
| class AnthropicImageBlock(BaseModel): | |
| """Image content block""" | |
| type: str = "image" | |
| source: AnthropicImageSource | |
| class AnthropicToolUseBlock(BaseModel): | |
| """Tool use content block""" | |
| type: str = "tool_use" | |
| id: str | |
| name: str | |
| input: Dict | |
| class AnthropicToolResultBlock(BaseModel): | |
| """Tool result content block""" | |
| type: str = "tool_result" | |
| tool_use_id: str | |
| content: Union[str, List[Dict]] | |
| # Union type for all content blocks | |
| AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, Dict] | |
| class AnthropicMessage(BaseModel): | |
| """Anthropic message format""" | |
| role: str # "user", "assistant" | |
| content: Union[str, List[AnthropicContentBlock]] | |
| class AnthropicTool(BaseModel): | |
| """Tool definition""" | |
| name: str | |
| description: Optional[str] = None | |
| input_schema: Dict | |
| class AnthropicToolChoice(BaseModel): | |
| """Tool choice specification""" | |
| type: str # "auto", "any", "tool" | |
| name: Optional[str] = None | |
| class AnthropicRequest(BaseModel): | |
| """Full Anthropic Messages API request spec""" | |
| model: str | |
| messages: List[AnthropicMessage] | |
| max_tokens: int # Required in Anthropic API | |
| # Optional parameters | |
| system: Optional[Union[str, List[Dict]]] = None | |
| temperature: Optional[float] = Field(default=1.0, ge=0, le=1) | |
| top_p: Optional[float] = Field(default=0.999, ge=0, le=1) | |
| top_k: Optional[int] = None | |
| stream: Optional[bool] = False | |
| stop_sequences: Optional[List[str]] = None | |
| # Tool use | |
| tools: Optional[List[AnthropicTool]] = None | |
| tool_choice: Optional[AnthropicToolChoice] = None | |
| # Metadata | |
| metadata: Optional[Dict] = None | |
| class AnthropicResponseContent(BaseModel): | |
| type: str = "text" | |
| text: Optional[str] = None | |
| # For tool_use | |
| id: Optional[str] = None | |
| name: Optional[str] = None | |
| input: Optional[Dict] = None | |
| class AnthropicUsage(BaseModel): | |
| input_tokens: int | |
| output_tokens: int | |
| class AnthropicResponse(BaseModel): | |
| """Full Anthropic Messages API response spec""" | |
| id: str | |
| type: str = "message" | |
| role: str = "assistant" | |
| model: str | |
| content: List[AnthropicResponseContent] | |
| stop_reason: Optional[str] = None # "end_turn", "max_tokens", "stop_sequence", "tool_use" | |
| stop_sequence: Optional[str] = None | |
| usage: AnthropicUsage | |
| # ============================================================================ | |
| # Content Parsing Utilities | |
| # ============================================================================ | |
| def extract_text_from_openai_content(content: Union[str, List, None]) -> str: | |
| """Extract text from OpenAI message content (string or array)""" | |
| if content is None: | |
| return "" | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| text_parts = [] | |
| for part in content: | |
| if isinstance(part, dict): | |
| if part.get("type") == "text": | |
| text_parts.append(part.get("text", "")) | |
| elif hasattr(part, "type") and part.type == "text": | |
| text_parts.append(part.text or "") | |
| return "\n".join(text_parts) | |
| return str(content) | |
| def extract_text_from_anthropic_content(content: Union[str, List]) -> str: | |
| """Extract text from Anthropic message content (string or array)""" | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| text_parts = [] | |
| for block in content: | |
| if isinstance(block, dict): | |
| if block.get("type") == "text": | |
| text_parts.append(block.get("text", "")) | |
| elif hasattr(block, "type") and block.type == "text": | |
| text_parts.append(block.text or "") | |
| return "\n".join(text_parts) | |
| return str(content) | |
| def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str: | |
| """Extract system prompt from Anthropic format""" | |
| if system is None: | |
| return "" | |
| if isinstance(system, str): | |
| return system | |
| if isinstance(system, list): | |
| # System can be array of text blocks | |
| text_parts = [] | |
| for block in system: | |
| if isinstance(block, dict) and block.get("type") == "text": | |
| text_parts.append(block.get("text", "")) | |
| return "\n".join(text_parts) | |
| return "" | |
| # ============================================================================ | |
| # Message Formatting | |
| # ============================================================================ | |
| def format_messages_for_model( | |
| messages: List[Dict], | |
| system_prompt: Optional[str] = None | |
| ) -> str: | |
| """Format messages for the model using chat template""" | |
| formatted_messages = [] | |
| if system_prompt: | |
| formatted_messages.append({"role": "system", "content": system_prompt}) | |
| for msg in messages: | |
| role = msg.get("role", "user") | |
| content = msg.get("content", "") | |
| # Map tool role to assistant for compatibility | |
| if role == "tool": | |
| role = "user" | |
| formatted_messages.append({"role": role, "content": content}) | |
| # Use tokenizer's chat template if available | |
| if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template: | |
| try: | |
| return tokenizer.apply_chat_template( | |
| formatted_messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| except Exception: | |
| pass | |
| # Fallback: Simple format | |
| prompt = "" | |
| for msg in formatted_messages: | |
| role = msg["role"] | |
| content = msg["content"] | |
| if role == "system": | |
| prompt += f"<|system|>\n{content}\n" | |
| elif role == "user": | |
| prompt += f"<|user|>\n{content}\n" | |
| elif role == "assistant": | |
| prompt += f"<|assistant|>\n{content}\n" | |
| prompt += "<|assistant|>\n" | |
| return prompt | |
| # ============================================================================ | |
| # Generation Logic | |
| # ============================================================================ | |
| def generate_response( | |
| prompt: str, | |
| max_tokens: int = MAX_TOKENS_DEFAULT, | |
| temperature: float = TEMPERATURE_DEFAULT, | |
| top_p: float = 0.95, | |
| top_k: Optional[int] = None, | |
| stop: Optional[List[str]] = None, | |
| ) -> tuple[str, int, int, str]: | |
| """ | |
| Generate response from the model | |
| Returns: (response_text, input_tokens, output_tokens, stop_reason) | |
| """ | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) | |
| input_length = inputs.input_ids.shape[1] | |
| # Generation config | |
| gen_kwargs = { | |
| "max_new_tokens": max_tokens, | |
| "temperature": max(temperature, 0.01), | |
| "top_p": top_p, | |
| "do_sample": temperature > 0, | |
| "pad_token_id": tokenizer.pad_token_id, | |
| "eos_token_id": tokenizer.eos_token_id, | |
| } | |
| if top_k is not None and top_k > 0: | |
| gen_kwargs["top_k"] = top_k | |
| with torch.no_grad(): | |
| outputs = model.generate(inputs.input_ids, **gen_kwargs) | |
| # Decode only the new tokens | |
| generated_tokens = outputs[0][input_length:] | |
| response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
| output_length = len(generated_tokens) | |
| stop_reason = "stop" # Default | |
| # Handle stop sequences | |
| if stop: | |
| for stop_seq in stop: | |
| if stop_seq in response_text: | |
| response_text = response_text.split(stop_seq)[0] | |
| stop_reason = "stop" | |
| break | |
| # Check if max tokens reached | |
| if output_length >= max_tokens: | |
| stop_reason = "length" | |
| return response_text.strip(), input_length, output_length, stop_reason | |
| async def generate_stream( | |
| prompt: str, | |
| max_tokens: int = MAX_TOKENS_DEFAULT, | |
| temperature: float = TEMPERATURE_DEFAULT, | |
| top_p: float = 0.95, | |
| top_k: Optional[int] = None, | |
| ) -> AsyncGenerator[str, None]: | |
| """Stream generation for real-time responses""" | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) | |
| streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) | |
| gen_kwargs = { | |
| "max_new_tokens": max_tokens, | |
| "temperature": max(temperature, 0.01), | |
| "top_p": top_p, | |
| "do_sample": temperature > 0, | |
| "pad_token_id": tokenizer.pad_token_id, | |
| "eos_token_id": tokenizer.eos_token_id, | |
| "streamer": streamer, | |
| } | |
| if top_k is not None and top_k > 0: | |
| gen_kwargs["top_k"] = top_k | |
| thread = Thread(target=lambda: model.generate(inputs.input_ids, **gen_kwargs)) | |
| thread.start() | |
| for text in streamer: | |
| yield text | |
| thread.join() | |
| # ============================================================================ | |
| # FastAPI Application | |
| # ============================================================================ | |
| async def lifespan(app: FastAPI): | |
| """Load model on startup""" | |
| load_model() | |
| yield | |
| app = FastAPI( | |
| title="Free Coding API", | |
| description="OpenAI & Anthropic compatible API for coding tasks", | |
| version="1.0.0", | |
| lifespan=lifespan | |
| ) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ============================================================================ | |
| # Authentication | |
| # ============================================================================ | |
| def verify_api_key(authorization: Optional[str] = None) -> bool: | |
| """Simple API key verification""" | |
| if not API_KEY or API_KEY == "": | |
| return True | |
| if not authorization: | |
| return False | |
| if authorization.startswith("Bearer "): | |
| token = authorization[7:] | |
| else: | |
| token = authorization | |
| return token == API_KEY | |
| # ============================================================================ | |
| # OpenAI Compatible Endpoints | |
| # ============================================================================ | |
| async def list_models(): | |
| """List available models (OpenAI compatible)""" | |
| models = [ | |
| OpenAIModelInfo(id=alias, created=int(time.time())) | |
| for alias in MODEL_ALIASES.keys() | |
| ] | |
| return OpenAIModelsResponse(data=models) | |
| async def get_model(model_id: str): | |
| """Get model info""" | |
| if model_id in MODEL_ALIASES or model_id == MODEL_ID: | |
| return OpenAIModelInfo(id=model_id, created=int(time.time())) | |
| raise HTTPException(status_code=404, detail="Model not found") | |
| async def openai_chat_completions( | |
| request: OpenAIChatRequest, | |
| authorization: Optional[str] = Header(None), | |
| ): | |
| """OpenAI-compatible chat completions endpoint - Full spec compliance""" | |
| if not verify_api_key(authorization): | |
| raise HTTPException(status_code=401, detail="Invalid API key") | |
| # Extract messages | |
| messages = [] | |
| for m in request.messages: | |
| content = extract_text_from_openai_content(m.content) | |
| messages.append({"role": m.role, "content": content}) | |
| # Extract system message if present | |
| system_prompt = None | |
| filtered_messages = [] | |
| for msg in messages: | |
| if msg["role"] == "system": | |
| system_prompt = msg["content"] | |
| else: | |
| filtered_messages.append(msg) | |
| prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt) | |
| # Determine max tokens | |
| max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT | |
| # Handle stop sequences | |
| stop_sequences = None | |
| if request.stop: | |
| stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop | |
| request_id = f"chatcmpl-{uuid.uuid4().hex[:29]}" | |
| system_fingerprint = f"fp_{uuid.uuid4().hex[:10]}" | |
| created_time = int(time.time()) | |
| if request.stream: | |
| # OpenAI Streaming format | |
| async def stream_generator(): | |
| # First chunk with role | |
| first_chunk = { | |
| "id": request_id, | |
| "object": "chat.completion.chunk", | |
| "created": created_time, | |
| "model": request.model, | |
| "system_fingerprint": system_fingerprint, | |
| "choices": [{ | |
| "index": 0, | |
| "delta": {"role": "assistant", "content": ""}, | |
| "logprobs": None, | |
| "finish_reason": None | |
| }] | |
| } | |
| yield f"data: {json.dumps(first_chunk)}\n\n" | |
| # Stream content | |
| async for token in generate_stream( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=request.temperature or 1.0, | |
| top_p=request.top_p or 1.0, | |
| ): | |
| chunk = { | |
| "id": request_id, | |
| "object": "chat.completion.chunk", | |
| "created": created_time, | |
| "model": request.model, | |
| "system_fingerprint": system_fingerprint, | |
| "choices": [{ | |
| "index": 0, | |
| "delta": {"content": token}, | |
| "logprobs": None, | |
| "finish_reason": None | |
| }] | |
| } | |
| yield f"data: {json.dumps(chunk)}\n\n" | |
| # Final chunk with finish_reason | |
| final_chunk = { | |
| "id": request_id, | |
| "object": "chat.completion.chunk", | |
| "created": created_time, | |
| "model": request.model, | |
| "system_fingerprint": system_fingerprint, | |
| "choices": [{ | |
| "index": 0, | |
| "delta": {}, | |
| "logprobs": None, | |
| "finish_reason": "stop" | |
| }] | |
| } | |
| yield f"data: {json.dumps(final_chunk)}\n\n" | |
| # Usage chunk if requested | |
| if request.stream_options and request.stream_options.get("include_usage"): | |
| usage_chunk = { | |
| "id": request_id, | |
| "object": "chat.completion.chunk", | |
| "created": created_time, | |
| "model": request.model, | |
| "choices": [], | |
| "usage": { | |
| "prompt_tokens": 0, | |
| "completion_tokens": 0, | |
| "total_tokens": 0 | |
| } | |
| } | |
| yield f"data: {json.dumps(usage_chunk)}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse( | |
| stream_generator(), | |
| media_type="text/event-stream", | |
| headers={ | |
| "Cache-Control": "no-cache", | |
| "Connection": "keep-alive", | |
| "X-Accel-Buffering": "no" | |
| } | |
| ) | |
| # Non-streaming response | |
| response_text, input_tokens, output_tokens, stop_reason = generate_response( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=request.temperature or 1.0, | |
| top_p=request.top_p or 1.0, | |
| stop=stop_sequences, | |
| ) | |
| # Map stop reason to OpenAI format | |
| openai_finish_reason = "stop" if stop_reason == "stop" else "length" | |
| return OpenAIChatResponse( | |
| id=request_id, | |
| created=created_time, | |
| model=request.model, | |
| system_fingerprint=system_fingerprint, | |
| choices=[ | |
| OpenAIChoice( | |
| index=0, | |
| message=OpenAIChoiceMessage(role="assistant", content=response_text), | |
| finish_reason=openai_finish_reason, | |
| logprobs=None | |
| ) | |
| ], | |
| usage=OpenAIUsage( | |
| prompt_tokens=input_tokens, | |
| completion_tokens=output_tokens, | |
| total_tokens=input_tokens + output_tokens | |
| ) | |
| ) | |
| # ============================================================================ | |
| # Anthropic Compatible Endpoints | |
| # ============================================================================ | |
| async def anthropic_messages( | |
| request: AnthropicRequest, | |
| authorization: Optional[str] = Header(None), | |
| x_api_key: Optional[str] = Header(None, alias="x-api-key"), | |
| anthropic_version: Optional[str] = Header(None, alias="anthropic-version"), | |
| ): | |
| """Anthropic-compatible messages endpoint - Full spec compliance""" | |
| # Anthropic uses x-api-key header | |
| auth_key = x_api_key or authorization | |
| if not verify_api_key(auth_key): | |
| raise HTTPException(status_code=401, detail="Invalid API key") | |
| # Extract messages | |
| messages = [] | |
| for m in request.messages: | |
| content = extract_text_from_anthropic_content(m.content) | |
| messages.append({"role": m.role, "content": content}) | |
| # Extract system prompt | |
| system_prompt = extract_system_prompt_anthropic(request.system) | |
| prompt = format_messages_for_model(messages, system_prompt=system_prompt) | |
| request_id = f"msg_{uuid.uuid4().hex[:24]}" | |
| if request.stream: | |
| # Anthropic streaming format (Server-Sent Events) | |
| async def stream_generator(): | |
| input_tokens = 0 # Would be calculated from prompt | |
| # 1. message_start event | |
| message_start = { | |
| "type": "message_start", | |
| "message": { | |
| "id": request_id, | |
| "type": "message", | |
| "role": "assistant", | |
| "model": request.model, | |
| "content": [], | |
| "stop_reason": None, | |
| "stop_sequence": None, | |
| "usage": { | |
| "input_tokens": input_tokens, | |
| "output_tokens": 0 | |
| } | |
| } | |
| } | |
| yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n" | |
| # 2. content_block_start event | |
| content_block_start = { | |
| "type": "content_block_start", | |
| "index": 0, | |
| "content_block": { | |
| "type": "text", | |
| "text": "" | |
| } | |
| } | |
| yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n" | |
| # 3. Stream content_block_delta events | |
| output_tokens = 0 | |
| async for token in generate_stream( | |
| prompt, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature or 1.0, | |
| top_p=request.top_p or 0.999, | |
| top_k=request.top_k, | |
| ): | |
| output_tokens += 1 | |
| delta = { | |
| "type": "content_block_delta", | |
| "index": 0, | |
| "delta": { | |
| "type": "text_delta", | |
| "text": token | |
| } | |
| } | |
| yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n" | |
| # 4. content_block_stop event | |
| content_block_stop = { | |
| "type": "content_block_stop", | |
| "index": 0 | |
| } | |
| yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n" | |
| # 5. message_delta event | |
| message_delta = { | |
| "type": "message_delta", | |
| "delta": { | |
| "stop_reason": "end_turn", | |
| "stop_sequence": None | |
| }, | |
| "usage": { | |
| "output_tokens": output_tokens | |
| } | |
| } | |
| yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n" | |
| # 6. message_stop event | |
| message_stop = {"type": "message_stop"} | |
| yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n" | |
| return StreamingResponse( | |
| stream_generator(), | |
| media_type="text/event-stream", | |
| headers={ | |
| "Cache-Control": "no-cache", | |
| "Connection": "keep-alive", | |
| "X-Accel-Buffering": "no" | |
| } | |
| ) | |
| # Non-streaming response | |
| response_text, input_tokens, output_tokens, stop_reason = generate_response( | |
| prompt, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature or 1.0, | |
| top_p=request.top_p or 0.999, | |
| top_k=request.top_k, | |
| stop=request.stop_sequences, | |
| ) | |
| # Map stop reason to Anthropic format | |
| anthropic_stop_reason = "end_turn" | |
| stop_sequence_used = None | |
| if stop_reason == "length": | |
| anthropic_stop_reason = "max_tokens" | |
| elif stop_reason == "stop" and request.stop_sequences: | |
| for seq in request.stop_sequences: | |
| if seq in response_text: | |
| anthropic_stop_reason = "stop_sequence" | |
| stop_sequence_used = seq | |
| break | |
| return AnthropicResponse( | |
| id=request_id, | |
| model=request.model, | |
| content=[AnthropicResponseContent(type="text", text=response_text)], | |
| stop_reason=anthropic_stop_reason, | |
| stop_sequence=stop_sequence_used, | |
| usage=AnthropicUsage( | |
| input_tokens=input_tokens, | |
| output_tokens=output_tokens | |
| ) | |
| ) | |
| # ============================================================================ | |
| # Health & Info Endpoints | |
| # ============================================================================ | |
| async def root(): | |
| return { | |
| "name": "Free Coding API", | |
| "version": "1.0.0", | |
| "model": MODEL_ID, | |
| "compatibility": { | |
| "openai": "v1 Chat Completions API", | |
| "anthropic": "Messages API (2023-06-01)" | |
| }, | |
| "endpoints": { | |
| "openai_chat": "/v1/chat/completions", | |
| "anthropic_messages": "/v1/messages", | |
| "models": "/v1/models" | |
| }, | |
| "docs": "/docs" | |
| } | |
| async def health(): | |
| return { | |
| "status": "healthy", | |
| "model_loaded": model is not None, | |
| "model_id": MODEL_ID | |
| } | |
| # ============================================================================ | |
| # Main Entry Point | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |