""" HuggingFace Spaces - OpenAI & Anthropic Compatible Coding API A free, skills-only API endpoint for coding tasks (like Codex/Claude Code) Author: Matrix Agent Features: - Full OpenAI API compatibility (/v1/chat/completions) - Full Anthropic API compatibility (/v1/messages) - Optimized for coding tasks - Runs on free HF Spaces (2 vCPU, 16GB RAM) API Specifications verified against: - OpenAI: https://platform.openai.com/docs/api-reference/chat/create - Anthropic: https://docs.anthropic.com/en/api/messages """ import os import time import uuid import json import asyncio from typing import List, Optional, Union, Dict, Any, AsyncGenerator from contextlib import asynccontextmanager import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread from fastapi import FastAPI, HTTPException, Header, Request, Response from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse, JSONResponse from pydantic import BaseModel, Field # ============================================================================ # Configuration # ============================================================================ MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct") ANTHROPIC_VERSION = "2023-06-01" # Standard Anthropic API version MODEL_ALIASES = { # OpenAI-style model names -> actual model "gpt-4": MODEL_ID, "gpt-4-turbo": MODEL_ID, "gpt-4o": MODEL_ID, "gpt-4o-mini": MODEL_ID, "gpt-3.5-turbo": MODEL_ID, "codex": MODEL_ID, "code-davinci-002": MODEL_ID, "o1": MODEL_ID, "o1-mini": MODEL_ID, # Anthropic-style model names "claude-3-opus-20240229": MODEL_ID, "claude-3-sonnet-20240229": MODEL_ID, "claude-3-haiku-20240307": MODEL_ID, "claude-3-5-sonnet-20241022": MODEL_ID, "claude-3-5-haiku-20241022": MODEL_ID, "claude-3-opus": MODEL_ID, "claude-3-sonnet": MODEL_ID, "claude-3-haiku": MODEL_ID, "claude-3-5-sonnet": MODEL_ID, "claude-code": MODEL_ID, } API_KEY = os.getenv("API_KEY", "sk-free-coding-api") MAX_TOKENS_DEFAULT = 2048 TEMPERATURE_DEFAULT = 0.7 # ============================================================================ # Global Model Instance # ============================================================================ model = None tokenizer = None def load_model(): """Load model with CPU optimization""" global model, tokenizer print(f"🚀 Loading model: {MODEL_ID}") print(f"📊 Device: CPU (Free HF Spaces)") tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, trust_remote_code=True, padding_side="left" ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load with CPU optimizations for 16GB RAM model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, low_cpu_mem_usage=True, ) model.eval() print("✅ Model loaded successfully!") return model, tokenizer # ============================================================================ # Pydantic Models - OpenAI Compatible (Full Spec) # ============================================================================ class OpenAIContentPart(BaseModel): """Content part for multimodal messages""" type: str # "text", "image_url" text: Optional[str] = None image_url: Optional[Dict[str, str]] = None class OpenAIMessage(BaseModel): """OpenAI message format - supports both string and array content""" role: str # "system", "user", "assistant", "tool" content: Optional[Union[str, List[OpenAIContentPart]]] = None name: Optional[str] = None tool_calls: Optional[List[Dict]] = None tool_call_id: Optional[str] = None class OpenAIResponseFormat(BaseModel): """Response format specification""" type: str = "text" # "text", "json_object", "json_schema" json_schema: Optional[Dict] = None class OpenAIChatRequest(BaseModel): """Full OpenAI Chat Completions request spec""" model: str messages: List[OpenAIMessage] # Generation parameters temperature: Optional[float] = Field(default=1.0, ge=0, le=2) top_p: Optional[float] = Field(default=1.0, ge=0, le=1) n: Optional[int] = Field(default=1, ge=1, le=10) stream: Optional[bool] = False stop: Optional[Union[str, List[str]]] = None max_tokens: Optional[int] = None max_completion_tokens: Optional[int] = None # Newer parameter presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2) frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2) logit_bias: Optional[Dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = None # Additional parameters user: Optional[str] = None seed: Optional[int] = None tools: Optional[List[Dict]] = None tool_choice: Optional[Union[str, Dict]] = None response_format: Optional[OpenAIResponseFormat] = None # Stream options stream_options: Optional[Dict] = None class OpenAIChoiceMessage(BaseModel): role: str = "assistant" content: Optional[str] = None tool_calls: Optional[List[Dict]] = None class OpenAIChoice(BaseModel): index: int message: OpenAIChoiceMessage finish_reason: Optional[str] = None # "stop", "length", "tool_calls", "content_filter" logprobs: Optional[Dict] = None class OpenAIStreamChoice(BaseModel): index: int delta: Dict finish_reason: Optional[str] = None logprobs: Optional[Dict] = None class OpenAIUsage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int prompt_tokens_details: Optional[Dict] = None completion_tokens_details: Optional[Dict] = None class OpenAIChatResponse(BaseModel): """Full OpenAI Chat Completions response spec""" id: str object: str = "chat.completion" created: int model: str choices: List[OpenAIChoice] usage: Optional[OpenAIUsage] = None system_fingerprint: Optional[str] = None service_tier: Optional[str] = None class OpenAIStreamResponse(BaseModel): id: str object: str = "chat.completion.chunk" created: int model: str choices: List[OpenAIStreamChoice] system_fingerprint: Optional[str] = None class OpenAIModelInfo(BaseModel): id: str object: str = "model" created: int owned_by: str = "hf-spaces" class OpenAIModelsResponse(BaseModel): object: str = "list" data: List[OpenAIModelInfo] # ============================================================================ # Pydantic Models - Anthropic Compatible (Full Spec) # ============================================================================ class AnthropicTextBlock(BaseModel): """Text content block""" type: str = "text" text: str class AnthropicImageSource(BaseModel): """Image source for vision""" type: str = "base64" media_type: str # "image/jpeg", "image/png", "image/webp", "image/gif" data: str class AnthropicImageBlock(BaseModel): """Image content block""" type: str = "image" source: AnthropicImageSource class AnthropicToolUseBlock(BaseModel): """Tool use content block""" type: str = "tool_use" id: str name: str input: Dict class AnthropicToolResultBlock(BaseModel): """Tool result content block""" type: str = "tool_result" tool_use_id: str content: Union[str, List[Dict]] # Union type for all content blocks AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, Dict] class AnthropicMessage(BaseModel): """Anthropic message format""" role: str # "user", "assistant" content: Union[str, List[AnthropicContentBlock]] class AnthropicTool(BaseModel): """Tool definition""" name: str description: Optional[str] = None input_schema: Dict class AnthropicToolChoice(BaseModel): """Tool choice specification""" type: str # "auto", "any", "tool" name: Optional[str] = None class AnthropicRequest(BaseModel): """Full Anthropic Messages API request spec""" model: str messages: List[AnthropicMessage] max_tokens: int # Required in Anthropic API # Optional parameters system: Optional[Union[str, List[Dict]]] = None temperature: Optional[float] = Field(default=1.0, ge=0, le=1) top_p: Optional[float] = Field(default=0.999, ge=0, le=1) top_k: Optional[int] = None stream: Optional[bool] = False stop_sequences: Optional[List[str]] = None # Tool use tools: Optional[List[AnthropicTool]] = None tool_choice: Optional[AnthropicToolChoice] = None # Metadata metadata: Optional[Dict] = None class AnthropicResponseContent(BaseModel): type: str = "text" text: Optional[str] = None # For tool_use id: Optional[str] = None name: Optional[str] = None input: Optional[Dict] = None class AnthropicUsage(BaseModel): input_tokens: int output_tokens: int class AnthropicResponse(BaseModel): """Full Anthropic Messages API response spec""" id: str type: str = "message" role: str = "assistant" model: str content: List[AnthropicResponseContent] stop_reason: Optional[str] = None # "end_turn", "max_tokens", "stop_sequence", "tool_use" stop_sequence: Optional[str] = None usage: AnthropicUsage # ============================================================================ # Content Parsing Utilities # ============================================================================ def extract_text_from_openai_content(content: Union[str, List, None]) -> str: """Extract text from OpenAI message content (string or array)""" if content is None: return "" if isinstance(content, str): return content if isinstance(content, list): text_parts = [] for part in content: if isinstance(part, dict): if part.get("type") == "text": text_parts.append(part.get("text", "")) elif hasattr(part, "type") and part.type == "text": text_parts.append(part.text or "") return "\n".join(text_parts) return str(content) def extract_text_from_anthropic_content(content: Union[str, List]) -> str: """Extract text from Anthropic message content (string or array)""" if isinstance(content, str): return content if isinstance(content, list): text_parts = [] for block in content: if isinstance(block, dict): if block.get("type") == "text": text_parts.append(block.get("text", "")) elif hasattr(block, "type") and block.type == "text": text_parts.append(block.text or "") return "\n".join(text_parts) return str(content) def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str: """Extract system prompt from Anthropic format""" if system is None: return "" if isinstance(system, str): return system if isinstance(system, list): # System can be array of text blocks text_parts = [] for block in system: if isinstance(block, dict) and block.get("type") == "text": text_parts.append(block.get("text", "")) return "\n".join(text_parts) return "" # ============================================================================ # Message Formatting # ============================================================================ def format_messages_for_model( messages: List[Dict], system_prompt: Optional[str] = None ) -> str: """Format messages for the model using chat template""" formatted_messages = [] if system_prompt: formatted_messages.append({"role": "system", "content": system_prompt}) for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") # Map tool role to assistant for compatibility if role == "tool": role = "user" formatted_messages.append({"role": role, "content": content}) # Use tokenizer's chat template if available if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template: try: return tokenizer.apply_chat_template( formatted_messages, tokenize=False, add_generation_prompt=True ) except Exception: pass # Fallback: Simple format prompt = "" for msg in formatted_messages: role = msg["role"] content = msg["content"] if role == "system": prompt += f"<|system|>\n{content}\n" elif role == "user": prompt += f"<|user|>\n{content}\n" elif role == "assistant": prompt += f"<|assistant|>\n{content}\n" prompt += "<|assistant|>\n" return prompt # ============================================================================ # Generation Logic # ============================================================================ def generate_response( prompt: str, max_tokens: int = MAX_TOKENS_DEFAULT, temperature: float = TEMPERATURE_DEFAULT, top_p: float = 0.95, top_k: Optional[int] = None, stop: Optional[List[str]] = None, ) -> tuple[str, int, int, str]: """ Generate response from the model Returns: (response_text, input_tokens, output_tokens, stop_reason) """ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) input_length = inputs.input_ids.shape[1] # Generation config gen_kwargs = { "max_new_tokens": max_tokens, "temperature": max(temperature, 0.01), "top_p": top_p, "do_sample": temperature > 0, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, } if top_k is not None and top_k > 0: gen_kwargs["top_k"] = top_k with torch.no_grad(): outputs = model.generate(inputs.input_ids, **gen_kwargs) # Decode only the new tokens generated_tokens = outputs[0][input_length:] response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) output_length = len(generated_tokens) stop_reason = "stop" # Default # Handle stop sequences if stop: for stop_seq in stop: if stop_seq in response_text: response_text = response_text.split(stop_seq)[0] stop_reason = "stop" break # Check if max tokens reached if output_length >= max_tokens: stop_reason = "length" return response_text.strip(), input_length, output_length, stop_reason async def generate_stream( prompt: str, max_tokens: int = MAX_TOKENS_DEFAULT, temperature: float = TEMPERATURE_DEFAULT, top_p: float = 0.95, top_k: Optional[int] = None, ) -> AsyncGenerator[str, None]: """Stream generation for real-time responses""" inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) gen_kwargs = { "max_new_tokens": max_tokens, "temperature": max(temperature, 0.01), "top_p": top_p, "do_sample": temperature > 0, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, "streamer": streamer, } if top_k is not None and top_k > 0: gen_kwargs["top_k"] = top_k thread = Thread(target=lambda: model.generate(inputs.input_ids, **gen_kwargs)) thread.start() for text in streamer: yield text thread.join() # ============================================================================ # FastAPI Application # ============================================================================ @asynccontextmanager async def lifespan(app: FastAPI): """Load model on startup""" load_model() yield app = FastAPI( title="Free Coding API", description="OpenAI & Anthropic compatible API for coding tasks", version="1.0.0", lifespan=lifespan ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ============================================================================ # Authentication # ============================================================================ def verify_api_key(authorization: Optional[str] = None) -> bool: """Simple API key verification""" if not API_KEY or API_KEY == "": return True if not authorization: return False if authorization.startswith("Bearer "): token = authorization[7:] else: token = authorization return token == API_KEY # ============================================================================ # OpenAI Compatible Endpoints # ============================================================================ @app.get("/v1/models") async def list_models(): """List available models (OpenAI compatible)""" models = [ OpenAIModelInfo(id=alias, created=int(time.time())) for alias in MODEL_ALIASES.keys() ] return OpenAIModelsResponse(data=models) @app.get("/v1/models/{model_id}") async def get_model(model_id: str): """Get model info""" if model_id in MODEL_ALIASES or model_id == MODEL_ID: return OpenAIModelInfo(id=model_id, created=int(time.time())) raise HTTPException(status_code=404, detail="Model not found") @app.post("/v1/chat/completions") async def openai_chat_completions( request: OpenAIChatRequest, authorization: Optional[str] = Header(None), ): """OpenAI-compatible chat completions endpoint - Full spec compliance""" if not verify_api_key(authorization): raise HTTPException(status_code=401, detail="Invalid API key") # Extract messages messages = [] for m in request.messages: content = extract_text_from_openai_content(m.content) messages.append({"role": m.role, "content": content}) # Extract system message if present system_prompt = None filtered_messages = [] for msg in messages: if msg["role"] == "system": system_prompt = msg["content"] else: filtered_messages.append(msg) prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt) # Determine max tokens max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT # Handle stop sequences stop_sequences = None if request.stop: stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop request_id = f"chatcmpl-{uuid.uuid4().hex[:29]}" system_fingerprint = f"fp_{uuid.uuid4().hex[:10]}" created_time = int(time.time()) if request.stream: # OpenAI Streaming format async def stream_generator(): # First chunk with role first_chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "system_fingerprint": system_fingerprint, "choices": [{ "index": 0, "delta": {"role": "assistant", "content": ""}, "logprobs": None, "finish_reason": None }] } yield f"data: {json.dumps(first_chunk)}\n\n" # Stream content async for token in generate_stream( prompt, max_tokens=max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 1.0, ): chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "system_fingerprint": system_fingerprint, "choices": [{ "index": 0, "delta": {"content": token}, "logprobs": None, "finish_reason": None }] } yield f"data: {json.dumps(chunk)}\n\n" # Final chunk with finish_reason final_chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "system_fingerprint": system_fingerprint, "choices": [{ "index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop" }] } yield f"data: {json.dumps(final_chunk)}\n\n" # Usage chunk if requested if request.stream_options and request.stream_options.get("include_usage"): usage_chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "choices": [], "usage": { "prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0 } } yield f"data: {json.dumps(usage_chunk)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse( stream_generator(), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no" } ) # Non-streaming response response_text, input_tokens, output_tokens, stop_reason = generate_response( prompt, max_tokens=max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 1.0, stop=stop_sequences, ) # Map stop reason to OpenAI format openai_finish_reason = "stop" if stop_reason == "stop" else "length" return OpenAIChatResponse( id=request_id, created=created_time, model=request.model, system_fingerprint=system_fingerprint, choices=[ OpenAIChoice( index=0, message=OpenAIChoiceMessage(role="assistant", content=response_text), finish_reason=openai_finish_reason, logprobs=None ) ], usage=OpenAIUsage( prompt_tokens=input_tokens, completion_tokens=output_tokens, total_tokens=input_tokens + output_tokens ) ) # ============================================================================ # Anthropic Compatible Endpoints # ============================================================================ @app.post("/v1/messages") async def anthropic_messages( request: AnthropicRequest, authorization: Optional[str] = Header(None), x_api_key: Optional[str] = Header(None, alias="x-api-key"), anthropic_version: Optional[str] = Header(None, alias="anthropic-version"), ): """Anthropic-compatible messages endpoint - Full spec compliance""" # Anthropic uses x-api-key header auth_key = x_api_key or authorization if not verify_api_key(auth_key): raise HTTPException(status_code=401, detail="Invalid API key") # Extract messages messages = [] for m in request.messages: content = extract_text_from_anthropic_content(m.content) messages.append({"role": m.role, "content": content}) # Extract system prompt system_prompt = extract_system_prompt_anthropic(request.system) prompt = format_messages_for_model(messages, system_prompt=system_prompt) request_id = f"msg_{uuid.uuid4().hex[:24]}" if request.stream: # Anthropic streaming format (Server-Sent Events) async def stream_generator(): input_tokens = 0 # Would be calculated from prompt # 1. message_start event message_start = { "type": "message_start", "message": { "id": request_id, "type": "message", "role": "assistant", "model": request.model, "content": [], "stop_reason": None, "stop_sequence": None, "usage": { "input_tokens": input_tokens, "output_tokens": 0 } } } yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n" # 2. content_block_start event content_block_start = { "type": "content_block_start", "index": 0, "content_block": { "type": "text", "text": "" } } yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n" # 3. Stream content_block_delta events output_tokens = 0 async for token in generate_stream( prompt, max_tokens=request.max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 0.999, top_k=request.top_k, ): output_tokens += 1 delta = { "type": "content_block_delta", "index": 0, "delta": { "type": "text_delta", "text": token } } yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n" # 4. content_block_stop event content_block_stop = { "type": "content_block_stop", "index": 0 } yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n" # 5. message_delta event message_delta = { "type": "message_delta", "delta": { "stop_reason": "end_turn", "stop_sequence": None }, "usage": { "output_tokens": output_tokens } } yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n" # 6. message_stop event message_stop = {"type": "message_stop"} yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n" return StreamingResponse( stream_generator(), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no" } ) # Non-streaming response response_text, input_tokens, output_tokens, stop_reason = generate_response( prompt, max_tokens=request.max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 0.999, top_k=request.top_k, stop=request.stop_sequences, ) # Map stop reason to Anthropic format anthropic_stop_reason = "end_turn" stop_sequence_used = None if stop_reason == "length": anthropic_stop_reason = "max_tokens" elif stop_reason == "stop" and request.stop_sequences: for seq in request.stop_sequences: if seq in response_text: anthropic_stop_reason = "stop_sequence" stop_sequence_used = seq break return AnthropicResponse( id=request_id, model=request.model, content=[AnthropicResponseContent(type="text", text=response_text)], stop_reason=anthropic_stop_reason, stop_sequence=stop_sequence_used, usage=AnthropicUsage( input_tokens=input_tokens, output_tokens=output_tokens ) ) # ============================================================================ # Health & Info Endpoints # ============================================================================ @app.get("/") async def root(): return { "name": "Free Coding API", "version": "1.0.0", "model": MODEL_ID, "compatibility": { "openai": "v1 Chat Completions API", "anthropic": "Messages API (2023-06-01)" }, "endpoints": { "openai_chat": "/v1/chat/completions", "anthropic_messages": "/v1/messages", "models": "/v1/models" }, "docs": "/docs" } @app.get("/health") async def health(): return { "status": "healthy", "model_loaded": model is not None, "model_id": MODEL_ID } # ============================================================================ # Main Entry Point # ============================================================================ if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)