""" HuggingFace Spaces - OpenAI & Anthropic Compatible Coding API A free, skills-only API endpoint for coding tasks (like Codex/Claude Code) Author: Matrix Agent Features: - Full OpenAI API compatibility (/v1/chat/completions) - Full Anthropic API compatibility (/v1/messages) - Prefill Response Support (assistant message prefix for output control) - Thinking/Reasoning Content Block Support - Optimized for coding tasks - Runs on free HF Spaces (2 vCPU, 16GB RAM) API Specifications verified against: - OpenAI: https://platform.openai.com/docs/api-reference/chat/create - Anthropic: https://docs.anthropic.com/en/api/messages - Prefill: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response - MiniMax Anthropic: https://platform.minimax.io/docs/api-reference/text-anthropic-api """ import os import time import uuid import json import asyncio from typing import List, Optional, Union, Dict, Any, AsyncGenerator from contextlib import asynccontextmanager import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread from fastapi import FastAPI, HTTPException, Header, Request, Response from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse, JSONResponse from pydantic import BaseModel, Field # ============================================================================ # Configuration # ============================================================================ MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct") ANTHROPIC_VERSION = "2023-06-01" MODEL_ALIASES = { # OpenAI-style model names "gpt-4": MODEL_ID, "gpt-4-turbo": MODEL_ID, "gpt-4o": MODEL_ID, "gpt-4o-mini": MODEL_ID, "gpt-3.5-turbo": MODEL_ID, "codex": MODEL_ID, "code-davinci-002": MODEL_ID, "o1": MODEL_ID, "o1-mini": MODEL_ID, # Anthropic-style model names "claude-3-opus-20240229": MODEL_ID, "claude-3-sonnet-20240229": MODEL_ID, "claude-3-haiku-20240307": MODEL_ID, "claude-3-5-sonnet-20241022": MODEL_ID, "claude-3-5-haiku-20241022": MODEL_ID, "claude-3-opus": MODEL_ID, "claude-3-sonnet": MODEL_ID, "claude-3-haiku": MODEL_ID, "claude-3-5-sonnet": MODEL_ID, "claude-code": MODEL_ID, } API_KEY = os.getenv("API_KEY", "sk-free-coding-api") MAX_TOKENS_DEFAULT = 2048 TEMPERATURE_DEFAULT = 0.7 # ============================================================================ # Global Model Instance # ============================================================================ model = None tokenizer = None def load_model(): """Load model with CPU optimization""" global model, tokenizer print(f"🚀 Loading model: {MODEL_ID}") print(f"📊 Device: CPU (Free HF Spaces)") tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, trust_remote_code=True, padding_side="left" ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True, low_cpu_mem_usage=True, ) model.eval() print("✅ Model loaded successfully!") return model, tokenizer # ============================================================================ # Pydantic Models - OpenAI Compatible # ============================================================================ class OpenAIContentPart(BaseModel): type: str text: Optional[str] = None image_url: Optional[Dict[str, str]] = None class OpenAIMessage(BaseModel): role: str content: Optional[Union[str, List[OpenAIContentPart]]] = None name: Optional[str] = None tool_calls: Optional[List[Dict]] = None tool_call_id: Optional[str] = None class OpenAIResponseFormat(BaseModel): type: str = "text" json_schema: Optional[Dict] = None class OpenAIChatRequest(BaseModel): model: str messages: List[OpenAIMessage] temperature: Optional[float] = Field(default=1.0, ge=0, le=2) top_p: Optional[float] = Field(default=1.0, ge=0, le=1) n: Optional[int] = Field(default=1, ge=1, le=10) stream: Optional[bool] = False stop: Optional[Union[str, List[str]]] = None max_tokens: Optional[int] = None max_completion_tokens: Optional[int] = None presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2) frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2) logit_bias: Optional[Dict[str, float]] = None logprobs: Optional[bool] = False top_logprobs: Optional[int] = None user: Optional[str] = None seed: Optional[int] = None tools: Optional[List[Dict]] = None tool_choice: Optional[Union[str, Dict]] = None response_format: Optional[OpenAIResponseFormat] = None stream_options: Optional[Dict] = None class OpenAIChoiceMessage(BaseModel): role: str = "assistant" content: Optional[str] = None tool_calls: Optional[List[Dict]] = None class OpenAIChoice(BaseModel): index: int message: OpenAIChoiceMessage finish_reason: Optional[str] = None logprobs: Optional[Dict] = None class OpenAIStreamChoice(BaseModel): index: int delta: Dict finish_reason: Optional[str] = None logprobs: Optional[Dict] = None class OpenAIUsage(BaseModel): prompt_tokens: int completion_tokens: int total_tokens: int prompt_tokens_details: Optional[Dict] = None completion_tokens_details: Optional[Dict] = None class OpenAIChatResponse(BaseModel): id: str object: str = "chat.completion" created: int model: str choices: List[OpenAIChoice] usage: Optional[OpenAIUsage] = None system_fingerprint: Optional[str] = None service_tier: Optional[str] = None class OpenAIModelInfo(BaseModel): id: str object: str = "model" created: int owned_by: str = "hf-spaces" class OpenAIModelsResponse(BaseModel): object: str = "list" data: List[OpenAIModelInfo] # ============================================================================ # Pydantic Models - Anthropic Compatible (with Thinking & Prefill support) # ============================================================================ class AnthropicTextBlock(BaseModel): type: str = "text" text: str class AnthropicImageSource(BaseModel): type: str = "base64" media_type: str data: str class AnthropicImageBlock(BaseModel): type: str = "image" source: AnthropicImageSource class AnthropicThinkingBlock(BaseModel): """Thinking/reasoning content block""" type: str = "thinking" thinking: str AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, AnthropicThinkingBlock, Dict] class AnthropicMessage(BaseModel): role: str # "user", "assistant" content: Union[str, List[AnthropicContentBlock]] class AnthropicTool(BaseModel): name: str description: Optional[str] = None input_schema: Dict class AnthropicToolChoice(BaseModel): type: str name: Optional[str] = None class AnthropicThinkingConfig(BaseModel): """Configuration for thinking/reasoning mode""" type: str = "enabled" # "enabled" or "disabled" budget_tokens: Optional[int] = None # Token budget for thinking class AnthropicRequest(BaseModel): """Full Anthropic Messages API request with thinking & prefill support""" model: str messages: List[AnthropicMessage] max_tokens: int # Optional parameters system: Optional[Union[str, List[Dict]]] = None temperature: Optional[float] = Field(default=1.0, ge=0, le=1) top_p: Optional[float] = Field(default=0.999, ge=0, le=1) top_k: Optional[int] = None stream: Optional[bool] = False stop_sequences: Optional[List[str]] = None # Tool use tools: Optional[List[AnthropicTool]] = None tool_choice: Optional[AnthropicToolChoice] = None # Thinking/reasoning support thinking: Optional[AnthropicThinkingConfig] = None # Metadata metadata: Optional[Dict] = None class AnthropicResponseContent(BaseModel): type: str = "text" text: Optional[str] = None # For thinking blocks thinking: Optional[str] = None # For tool_use id: Optional[str] = None name: Optional[str] = None input: Optional[Dict] = None class AnthropicUsage(BaseModel): input_tokens: int output_tokens: int class AnthropicResponse(BaseModel): id: str type: str = "message" role: str = "assistant" model: str content: List[AnthropicResponseContent] stop_reason: Optional[str] = None stop_sequence: Optional[str] = None usage: AnthropicUsage # ============================================================================ # Content Parsing Utilities # ============================================================================ def extract_text_from_openai_content(content: Union[str, List, None]) -> str: if content is None: return "" if isinstance(content, str): return content if isinstance(content, list): text_parts = [] for part in content: if isinstance(part, dict): if part.get("type") == "text": text_parts.append(part.get("text", "")) elif hasattr(part, "type") and part.type == "text": text_parts.append(part.text or "") return "\n".join(text_parts) return str(content) def extract_text_from_anthropic_content(content: Union[str, List]) -> str: if isinstance(content, str): return content if isinstance(content, list): text_parts = [] for block in content: if isinstance(block, dict): if block.get("type") == "text": text_parts.append(block.get("text", "")) elif block.get("type") == "thinking": pass # Skip thinking blocks in extraction elif hasattr(block, "type"): if block.type == "text": text_parts.append(block.text or "") return "\n".join(text_parts) return str(content) def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str: if system is None: return "" if isinstance(system, str): return system if isinstance(system, list): text_parts = [] for block in system: if isinstance(block, dict) and block.get("type") == "text": text_parts.append(block.get("text", "")) return "\n".join(text_parts) return "" def extract_prefill_from_messages(messages: List[Dict]) -> tuple[List[Dict], str]: """ Extract prefill content if the last message is from assistant. Returns (messages_without_prefill, prefill_text) Prefill allows controlling output by providing initial assistant response. See: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response """ if not messages: return messages, "" last_msg = messages[-1] if last_msg.get("role") == "assistant": prefill = last_msg.get("content", "") # Prefill cannot end with trailing whitespace if isinstance(prefill, str): prefill = prefill.rstrip() return messages[:-1], prefill return messages, "" # ============================================================================ # Message Formatting with Prefill Support # ============================================================================ def format_messages_for_model( messages: List[Dict], system_prompt: Optional[str] = None, prefill: str = "" ) -> str: """ Format messages for the model using chat template. Supports prefill for controlling output format. """ formatted_messages = [] if system_prompt: formatted_messages.append({"role": "system", "content": system_prompt}) for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") if role == "tool": role = "user" formatted_messages.append({"role": role, "content": content}) # Use tokenizer's chat template if available if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template: try: prompt = tokenizer.apply_chat_template( formatted_messages, tokenize=False, add_generation_prompt=True ) # Append prefill if provided if prefill: prompt = prompt + prefill return prompt except Exception: pass # Fallback format prompt = "" for msg in formatted_messages: role = msg["role"] content = msg["content"] if role == "system": prompt += f"<|system|>\n{content}\n" elif role == "user": prompt += f"<|user|>\n{content}\n" elif role == "assistant": prompt += f"<|assistant|>\n{content}\n" prompt += "<|assistant|>\n" # Append prefill if prefill: prompt = prompt + prefill return prompt # ============================================================================ # Generation Logic with Thinking Support # ============================================================================ def generate_response( prompt: str, max_tokens: int = MAX_TOKENS_DEFAULT, temperature: float = TEMPERATURE_DEFAULT, top_p: float = 0.95, top_k: Optional[int] = None, stop: Optional[List[str]] = None, enable_thinking: bool = False, thinking_budget: int = 512, ) -> tuple[str, str, int, int, str]: """ Generate response from the model. Returns: (response_text, thinking_text, input_tokens, output_tokens, stop_reason) """ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) input_length = inputs.input_ids.shape[1] gen_kwargs = { "max_new_tokens": max_tokens, "temperature": max(temperature, 0.01), "top_p": top_p, "do_sample": temperature > 0, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, } if top_k is not None and top_k > 0: gen_kwargs["top_k"] = top_k with torch.no_grad(): outputs = model.generate(inputs.input_ids, **gen_kwargs) generated_tokens = outputs[0][input_length:] response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) output_length = len(generated_tokens) stop_reason = "stop" thinking_text = "" # Simulate thinking by extracting ... blocks if present if enable_thinking and "" in response_text: import re think_match = re.search(r"(.*?)", response_text, re.DOTALL) if think_match: thinking_text = think_match.group(1).strip() response_text = re.sub(r".*?", "", response_text, flags=re.DOTALL).strip() # Handle stop sequences if stop: for stop_seq in stop: if stop_seq in response_text: response_text = response_text.split(stop_seq)[0] stop_reason = "stop" break if output_length >= max_tokens: stop_reason = "length" return response_text.strip(), thinking_text, input_length, output_length, stop_reason async def generate_stream( prompt: str, max_tokens: int = MAX_TOKENS_DEFAULT, temperature: float = TEMPERATURE_DEFAULT, top_p: float = 0.95, top_k: Optional[int] = None, ) -> AsyncGenerator[str, None]: """Stream generation for real-time responses""" inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) gen_kwargs = { "max_new_tokens": max_tokens, "temperature": max(temperature, 0.01), "top_p": top_p, "do_sample": temperature > 0, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, "streamer": streamer, } if top_k is not None and top_k > 0: gen_kwargs["top_k"] = top_k thread = Thread(target=lambda: model.generate(inputs.input_ids, **gen_kwargs)) thread.start() for text in streamer: yield text thread.join() # ============================================================================ # FastAPI Application # ============================================================================ @asynccontextmanager async def lifespan(app: FastAPI): load_model() yield app = FastAPI( title="Free Coding API", description="OpenAI & Anthropic compatible API with Prefill & Thinking support", version="1.1.0", lifespan=lifespan ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ============================================================================ # Authentication # ============================================================================ def verify_api_key(authorization: Optional[str] = None) -> bool: if not API_KEY or API_KEY == "": return True if not authorization: return False if authorization.startswith("Bearer "): token = authorization[7:] else: token = authorization return token == API_KEY # ============================================================================ # OpenAI Compatible Endpoints # ============================================================================ @app.get("/v1/models") async def list_models(): models = [ OpenAIModelInfo(id=alias, created=int(time.time())) for alias in MODEL_ALIASES.keys() ] return OpenAIModelsResponse(data=models) @app.get("/v1/models/{model_id}") async def get_model(model_id: str): if model_id in MODEL_ALIASES or model_id == MODEL_ID: return OpenAIModelInfo(id=model_id, created=int(time.time())) raise HTTPException(status_code=404, detail="Model not found") @app.post("/v1/chat/completions") async def openai_chat_completions( request: OpenAIChatRequest, authorization: Optional[str] = Header(None), ): """OpenAI-compatible chat completions with prefill support""" if not verify_api_key(authorization): raise HTTPException(status_code=401, detail="Invalid API key") # Extract messages messages = [] for m in request.messages: content = extract_text_from_openai_content(m.content) messages.append({"role": m.role, "content": content}) # Check for prefill (last assistant message) messages, prefill = extract_prefill_from_messages(messages) # Extract system message system_prompt = None filtered_messages = [] for msg in messages: if msg["role"] == "system": system_prompt = msg["content"] else: filtered_messages.append(msg) prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt, prefill=prefill) max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT stop_sequences = None if request.stop: stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop request_id = f"chatcmpl-{uuid.uuid4().hex[:29]}" system_fingerprint = f"fp_{uuid.uuid4().hex[:10]}" created_time = int(time.time()) if request.stream: async def stream_generator(): first_chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "system_fingerprint": system_fingerprint, "choices": [{ "index": 0, "delta": {"role": "assistant", "content": prefill}, # Include prefill in first chunk "logprobs": None, "finish_reason": None }] } yield f"data: {json.dumps(first_chunk)}\n\n" async for token in generate_stream( prompt, max_tokens=max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 1.0, ): chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "system_fingerprint": system_fingerprint, "choices": [{ "index": 0, "delta": {"content": token}, "logprobs": None, "finish_reason": None }] } yield f"data: {json.dumps(chunk)}\n\n" final_chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "system_fingerprint": system_fingerprint, "choices": [{ "index": 0, "delta": {}, "logprobs": None, "finish_reason": "stop" }] } yield f"data: {json.dumps(final_chunk)}\n\n" if request.stream_options and request.stream_options.get("include_usage"): usage_chunk = { "id": request_id, "object": "chat.completion.chunk", "created": created_time, "model": request.model, "choices": [], "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} } yield f"data: {json.dumps(usage_chunk)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse( stream_generator(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"} ) # Non-streaming response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response( prompt, max_tokens=max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 1.0, stop=stop_sequences, ) # Prepend prefill to response full_response = prefill + response_text if prefill else response_text openai_finish_reason = "stop" if stop_reason == "stop" else "length" return OpenAIChatResponse( id=request_id, created=created_time, model=request.model, system_fingerprint=system_fingerprint, choices=[ OpenAIChoice( index=0, message=OpenAIChoiceMessage(role="assistant", content=full_response), finish_reason=openai_finish_reason, logprobs=None ) ], usage=OpenAIUsage( prompt_tokens=input_tokens, completion_tokens=output_tokens, total_tokens=input_tokens + output_tokens ) ) # ============================================================================ # Anthropic Compatible Endpoints with Prefill & Thinking # ============================================================================ @app.post("/v1/messages") async def anthropic_messages( request: AnthropicRequest, authorization: Optional[str] = Header(None), x_api_key: Optional[str] = Header(None, alias="x-api-key"), anthropic_version: Optional[str] = Header(None, alias="anthropic-version"), ): """Anthropic-compatible messages endpoint with prefill & thinking support""" auth_key = x_api_key or authorization if not verify_api_key(auth_key): raise HTTPException(status_code=401, detail="Invalid API key") # Extract messages messages = [] for m in request.messages: content = extract_text_from_anthropic_content(m.content) messages.append({"role": m.role, "content": content}) # Check for prefill (last assistant message) messages, prefill = extract_prefill_from_messages(messages) # Extract system prompt system_prompt = extract_system_prompt_anthropic(request.system) prompt = format_messages_for_model(messages, system_prompt=system_prompt, prefill=prefill) # Check thinking configuration enable_thinking = False thinking_budget = 512 if request.thinking: if request.thinking.type == "enabled": enable_thinking = True if request.thinking.budget_tokens: thinking_budget = request.thinking.budget_tokens request_id = f"msg_{uuid.uuid4().hex[:24]}" if request.stream: async def stream_generator(): input_tokens = 0 # message_start message_start = { "type": "message_start", "message": { "id": request_id, "type": "message", "role": "assistant", "model": request.model, "content": [], "stop_reason": None, "stop_sequence": None, "usage": {"input_tokens": input_tokens, "output_tokens": 0} } } yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n" content_index = 0 # If thinking is enabled, add thinking block first (simulated) if enable_thinking: # thinking block start thinking_block_start = { "type": "content_block_start", "index": content_index, "content_block": {"type": "thinking", "thinking": ""} } yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n" # Simulate thinking content thinking_text = "Analyzing the request and formulating a response..." thinking_delta = { "type": "content_block_delta", "index": content_index, "delta": {"type": "thinking_delta", "thinking": thinking_text} } yield f"event: content_block_delta\ndata: {json.dumps(thinking_delta)}\n\n" thinking_block_stop = {"type": "content_block_stop", "index": content_index} yield f"event: content_block_stop\ndata: {json.dumps(thinking_block_stop)}\n\n" content_index += 1 # text content block start content_block_start = { "type": "content_block_start", "index": content_index, "content_block": {"type": "text", "text": ""} } yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n" # Include prefill in first delta if present if prefill: prefill_delta = { "type": "content_block_delta", "index": content_index, "delta": {"type": "text_delta", "text": prefill} } yield f"event: content_block_delta\ndata: {json.dumps(prefill_delta)}\n\n" # Stream content output_tokens = 0 async for token in generate_stream( prompt, max_tokens=request.max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 0.999, top_k=request.top_k, ): output_tokens += 1 delta = { "type": "content_block_delta", "index": content_index, "delta": {"type": "text_delta", "text": token} } yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n" # content_block_stop content_block_stop = {"type": "content_block_stop", "index": content_index} yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n" # message_delta message_delta = { "type": "message_delta", "delta": {"stop_reason": "end_turn", "stop_sequence": None}, "usage": {"output_tokens": output_tokens} } yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n" # message_stop message_stop = {"type": "message_stop"} yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n" return StreamingResponse( stream_generator(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"} ) # Non-streaming response response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response( prompt, max_tokens=request.max_tokens, temperature=request.temperature or 1.0, top_p=request.top_p or 0.999, top_k=request.top_k, stop=request.stop_sequences, enable_thinking=enable_thinking, thinking_budget=thinking_budget, ) # Prepend prefill to response full_response = prefill + response_text if prefill else response_text # Build content blocks content_blocks = [] # Add thinking block if enabled and we have thinking content if enable_thinking: if not thinking_text: thinking_text = "Analyzing the request and formulating a response." content_blocks.append(AnthropicResponseContent(type="thinking", thinking=thinking_text)) # Add text block content_blocks.append(AnthropicResponseContent(type="text", text=full_response)) # Determine stop reason anthropic_stop_reason = "end_turn" stop_sequence_used = None if stop_reason == "length": anthropic_stop_reason = "max_tokens" elif stop_reason == "stop" and request.stop_sequences: for seq in request.stop_sequences: if seq in response_text: anthropic_stop_reason = "stop_sequence" stop_sequence_used = seq break return AnthropicResponse( id=request_id, model=request.model, content=content_blocks, stop_reason=anthropic_stop_reason, stop_sequence=stop_sequence_used, usage=AnthropicUsage( input_tokens=input_tokens, output_tokens=output_tokens ) ) # ============================================================================ # Health & Info Endpoints # ============================================================================ @app.get("/") async def root(): return { "name": "Free Coding API", "version": "1.1.0", "model": MODEL_ID, "features": { "prefill_response": "Supported - Include assistant message at end for output control", "thinking": "Supported - Enable with thinking: {type: 'enabled'}", "streaming": "Supported - Both OpenAI and Anthropic formats" }, "compatibility": { "openai": "v1 Chat Completions API", "anthropic": "Messages API (2023-06-01)" }, "endpoints": { "openai_chat": "/v1/chat/completions", "anthropic_messages": "/v1/messages", "models": "/v1/models" }, "docs": "/docs" } @app.get("/health") async def health(): return { "status": "healthy", "model_loaded": model is not None, "model_id": MODEL_ID } # ============================================================================ # Main Entry Point # ============================================================================ if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)