"""
HuggingFace Spaces - OpenAI & Anthropic Compatible Coding API
A free, skills-only API endpoint for coding tasks (like Codex/Claude Code)
Author: Matrix Agent
Features:
- Full OpenAI API compatibility (/v1/chat/completions)
- Full Anthropic API compatibility (/v1/messages)
- Computer Use Agent (CUA) endpoint (/v1/cua)
- Prefill Response Support (assistant message prefix for output control)
- Thinking/Reasoning Content Block Support
- Optimized for coding tasks
- Runs on free HF Spaces (2 vCPU, 16GB RAM)
API Specifications verified against:
- OpenAI: https://platform.openai.com/docs/api-reference/chat/create
- Anthropic: https://docs.anthropic.com/en/api/messages
- Anthropic Computer Use: https://docs.anthropic.com/en/docs/agents-and-tools/computer-use
- Prefill: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
- MiniMax Anthropic: https://platform.minimax.io/docs/api-reference/text-anthropic-api
"""
import os
import time
import uuid
import json
import asyncio
from typing import List, Optional, Union, Dict, Any, AsyncGenerator
from contextlib import asynccontextmanager
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
from fastapi import FastAPI, HTTPException, Header, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel, Field
# ============================================================================
# Configuration
# ============================================================================
MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct")
ANTHROPIC_VERSION = "2023-06-01"
MODEL_ALIASES = {
# OpenAI-style model names
"gpt-4": MODEL_ID,
"gpt-4-turbo": MODEL_ID,
"gpt-4o": MODEL_ID,
"gpt-4o-mini": MODEL_ID,
"gpt-3.5-turbo": MODEL_ID,
"codex": MODEL_ID,
"code-davinci-002": MODEL_ID,
"o1": MODEL_ID,
"o1-mini": MODEL_ID,
# Anthropic-style model names
"claude-3-opus-20240229": MODEL_ID,
"claude-3-sonnet-20240229": MODEL_ID,
"claude-3-haiku-20240307": MODEL_ID,
"claude-3-5-sonnet-20241022": MODEL_ID,
"claude-3-5-haiku-20241022": MODEL_ID,
"claude-3-opus": MODEL_ID,
"claude-3-sonnet": MODEL_ID,
"claude-3-haiku": MODEL_ID,
"claude-3-5-sonnet": MODEL_ID,
"claude-code": MODEL_ID,
# Computer Use Agent (CUA) model
"sheikh-computer-use-preview": MODEL_ID,
"computer-use-preview": MODEL_ID,
}
API_KEY = os.getenv("API_KEY", "sk-free-coding-api")
MAX_TOKENS_DEFAULT = 2048
TEMPERATURE_DEFAULT = 0.7
# ============================================================================
# Global Model Instance
# ============================================================================
model = None
tokenizer = None
def load_model():
"""Load model with CPU optimization"""
global model, tokenizer
print(f"🚀 Loading model: {MODEL_ID}")
print(f"📊 Device: CPU (Free HF Spaces)")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
trust_remote_code=True,
padding_side="left"
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
device_map="cpu",
trust_remote_code=True,
low_cpu_mem_usage=True,
)
model.eval()
print("✅ Model loaded successfully!")
return model, tokenizer
# ============================================================================
# Pydantic Models - OpenAI Compatible
# ============================================================================
class OpenAIContentPart(BaseModel):
type: str
text: Optional[str] = None
image_url: Optional[Dict[str, str]] = None
class OpenAIMessage(BaseModel):
role: str
content: Optional[Union[str, List[OpenAIContentPart]]] = None
name: Optional[str] = None
tool_calls: Optional[List[Dict]] = None
tool_call_id: Optional[str] = None
class OpenAIResponseFormat(BaseModel):
type: str = "text"
json_schema: Optional[Dict] = None
class OpenAIChatRequest(BaseModel):
model: str
messages: List[OpenAIMessage]
temperature: Optional[float] = Field(default=1.0, ge=0, le=2)
top_p: Optional[float] = Field(default=1.0, ge=0, le=1)
n: Optional[int] = Field(default=1, ge=1, le=10)
stream: Optional[bool] = False
stop: Optional[Union[str, List[str]]] = None
max_tokens: Optional[int] = None
max_completion_tokens: Optional[int] = None
presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
logit_bias: Optional[Dict[str, float]] = None
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = None
user: Optional[str] = None
seed: Optional[int] = None
tools: Optional[List[Dict]] = None
tool_choice: Optional[Union[str, Dict]] = None
response_format: Optional[OpenAIResponseFormat] = None
stream_options: Optional[Dict] = None
class OpenAIChoiceMessage(BaseModel):
role: str = "assistant"
content: Optional[str] = None
tool_calls: Optional[List[Dict]] = None
class OpenAIChoice(BaseModel):
index: int
message: OpenAIChoiceMessage
finish_reason: Optional[str] = None
logprobs: Optional[Dict] = None
class OpenAIStreamChoice(BaseModel):
index: int
delta: Dict
finish_reason: Optional[str] = None
logprobs: Optional[Dict] = None
class OpenAIUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
prompt_tokens_details: Optional[Dict] = None
completion_tokens_details: Optional[Dict] = None
class OpenAIChatResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[OpenAIChoice]
usage: Optional[OpenAIUsage] = None
system_fingerprint: Optional[str] = None
service_tier: Optional[str] = None
class OpenAIModelInfo(BaseModel):
id: str
object: str = "model"
created: int
owned_by: str = "hf-spaces"
class OpenAIModelsResponse(BaseModel):
object: str = "list"
data: List[OpenAIModelInfo]
# ============================================================================
# Pydantic Models - Anthropic Compatible (with Thinking & Prefill support)
# ============================================================================
class AnthropicTextBlock(BaseModel):
type: str = "text"
text: str
class AnthropicImageSource(BaseModel):
type: str = "base64"
media_type: str
data: str
class AnthropicImageBlock(BaseModel):
type: str = "image"
source: AnthropicImageSource
class AnthropicThinkingBlock(BaseModel):
"""Thinking/reasoning content block"""
type: str = "thinking"
thinking: str
AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, AnthropicThinkingBlock, Dict]
class AnthropicMessage(BaseModel):
role: str # "user", "assistant"
content: Union[str, List[AnthropicContentBlock]]
class AnthropicTool(BaseModel):
name: str
description: Optional[str] = None
input_schema: Dict
class AnthropicToolChoice(BaseModel):
type: str
name: Optional[str] = None
class AnthropicThinkingConfig(BaseModel):
"""Configuration for thinking/reasoning mode"""
type: str = "enabled" # "enabled" or "disabled"
budget_tokens: Optional[int] = None # Token budget for thinking
class AnthropicRequest(BaseModel):
"""Full Anthropic Messages API request with thinking & prefill support"""
model: str
messages: List[AnthropicMessage]
max_tokens: int
# Optional parameters
system: Optional[Union[str, List[Dict]]] = None
temperature: Optional[float] = Field(default=1.0, ge=0, le=1)
top_p: Optional[float] = Field(default=0.999, ge=0, le=1)
top_k: Optional[int] = None
stream: Optional[bool] = False
stop_sequences: Optional[List[str]] = None
# Tool use
tools: Optional[List[AnthropicTool]] = None
tool_choice: Optional[AnthropicToolChoice] = None
# Thinking/reasoning support
thinking: Optional[AnthropicThinkingConfig] = None
# Metadata
metadata: Optional[Dict] = None
class AnthropicResponseContent(BaseModel):
type: str = "text"
text: Optional[str] = None
# For thinking blocks
thinking: Optional[str] = None
# For tool_use
id: Optional[str] = None
name: Optional[str] = None
input: Optional[Dict] = None
class AnthropicUsage(BaseModel):
input_tokens: int
output_tokens: int
class AnthropicResponse(BaseModel):
id: str
type: str = "message"
role: str = "assistant"
model: str
content: List[AnthropicResponseContent]
stop_reason: Optional[str] = None
stop_sequence: Optional[str] = None
usage: AnthropicUsage
# ============================================================================
# Content Parsing Utilities
# ============================================================================
def extract_text_from_openai_content(content: Union[str, List, None]) -> str:
if content is None:
return ""
if isinstance(content, str):
return content
if isinstance(content, list):
text_parts = []
for part in content:
if isinstance(part, dict):
if part.get("type") == "text":
text_parts.append(part.get("text", ""))
elif hasattr(part, "type") and part.type == "text":
text_parts.append(part.text or "")
return "\n".join(text_parts)
return str(content)
def extract_text_from_anthropic_content(content: Union[str, List]) -> str:
if isinstance(content, str):
return content
if isinstance(content, list):
text_parts = []
for block in content:
if isinstance(block, dict):
if block.get("type") == "text":
text_parts.append(block.get("text", ""))
elif block.get("type") == "thinking":
pass # Skip thinking blocks in extraction
elif hasattr(block, "type"):
if block.type == "text":
text_parts.append(block.text or "")
return "\n".join(text_parts)
return str(content)
def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str:
if system is None:
return ""
if isinstance(system, str):
return system
if isinstance(system, list):
text_parts = []
for block in system:
if isinstance(block, dict) and block.get("type") == "text":
text_parts.append(block.get("text", ""))
return "\n".join(text_parts)
return ""
def extract_prefill_from_messages(messages: List[Dict]) -> tuple[List[Dict], str]:
"""
Extract prefill content if the last message is from assistant.
Returns (messages_without_prefill, prefill_text)
Prefill allows controlling output by providing initial assistant response.
See: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
"""
if not messages:
return messages, ""
last_msg = messages[-1]
if last_msg.get("role") == "assistant":
prefill = last_msg.get("content", "")
# Prefill cannot end with trailing whitespace
if isinstance(prefill, str):
prefill = prefill.rstrip()
return messages[:-1], prefill
return messages, ""
# ============================================================================
# Message Formatting with Prefill Support
# ============================================================================
def format_messages_for_model(
messages: List[Dict],
system_prompt: Optional[str] = None,
prefill: str = ""
) -> str:
"""
Format messages for the model using chat template.
Supports prefill for controlling output format.
"""
formatted_messages = []
if system_prompt:
formatted_messages.append({"role": "system", "content": system_prompt})
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
if role == "tool":
role = "user"
formatted_messages.append({"role": role, "content": content})
# Use tokenizer's chat template if available
if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
try:
prompt = tokenizer.apply_chat_template(
formatted_messages,
tokenize=False,
add_generation_prompt=True
)
# Append prefill if provided
if prefill:
prompt = prompt + prefill
return prompt
except Exception:
pass
# Fallback format
prompt = ""
for msg in formatted_messages:
role = msg["role"]
content = msg["content"]
if role == "system":
prompt += f"<|system|>\n{content}\n"
elif role == "user":
prompt += f"<|user|>\n{content}\n"
elif role == "assistant":
prompt += f"<|assistant|>\n{content}\n"
prompt += "<|assistant|>\n"
# Append prefill
if prefill:
prompt = prompt + prefill
return prompt
# ============================================================================
# Generation Logic with Thinking Support
# ============================================================================
def generate_response(
prompt: str,
max_tokens: int = MAX_TOKENS_DEFAULT,
temperature: float = TEMPERATURE_DEFAULT,
top_p: float = 0.95,
top_k: Optional[int] = None,
stop: Optional[List[str]] = None,
enable_thinking: bool = False,
thinking_budget: int = 512,
) -> tuple[str, str, int, int, str]:
"""
Generate response from the model.
Returns: (response_text, thinking_text, input_tokens, output_tokens, stop_reason)
"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
input_length = inputs.input_ids.shape[1]
gen_kwargs = {
"max_new_tokens": max_tokens,
"temperature": max(temperature, 0.01),
"top_p": top_p,
"do_sample": temperature > 0,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
}
if top_k is not None and top_k > 0:
gen_kwargs["top_k"] = top_k
with torch.no_grad():
outputs = model.generate(inputs.input_ids, **gen_kwargs)
generated_tokens = outputs[0][input_length:]
response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
output_length = len(generated_tokens)
stop_reason = "stop"
thinking_text = ""
# Simulate thinking by extracting ... blocks if present
if enable_thinking and "" in response_text:
import re
think_match = re.search(r"(.*?)", response_text, re.DOTALL)
if think_match:
thinking_text = think_match.group(1).strip()
response_text = re.sub(r".*?", "", response_text, flags=re.DOTALL).strip()
# Handle stop sequences
if stop:
for stop_seq in stop:
if stop_seq in response_text:
response_text = response_text.split(stop_seq)[0]
stop_reason = "stop"
break
if output_length >= max_tokens:
stop_reason = "length"
return response_text.strip(), thinking_text, input_length, output_length, stop_reason
async def generate_stream(
prompt: str,
max_tokens: int = MAX_TOKENS_DEFAULT,
temperature: float = TEMPERATURE_DEFAULT,
top_p: float = 0.95,
top_k: Optional[int] = None,
) -> AsyncGenerator[str, None]:
"""Stream generation for real-time responses"""
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
gen_kwargs = {
"max_new_tokens": max_tokens,
"temperature": max(temperature, 0.01),
"top_p": top_p,
"do_sample": temperature > 0,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"streamer": streamer,
}
if top_k is not None and top_k > 0:
gen_kwargs["top_k"] = top_k
thread = Thread(target=lambda: model.generate(inputs.input_ids, **gen_kwargs))
thread.start()
for text in streamer:
yield text
thread.join()
# ============================================================================
# FastAPI Application
# ============================================================================
@asynccontextmanager
async def lifespan(app: FastAPI):
load_model()
yield
app = FastAPI(
title="Free Coding API",
description="OpenAI & Anthropic compatible API with Files, Skills, Batches, CUA, Prefill & Thinking support",
version="1.3.0",
lifespan=lifespan
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ============================================================================
# Authentication
# ============================================================================
def verify_api_key(authorization: Optional[str] = None) -> bool:
if not API_KEY or API_KEY == "":
return True
if not authorization:
return False
if authorization.startswith("Bearer "):
token = authorization[7:]
else:
token = authorization
return token == API_KEY
# ============================================================================
# OpenAI Compatible Endpoints
# ============================================================================
@app.get("/v1/models")
async def list_models():
models = [
OpenAIModelInfo(id=alias, created=int(time.time()))
for alias in MODEL_ALIASES.keys()
]
return OpenAIModelsResponse(data=models)
@app.get("/v1/models/{model_id}")
async def get_model(model_id: str):
if model_id in MODEL_ALIASES or model_id == MODEL_ID:
return OpenAIModelInfo(id=model_id, created=int(time.time()))
raise HTTPException(status_code=404, detail="Model not found")
@app.post("/v1/chat/completions")
async def openai_chat_completions(
request: OpenAIChatRequest,
authorization: Optional[str] = Header(None),
):
"""OpenAI-compatible chat completions with prefill support"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
# Extract messages
messages = []
for m in request.messages:
content = extract_text_from_openai_content(m.content)
messages.append({"role": m.role, "content": content})
# Check for prefill (last assistant message)
messages, prefill = extract_prefill_from_messages(messages)
# Extract system message
system_prompt = None
filtered_messages = []
for msg in messages:
if msg["role"] == "system":
system_prompt = msg["content"]
else:
filtered_messages.append(msg)
prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt, prefill=prefill)
max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT
stop_sequences = None
if request.stop:
stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop
request_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
system_fingerprint = f"fp_{uuid.uuid4().hex[:10]}"
created_time = int(time.time())
if request.stream:
async def stream_generator():
first_chunk = {
"id": request_id,
"object": "chat.completion.chunk",
"created": created_time,
"model": request.model,
"system_fingerprint": system_fingerprint,
"choices": [{
"index": 0,
"delta": {"role": "assistant", "content": prefill}, # Include prefill in first chunk
"logprobs": None,
"finish_reason": None
}]
}
yield f"data: {json.dumps(first_chunk)}\n\n"
async for token in generate_stream(
prompt,
max_tokens=max_tokens,
temperature=request.temperature or 1.0,
top_p=request.top_p or 1.0,
):
chunk = {
"id": request_id,
"object": "chat.completion.chunk",
"created": created_time,
"model": request.model,
"system_fingerprint": system_fingerprint,
"choices": [{
"index": 0,
"delta": {"content": token},
"logprobs": None,
"finish_reason": None
}]
}
yield f"data: {json.dumps(chunk)}\n\n"
final_chunk = {
"id": request_id,
"object": "chat.completion.chunk",
"created": created_time,
"model": request.model,
"system_fingerprint": system_fingerprint,
"choices": [{
"index": 0,
"delta": {},
"logprobs": None,
"finish_reason": "stop"
}]
}
yield f"data: {json.dumps(final_chunk)}\n\n"
if request.stream_options and request.stream_options.get("include_usage"):
usage_chunk = {
"id": request_id,
"object": "chat.completion.chunk",
"created": created_time,
"model": request.model,
"choices": [],
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
}
yield f"data: {json.dumps(usage_chunk)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
)
# Non-streaming
response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
prompt,
max_tokens=max_tokens,
temperature=request.temperature or 1.0,
top_p=request.top_p or 1.0,
stop=stop_sequences,
)
# Prepend prefill to response
full_response = prefill + response_text if prefill else response_text
openai_finish_reason = "stop" if stop_reason == "stop" else "length"
return OpenAIChatResponse(
id=request_id,
created=created_time,
model=request.model,
system_fingerprint=system_fingerprint,
choices=[
OpenAIChoice(
index=0,
message=OpenAIChoiceMessage(role="assistant", content=full_response),
finish_reason=openai_finish_reason,
logprobs=None
)
],
usage=OpenAIUsage(
prompt_tokens=input_tokens,
completion_tokens=output_tokens,
total_tokens=input_tokens + output_tokens
)
)
# ============================================================================
# Anthropic Compatible Endpoints with Prefill & Thinking
# ============================================================================
@app.post("/v1/messages")
async def anthropic_messages(
request: AnthropicRequest,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
):
"""Anthropic-compatible messages endpoint with prefill & thinking support"""
auth_key = x_api_key or authorization
if not verify_api_key(auth_key):
raise HTTPException(status_code=401, detail="Invalid API key")
# Extract messages
messages = []
for m in request.messages:
content = extract_text_from_anthropic_content(m.content)
messages.append({"role": m.role, "content": content})
# Check for prefill (last assistant message)
messages, prefill = extract_prefill_from_messages(messages)
# Extract system prompt
system_prompt = extract_system_prompt_anthropic(request.system)
prompt = format_messages_for_model(messages, system_prompt=system_prompt, prefill=prefill)
# Check thinking configuration
enable_thinking = False
thinking_budget = 512
if request.thinking:
if request.thinking.type == "enabled":
enable_thinking = True
if request.thinking.budget_tokens:
thinking_budget = request.thinking.budget_tokens
request_id = f"msg_{uuid.uuid4().hex[:24]}"
if request.stream:
async def stream_generator():
input_tokens = 0
# message_start
message_start = {
"type": "message_start",
"message": {
"id": request_id,
"type": "message",
"role": "assistant",
"model": request.model,
"content": [],
"stop_reason": None,
"stop_sequence": None,
"usage": {"input_tokens": input_tokens, "output_tokens": 0}
}
}
yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
content_index = 0
# If thinking is enabled, add thinking block first (simulated)
if enable_thinking:
# thinking block start
thinking_block_start = {
"type": "content_block_start",
"index": content_index,
"content_block": {"type": "thinking", "thinking": ""}
}
yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n"
# Simulate thinking content
thinking_text = "Analyzing the request and formulating a response..."
thinking_delta = {
"type": "content_block_delta",
"index": content_index,
"delta": {"type": "thinking_delta", "thinking": thinking_text}
}
yield f"event: content_block_delta\ndata: {json.dumps(thinking_delta)}\n\n"
thinking_block_stop = {"type": "content_block_stop", "index": content_index}
yield f"event: content_block_stop\ndata: {json.dumps(thinking_block_stop)}\n\n"
content_index += 1
# text content block start
content_block_start = {
"type": "content_block_start",
"index": content_index,
"content_block": {"type": "text", "text": ""}
}
yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
# Include prefill in first delta if present
if prefill:
prefill_delta = {
"type": "content_block_delta",
"index": content_index,
"delta": {"type": "text_delta", "text": prefill}
}
yield f"event: content_block_delta\ndata: {json.dumps(prefill_delta)}\n\n"
# Stream content
output_tokens = 0
async for token in generate_stream(
prompt,
max_tokens=request.max_tokens,
temperature=request.temperature or 1.0,
top_p=request.top_p or 0.999,
top_k=request.top_k,
):
output_tokens += 1
delta = {
"type": "content_block_delta",
"index": content_index,
"delta": {"type": "text_delta", "text": token}
}
yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n"
# content_block_stop
content_block_stop = {"type": "content_block_stop", "index": content_index}
yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n"
# message_delta
message_delta = {
"type": "message_delta",
"delta": {"stop_reason": "end_turn", "stop_sequence": None},
"usage": {"output_tokens": output_tokens}
}
yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
# message_stop
message_stop = {"type": "message_stop"}
yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
)
# Non-streaming response
response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
prompt,
max_tokens=request.max_tokens,
temperature=request.temperature or 1.0,
top_p=request.top_p or 0.999,
top_k=request.top_k,
stop=request.stop_sequences,
enable_thinking=enable_thinking,
thinking_budget=thinking_budget,
)
# Prepend prefill to response
full_response = prefill + response_text if prefill else response_text
# Build content blocks
content_blocks = []
# Add thinking block if enabled and we have thinking content
if enable_thinking:
if not thinking_text:
thinking_text = "Analyzing the request and formulating a response."
content_blocks.append(AnthropicResponseContent(type="thinking", thinking=thinking_text))
# Add text block
content_blocks.append(AnthropicResponseContent(type="text", text=full_response))
# Determine stop reason
anthropic_stop_reason = "end_turn"
stop_sequence_used = None
if stop_reason == "length":
anthropic_stop_reason = "max_tokens"
elif stop_reason == "stop" and request.stop_sequences:
for seq in request.stop_sequences:
if seq in response_text:
anthropic_stop_reason = "stop_sequence"
stop_sequence_used = seq
break
return AnthropicResponse(
id=request_id,
model=request.model,
content=content_blocks,
stop_reason=anthropic_stop_reason,
stop_sequence=stop_sequence_used,
usage=AnthropicUsage(
input_tokens=input_tokens,
output_tokens=output_tokens
)
)
# ============================================================================
# Files API (Beta)
# ============================================================================
# In-memory file storage (for demo - in production use persistent storage)
files_storage: Dict[str, Dict] = {}
class FileUploadResponse(BaseModel):
id: str
object: str = "file"
bytes: int
created_at: int
filename: str
purpose: str
@app.post("/v1/files")
async def upload_file(
request: Request,
authorization: Optional[str] = Header(None),
):
"""Upload a file for use across multiple API calls"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
form = await request.form()
file = form.get("file")
purpose = form.get("purpose", "assistants")
if not file:
raise HTTPException(status_code=400, detail="No file provided")
file_id = f"file-{uuid.uuid4().hex[:24]}"
content = await file.read()
file_data = {
"id": file_id,
"object": "file",
"bytes": len(content),
"created_at": int(time.time()),
"filename": file.filename,
"purpose": purpose,
"content": content # Store content in memory
}
files_storage[file_id] = file_data
return FileUploadResponse(
id=file_id,
bytes=len(content),
created_at=file_data["created_at"],
filename=file.filename,
purpose=purpose
)
@app.get("/v1/files")
async def list_files(
authorization: Optional[str] = Header(None),
purpose: Optional[str] = None,
):
"""List all uploaded files"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
files_list = []
for file_id, file_data in files_storage.items():
if purpose and file_data.get("purpose") != purpose:
continue
files_list.append({
"id": file_data["id"],
"object": "file",
"bytes": file_data["bytes"],
"created_at": file_data["created_at"],
"filename": file_data["filename"],
"purpose": file_data["purpose"]
})
return {"object": "list", "data": files_list}
@app.get("/v1/files/{file_id}")
async def get_file(
file_id: str,
authorization: Optional[str] = Header(None),
):
"""Get file metadata"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
if file_id not in files_storage:
raise HTTPException(status_code=404, detail="File not found")
file_data = files_storage[file_id]
return {
"id": file_data["id"],
"object": "file",
"bytes": file_data["bytes"],
"created_at": file_data["created_at"],
"filename": file_data["filename"],
"purpose": file_data["purpose"]
}
@app.delete("/v1/files/{file_id}")
async def delete_file(
file_id: str,
authorization: Optional[str] = Header(None),
):
"""Delete a file"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
if file_id not in files_storage:
raise HTTPException(status_code=404, detail="File not found")
del files_storage[file_id]
return {"id": file_id, "object": "file", "deleted": True}
# ============================================================================
# Skills API (Beta)
# ============================================================================
skills_storage: Dict[str, Dict] = {}
class SkillCreate(BaseModel):
name: str
description: Optional[str] = None
instructions: str
tools: Optional[List[Dict]] = None
class SkillResponse(BaseModel):
id: str
object: str = "skill"
name: str
description: Optional[str] = None
instructions: str
tools: Optional[List[Dict]] = None
created_at: int
@app.post("/v1/skills")
async def create_skill(
request: SkillCreate,
authorization: Optional[str] = Header(None),
):
"""Create a custom agent skill"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
skill_id = f"skill-{uuid.uuid4().hex[:24]}"
skill_data = {
"id": skill_id,
"object": "skill",
"name": request.name,
"description": request.description,
"instructions": request.instructions,
"tools": request.tools or [],
"created_at": int(time.time())
}
skills_storage[skill_id] = skill_data
return SkillResponse(**skill_data)
@app.get("/v1/skills")
async def list_skills(
authorization: Optional[str] = Header(None),
):
"""List all custom skills"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
return {
"object": "list",
"data": [
{k: v for k, v in skill.items()}
for skill in skills_storage.values()
]
}
@app.get("/v1/skills/{skill_id}")
async def get_skill(
skill_id: str,
authorization: Optional[str] = Header(None),
):
"""Get skill details"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
if skill_id not in skills_storage:
raise HTTPException(status_code=404, detail="Skill not found")
return skills_storage[skill_id]
@app.delete("/v1/skills/{skill_id}")
async def delete_skill(
skill_id: str,
authorization: Optional[str] = Header(None),
):
"""Delete a skill"""
if not verify_api_key(authorization):
raise HTTPException(status_code=401, detail="Invalid API key")
if skill_id not in skills_storage:
raise HTTPException(status_code=404, detail="Skill not found")
del skills_storage[skill_id]
return {"id": skill_id, "object": "skill", "deleted": True}
# ============================================================================
# Message Batches API (50% cost reduction for async processing)
# ============================================================================
batches_storage: Dict[str, Dict] = {}
class BatchRequest(BaseModel):
custom_id: str
params: Dict # Contains the message request parameters
class CreateBatchRequest(BaseModel):
requests: List[BatchRequest]
class BatchResponse(BaseModel):
id: str
type: str = "message_batch"
processing_status: str # "in_progress", "ended"
request_counts: Dict
ended_at: Optional[int] = None
created_at: int
expires_at: int
results_url: Optional[str] = None
@app.post("/v1/messages/batches")
async def create_message_batch(
request: CreateBatchRequest,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""
Create a Message Batch for async processing with 50% cost reduction.
Process large volumes of Messages requests asynchronously.
"""
auth_key = x_api_key or authorization
if not verify_api_key(auth_key):
raise HTTPException(status_code=401, detail="Invalid API key")
batch_id = f"batch_{uuid.uuid4().hex[:24]}"
created_at = int(time.time())
# Process batch requests asynchronously (simulated)
results = []
succeeded = 0
failed = 0
for req in request.requests:
try:
# Extract message parameters
params = req.params
messages = params.get("messages", [])
max_tokens = params.get("max_tokens", 1024)
# Format and generate
formatted_msgs = []
for m in messages:
content = m.get("content", "")
if isinstance(content, list):
content = " ".join([b.get("text", "") for b in content if b.get("type") == "text"])
formatted_msgs.append({"role": m.get("role"), "content": content})
prompt = format_messages_for_model(formatted_msgs)
response_text, _, input_tokens, output_tokens, _ = generate_response(
prompt, max_tokens=max_tokens
)
results.append({
"custom_id": req.custom_id,
"result": {
"type": "succeeded",
"message": {
"id": f"msg_{uuid.uuid4().hex[:24]}",
"type": "message",
"role": "assistant",
"content": [{"type": "text", "text": response_text}],
"model": params.get("model", "claude-3-sonnet"),
"stop_reason": "end_turn",
"usage": {"input_tokens": input_tokens, "output_tokens": output_tokens}
}
}
})
succeeded += 1
except Exception as e:
results.append({
"custom_id": req.custom_id,
"result": {
"type": "errored",
"error": {"type": "server_error", "message": str(e)}
}
})
failed += 1
batch_data = {
"id": batch_id,
"type": "message_batch",
"processing_status": "ended",
"request_counts": {
"processing": 0,
"succeeded": succeeded,
"errored": failed,
"canceled": 0,
"expired": 0
},
"ended_at": int(time.time()),
"created_at": created_at,
"expires_at": created_at + 86400, # 24 hours
"results": results
}
batches_storage[batch_id] = batch_data
return BatchResponse(
id=batch_id,
processing_status="ended",
request_counts=batch_data["request_counts"],
ended_at=batch_data["ended_at"],
created_at=created_at,
expires_at=batch_data["expires_at"],
results_url=f"/v1/messages/batches/{batch_id}/results"
)
@app.get("/v1/messages/batches")
async def list_batches(
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""List all message batches"""
auth_key = x_api_key or authorization
if not verify_api_key(auth_key):
raise HTTPException(status_code=401, detail="Invalid API key")
return {
"object": "list",
"data": [
{k: v for k, v in batch.items() if k != "results"}
for batch in batches_storage.values()
]
}
@app.get("/v1/messages/batches/{batch_id}")
async def get_batch(
batch_id: str,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""Get batch status and details"""
auth_key = x_api_key or authorization
if not verify_api_key(auth_key):
raise HTTPException(status_code=401, detail="Invalid API key")
if batch_id not in batches_storage:
raise HTTPException(status_code=404, detail="Batch not found")
batch = batches_storage[batch_id]
return {k: v for k, v in batch.items() if k != "results"}
@app.get("/v1/messages/batches/{batch_id}/results")
async def get_batch_results(
batch_id: str,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""Get batch results (JSONL format)"""
auth_key = x_api_key or authorization
if not verify_api_key(auth_key):
raise HTTPException(status_code=401, detail="Invalid API key")
if batch_id not in batches_storage:
raise HTTPException(status_code=404, detail="Batch not found")
batch = batches_storage[batch_id]
if batch["processing_status"] != "ended":
raise HTTPException(status_code=400, detail="Batch still processing")
# Return results as JSON (in real API this would be JSONL)
return {"results": batch.get("results", [])}
@app.post("/v1/messages/batches/{batch_id}/cancel")
async def cancel_batch(
batch_id: str,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""Cancel a batch"""
auth_key = x_api_key or authorization
if not verify_api_key(auth_key):
raise HTTPException(status_code=401, detail="Invalid API key")
if batch_id not in batches_storage:
raise HTTPException(status_code=404, detail="Batch not found")
batch = batches_storage[batch_id]
if batch["processing_status"] == "ended":
raise HTTPException(status_code=400, detail="Batch already ended")
batch["processing_status"] = "ended"
batch["request_counts"]["canceled"] = batch["request_counts"].get("processing", 0)
batch["request_counts"]["processing"] = 0
return {k: v for k, v in batch.items() if k != "results"}
# ============================================================================
# Anthropic Separate Base Path: /anthropic/v1/
# ============================================================================
@app.post("/anthropic/v1/messages")
async def anthropic_messages_separate(
request: AnthropicRequest,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
):
"""Anthropic endpoint with separate base path: /anthropic/v1/messages"""
return await anthropic_messages(request, authorization, x_api_key, anthropic_version)
@app.get("/anthropic/v1/models")
async def anthropic_list_models():
"""List Anthropic models"""
return {
"object": "list",
"data": [
{"id": "claude-3-opus-20240229", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-sonnet-20240229", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-haiku-20240307", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-5-sonnet-20241022", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-5-haiku-20241022", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-opus", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-sonnet", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-haiku", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-3-5-sonnet", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
{"id": "claude-code", "object": "model", "created": int(time.time()), "owned_by": "anthropic"},
]
}
@app.get("/anthropic")
async def anthropic_info():
"""Anthropic base endpoint info"""
return {
"name": "Anthropic Compatible API",
"version": ANTHROPIC_VERSION,
"base_url": "/anthropic/v1",
"endpoints": {
"messages": "/anthropic/v1/messages",
"models": "/anthropic/v1/models"
},
"features": ["prefill_response", "thinking", "streaming"]
}
# ============================================================================
# Computer Use Agent (CUA) - Pydantic Models
# ============================================================================
class CUAToolAction(BaseModel):
"""Computer use tool action"""
type: str # "click", "type", "scroll", "screenshot", "key", "move", "drag", "wait"
# For click/move/drag
x: Optional[int] = None
y: Optional[int] = None
button: Optional[str] = "left" # "left", "right", "middle"
# For type
text: Optional[str] = None
# For key
key: Optional[str] = None # "enter", "tab", "escape", "backspace", etc.
modifiers: Optional[List[str]] = None # ["ctrl", "shift", "alt", "meta"]
# For scroll
direction: Optional[str] = None # "up", "down", "left", "right"
amount: Optional[int] = None # pixels or lines
# For drag
start_x: Optional[int] = None
start_y: Optional[int] = None
end_x: Optional[int] = None
end_y: Optional[int] = None
# For wait
duration: Optional[float] = None # seconds
class CUAToolResult(BaseModel):
"""Result of a computer use tool action"""
type: str = "tool_result"
tool_use_id: str
content: Optional[Union[str, List[Dict]]] = None
is_error: Optional[bool] = False
class CUAScreenInfo(BaseModel):
"""Screen configuration for CUA"""
width: int = 1920
height: int = 1080
display_number: Optional[int] = 0
class CUAComputerTool(BaseModel):
"""Computer use tool definition"""
type: str = "computer_20241022"
name: str = "computer"
display_width_px: int = 1920
display_height_px: int = 1080
display_number: Optional[int] = 0
class CUAMessage(BaseModel):
"""CUA message format"""
role: str
content: Union[str, List[Dict]]
class CUARequest(BaseModel):
"""Computer Use Agent request"""
model: str = "sheikh-computer-use-preview"
messages: List[CUAMessage]
max_tokens: int = 4096
# Computer use specific
tools: Optional[List[Dict]] = None
tool_choice: Optional[Dict] = None
# Screen configuration
screen: Optional[CUAScreenInfo] = None
# Standard params
system: Optional[str] = None
temperature: Optional[float] = 0.7
stream: Optional[bool] = False
# Thinking mode
thinking: Optional[AnthropicThinkingConfig] = None
class CUAToolUseBlock(BaseModel):
"""Tool use content block"""
type: str = "tool_use"
id: str
name: str
input: Dict
class CUAResponse(BaseModel):
"""CUA response format"""
id: str
type: str = "message"
role: str = "assistant"
model: str
content: List[Dict]
stop_reason: Optional[str] = None
usage: Dict
# ============================================================================
# CUA - Computer Action Parser
# ============================================================================
def parse_computer_action_from_text(text: str, screen_width: int = 1920, screen_height: int = 1080) -> Optional[Dict]:
"""
Parse computer actions from model's text response.
The model describes what actions it wants to take, and we parse them.
"""
import re
text_lower = text.lower()
# Click patterns
click_match = re.search(r'click\s+(?:at\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
if click_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "click",
"coordinate": [int(click_match.group(1)), int(click_match.group(2))]
}
}
# Type patterns
type_match = re.search(r'type\s+["\']([^"\']+)["\']', text, re.IGNORECASE)
if type_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "type",
"text": type_match.group(1)
}
}
# Key press patterns
key_match = re.search(r'press\s+(?:the\s+)?(\w+)\s+key', text_lower)
if key_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "key",
"key": key_match.group(1)
}
}
# Screenshot request
if 'screenshot' in text_lower or 'screen capture' in text_lower or 'take a picture' in text_lower:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "screenshot"
}
}
# Scroll patterns
scroll_match = re.search(r'scroll\s+(up|down|left|right)(?:\s+(\d+))?', text_lower)
if scroll_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "scroll",
"coordinate": [screen_width // 2, screen_height // 2],
"direction": scroll_match.group(1),
"amount": int(scroll_match.group(2)) if scroll_match.group(2) else 3
}
}
# Move mouse
move_match = re.search(r'move\s+(?:mouse\s+)?(?:to\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
if move_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "mouse_move",
"coordinate": [int(move_match.group(1)), int(move_match.group(2))]
}
}
# Double click
if 'double click' in text_lower or 'double-click' in text_lower:
dbl_match = re.search(r'double[- ]click\s+(?:at\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
if dbl_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "double_click",
"coordinate": [int(dbl_match.group(1)), int(dbl_match.group(2))]
}
}
# Right click
if 'right click' in text_lower or 'right-click' in text_lower:
right_match = re.search(r'right[- ]click\s+(?:at\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
if right_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "right_click",
"coordinate": [int(right_match.group(1)), int(right_match.group(2))]
}
}
# Drag patterns
drag_match = re.search(r'drag\s+from\s+(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?\s+to\s+(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
if drag_match:
return {
"type": "tool_use",
"id": f"toolu_{uuid.uuid4().hex[:24]}",
"name": "computer",
"input": {
"action": "left_click_drag",
"start_coordinate": [int(drag_match.group(1)), int(drag_match.group(2))],
"coordinate": [int(drag_match.group(3)), int(drag_match.group(4))]
}
}
return None
# ============================================================================
# Computer Use Agent (CUA) Endpoint
# ============================================================================
@app.post("/v1/cua")
async def computer_use_agent(
request: CUARequest,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""
Computer Use Agent endpoint - sheikh-computer-use-preview
This endpoint provides a computer control interface compatible with
Anthropic's Computer Use API. It processes user requests and generates
computer control actions (click, type, scroll, screenshot, etc.)
The model analyzes the request and current state (via screenshots) and
outputs structured tool calls for computer control actions.
"""
auth_key = x_api_key or authorization
if not verify_api_key(auth_key):
raise HTTPException(status_code=401, detail="Invalid API key")
# Get screen configuration
screen_width = 1920
screen_height = 1080
if request.screen:
screen_width = request.screen.width
screen_height = request.screen.height
# Build system prompt for computer use
cua_system_prompt = f"""You are a Computer Use Agent (CUA) that helps users interact with computers.
You can control the computer by describing actions you want to take.
Available actions:
- click at (x, y) - Click at screen coordinates
- double click at (x, y) - Double click at coordinates
- right click at (x, y) - Right click at coordinates
- type "text" - Type the specified text
- press [key] key - Press a key (enter, tab, escape, backspace, etc.)
- scroll [up/down/left/right] [amount] - Scroll the screen
- move mouse to (x, y) - Move cursor to coordinates
- drag from (x1, y1) to (x2, y2) - Drag from one point to another
- screenshot - Request a screenshot of the current screen
Screen resolution: {screen_width}x{screen_height}
When analyzing a screenshot or user request, describe the actions needed step by step.
Always specify exact coordinates when performing click or move actions.
Be precise and methodical in your approach."""
if request.system:
cua_system_prompt = request.system + "\n\n" + cua_system_prompt
# Extract messages
messages = []
for m in request.messages:
content = m.content
if isinstance(content, str):
messages.append({"role": m.role, "content": content})
elif isinstance(content, list):
# Handle multimodal content (images, tool results)
text_parts = []
for block in content:
if isinstance(block, dict):
if block.get("type") == "text":
text_parts.append(block.get("text", ""))
elif block.get("type") == "image":
text_parts.append("[Screenshot provided - analyzing...]")
elif block.get("type") == "tool_result":
text_parts.append(f"[Tool result: {block.get('content', '')}]")
messages.append({"role": m.role, "content": "\n".join(text_parts)})
# Check for prefill
messages, prefill = extract_prefill_from_messages(messages)
prompt = format_messages_for_model(messages, system_prompt=cua_system_prompt, prefill=prefill)
request_id = f"msg_{uuid.uuid4().hex[:24]}"
if request.stream:
async def stream_generator():
# message_start
message_start = {
"type": "message_start",
"message": {
"id": request_id,
"type": "message",
"role": "assistant",
"model": request.model,
"content": [],
"stop_reason": None,
"usage": {"input_tokens": 0, "output_tokens": 0}
}
}
yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
# content_block_start for text
content_block_start = {
"type": "content_block_start",
"index": 0,
"content_block": {"type": "text", "text": ""}
}
yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
full_text = ""
output_tokens = 0
async for token in generate_stream(
prompt,
max_tokens=request.max_tokens,
temperature=request.temperature or 0.7,
):
full_text += token
output_tokens += 1
delta = {
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": token}
}
yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n"
# content_block_stop for text
yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
# Check if we should emit a tool_use block
tool_action = parse_computer_action_from_text(full_text, screen_width, screen_height)
if tool_action:
tool_block_start = {
"type": "content_block_start",
"index": 1,
"content_block": {
"type": "tool_use",
"id": tool_action["id"],
"name": tool_action["name"],
"input": {}
}
}
yield f"event: content_block_start\ndata: {json.dumps(tool_block_start)}\n\n"
# Send input as delta
input_delta = {
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": json.dumps(tool_action["input"])}
}
yield f"event: content_block_delta\ndata: {json.dumps(input_delta)}\n\n"
yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 1})}\n\n"
# message_delta
stop_reason = "tool_use" if tool_action else "end_turn"
message_delta = {
"type": "message_delta",
"delta": {"stop_reason": stop_reason},
"usage": {"output_tokens": output_tokens}
}
yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
)
# Non-streaming response
response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
prompt,
max_tokens=request.max_tokens,
temperature=request.temperature or 0.7,
)
full_response = prefill + response_text if prefill else response_text
# Build content blocks
content_blocks = []
# Add text block
content_blocks.append({"type": "text", "text": full_response})
# Parse and add tool use block if detected
tool_action = parse_computer_action_from_text(full_response, screen_width, screen_height)
if tool_action:
content_blocks.append(tool_action)
stop_reason = "tool_use"
else:
stop_reason = "end_turn"
return CUAResponse(
id=request_id,
model=request.model,
content=content_blocks,
stop_reason=stop_reason,
usage={
"input_tokens": input_tokens,
"output_tokens": output_tokens
}
)
# Alternative endpoint paths for compatibility
@app.post("/v1/computer-use")
async def computer_use_alt(
request: CUARequest,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""Alternative endpoint path for computer use"""
return await computer_use_agent(request, authorization, x_api_key)
# ============================================================================
# CUA Separate Base Path: /cua/v1/
# ============================================================================
@app.post("/cua/v1/messages")
async def cua_messages(
request: CUARequest,
authorization: Optional[str] = Header(None),
x_api_key: Optional[str] = Header(None, alias="x-api-key"),
):
"""CUA endpoint with separate base path: /cua/v1/messages"""
return await computer_use_agent(request, authorization, x_api_key)
@app.get("/cua/v1/models")
async def cua_list_models():
"""List CUA models"""
return {
"object": "list",
"data": [
{
"id": "sheikh-computer-use-preview",
"object": "model",
"created": int(time.time()),
"owned_by": "sheikh-ai",
"capabilities": {
"computer_use": True,
"vision": True,
"tool_use": True
}
},
{
"id": "computer-use-preview",
"object": "model",
"created": int(time.time()),
"owned_by": "sheikh-ai",
"capabilities": {
"computer_use": True,
"vision": True,
"tool_use": True
}
}
]
}
@app.get("/cua")
async def cua_info():
"""CUA base endpoint info"""
return {
"name": "Sheikh Computer Use Agent (CUA)",
"version": "1.0.0",
"model": "sheikh-computer-use-preview",
"base_url": "/cua/v1",
"endpoints": {
"messages": "/cua/v1/messages",
"models": "/cua/v1/models"
},
"supported_actions": [
"click", "double_click", "right_click",
"type", "key", "scroll",
"mouse_move", "left_click_drag",
"screenshot"
],
"screen_default": {"width": 1920, "height": 1080}
}
# ============================================================================
# Health & Info Endpoints
# ============================================================================
@app.get("/")
async def root():
return {
"name": "Free Coding API",
"version": "1.3.0",
"model": MODEL_ID,
"features": {
"prefill_response": "Supported",
"thinking": "Supported",
"streaming": "Supported",
"computer_use": "Supported",
"files_api": "Beta",
"skills_api": "Beta",
"message_batches": "Supported (50% cost reduction)"
},
"openai": {
"base_url": "/v1",
"chat": "/v1/chat/completions",
"models": "/v1/models",
"files": "/v1/files",
"skills": "/v1/skills"
},
"anthropic": {
"base_url": "/anthropic/v1",
"messages": "/anthropic/v1/messages",
"batches": "/v1/messages/batches",
"models": "/anthropic/v1/models"
},
"cua": {
"base_url": "/cua/v1",
"messages": "/cua/v1/messages",
"models": "/cua/v1/models",
"model": "sheikh-computer-use-preview"
},
"docs": "/docs"
}
@app.get("/health")
async def health():
return {
"status": "healthy",
"model_loaded": model is not None,
"model_id": MODEL_ID
}
# ============================================================================
# Main Entry Point
# ============================================================================
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)