Spaces:

megharudushi
/

free-coding-api

Sleeping

App Files Files Community

megharudushi commited on 8 days ago

Commit

2d169db

verified ·

1 Parent(s): 1e22395

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +452 -5

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ Author: Matrix Agent
 Features:
 - Full OpenAI API compatibility (/v1/chat/completions)
 - Full Anthropic API compatibility (/v1/messages)
 - Prefill Response Support (assistant message prefix for output control)
 - Thinking/Reasoning Content Block Support
 - Optimized for coding tasks
@@ -14,6 +15,7 @@ Features:
 API Specifications verified against:
 - OpenAI: https://platform.openai.com/docs/api-reference/chat/create
 - Anthropic: https://docs.anthropic.com/en/api/messages
 - Prefill: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
 - MiniMax Anthropic: https://platform.minimax.io/docs/api-reference/text-anthropic-api
 """
@@ -64,6 +66,9 @@ MODEL_ALIASES = {
     "claude-3-haiku": MODEL_ID,
     "claude-3-5-sonnet": MODEL_ID,
     "claude-code": MODEL_ID,
 }
 API_KEY = os.getenv("API_KEY", "sk-free-coding-api")
@@ -517,8 +522,8 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
     title="Free Coding API",
-    description="OpenAI & Anthropic compatible API with Prefill & Thinking support",
-    version="1.1.0",
     lifespan=lifespan
 )
@@ -907,6 +912,443 @@ async def anthropic_messages(
         )
     )
 # ============================================================================
 # Health & Info Endpoints
 # ============================================================================
@@ -915,22 +1357,27 @@ async def anthropic_messages(
 async def root():
     return {
         "name": "Free Coding API",
-        "version": "1.1.0",
         "model": MODEL_ID,
         "features": {
             "prefill_response": "Supported - Include assistant message at end for output control",
             "thinking": "Supported - Enable with thinking: {type: 'enabled'}",
-            "streaming": "Supported - Both OpenAI and Anthropic formats"
         },
         "compatibility": {
             "openai": "v1 Chat Completions API",
-            "anthropic": "Messages API (2023-06-01)"
         },
         "endpoints": {
             "openai_chat": "/v1/chat/completions",
             "anthropic_messages": "/v1/messages",
             "models": "/v1/models"
         },
         "docs": "/docs"
     }

 Features:
 - Full OpenAI API compatibility (/v1/chat/completions)
 - Full Anthropic API compatibility (/v1/messages)
+- Computer Use Agent (CUA) endpoint (/v1/cua)
 - Prefill Response Support (assistant message prefix for output control)
 - Thinking/Reasoning Content Block Support
 - Optimized for coding tasks
 API Specifications verified against:
 - OpenAI: https://platform.openai.com/docs/api-reference/chat/create
 - Anthropic: https://docs.anthropic.com/en/api/messages
+- Anthropic Computer Use: https://docs.anthropic.com/en/docs/agents-and-tools/computer-use
 - Prefill: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
 - MiniMax Anthropic: https://platform.minimax.io/docs/api-reference/text-anthropic-api
 """
     "claude-3-haiku": MODEL_ID,
     "claude-3-5-sonnet": MODEL_ID,
     "claude-code": MODEL_ID,
+    # Computer Use Agent (CUA) model
+    "sheikh-computer-use-preview": MODEL_ID,
+    "computer-use-preview": MODEL_ID,
 }
 API_KEY = os.getenv("API_KEY", "sk-free-coding-api")
 app = FastAPI(
     title="Free Coding API",
+    description="OpenAI & Anthropic compatible API with Prefill, Thinking & Computer Use Agent (CUA) support",
+    version="1.2.0",
     lifespan=lifespan
 )
         )
     )
+# ============================================================================
+# Computer Use Agent (CUA) - Pydantic Models
+# ============================================================================
+class CUAToolAction(BaseModel):
+    """Computer use tool action"""
+    type: str  # "click", "type", "scroll", "screenshot", "key", "move", "drag", "wait"
+    # For click/move/drag
+    x: Optional[int] = None
+    y: Optional[int] = None
+    button: Optional[str] = "left"  # "left", "right", "middle"
+    # For type
+    text: Optional[str] = None
+    # For key
+    key: Optional[str] = None  # "enter", "tab", "escape", "backspace", etc.
+    modifiers: Optional[List[str]] = None  # ["ctrl", "shift", "alt", "meta"]
+    # For scroll
+    direction: Optional[str] = None  # "up", "down", "left", "right"
+    amount: Optional[int] = None  # pixels or lines
+    # For drag
+    start_x: Optional[int] = None
+    start_y: Optional[int] = None
+    end_x: Optional[int] = None
+    end_y: Optional[int] = None
+    # For wait
+    duration: Optional[float] = None  # seconds
+class CUAToolResult(BaseModel):
+    """Result of a computer use tool action"""
+    type: str = "tool_result"
+    tool_use_id: str
+    content: Optional[Union[str, List[Dict]]] = None
+    is_error: Optional[bool] = False
+class CUAScreenInfo(BaseModel):
+    """Screen configuration for CUA"""
+    width: int = 1920
+    height: int = 1080
+    display_number: Optional[int] = 0
+class CUAComputerTool(BaseModel):
+    """Computer use tool definition"""
+    type: str = "computer_20241022"
+    name: str = "computer"
+    display_width_px: int = 1920
+    display_height_px: int = 1080
+    display_number: Optional[int] = 0
+class CUAMessage(BaseModel):
+    """CUA message format"""
+    role: str
+    content: Union[str, List[Dict]]
+class CUARequest(BaseModel):
+    """Computer Use Agent request"""
+    model: str = "sheikh-computer-use-preview"
+    messages: List[CUAMessage]
+    max_tokens: int = 4096
+    # Computer use specific
+    tools: Optional[List[Dict]] = None
+    tool_choice: Optional[Dict] = None
+    # Screen configuration
+    screen: Optional[CUAScreenInfo] = None
+    # Standard params
+    system: Optional[str] = None
+    temperature: Optional[float] = 0.7
+    stream: Optional[bool] = False
+    # Thinking mode
+    thinking: Optional[AnthropicThinkingConfig] = None
+class CUAToolUseBlock(BaseModel):
+    """Tool use content block"""
+    type: str = "tool_use"
+    id: str
+    name: str
+    input: Dict
+class CUAResponse(BaseModel):
+    """CUA response format"""
+    id: str
+    type: str = "message"
+    role: str = "assistant"
+    model: str
+    content: List[Dict]
+    stop_reason: Optional[str] = None
+    usage: Dict
+# ============================================================================
+# CUA - Computer Action Parser
+# ============================================================================
+def parse_computer_action_from_text(text: str, screen_width: int = 1920, screen_height: int = 1080) -> Optional[Dict]:
+    """
+    Parse computer actions from model's text response.
+    The model describes what actions it wants to take, and we parse them.
+    """
+    import re
+    text_lower = text.lower()
+    # Click patterns
+    click_match = re.search(r'click\s+(?:at\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
+    if click_match:
+        return {
+            "type": "tool_use",
+            "id": f"toolu_{uuid.uuid4().hex[:24]}",
+            "name": "computer",
+            "input": {
+                "action": "click",
+                "coordinate": [int(click_match.group(1)), int(click_match.group(2))]
+            }
+        }
+    # Type patterns
+    type_match = re.search(r'type\s+["\']([^"\']+)["\']', text, re.IGNORECASE)
+    if type_match:
+        return {
+            "type": "tool_use",
+            "id": f"toolu_{uuid.uuid4().hex[:24]}",
+            "name": "computer",
+            "input": {
+                "action": "type",
+                "text": type_match.group(1)
+            }
+        }
+    # Key press patterns
+    key_match = re.search(r'press\s+(?:the\s+)?(\w+)\s+key', text_lower)
+    if key_match:
+        return {
+            "type": "tool_use",
+            "id": f"toolu_{uuid.uuid4().hex[:24]}",
+            "name": "computer",
+            "input": {
+                "action": "key",
+                "key": key_match.group(1)
+            }
+        }
+    # Screenshot request
+    if 'screenshot' in text_lower or 'screen capture' in text_lower or 'take a picture' in text_lower:
+        return {
+            "type": "tool_use",
+            "id": f"toolu_{uuid.uuid4().hex[:24]}",
+            "name": "computer",
+            "input": {
+                "action": "screenshot"
+            }
+        }
+    # Scroll patterns
+    scroll_match = re.search(r'scroll\s+(up|down|left|right)(?:\s+(\d+))?', text_lower)
+    if scroll_match:
+        return {
+            "type": "tool_use",
+            "id": f"toolu_{uuid.uuid4().hex[:24]}",
+            "name": "computer",
+            "input": {
+                "action": "scroll",
+                "coordinate": [screen_width // 2, screen_height // 2],
+                "direction": scroll_match.group(1),
+                "amount": int(scroll_match.group(2)) if scroll_match.group(2) else 3
+            }
+        }
+    # Move mouse
+    move_match = re.search(r'move\s+(?:mouse\s+)?(?:to\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
+    if move_match:
+        return {
+            "type": "tool_use",
+            "id": f"toolu_{uuid.uuid4().hex[:24]}",
+            "name": "computer",
+            "input": {
+                "action": "mouse_move",
+                "coordinate": [int(move_match.group(1)), int(move_match.group(2))]
+            }
+        }
+    # Double click
+    if 'double click' in text_lower or 'double-click' in text_lower:
+        dbl_match = re.search(r'double[- ]click\s+(?:at\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
+        if dbl_match:
+            return {
+                "type": "tool_use",
+                "id": f"toolu_{uuid.uuid4().hex[:24]}",
+                "name": "computer",
+                "input": {
+                    "action": "double_click",
+                    "coordinate": [int(dbl_match.group(1)), int(dbl_match.group(2))]
+                }
+            }
+    # Right click
+    if 'right click' in text_lower or 'right-click' in text_lower:
+        right_match = re.search(r'right[- ]click\s+(?:at\s+)?(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
+        if right_match:
+            return {
+                "type": "tool_use",
+                "id": f"toolu_{uuid.uuid4().hex[:24]}",
+                "name": "computer",
+                "input": {
+                    "action": "right_click",
+                    "coordinate": [int(right_match.group(1)), int(right_match.group(2))]
+                }
+            }
+    # Drag patterns
+    drag_match = re.search(r'drag\s+from\s+(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?\s+to\s+(?:\()?(\d+)\s*[,\s]\s*(\d+)(?:\))?', text_lower)
+    if drag_match:
+        return {
+            "type": "tool_use",
+            "id": f"toolu_{uuid.uuid4().hex[:24]}",
+            "name": "computer",
+            "input": {
+                "action": "left_click_drag",
+                "start_coordinate": [int(drag_match.group(1)), int(drag_match.group(2))],
+                "coordinate": [int(drag_match.group(3)), int(drag_match.group(4))]
+            }
+        }
+    return None
+# ============================================================================
+# Computer Use Agent (CUA) Endpoint
+# ============================================================================
+@app.post("/v1/cua")
+async def computer_use_agent(
+    request: CUARequest,
+    authorization: Optional[str] = Header(None),
+    x_api_key: Optional[str] = Header(None, alias="x-api-key"),
+):
+    """
+    Computer Use Agent endpoint - sheikh-computer-use-preview
+    This endpoint provides a computer control interface compatible with
+    Anthropic's Computer Use API. It processes user requests and generates
+    computer control actions (click, type, scroll, screenshot, etc.)
+    The model analyzes the request and current state (via screenshots) and
+    outputs structured tool calls for computer control actions.
+    """
+    auth_key = x_api_key or authorization
+    if not verify_api_key(auth_key):
+        raise HTTPException(status_code=401, detail="Invalid API key")
+    # Get screen configuration
+    screen_width = 1920
+    screen_height = 1080
+    if request.screen:
+        screen_width = request.screen.width
+        screen_height = request.screen.height
+    # Build system prompt for computer use
+    cua_system_prompt = f"""You are a Computer Use Agent (CUA) that helps users interact with computers.
+You can control the computer by describing actions you want to take.
+Available actions:
+- click at (x, y) - Click at screen coordinates
+- double click at (x, y) - Double click at coordinates
+- right click at (x, y) - Right click at coordinates
+- type "text" - Type the specified text
+- press [key] key - Press a key (enter, tab, escape, backspace, etc.)
+- scroll [up/down/left/right] [amount] - Scroll the screen
+- move mouse to (x, y) - Move cursor to coordinates
+- drag from (x1, y1) to (x2, y2) - Drag from one point to another
+- screenshot - Request a screenshot of the current screen
+Screen resolution: {screen_width}x{screen_height}
+When analyzing a screenshot or user request, describe the actions needed step by step.
+Always specify exact coordinates when performing click or move actions.
+Be precise and methodical in your approach."""
+    if request.system:
+        cua_system_prompt = request.system + "\n\n" + cua_system_prompt
+    # Extract messages
+    messages = []
+    for m in request.messages:
+        content = m.content
+        if isinstance(content, str):
+            messages.append({"role": m.role, "content": content})
+        elif isinstance(content, list):
+            # Handle multimodal content (images, tool results)
+            text_parts = []
+            for block in content:
+                if isinstance(block, dict):
+                    if block.get("type") == "text":
+                        text_parts.append(block.get("text", ""))
+                    elif block.get("type") == "image":
+                        text_parts.append("[Screenshot provided - analyzing...]")
+                    elif block.get("type") == "tool_result":
+                        text_parts.append(f"[Tool result: {block.get('content', '')}]")
+            messages.append({"role": m.role, "content": "\n".join(text_parts)})
+    # Check for prefill
+    messages, prefill = extract_prefill_from_messages(messages)
+    prompt = format_messages_for_model(messages, system_prompt=cua_system_prompt, prefill=prefill)
+    request_id = f"msg_{uuid.uuid4().hex[:24]}"
+    if request.stream:
+        async def stream_generator():
+            # message_start
+            message_start = {
+                "type": "message_start",
+                "message": {
+                    "id": request_id,
+                    "type": "message",
+                    "role": "assistant",
+                    "model": request.model,
+                    "content": [],
+                    "stop_reason": None,
+                    "usage": {"input_tokens": 0, "output_tokens": 0}
+                }
+            }
+            yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
+            # content_block_start for text
+            content_block_start = {
+                "type": "content_block_start",
+                "index": 0,
+                "content_block": {"type": "text", "text": ""}
+            }
+            yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
+            full_text = ""
+            output_tokens = 0
+            async for token in generate_stream(
+                prompt,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature or 0.7,
+            ):
+                full_text += token
+                output_tokens += 1
+                delta = {
+                    "type": "content_block_delta",
+                    "index": 0,
+                    "delta": {"type": "text_delta", "text": token}
+                }
+                yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n"
+            # content_block_stop for text
+            yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
+            # Check if we should emit a tool_use block
+            tool_action = parse_computer_action_from_text(full_text, screen_width, screen_height)
+            if tool_action:
+                tool_block_start = {
+                    "type": "content_block_start",
+                    "index": 1,
+                    "content_block": {
+                        "type": "tool_use",
+                        "id": tool_action["id"],
+                        "name": tool_action["name"],
+                        "input": {}
+                    }
+                }
+                yield f"event: content_block_start\ndata: {json.dumps(tool_block_start)}\n\n"
+                # Send input as delta
+                input_delta = {
+                    "type": "content_block_delta",
+                    "index": 1,
+                    "delta": {"type": "input_json_delta", "partial_json": json.dumps(tool_action["input"])}
+                }
+                yield f"event: content_block_delta\ndata: {json.dumps(input_delta)}\n\n"
+                yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 1})}\n\n"
+            # message_delta
+            stop_reason = "tool_use" if tool_action else "end_turn"
+            message_delta = {
+                "type": "message_delta",
+                "delta": {"stop_reason": stop_reason},
+                "usage": {"output_tokens": output_tokens}
+            }
+            yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
+            yield f"event: message_stop\ndata: {json.dumps({'type': 'message_stop'})}\n\n"
+        return StreamingResponse(
+            stream_generator(),
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache", "Connection": "keep-alive"}
+        )
+    # Non-streaming response
+    response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
+        prompt,
+        max_tokens=request.max_tokens,
+        temperature=request.temperature or 0.7,
+    )
+    full_response = prefill + response_text if prefill else response_text
+    # Build content blocks
+    content_blocks = []
+    # Add text block
+    content_blocks.append({"type": "text", "text": full_response})
+    # Parse and add tool use block if detected
+    tool_action = parse_computer_action_from_text(full_response, screen_width, screen_height)
+    if tool_action:
+        content_blocks.append(tool_action)
+        stop_reason = "tool_use"
+    else:
+        stop_reason = "end_turn"
+    return CUAResponse(
+        id=request_id,
+        model=request.model,
+        content=content_blocks,
+        stop_reason=stop_reason,
+        usage={
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens
+        }
+    )
+# Alternative endpoint path for compatibility
+@app.post("/v1/computer-use")
+async def computer_use_alt(
+    request: CUARequest,
+    authorization: Optional[str] = Header(None),
+    x_api_key: Optional[str] = Header(None, alias="x-api-key"),
+):
+    """Alternative endpoint path for computer use"""
+    return await computer_use_agent(request, authorization, x_api_key)
 # ============================================================================
 # Health & Info Endpoints
 # ============================================================================
 async def root():
     return {
         "name": "Free Coding API",
+        "version": "1.2.0",
         "model": MODEL_ID,
         "features": {
             "prefill_response": "Supported - Include assistant message at end for output control",
             "thinking": "Supported - Enable with thinking: {type: 'enabled'}",
+            "streaming": "Supported - Both OpenAI and Anthropic formats",
+            "computer_use": "Supported - CUA with sheikh-computer-use-preview model"
         },
         "compatibility": {
             "openai": "v1 Chat Completions API",
+            "anthropic": "Messages API (2023-06-01)",
+            "computer_use": "Anthropic Computer Use API compatible"
         },
         "endpoints": {
             "openai_chat": "/v1/chat/completions",
             "anthropic_messages": "/v1/messages",
+            "computer_use": "/v1/cua",
+            "computer_use_alt": "/v1/computer-use",
             "models": "/v1/models"
         },
+        "cua_model": "sheikh-computer-use-preview",
         "docs": "/docs"
     }