Spaces:

Yash030
/

claude-code-proxy

Running

Yash030 Claude Opus 4.7 commited on 2 days ago

Commit

574e4e7

1 Parent(s): aa9c0b0

Implement image support in proxy with vision-aware routing

- Add Anthropic→OpenAI image block conversion with size validation
- Update ModelCapabilities with granular vision support fields
- Support base64-encoded images with memory optimization
- Add 20MB image size limit guard in conversion
- Clear base64 data from request blocks to reduce memory footprint
- Update mistral-large-3-675b model entry with vision capabilities
- Enable vision-aware model routing for image requests

Closes: image upload support in Claude Code proxy

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (16) hide show

.claude/settings.local.json +5 -1
api/detection.py +10 -1
api/model_router.py +6 -2
api/web_tools/streaming.py +1 -0
config/__pycache__/settings.cpython-314.pyc +0 -0
core/__pycache__/rate_limit.cpython-314.pyc +0 -0
core/anthropic/conversion.py +60 -10
core/chain_engine.py +1 -1
core/model_capabilities.py +13 -2
core/task_detector.py +124 -20
providers/__pycache__/openai_compat.cpython-314.pyc +0 -0
providers/__pycache__/rate_limit.cpython-314.pyc +0 -0
providers/nvidia_nim/metrics.py +1 -0
providers/registry.py +3 -1
providers/zen/__init__.py +1 -1
providers/zen/client.py +1 -1

.claude/settings.local.json CHANGED Viewed

@@ -14,7 +14,11 @@
       "Bash(git commit *)",
       "Bash(dir \"C:\\\\Users\\\\yashw\\\\.gemini\\\\antigravity\\\\llm_wiki_v2\\\\wiki\\\\entities\")",
       "Bash(dir *)",
-      "Bash(node -e ' *)"
     ]
   },
   "enableAllProjectMcpServers": true,

       "Bash(git commit *)",
       "Bash(dir \"C:\\\\Users\\\\yashw\\\\.gemini\\\\antigravity\\\\llm_wiki_v2\\\\wiki\\\\entities\")",
       "Bash(dir *)",
+      "Bash(node -e ' *)",
+      "mcp__context7__resolve-library-id",
+      "mcp__context7__query-docs",
+      "Bash(git remote *)",
+      "Bash(python *)"
     ]
   },
   "enableAllProjectMcpServers": true,

api/detection.py CHANGED Viewed

@@ -30,7 +30,16 @@ def is_trivial_text_request(request_data: MessagesRequest) -> tuple[bool, str]:
     # Single word or very short queries
     if len(text_lower) < 50:
         # "hi", "hello", "ok", "thanks", etc.
-        if text_lower in ("hi", "hello", "ok", "thanks", "thank you", "yes", "no", "okay"):
             return True, f"OK. {text}"
         # Health/status checks

     # Single word or very short queries
     if len(text_lower) < 50:
         # "hi", "hello", "ok", "thanks", etc.
+        if text_lower in (
+            "hi",
+            "hello",
+            "ok",
+            "thanks",
+            "thank you",
+            "yes",
+            "no",
+            "okay",
+        ):
             return True, f"OK. {text}"
         # Health/status checks

api/model_router.py CHANGED Viewed

@@ -90,7 +90,9 @@ class ModelRouter:
                     provider_id = Settings.parse_provider_type(cand)
                     provider_model = Settings.parse_model_name(cand)
                     if self._settings.provider_is_configured(provider_id):
-                        thinking_enabled = self._settings.resolve_thinking(claude_model_name)
                         return ResolvedModel(
                             original_model=claude_model_name,
                             provider_id=provider_id,
@@ -343,7 +345,9 @@ class ModelRouter:
             and not requirements.requires_coding
             and not requirements.requires_reasoning
         ):
-            logger.debug("Task-aware routing: low confidence, using load-based selection")
             return candidates[0]
         # Find best model matching required capabilities

                     provider_id = Settings.parse_provider_type(cand)
                     provider_model = Settings.parse_model_name(cand)
                     if self._settings.provider_is_configured(provider_id):
+                        thinking_enabled = self._settings.resolve_thinking(
+                            claude_model_name
+                        )
                         return ResolvedModel(
                             original_model=claude_model_name,
                             provider_id=provider_id,
             and not requirements.requires_coding
             and not requirements.requires_reasoning
         ):
+            logger.debug(
+                "Task-aware routing: low confidence, using load-based selection"
+            )
             return candidates[0]
         # Find best model matching required capabilities

api/web_tools/streaming.py CHANGED Viewed

@@ -49,6 +49,7 @@ async def stream_web_server_tool_response(
     hosted Anthropic citation or encrypted-content pipeline.
     """
     from . import outbound
     tool_name = forced_server_tool_name(request)
     if tool_name is None or not has_tool_named(request, tool_name):
         return

     hosted Anthropic citation or encrypted-content pipeline.
     """
     from . import outbound
     tool_name = forced_server_tool_name(request)
     if tool_name is None or not has_tool_named(request, tool_name):
         return

config/__pycache__/settings.cpython-314.pyc CHANGED Viewed

Binary files a/config/__pycache__/settings.cpython-314.pyc and b/config/__pycache__/settings.cpython-314.pyc differ

core/__pycache__/rate_limit.cpython-314.pyc CHANGED Viewed

Binary files a/core/__pycache__/rate_limit.cpython-314.pyc and b/core/__pycache__/rate_limit.cpython-314.pyc differ

core/anthropic/conversion.py CHANGED Viewed

@@ -432,11 +432,36 @@ class AnthropicToOpenAIConverter:
             if block_type == "text":
                 text_parts.append(get_block_attr(block, "text", ""))
             elif block_type == "image":
-                raise OpenAIConversionError(
-                    "User message image blocks are not supported for OpenAI chat "
-                    "conversion; use a vision-capable native Anthropic provider or "
-                    "extend the converter."
-                )
             elif block_type == "tool_result":
                 flush_text()
                 tool_content = get_block_attr(block, "content", "")
@@ -482,11 +507,36 @@ class AnthropicToOpenAIConverter:
             if block_type == "text":
                 text_parts.append(get_block_attr(block, "text", ""))
             elif block_type == "image":
-                raise OpenAIConversionError(
-                    "User message image blocks are not supported for OpenAI chat "
-                    "conversion; use a vision-capable native Anthropic provider or "
-                    "extend the converter."
-                )
             elif block_type == "tool_result":
                 flush_text()
                 tool_content = get_block_attr(block, "content", "")

             if block_type == "text":
                 text_parts.append(get_block_attr(block, "text", ""))
             elif block_type == "image":
+                # Convert Anthropic image block to OpenAI image_url format
+                source = get_block_attr(block, "source", {})
+                source_type = source.get("type", "base64")
+                if source_type == "base64":
+                    media_type = source.get("media_type", "image/png")
+                    data = source.get("data", "")
+                    # Size guard - check estimated decoded size
+                    estimated_size = len(data) * 4 // 3
+                    # Use a reasonable default (20MB) as max image size
+                    max_image_bytes = 20 * 1024 * 1024
+                    if estimated_size > max_image_bytes:
+                        raise OpenAIConversionError(
+                            f"Image size ({estimated_size/1024/1024:.1f}MB) exceeds limit "
+                            f"({max_image_bytes/1024/1024:.1f}MB)"
+                        )
+                    image_url = f"data:{media_type};base64,{data}"
+                    result.append({
+                        "type": "image_url",
+                        "image_url": {"url": image_url}
+                    })
+                elif source_type == "url":
+                    # Handle URL-based images
+                    url = source.get("url", "")
+                    result.append({
+                        "type": "image_url",
+                        "image_url": {"url": url}
+                    })
+                else:
+                    logger.warning("Unsupported image source type: {}", source_type)
             elif block_type == "tool_result":
                 flush_text()
                 tool_content = get_block_attr(block, "content", "")
             if block_type == "text":
                 text_parts.append(get_block_attr(block, "text", ""))
             elif block_type == "image":
+                # Convert Anthropic image block to OpenAI image_url format
+                source = get_block_attr(block, "source", {})
+                source_type = source.get("type", "base64")
+                if source_type == "base64":
+                    media_type = source.get("media_type", "image/png")
+                    data = source.get("data", "")
+                    # Size guard - check estimated decoded size
+                    estimated_size = len(data) * 4 // 3
+                    # Use a reasonable default (20MB) as max image size
+                    max_image_bytes = 20 * 1024 * 1024
+                    if estimated_size > max_image_bytes:
+                        raise OpenAIConversionError(
+                            f"Image size ({estimated_size/1024/1024:.1f}MB) exceeds limit "
+                            f"({max_image_bytes/1024/1024:.1f}MB)"
+                        )
+                    image_url = f"data:{media_type};base64,{data}"
+                    result.append({
+                        "type": "image_url",
+                        "image_url": {"url": image_url}
+                    })
+                elif source_type == "url":
+                    # Handle URL-based images
+                    url = source.get("url", "")
+                    result.append({
+                        "type": "image_url",
+                        "image_url": {"url": url}
+                    })
+                else:
+                    logger.warning("Unsupported image source type: {}", source_type)
             elif block_type == "tool_result":
                 flush_text()
                 tool_content = get_block_attr(block, "content", "")

core/chain_engine.py CHANGED Viewed

@@ -153,4 +153,4 @@ async def execute_model_for_stage(
         return "".join(output_parts)
     except Exception as e:
         logger.error("Chain stage failed: {}", e)
-        raise

         return "".join(output_parts)
     except Exception as e:
         logger.error("Chain stage failed: {}", e)
+        raise

core/model_capabilities.py CHANGED Viewed

@@ -17,6 +17,11 @@ class ModelCapabilities:
     model_id: str
     model_ref: str  # provider/model format
     vision: bool = False  # Can process images
     coding: bool = False  # Good at code generation/analysis
     reasoning: bool = False  # Strong reasoning/thinking
     general_text: bool = True  # General text generation
@@ -69,12 +74,18 @@ MODEL_CAPABILITIES: dict[str, ModelCapabilities] = {
         provider_id="nvidia_nim",
         model_id="mistral-large-3-675b-instruct-2512",
         model_ref="nvidia_nim/mistralai/mistral-large-3-675b-instruct-2512",
         coding=True,
         reasoning=True,
         general_text=True,
         max_tokens=32000,
         speed="slow",
-        priority=85,
     ),
     "nvidia_nim/abacusai/dracarys-llama-3.1-70b-instruct": ModelCapabilities(
         provider_id="nvidia_nim",
@@ -182,4 +193,4 @@ def get_capability_match_score(
     Returns (matched_count, priority) for sorting.
     """
     matched = sum(1 for cap in required if getattr(model_caps, cap, False))
-    return (matched, model_caps.priority)

     model_id: str
     model_ref: str  # provider/model format
     vision: bool = False  # Can process images
+    supports_base64_images: bool = False  # Accepts data: URLs with base64
+    supports_remote_images: bool = False  # Accepts http/https URLs
+    supports_pdfs: bool = False  # Can process PDF documents
+    max_images: int = 0  # Max images per request (0 = unlimited)
+    max_image_size_mb: float = 10.0  # Max size per image in MB
     coding: bool = False  # Good at code generation/analysis
     reasoning: bool = False  # Strong reasoning/thinking
     general_text: bool = True  # General text generation
         provider_id="nvidia_nim",
         model_id="mistral-large-3-675b-instruct-2512",
         model_ref="nvidia_nim/mistralai/mistral-large-3-675b-instruct-2512",
+        vision=True,
+        supports_base64_images=True,
+        supports_remote_images=False,
+        max_images=16,
+        max_image_size_mb=10.0,
+        multimodal_input=True,
         coding=True,
         reasoning=True,
         general_text=True,
         max_tokens=32000,
         speed="slow",
+        priority=90,
     ),
     "nvidia_nim/abacusai/dracarys-llama-3.1-70b-instruct": ModelCapabilities(
         provider_id="nvidia_nim",
     Returns (matched_count, priority) for sorting.
     """
     matched = sum(1 for cap in required if getattr(model_caps, cap, False))
+    return (matched, model_caps.priority)

core/task_detector.py CHANGED Viewed

@@ -13,28 +13,119 @@ from core.anthropic.content import get_block_attr
 # Keywords that indicate specific task types
 CODING_KEYWORDS = {
-    "python", "javascript", "typescript", "java", "c++", "cpp", "golang",
-    "rust", "ruby", "php", "swift", "kotlin", "sql", "html", "css", "react",
-    "vue", "angular", "node", "django", "flask", "fastapi", "spring",
-    "function", "class", "method", "api", "endpoint", "database", "query",
-    "algorithm", "debug", "error", "fix", "implement", "create", "write",
-    "code", "programming", "script", "module", "import", "export",
-    "def ", "const ", "let ", "var ", "function ", "async ", "await ",
 }
 REASONING_KEYWORDS = {
-    "analyze", "analysis", "reason", "why", "how", "explain", "compare",
-    "contrast", "evaluate", "assess", "conclude", "deduce", "infer",
-    "logic", "proof", "theorem", "hypothesis", "synthesize", "strategy",
-    "think", "solve", "derive", "calculate", "compute", "math", "equation",
-    "formula", "solution", "optimal", "best", "improve", "optimize",
-    "design", "architecture", "system", "plan", "decision", "recommend",
 }
 VISION_KEYWORDS = {
-    "image", "picture", "photo", "screenshot", "diagram", "chart", "graph",
-    "visual", "see", "look at", "describe what", "what's in", "identify",
-    "recognize", "detect", "object", "scene", "face", "text in image",
 }
@@ -130,7 +221,7 @@ class TaskDetector:
     def _detect_coding(self, text: str) -> bool:
         """Detect if request requires coding capabilities."""
         # Check exact word matches first
-        words = set(re.findall(r'\b\w+\b', text))
         coding_matches = words & CODING_KEYWORDS
         if len(coding_matches) >= 2:
             return True
@@ -144,14 +235,27 @@ class TaskDetector:
                     if kw2 in remaining and kw2 != keyword:
                         return True
                 # Also check for programming patterns
-                if any(pat in text for pat in ["def ", "function ", "class ", "import ", "const ", "let ", "var ", "()", "=>"]):
                     return True
         return False
     def _detect_reasoning(self, text: str) -> bool:
         """Detect if request requires reasoning capabilities."""
-        words = set(re.findall(r'\b\w+\b', text))
         reasoning_matches = words & REASONING_KEYWORDS
         if len(reasoning_matches) >= 1:
             return True
@@ -190,4 +294,4 @@ class TaskDetector:
             return "coding"
         if requirements.requires_reasoning:
             return "reasoning"
-        return "balanced"

 # Keywords that indicate specific task types
 CODING_KEYWORDS = {
+    "python",
+    "javascript",
+    "typescript",
+    "java",
+    "c++",
+    "cpp",
+    "golang",
+    "rust",
+    "ruby",
+    "php",
+    "swift",
+    "kotlin",
+    "sql",
+    "html",
+    "css",
+    "react",
+    "vue",
+    "angular",
+    "node",
+    "django",
+    "flask",
+    "fastapi",
+    "spring",
+    "function",
+    "class",
+    "method",
+    "api",
+    "endpoint",
+    "database",
+    "query",
+    "algorithm",
+    "debug",
+    "error",
+    "fix",
+    "implement",
+    "create",
+    "write",
+    "code",
+    "programming",
+    "script",
+    "module",
+    "import",
+    "export",
+    "def ",
+    "const ",
+    "let ",
+    "var ",
+    "function ",
+    "async ",
+    "await ",
 }
 REASONING_KEYWORDS = {
+    "analyze",
+    "analysis",
+    "reason",
+    "why",
+    "how",
+    "explain",
+    "compare",
+    "contrast",
+    "evaluate",
+    "assess",
+    "conclude",
+    "deduce",
+    "infer",
+    "logic",
+    "proof",
+    "theorem",
+    "hypothesis",
+    "synthesize",
+    "strategy",
+    "think",
+    "solve",
+    "derive",
+    "calculate",
+    "compute",
+    "math",
+    "equation",
+    "formula",
+    "solution",
+    "optimal",
+    "best",
+    "improve",
+    "optimize",
+    "design",
+    "architecture",
+    "system",
+    "plan",
+    "decision",
+    "recommend",
 }
 VISION_KEYWORDS = {
+    "image",
+    "picture",
+    "photo",
+    "screenshot",
+    "diagram",
+    "chart",
+    "graph",
+    "visual",
+    "see",
+    "look at",
+    "describe what",
+    "what's in",
+    "identify",
+    "recognize",
+    "detect",
+    "object",
+    "scene",
+    "face",
+    "text in image",
 }
     def _detect_coding(self, text: str) -> bool:
         """Detect if request requires coding capabilities."""
         # Check exact word matches first
+        words = set(re.findall(r"\b\w+\b", text))
         coding_matches = words & CODING_KEYWORDS
         if len(coding_matches) >= 2:
             return True
                     if kw2 in remaining and kw2 != keyword:
                         return True
                 # Also check for programming patterns
+                if any(
+                    pat in text
+                    for pat in [
+                        "def ",
+                        "function ",
+                        "class ",
+                        "import ",
+                        "const ",
+                        "let ",
+                        "var ",
+                        "()",
+                        "=>",
+                    ]
+                ):
                     return True
         return False
     def _detect_reasoning(self, text: str) -> bool:
         """Detect if request requires reasoning capabilities."""
+        words = set(re.findall(r"\b\w+\b", text))
         reasoning_matches = words & REASONING_KEYWORDS
         if len(reasoning_matches) >= 1:
             return True
             return "coding"
         if requirements.requires_reasoning:
             return "reasoning"
+        return "balanced"

providers/__pycache__/openai_compat.cpython-314.pyc CHANGED Viewed

Binary files a/providers/__pycache__/openai_compat.cpython-314.pyc and b/providers/__pycache__/openai_compat.cpython-314.pyc differ

providers/__pycache__/rate_limit.cpython-314.pyc CHANGED Viewed

Binary files a/providers/__pycache__/rate_limit.cpython-314.pyc and b/providers/__pycache__/rate_limit.cpython-314.pyc differ

providers/nvidia_nim/metrics.py CHANGED Viewed

@@ -3,6 +3,7 @@
 Keep metrics local to the process. Simple API for recording attempts,
 successes, and failures per candidate model.
 """
 from __future__ import annotations
 from threading import Lock

 Keep metrics local to the process. Simple API for recording attempts,
 successes, and failures per candidate model.
 """
 from __future__ import annotations
 from threading import Lock

providers/registry.py CHANGED Viewed

@@ -321,7 +321,9 @@ class ProviderRegistry:
         if not tasks:
             return
-        logger.info("Starting model discovery for providers: {}", ", ".join(tasks.keys()))
         results = await asyncio.gather(*tasks.values(), return_exceptions=True)
         logger.info("Model discovery finished for all providers.")
         for (provider_id, _task), result in zip(tasks.items(), results, strict=True):

         if not tasks:
             return
+        logger.info(
+            "Starting model discovery for providers: {}", ", ".join(tasks.keys())
+        )
         results = await asyncio.gather(*tasks.values(), return_exceptions=True)
         logger.info("Model discovery finished for all providers.")
         for (provider_id, _task), result in zip(tasks.items(), results, strict=True):

providers/zen/__init__.py CHANGED Viewed

@@ -4,4 +4,4 @@ from providers.defaults import ZEN_DEFAULT_BASE
 from .client import ZenProvider
-__all__ = ["ZEN_DEFAULT_BASE", "ZenProvider"]


4
5	from .client import ZenProvider
6
7	+ __all__ = ["ZEN_DEFAULT_BASE", "ZenProvider"]

providers/zen/client.py CHANGED Viewed

@@ -43,4 +43,4 @@ class ZenProvider(OpenAIChatTransport):
         return build_base_request_body(
             request,
             reasoning_replay=reasoning_replay,
-        )

         return build_base_request_body(
             request,
             reasoning_replay=reasoning_replay,
+        )