Spaces:
Running on Zero
Running on Zero
Create inference_router.py - Model routing logic
Browse files- inference_router.py +129 -0
inference_router.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TEQUMSA Inference Router
|
| 2 |
+
|
| 3 |
+
Routes inference requests to optimal model providers based on
|
| 4 |
+
prompt analysis, load balancing, and execution mode.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import json
|
| 9 |
+
import time
|
| 10 |
+
from typing import Dict, Any, Optional
|
| 11 |
+
from dataclasses import dataclass
|
| 12 |
+
from enum import Enum
|
| 13 |
+
|
| 14 |
+
class ModelProvider(Enum):
|
| 15 |
+
CLAUDE = "claude"
|
| 16 |
+
GPT = "gpt"
|
| 17 |
+
GEMINI = "gemini"
|
| 18 |
+
PERPLEXITY = "perplexity"
|
| 19 |
+
AUTO = "auto"
|
| 20 |
+
|
| 21 |
+
class ExecutionMode(Enum):
|
| 22 |
+
STANDARD = "standard"
|
| 23 |
+
RECURSIVE = "recursive"
|
| 24 |
+
CAUSAL = "causal"
|
| 25 |
+
RDOD = "rdod"
|
| 26 |
+
|
| 27 |
+
@dataclass
|
| 28 |
+
class RouteDecision:
|
| 29 |
+
provider: str
|
| 30 |
+
confidence: float
|
| 31 |
+
reasoning: str
|
| 32 |
+
mode: str
|
| 33 |
+
estimated_tokens: int
|
| 34 |
+
|
| 35 |
+
class InferenceRouter:
|
| 36 |
+
"""Routes inference requests to optimal model providers."""
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.provider_costs = {
|
| 40 |
+
"claude": {"input": 0.003, "output": 0.015},
|
| 41 |
+
"gpt": {"input": 0.002, "output": 0.008},
|
| 42 |
+
"gemini": {"input": 0.0005, "output": 0.0015},
|
| 43 |
+
"perplexity": {"input": 0.002, "output": 0.008}
|
| 44 |
+
}
|
| 45 |
+
self.provider_latency = {
|
| 46 |
+
"claude": 2.5, "gpt": 2.0, "gemini": 1.8, "perplexity": 3.0
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
def _estimate_tokens(self, prompt: str) -> int:
|
| 50 |
+
"""Estimate token count from prompt text."""
|
| 51 |
+
return len(prompt.split()) * 1.3
|
| 52 |
+
|
| 53 |
+
def _analyze_prompt(self, prompt: str) -> Dict[str, Any]:
|
| 54 |
+
"""Analyze prompt characteristics."""
|
| 55 |
+
length = len(prompt)
|
| 56 |
+
words = prompt.split()
|
| 57 |
+
complexity_keywords = ["analyze", "reason", "complex", "detailed", "technical"]
|
| 58 |
+
creative_keywords = ["create", "write", "story", "poem", "art"]
|
| 59 |
+
code_keywords = ["code", "function", "program", "debug", "algorithm"]
|
| 60 |
+
|
| 61 |
+
has_complexity = any(kw in prompt.lower() for kw in complexity_keywords)
|
| 62 |
+
has_creative = any(kw in prompt.lower() for kw in creative_keywords)
|
| 63 |
+
has_code = any(kw in prompt.lower() for kw in code_keywords)
|
| 64 |
+
|
| 65 |
+
return {
|
| 66 |
+
"length": length,
|
| 67 |
+
"word_count": len(words),
|
| 68 |
+
"has_complexity": has_complexity,
|
| 69 |
+
"has_creative": has_creative,
|
| 70 |
+
"has_code": has_code,
|
| 71 |
+
"is_short": length < 100,
|
| 72 |
+
"is_long": length > 1000
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
def _select_provider(self, analysis: Dict, target: str) -> RouteDecision:
|
| 76 |
+
"""Select optimal provider based on analysis."""
|
| 77 |
+
if target == "auto":
|
| 78 |
+
if analysis["has_code"]:
|
| 79 |
+
provider = "claude"
|
| 80 |
+
confidence = 0.9
|
| 81 |
+
reasoning = "Code-related prompts routed to Claude"
|
| 82 |
+
elif analysis["has_creative"]:
|
| 83 |
+
provider = "gpt"
|
| 84 |
+
confidence = 0.85
|
| 85 |
+
reasoning = "Creative prompts routed to GPT"
|
| 86 |
+
elif analysis["has_complexity"]:
|
| 87 |
+
provider = "claude"
|
| 88 |
+
confidence = 0.88
|
| 89 |
+
reasoning = "Complex reasoning routed to Claude"
|
| 90 |
+
else:
|
| 91 |
+
provider = "gemini"
|
| 92 |
+
confidence = 0.8
|
| 93 |
+
reasoning = "Standard prompts routed to Gemini"
|
| 94 |
+
else:
|
| 95 |
+
provider = target
|
| 96 |
+
confidence = 0.95
|
| 97 |
+
reasoning = f"User-specified provider: {target}"
|
| 98 |
+
|
| 99 |
+
return RouteDecision(
|
| 100 |
+
provider=provider,
|
| 101 |
+
confidence=confidence,
|
| 102 |
+
reasoning=reasoning,
|
| 103 |
+
mode="standard",
|
| 104 |
+
estimated_tokens=int(self._estimate_tokens(
|
| 105 |
+
analysis["length"] * 0.5
|
| 106 |
+
))
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def route(self, prompt: str, target_model: str = "auto") -> Dict[str, Any]:
|
| 110 |
+
"""Route a prompt to optimal model provider."""
|
| 111 |
+
analysis = self._analyze_prompt(prompt)
|
| 112 |
+
decision = self._select_provider(analysis, target_model)
|
| 113 |
+
|
| 114 |
+
return {
|
| 115 |
+
"status": "routed",
|
| 116 |
+
"timestamp": time.time(),
|
| 117 |
+
"analysis": analysis,
|
| 118 |
+
"route": {
|
| 119 |
+
"provider": decision.provider,
|
| 120 |
+
"confidence": decision.confidence,
|
| 121 |
+
"reasoning": decision.reasoning,
|
| 122 |
+
"estimated_tokens": decision.estimated_tokens,
|
| 123 |
+
"latency_estimate": self.provider_latency.get(decision.provider, 2.0),
|
| 124 |
+
"cost_estimate": {
|
| 125 |
+
"input": self.provider_costs.get(decision.provider, {}).get("input", 0),
|
| 126 |
+
"output": self.provider_costs.get(decision.provider, {}).get("output", 0)
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
}
|