Spaces:

ayushm98
/

cascade

Sleeping

ayushm98 commited on Feb 1, 2025

Commit

e5ac5b4

0 Parent(s):

chore: initialize Cascade project

- Set up Python project with Poetry
- Add LLM provider abstraction (OpenAI, Ollama)
- Add OpenAI-compatible API schemas
- Configure environment template
- Add .gitignore for Python project

Files changed (11) hide show

.env.example +25 -0
.gitignore +80 -0
pyproject.toml +56 -0
src/cascade/__init__.py +3 -0
src/cascade/api/__init__.py +1 -0
src/cascade/api/schemas.py +79 -0
src/cascade/config.py +43 -0
src/cascade/providers/__init__.py +7 -0
src/cascade/providers/base.py +63 -0
src/cascade/providers/ollama_provider.py +120 -0
src/cascade/providers/openai_provider.py +88 -0

.env.example ADDED Viewed

	@@ -0,0 +1,25 @@

+# OpenAI Configuration
+OPENAI_API_KEY=sk-your-api-key-here
+# Ollama Configuration
+OLLAMA_BASE_URL=http://localhost:11434
+OLLAMA_MODEL=llama3.2
+# Redis Configuration
+REDIS_URL=redis://localhost:6379
+# Qdrant Configuration
+QDRANT_URL=http://localhost:6333
+QDRANT_COLLECTION=cascade_cache
+# Cache Configuration
+CACHE_TTL=3600
+SIMILARITY_THRESHOLD=0.92
+# Server Configuration
+HOST=0.0.0.0
+PORT=8000
+DEBUG=false
+# Logging
+LOG_LEVEL=INFO

.gitignore ADDED Viewed

	@@ -0,0 +1,80 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+# Testing
+.coverage
+.pytest_cache/
+htmlcov/
+.tox/
+.nox/
+# Mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Ruff
+.ruff_cache/
+# Models (large files)
+models/*.pt
+models/*.bin
+models/*.safetensors
+models/*.onnx
+ml/artifacts/*.onnx
+ml/artifacts/*.pt
+# Data
+ml/data/*.jsonl
+ml/data/*.csv
+ml/data/*.parquet
+# Logs
+*.log
+logs/
+# Local config
+.env.local
+# AI Assistant files (never commit these)
+CLAUDE.md
+.claude/
+*claude*
+*CLAUDE*
+IMPLEMENTATION_PLAN.md
+# Baseline results
+baseline_results.json

pyproject.toml ADDED Viewed

	@@ -0,0 +1,56 @@

+[tool.poetry]
+name = "cascade"
+version = "0.1.0"
+description = "Intelligent LLM Request Router - Reduce API costs by 60%+ through smart routing and semantic caching"
+authors = ["Ayush <ayush@example.com>"]
+readme = "README.md"
+packages = [{include = "cascade", from = "src"}]
+[tool.poetry.dependencies]
+python = "^3.11"
+fastapi = "^0.109.0"
+uvicorn = {extras = ["standard"], version = "^0.27.0"}
+httpx = "^0.26.0"
+pydantic = "^2.5.0"
+pydantic-settings = "^2.1.0"
+redis = "^5.0.0"
+qdrant-client = "^1.7.0"
+sentence-transformers = "^2.2.0"
+openai = "^1.10.0"
+prometheus-client = "^0.19.0"
+python-dotenv = "^1.0.0"
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.4.0"
+pytest-asyncio = "^0.23.0"
+pytest-cov = "^4.1.0"
+ruff = "^0.1.0"
+black = "^24.1.0"
+mypy = "^1.8.0"
+pre-commit = "^3.6.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+[tool.ruff]
+line-length = 100
+target-version = "py311"
+[tool.ruff.lint]
+select = ["E", "F", "I", "N", "W", "UP"]
+ignore = ["E501"]
+[tool.black]
+line-length = 100
+target-version = ["py311"]
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]

src/cascade/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Cascade - Intelligent LLM Request Router."""
2	+
3	+ __version__ = "0.1.0"

src/cascade/api/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """API module for Cascade."""

src/cascade/api/schemas.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""OpenAI-compatible request/response schemas."""
+from typing import Literal
+from pydantic import BaseModel, Field
+class ChatMessage(BaseModel):
+    """A single message in a chat conversation."""
+    role: Literal["system", "user", "assistant"] = Field(
+        ..., description="The role of the message author"
+    )
+    content: str = Field(..., description="The content of the message")
+class ChatCompletionRequest(BaseModel):
+    """OpenAI-compatible chat completion request."""
+    model: str = Field(default="gpt-4o", description="Model to use for completion")
+    messages: list[ChatMessage] = Field(..., description="List of messages in the conversation")
+    temperature: float = Field(default=0.7, ge=0, le=2, description="Sampling temperature")
+    max_tokens: int | None = Field(default=None, description="Maximum tokens to generate")
+    stream: bool = Field(default=False, description="Whether to stream the response")
+    # Cascade-specific options
+    bypass_cache: bool = Field(default=False, description="Skip semantic cache lookup")
+    force_model: str | None = Field(default=None, description="Force specific model (bypass routing)")
+class ChatCompletionChoice(BaseModel):
+    """A single completion choice."""
+    index: int = Field(..., description="Index of the choice")
+    message: ChatMessage = Field(..., description="The generated message")
+    finish_reason: Literal["stop", "length", "content_filter"] | None = Field(
+        default="stop", description="Reason for completion"
+    )
+class UsageInfo(BaseModel):
+    """Token usage information."""
+    prompt_tokens: int = Field(..., description="Number of tokens in the prompt")
+    completion_tokens: int = Field(..., description="Number of tokens in the completion")
+    total_tokens: int = Field(..., description="Total tokens used")
+class ChatCompletionResponse(BaseModel):
+    """OpenAI-compatible chat completion response."""
+    id: str = Field(..., description="Unique identifier for the completion")
+    object: Literal["chat.completion"] = Field(default="chat.completion")
+    created: int = Field(..., description="Unix timestamp of creation")
+    model: str = Field(..., description="Model used for completion")
+    choices: list[ChatCompletionChoice] = Field(..., description="List of completion choices")
+    usage: UsageInfo = Field(..., description="Token usage information")
+    # Cascade-specific metadata
+    cascade_metadata: dict | None = Field(
+        default=None,
+        description="Cascade routing metadata (cache_hit, routed_model, cost, etc.)",
+    )
+class HealthResponse(BaseModel):
+    """Health check response."""
+    status: Literal["healthy", "degraded", "unhealthy"] = Field(..., description="Service status")
+    version: str = Field(..., description="API version")
+    components: dict[str, bool] = Field(..., description="Component health status")
+class ErrorResponse(BaseModel):
+    """Error response."""
+    error: str = Field(..., description="Error message")
+    code: str = Field(..., description="Error code")
+    details: dict | None = Field(default=None, description="Additional error details")

src/cascade/config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""Configuration settings for Cascade."""
+from functools import lru_cache
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
+    # OpenAI
+    openai_api_key: str = ""
+    # Ollama
+    ollama_base_url: str = "http://localhost:11434"
+    ollama_model: str = "llama3.2"
+    # Redis
+    redis_url: str = "redis://localhost:6379"
+    # Qdrant
+    qdrant_url: str = "http://localhost:6333"
+    qdrant_collection: str = "cascade_cache"
+    # Cache
+    cache_ttl: int = 3600  # 1 hour
+    similarity_threshold: float = 0.92
+    # Server
+    host: str = "0.0.0.0"
+    port: int = 8000
+    debug: bool = False
+    # Logging
+    log_level: str = "INFO"
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance."""
+    return Settings()

src/cascade/providers/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""LLM Provider implementations."""
+from cascade.providers.base import LLMProvider, LLMResponse
+from cascade.providers.openai_provider import OpenAIProvider
+from cascade.providers.ollama_provider import OllamaProvider
+__all__ = ["LLMProvider", "LLMResponse", "OpenAIProvider", "OllamaProvider"]

src/cascade/providers/base.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Abstract base class for LLM providers."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+@dataclass
+class LLMResponse:
+    """Standardized response from any LLM provider."""
+    content: str
+    model: str
+    prompt_tokens: int
+    completion_tokens: int
+    finish_reason: str = "stop"
+    @property
+    def total_tokens(self) -> int:
+        """Total tokens used."""
+        return self.prompt_tokens + self.completion_tokens
+class LLMProvider(ABC):
+    """Abstract base class for LLM providers."""
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Provider name identifier."""
+        ...
+    @property
+    @abstractmethod
+    def available_models(self) -> list[str]:
+        """List of available models for this provider."""
+        ...
+    @abstractmethod
+    async def complete(
+        self,
+        messages: list[dict],
+        model: str,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+    ) -> LLMResponse:
+        """
+        Generate a chat completion.
+        Args:
+            messages: List of message dicts with 'role' and 'content'
+            model: Model identifier
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+        Returns:
+            LLMResponse with generated content and metadata
+        """
+        ...
+    @abstractmethod
+    async def is_available(self) -> bool:
+        """Check if the provider is available and healthy."""
+        ...

src/cascade/providers/ollama_provider.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""Ollama provider implementation for local LLM inference."""
+import logging
+import httpx
+from cascade.config import get_settings
+from cascade.providers.base import LLMProvider, LLMResponse
+logger = logging.getLogger(__name__)
+class OllamaProvider(LLMProvider):
+    """Ollama local LLM provider."""
+    def __init__(self) -> None:
+        """Initialize Ollama provider."""
+        settings = get_settings()
+        self._base_url = settings.ollama_base_url.rstrip("/")
+        self._default_model = settings.ollama_model
+        self._client = httpx.AsyncClient(timeout=120.0)
+    @property
+    def name(self) -> str:
+        """Provider name."""
+        return "ollama"
+    @property
+    def available_models(self) -> list[str]:
+        """Available Ollama models (configured default)."""
+        return [self._default_model]
+    async def complete(
+        self,
+        messages: list[dict],
+        model: str | None = None,
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+    ) -> LLMResponse:
+        """
+        Generate chat completion using Ollama API.
+        Args:
+            messages: List of message dicts
+            model: Ollama model name (defaults to configured model)
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate (num_predict in Ollama)
+        Returns:
+            LLMResponse with generated content
+        """
+        model = model or self._default_model
+        logger.debug(f"Ollama request: model={model}, messages={len(messages)}")
+        payload: dict = {
+            "model": model,
+            "messages": messages,
+            "stream": False,
+            "options": {
+                "temperature": temperature,
+            },
+        }
+        if max_tokens:
+            payload["options"]["num_predict"] = max_tokens
+        response = await self._client.post(
+            f"{self._base_url}/api/chat",
+            json=payload,
+        )
+        response.raise_for_status()
+        data = response.json()
+        # Ollama response format
+        message = data.get("message", {})
+        content = message.get("content", "")
+        # Ollama provides token counts in different fields
+        prompt_tokens = data.get("prompt_eval_count", 0)
+        completion_tokens = data.get("eval_count", 0)
+        return LLMResponse(
+            content=content,
+            model=model,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            finish_reason="stop",
+        )
+    async def is_available(self) -> bool:
+        """Check if Ollama is running and model is available."""
+        try:
+            response = await self._client.get(f"{self._base_url}/api/tags")
+            if response.status_code != 200:
+                return False
+            data = response.json()
+            models = [m.get("name", "").split(":")[0] for m in data.get("models", [])]
+            # Check if our configured model is available
+            model_base = self._default_model.split(":")[0]
+            available = model_base in models
+            if not available:
+                logger.warning(
+                    f"Ollama model '{self._default_model}' not found. "
+                    f"Available: {models}. Run: ollama pull {self._default_model}"
+                )
+            return available
+        except httpx.ConnectError:
+            logger.warning("Ollama not running. Start with: ollama serve")
+            return False
+        except Exception as e:
+            logger.warning(f"Ollama availability check failed: {e}")
+            return False
+    async def close(self) -> None:
+        """Close the HTTP client."""
+        await self._client.aclose()

src/cascade/providers/openai_provider.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""OpenAI provider implementation."""
+import logging
+from openai import AsyncOpenAI
+from cascade.config import get_settings
+from cascade.providers.base import LLMProvider, LLMResponse
+logger = logging.getLogger(__name__)
+class OpenAIProvider(LLMProvider):
+    """OpenAI API provider."""
+    MODELS = [
+        "gpt-4o",
+        "gpt-4o-mini",
+        "gpt-4-turbo",
+        "gpt-3.5-turbo",
+    ]
+    def __init__(self) -> None:
+        """Initialize OpenAI provider."""
+        settings = get_settings()
+        self._client = AsyncOpenAI(api_key=settings.openai_api_key)
+    @property
+    def name(self) -> str:
+        """Provider name."""
+        return "openai"
+    @property
+    def available_models(self) -> list[str]:
+        """Available OpenAI models."""
+        return self.MODELS
+    async def complete(
+        self,
+        messages: list[dict],
+        model: str = "gpt-4o",
+        temperature: float = 0.7,
+        max_tokens: int | None = None,
+    ) -> LLMResponse:
+        """
+        Generate chat completion using OpenAI API.
+        Args:
+            messages: List of message dicts
+            model: OpenAI model identifier
+            temperature: Sampling temperature
+            max_tokens: Maximum tokens to generate
+        Returns:
+            LLMResponse with generated content
+        """
+        logger.debug(f"OpenAI request: model={model}, messages={len(messages)}")
+        kwargs: dict = {
+            "model": model,
+            "messages": messages,
+            "temperature": temperature,
+        }
+        if max_tokens:
+            kwargs["max_tokens"] = max_tokens
+        response = await self._client.chat.completions.create(**kwargs)
+        choice = response.choices[0]
+        usage = response.usage
+        return LLMResponse(
+            content=choice.message.content or "",
+            model=response.model,
+            prompt_tokens=usage.prompt_tokens if usage else 0,
+            completion_tokens=usage.completion_tokens if usage else 0,
+            finish_reason=choice.finish_reason or "stop",
+        )
+    async def is_available(self) -> bool:
+        """Check if OpenAI API is available."""
+        try:
+            # Simple models list check
+            await self._client.models.list()
+            return True
+        except Exception as e:
+            logger.warning(f"OpenAI availability check failed: {e}")
+            return False