ayushm98 commited on
Commit
e5ac5b4
·
0 Parent(s):

chore: initialize Cascade project

Browse files

- Set up Python project with Poetry
- Add LLM provider abstraction (OpenAI, Ollama)
- Add OpenAI-compatible API schemas
- Configure environment template
- Add .gitignore for Python project

.env.example ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAI Configuration
2
+ OPENAI_API_KEY=sk-your-api-key-here
3
+
4
+ # Ollama Configuration
5
+ OLLAMA_BASE_URL=http://localhost:11434
6
+ OLLAMA_MODEL=llama3.2
7
+
8
+ # Redis Configuration
9
+ REDIS_URL=redis://localhost:6379
10
+
11
+ # Qdrant Configuration
12
+ QDRANT_URL=http://localhost:6333
13
+ QDRANT_COLLECTION=cascade_cache
14
+
15
+ # Cache Configuration
16
+ CACHE_TTL=3600
17
+ SIMILARITY_THRESHOLD=0.92
18
+
19
+ # Server Configuration
20
+ HOST=0.0.0.0
21
+ PORT=8000
22
+ DEBUG=false
23
+
24
+ # Logging
25
+ LOG_LEVEL=INFO
.gitignore ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .env
25
+ .venv
26
+ env/
27
+ venv/
28
+ ENV/
29
+
30
+ # IDE
31
+ .idea/
32
+ .vscode/
33
+ *.swp
34
+ *.swo
35
+ .DS_Store
36
+
37
+ # Testing
38
+ .coverage
39
+ .pytest_cache/
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+
44
+ # Mypy
45
+ .mypy_cache/
46
+ .dmypy.json
47
+ dmypy.json
48
+
49
+ # Ruff
50
+ .ruff_cache/
51
+
52
+ # Models (large files)
53
+ models/*.pt
54
+ models/*.bin
55
+ models/*.safetensors
56
+ models/*.onnx
57
+ ml/artifacts/*.onnx
58
+ ml/artifacts/*.pt
59
+
60
+ # Data
61
+ ml/data/*.jsonl
62
+ ml/data/*.csv
63
+ ml/data/*.parquet
64
+
65
+ # Logs
66
+ *.log
67
+ logs/
68
+
69
+ # Local config
70
+ .env.local
71
+
72
+ # AI Assistant files (never commit these)
73
+ CLAUDE.md
74
+ .claude/
75
+ *claude*
76
+ *CLAUDE*
77
+ IMPLEMENTATION_PLAN.md
78
+
79
+ # Baseline results
80
+ baseline_results.json
pyproject.toml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "cascade"
3
+ version = "0.1.0"
4
+ description = "Intelligent LLM Request Router - Reduce API costs by 60%+ through smart routing and semantic caching"
5
+ authors = ["Ayush <ayush@example.com>"]
6
+ readme = "README.md"
7
+ packages = [{include = "cascade", from = "src"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.11"
11
+ fastapi = "^0.109.0"
12
+ uvicorn = {extras = ["standard"], version = "^0.27.0"}
13
+ httpx = "^0.26.0"
14
+ pydantic = "^2.5.0"
15
+ pydantic-settings = "^2.1.0"
16
+ redis = "^5.0.0"
17
+ qdrant-client = "^1.7.0"
18
+ sentence-transformers = "^2.2.0"
19
+ openai = "^1.10.0"
20
+ prometheus-client = "^0.19.0"
21
+ python-dotenv = "^1.0.0"
22
+
23
+ [tool.poetry.group.dev.dependencies]
24
+ pytest = "^7.4.0"
25
+ pytest-asyncio = "^0.23.0"
26
+ pytest-cov = "^4.1.0"
27
+ ruff = "^0.1.0"
28
+ black = "^24.1.0"
29
+ mypy = "^1.8.0"
30
+ pre-commit = "^3.6.0"
31
+
32
+ [build-system]
33
+ requires = ["poetry-core"]
34
+ build-backend = "poetry.core.masonry.api"
35
+
36
+ [tool.ruff]
37
+ line-length = 100
38
+ target-version = "py311"
39
+
40
+ [tool.ruff.lint]
41
+ select = ["E", "F", "I", "N", "W", "UP"]
42
+ ignore = ["E501"]
43
+
44
+ [tool.black]
45
+ line-length = 100
46
+ target-version = ["py311"]
47
+
48
+ [tool.mypy]
49
+ python_version = "3.11"
50
+ warn_return_any = true
51
+ warn_unused_configs = true
52
+ disallow_untyped_defs = true
53
+
54
+ [tool.pytest.ini_options]
55
+ asyncio_mode = "auto"
56
+ testpaths = ["tests"]
src/cascade/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """Cascade - Intelligent LLM Request Router."""
2
+
3
+ __version__ = "0.1.0"
src/cascade/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """API module for Cascade."""
src/cascade/api/schemas.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenAI-compatible request/response schemas."""
2
+
3
+ from typing import Literal
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class ChatMessage(BaseModel):
9
+ """A single message in a chat conversation."""
10
+
11
+ role: Literal["system", "user", "assistant"] = Field(
12
+ ..., description="The role of the message author"
13
+ )
14
+ content: str = Field(..., description="The content of the message")
15
+
16
+
17
+ class ChatCompletionRequest(BaseModel):
18
+ """OpenAI-compatible chat completion request."""
19
+
20
+ model: str = Field(default="gpt-4o", description="Model to use for completion")
21
+ messages: list[ChatMessage] = Field(..., description="List of messages in the conversation")
22
+ temperature: float = Field(default=0.7, ge=0, le=2, description="Sampling temperature")
23
+ max_tokens: int | None = Field(default=None, description="Maximum tokens to generate")
24
+ stream: bool = Field(default=False, description="Whether to stream the response")
25
+
26
+ # Cascade-specific options
27
+ bypass_cache: bool = Field(default=False, description="Skip semantic cache lookup")
28
+ force_model: str | None = Field(default=None, description="Force specific model (bypass routing)")
29
+
30
+
31
+ class ChatCompletionChoice(BaseModel):
32
+ """A single completion choice."""
33
+
34
+ index: int = Field(..., description="Index of the choice")
35
+ message: ChatMessage = Field(..., description="The generated message")
36
+ finish_reason: Literal["stop", "length", "content_filter"] | None = Field(
37
+ default="stop", description="Reason for completion"
38
+ )
39
+
40
+
41
+ class UsageInfo(BaseModel):
42
+ """Token usage information."""
43
+
44
+ prompt_tokens: int = Field(..., description="Number of tokens in the prompt")
45
+ completion_tokens: int = Field(..., description="Number of tokens in the completion")
46
+ total_tokens: int = Field(..., description="Total tokens used")
47
+
48
+
49
+ class ChatCompletionResponse(BaseModel):
50
+ """OpenAI-compatible chat completion response."""
51
+
52
+ id: str = Field(..., description="Unique identifier for the completion")
53
+ object: Literal["chat.completion"] = Field(default="chat.completion")
54
+ created: int = Field(..., description="Unix timestamp of creation")
55
+ model: str = Field(..., description="Model used for completion")
56
+ choices: list[ChatCompletionChoice] = Field(..., description="List of completion choices")
57
+ usage: UsageInfo = Field(..., description="Token usage information")
58
+
59
+ # Cascade-specific metadata
60
+ cascade_metadata: dict | None = Field(
61
+ default=None,
62
+ description="Cascade routing metadata (cache_hit, routed_model, cost, etc.)",
63
+ )
64
+
65
+
66
+ class HealthResponse(BaseModel):
67
+ """Health check response."""
68
+
69
+ status: Literal["healthy", "degraded", "unhealthy"] = Field(..., description="Service status")
70
+ version: str = Field(..., description="API version")
71
+ components: dict[str, bool] = Field(..., description="Component health status")
72
+
73
+
74
+ class ErrorResponse(BaseModel):
75
+ """Error response."""
76
+
77
+ error: str = Field(..., description="Error message")
78
+ code: str = Field(..., description="Error code")
79
+ details: dict | None = Field(default=None, description="Additional error details")
src/cascade/config.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration settings for Cascade."""
2
+
3
+ from functools import lru_cache
4
+
5
+ from pydantic_settings import BaseSettings, SettingsConfigDict
6
+
7
+
8
+ class Settings(BaseSettings):
9
+ """Application settings loaded from environment variables."""
10
+
11
+ model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
12
+
13
+ # OpenAI
14
+ openai_api_key: str = ""
15
+
16
+ # Ollama
17
+ ollama_base_url: str = "http://localhost:11434"
18
+ ollama_model: str = "llama3.2"
19
+
20
+ # Redis
21
+ redis_url: str = "redis://localhost:6379"
22
+
23
+ # Qdrant
24
+ qdrant_url: str = "http://localhost:6333"
25
+ qdrant_collection: str = "cascade_cache"
26
+
27
+ # Cache
28
+ cache_ttl: int = 3600 # 1 hour
29
+ similarity_threshold: float = 0.92
30
+
31
+ # Server
32
+ host: str = "0.0.0.0"
33
+ port: int = 8000
34
+ debug: bool = False
35
+
36
+ # Logging
37
+ log_level: str = "INFO"
38
+
39
+
40
+ @lru_cache
41
+ def get_settings() -> Settings:
42
+ """Get cached settings instance."""
43
+ return Settings()
src/cascade/providers/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """LLM Provider implementations."""
2
+
3
+ from cascade.providers.base import LLMProvider, LLMResponse
4
+ from cascade.providers.openai_provider import OpenAIProvider
5
+ from cascade.providers.ollama_provider import OllamaProvider
6
+
7
+ __all__ = ["LLMProvider", "LLMResponse", "OpenAIProvider", "OllamaProvider"]
src/cascade/providers/base.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Abstract base class for LLM providers."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass
8
+ class LLMResponse:
9
+ """Standardized response from any LLM provider."""
10
+
11
+ content: str
12
+ model: str
13
+ prompt_tokens: int
14
+ completion_tokens: int
15
+ finish_reason: str = "stop"
16
+
17
+ @property
18
+ def total_tokens(self) -> int:
19
+ """Total tokens used."""
20
+ return self.prompt_tokens + self.completion_tokens
21
+
22
+
23
+ class LLMProvider(ABC):
24
+ """Abstract base class for LLM providers."""
25
+
26
+ @property
27
+ @abstractmethod
28
+ def name(self) -> str:
29
+ """Provider name identifier."""
30
+ ...
31
+
32
+ @property
33
+ @abstractmethod
34
+ def available_models(self) -> list[str]:
35
+ """List of available models for this provider."""
36
+ ...
37
+
38
+ @abstractmethod
39
+ async def complete(
40
+ self,
41
+ messages: list[dict],
42
+ model: str,
43
+ temperature: float = 0.7,
44
+ max_tokens: int | None = None,
45
+ ) -> LLMResponse:
46
+ """
47
+ Generate a chat completion.
48
+
49
+ Args:
50
+ messages: List of message dicts with 'role' and 'content'
51
+ model: Model identifier
52
+ temperature: Sampling temperature
53
+ max_tokens: Maximum tokens to generate
54
+
55
+ Returns:
56
+ LLMResponse with generated content and metadata
57
+ """
58
+ ...
59
+
60
+ @abstractmethod
61
+ async def is_available(self) -> bool:
62
+ """Check if the provider is available and healthy."""
63
+ ...
src/cascade/providers/ollama_provider.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Ollama provider implementation for local LLM inference."""
2
+
3
+ import logging
4
+
5
+ import httpx
6
+
7
+ from cascade.config import get_settings
8
+ from cascade.providers.base import LLMProvider, LLMResponse
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class OllamaProvider(LLMProvider):
14
+ """Ollama local LLM provider."""
15
+
16
+ def __init__(self) -> None:
17
+ """Initialize Ollama provider."""
18
+ settings = get_settings()
19
+ self._base_url = settings.ollama_base_url.rstrip("/")
20
+ self._default_model = settings.ollama_model
21
+ self._client = httpx.AsyncClient(timeout=120.0)
22
+
23
+ @property
24
+ def name(self) -> str:
25
+ """Provider name."""
26
+ return "ollama"
27
+
28
+ @property
29
+ def available_models(self) -> list[str]:
30
+ """Available Ollama models (configured default)."""
31
+ return [self._default_model]
32
+
33
+ async def complete(
34
+ self,
35
+ messages: list[dict],
36
+ model: str | None = None,
37
+ temperature: float = 0.7,
38
+ max_tokens: int | None = None,
39
+ ) -> LLMResponse:
40
+ """
41
+ Generate chat completion using Ollama API.
42
+
43
+ Args:
44
+ messages: List of message dicts
45
+ model: Ollama model name (defaults to configured model)
46
+ temperature: Sampling temperature
47
+ max_tokens: Maximum tokens to generate (num_predict in Ollama)
48
+
49
+ Returns:
50
+ LLMResponse with generated content
51
+ """
52
+ model = model or self._default_model
53
+ logger.debug(f"Ollama request: model={model}, messages={len(messages)}")
54
+
55
+ payload: dict = {
56
+ "model": model,
57
+ "messages": messages,
58
+ "stream": False,
59
+ "options": {
60
+ "temperature": temperature,
61
+ },
62
+ }
63
+ if max_tokens:
64
+ payload["options"]["num_predict"] = max_tokens
65
+
66
+ response = await self._client.post(
67
+ f"{self._base_url}/api/chat",
68
+ json=payload,
69
+ )
70
+ response.raise_for_status()
71
+ data = response.json()
72
+
73
+ # Ollama response format
74
+ message = data.get("message", {})
75
+ content = message.get("content", "")
76
+
77
+ # Ollama provides token counts in different fields
78
+ prompt_tokens = data.get("prompt_eval_count", 0)
79
+ completion_tokens = data.get("eval_count", 0)
80
+
81
+ return LLMResponse(
82
+ content=content,
83
+ model=model,
84
+ prompt_tokens=prompt_tokens,
85
+ completion_tokens=completion_tokens,
86
+ finish_reason="stop",
87
+ )
88
+
89
+ async def is_available(self) -> bool:
90
+ """Check if Ollama is running and model is available."""
91
+ try:
92
+ response = await self._client.get(f"{self._base_url}/api/tags")
93
+ if response.status_code != 200:
94
+ return False
95
+
96
+ data = response.json()
97
+ models = [m.get("name", "").split(":")[0] for m in data.get("models", [])]
98
+
99
+ # Check if our configured model is available
100
+ model_base = self._default_model.split(":")[0]
101
+ available = model_base in models
102
+
103
+ if not available:
104
+ logger.warning(
105
+ f"Ollama model '{self._default_model}' not found. "
106
+ f"Available: {models}. Run: ollama pull {self._default_model}"
107
+ )
108
+
109
+ return available
110
+
111
+ except httpx.ConnectError:
112
+ logger.warning("Ollama not running. Start with: ollama serve")
113
+ return False
114
+ except Exception as e:
115
+ logger.warning(f"Ollama availability check failed: {e}")
116
+ return False
117
+
118
+ async def close(self) -> None:
119
+ """Close the HTTP client."""
120
+ await self._client.aclose()
src/cascade/providers/openai_provider.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenAI provider implementation."""
2
+
3
+ import logging
4
+
5
+ from openai import AsyncOpenAI
6
+
7
+ from cascade.config import get_settings
8
+ from cascade.providers.base import LLMProvider, LLMResponse
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class OpenAIProvider(LLMProvider):
14
+ """OpenAI API provider."""
15
+
16
+ MODELS = [
17
+ "gpt-4o",
18
+ "gpt-4o-mini",
19
+ "gpt-4-turbo",
20
+ "gpt-3.5-turbo",
21
+ ]
22
+
23
+ def __init__(self) -> None:
24
+ """Initialize OpenAI provider."""
25
+ settings = get_settings()
26
+ self._client = AsyncOpenAI(api_key=settings.openai_api_key)
27
+
28
+ @property
29
+ def name(self) -> str:
30
+ """Provider name."""
31
+ return "openai"
32
+
33
+ @property
34
+ def available_models(self) -> list[str]:
35
+ """Available OpenAI models."""
36
+ return self.MODELS
37
+
38
+ async def complete(
39
+ self,
40
+ messages: list[dict],
41
+ model: str = "gpt-4o",
42
+ temperature: float = 0.7,
43
+ max_tokens: int | None = None,
44
+ ) -> LLMResponse:
45
+ """
46
+ Generate chat completion using OpenAI API.
47
+
48
+ Args:
49
+ messages: List of message dicts
50
+ model: OpenAI model identifier
51
+ temperature: Sampling temperature
52
+ max_tokens: Maximum tokens to generate
53
+
54
+ Returns:
55
+ LLMResponse with generated content
56
+ """
57
+ logger.debug(f"OpenAI request: model={model}, messages={len(messages)}")
58
+
59
+ kwargs: dict = {
60
+ "model": model,
61
+ "messages": messages,
62
+ "temperature": temperature,
63
+ }
64
+ if max_tokens:
65
+ kwargs["max_tokens"] = max_tokens
66
+
67
+ response = await self._client.chat.completions.create(**kwargs)
68
+
69
+ choice = response.choices[0]
70
+ usage = response.usage
71
+
72
+ return LLMResponse(
73
+ content=choice.message.content or "",
74
+ model=response.model,
75
+ prompt_tokens=usage.prompt_tokens if usage else 0,
76
+ completion_tokens=usage.completion_tokens if usage else 0,
77
+ finish_reason=choice.finish_reason or "stop",
78
+ )
79
+
80
+ async def is_available(self) -> bool:
81
+ """Check if OpenAI API is available."""
82
+ try:
83
+ # Simple models list check
84
+ await self._client.models.list()
85
+ return True
86
+ except Exception as e:
87
+ logger.warning(f"OpenAI availability check failed: {e}")
88
+ return False