Spaces:
Sleeping
Sleeping
feat: upgrade LLMOpt to V2 ML-powered architecture
Browse files- .env.example +12 -0
- README.md +166 -470
- data/complexity_training_data.json +3052 -51
- llmopt/analyzer/query_analyzer.py +70 -27
- llmopt/cache/__init__.py +1 -0
- llmopt/cache/semantic_cache.py +204 -0
- llmopt/core.py +82 -1
- llmopt/engine/optimization_engine.py +139 -5
- llmopt/estimator/complexity_estimator.py +50 -4
- llmopt/evaluation/__init__.py +1 -0
- llmopt/evaluation/evaluator.py +124 -0
- llmopt/optimizer/prompt_optimizer.py +43 -4
- llmopt/router/model_router.py +6 -6
- pyproject.toml +1 -1
- scripts/fix_json.py +21 -0
- tests/test_pipeline.py +2 -2
.env.example
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LLMOpt Environment Variables
|
| 2 |
+
|
| 3 |
+
# OpenAI
|
| 4 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 5 |
+
|
| 6 |
+
# Anthropic
|
| 7 |
+
ANTHROPIC_API_KEY=your_anthropic_api_key_here
|
| 8 |
+
|
| 9 |
+
# Redis Semantic Cache (V2)
|
| 10 |
+
# Option 1: Local Docker -> redis://localhost:6379
|
| 11 |
+
# Option 2: Redis Cloud -> redis://default:password@endpoint.redis-cloud.com:12345
|
| 12 |
+
REDIS_URL=redis://localhost:6379
|
README.md
CHANGED
|
@@ -1,100 +1,132 @@
|
|
| 1 |
-
# LLMOpt
|
| 2 |
|
| 3 |
-
> **
|
| 4 |
|
| 5 |
-
|
| 6 |
|
| 7 |
-
|
| 8 |
-
Your App → llmopt.generate(query) → [Analyze → Estimate → Optimize → Compress → Route] → LLM API → Response
|
| 9 |
-
```
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
## Table of Contents
|
| 14 |
|
| 15 |
-
- [
|
| 16 |
-
- [
|
| 17 |
-
- [
|
|
|
|
| 18 |
- [Python SDK Usage](#python-sdk-usage)
|
| 19 |
-
- [REST API](#rest-api)
|
| 20 |
-
- [Architecture](#architecture)
|
| 21 |
-
- [Training the ML Complexity Model](#training-the-ml-complexity-model)
|
| 22 |
- [Supported Providers & Models](#supported-providers--models)
|
| 23 |
-
- [
|
| 24 |
-
- [Project Structure](#project-structure)
|
| 25 |
|
| 26 |
---
|
| 27 |
|
| 28 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
# 1. Clone and install
|
| 32 |
-
git clone <repo>
|
| 33 |
-
cd llmopt
|
| 34 |
-
pip install -e ".[ml]"
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
cp config/.env.example config/.env
|
| 38 |
-
# Edit config/.env with your keys
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
---
|
| 50 |
|
| 51 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
### Requirements
|
| 54 |
- Python 3.10+
|
| 55 |
-
- At least one API key (OpenAI, Anthropic, Google, Mistral, DeepSeek) OR
|
| 56 |
|
| 57 |
-
###
|
| 58 |
|
| 59 |
```bash
|
| 60 |
-
#
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
-
#
|
| 64 |
pip install -e ".[ml]"
|
| 65 |
|
| 66 |
-
# Core
|
| 67 |
-
pip install -e
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
pip install -e ".[ml,local
|
| 71 |
```
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
## Configuration
|
| 76 |
-
|
| 77 |
-
Copy the example config and fill in the API keys for providers you want to use. Only keys for providers you need are required.
|
| 78 |
|
| 79 |
```bash
|
| 80 |
cp config/.env.example config/.env
|
| 81 |
```
|
| 82 |
|
| 83 |
```env
|
| 84 |
-
# config/.env
|
| 85 |
-
|
| 86 |
OPENAI_API_KEY=sk-...
|
| 87 |
ANTHROPIC_API_KEY=sk-ant-...
|
| 88 |
GEMINI_API_KEY=AIza...
|
| 89 |
-
MISTRAL_API_KEY=...
|
| 90 |
-
DEEPSEEK_API_KEY=sk-...
|
| 91 |
-
|
| 92 |
-
# Local models (optional)
|
| 93 |
OLLAMA_API_BASE=http://localhost:11434
|
| 94 |
|
| 95 |
-
#
|
| 96 |
-
|
| 97 |
-
LOG_LEVEL=WARNING
|
| 98 |
```
|
| 99 |
|
| 100 |
---
|
|
@@ -108,475 +140,139 @@ from llmopt import LLMOpt
|
|
| 108 |
|
| 109 |
client = LLMOpt()
|
| 110 |
|
|
|
|
| 111 |
result = client.generate(
|
| 112 |
query="Explain the difference between TCP and UDP",
|
| 113 |
-
budget_mode="balanced" # "cheap" | "balanced" | "quality"
|
| 114 |
)
|
| 115 |
|
| 116 |
print(result.response)
|
| 117 |
-
print(f"Model used
|
| 118 |
-
print(f"Cost
|
| 119 |
print(f"Tokens saved: {result.tokens_saved}")
|
| 120 |
```
|
| 121 |
|
| 122 |
-
###
|
| 123 |
-
|
| 124 |
-
| Mode | Behaviour |
|
| 125 |
-
|------|-----------|
|
| 126 |
-
| `cheap` | Aggressively prefers cheapest capable model. Enables compression. |
|
| 127 |
-
| `balanced` | Balances cost and quality. Default. |
|
| 128 |
-
| `quality` | Prioritizes response quality. Picks highest-capability feasible model. |
|
| 129 |
-
|
| 130 |
-
### Full Options
|
| 131 |
|
| 132 |
```python
|
| 133 |
result = client.generate(
|
| 134 |
-
query="Design a distributed rate limiter",
|
| 135 |
-
budget_mode="
|
| 136 |
-
|
| 137 |
-
# Hard cost cap — never spend more than this per request (USD)
|
| 138 |
-
max_cost_per_request=0.005,
|
| 139 |
|
| 140 |
-
#
|
| 141 |
-
|
| 142 |
|
| 143 |
# Provider filtering
|
| 144 |
-
exclude_providers=["openai"],
|
| 145 |
-
only_providers=["anthropic"],
|
| 146 |
-
|
| 147 |
-
# Use only local Ollama models (free, no API calls)
|
| 148 |
-
prefer_local=False,
|
| 149 |
-
|
| 150 |
-
# Pass conversation history for multi-turn chats
|
| 151 |
-
conversation_history=[
|
| 152 |
-
{"role": "user", "content": "I'm building a SaaS API"},
|
| 153 |
-
{"role": "assistant", "content": "Happy to help! What are the requirements?"},
|
| 154 |
-
],
|
| 155 |
|
| 156 |
-
|
|
|
|
| 157 |
|
| 158 |
-
# dry_run=True → runs full pipeline but skips actual API call
|
| 159 |
dry_run=False,
|
| 160 |
)
|
| 161 |
-
```
|
| 162 |
-
|
| 163 |
-
### Explainability
|
| 164 |
-
|
| 165 |
-
```python
|
| 166 |
-
# Get a full explanation of routing decisions WITHOUT making an API call
|
| 167 |
-
explanation = client.explain(
|
| 168 |
-
query="Implement a LRU cache in Python",
|
| 169 |
-
budget_mode="cheap"
|
| 170 |
-
)
|
| 171 |
-
|
| 172 |
-
print(explanation["complexity"])
|
| 173 |
-
# {'score': 0.62, 'tier': 'hard', 'required_coding': 0.74, ...}
|
| 174 |
-
|
| 175 |
-
print(explanation["optimization"])
|
| 176 |
-
# {'selected_model': 'deepseek-chat', 'provider': 'deepseek', ...}
|
| 177 |
-
|
| 178 |
-
# Or use the result object's explain() method after generate()
|
| 179 |
-
result = client.generate("...", dry_run=True)
|
| 180 |
-
print(result.explain())
|
| 181 |
-
```
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
=======================================================
|
| 186 |
-
LLMOpt Decision Explanation
|
| 187 |
-
=======================================================
|
| 188 |
-
Query complexity : 0.623 (hard)
|
| 189 |
-
Primary domain : code
|
| 190 |
-
Required reasoning: 0.62
|
| 191 |
-
Required coding : 0.75
|
| 192 |
-
Required math : 0.00
|
| 193 |
-
|
| 194 |
-
Selected model : deepseek-chat (deepseek)
|
| 195 |
-
Fallback model : claude-3-5-haiku-20241022
|
| 196 |
-
Compression : yes
|
| 197 |
-
System prompt : concise
|
| 198 |
-
|
| 199 |
-
Scoring rationale:
|
| 200 |
-
• model=deepseek-chat
|
| 201 |
-
• capability=0.887
|
| 202 |
-
• cost_norm=0.0192
|
| 203 |
-
• J=-0.607 (α=0.4,β=0.2,γ=0.4)
|
| 204 |
-
|
| 205 |
-
Tokens : 82 in / 600 out
|
| 206 |
-
Tokens saved : 12 (14.6% compression)
|
| 207 |
-
Cost : $0.000180
|
| 208 |
-
Cost saved : $0.006220 vs GPT-4o baseline
|
| 209 |
-
Latency : 1240ms
|
| 210 |
-
=======================================================
|
| 211 |
-
```
|
| 212 |
-
|
| 213 |
-
### Streaming
|
| 214 |
-
|
| 215 |
-
```python
|
| 216 |
-
for chunk in client.stream("Explain async/await in Python"):
|
| 217 |
-
print(chunk, end="", flush=True)
|
| 218 |
-
```
|
| 219 |
-
|
| 220 |
-
### Exploring the Model Registry
|
| 221 |
-
|
| 222 |
-
```python
|
| 223 |
-
# List all models sorted by cost
|
| 224 |
-
for model in client.registry.sorted_by_cost():
|
| 225 |
-
print(f"{model.model_name:35} ${model.input_cost_per_1k:.5f}/1k in capability={model.capability_score:.3f}")
|
| 226 |
-
|
| 227 |
-
# Find cheapest model that can handle high complexity
|
| 228 |
-
best = client.registry.cheapest_capable(complexity=0.85, min_coding=0.80)
|
| 229 |
-
print(best.model_name)
|
| 230 |
-
|
| 231 |
-
# Get only Anthropic models
|
| 232 |
-
for m in client.registry.by_provider("anthropic"):
|
| 233 |
-
print(m.model_name)
|
| 234 |
```
|
| 235 |
|
| 236 |
---
|
| 237 |
|
| 238 |
-
## REST API
|
| 239 |
|
| 240 |
-
|
| 241 |
|
|
|
|
| 242 |
```bash
|
| 243 |
-
python run.py
|
| 244 |
-
# or
|
| 245 |
-
python run.py --host 0.0.0.0 --port 8000 --reload
|
| 246 |
```
|
| 247 |
|
| 248 |
-
###
|
| 249 |
-
|
| 250 |
-
#### `POST /generate`
|
| 251 |
-
|
| 252 |
-
Full pipeline — analyze, optimize, route, return response.
|
| 253 |
-
|
| 254 |
```bash
|
| 255 |
curl -X POST http://localhost:8000/generate \
|
| 256 |
-H "Content-Type: application/json" \
|
| 257 |
-d '{
|
| 258 |
-
"query": "
|
| 259 |
-
"budget_mode": "balanced"
|
|
|
|
| 260 |
}'
|
| 261 |
```
|
| 262 |
|
| 263 |
-
Response:
|
| 264 |
-
```json
|
| 265 |
-
{
|
| 266 |
-
"response": "Quicksort is a divide-and-conquer sorting algorithm...",
|
| 267 |
-
"model_used": "gpt-4o-mini",
|
| 268 |
-
"provider": "openai",
|
| 269 |
-
"input_tokens": 87,
|
| 270 |
-
"output_tokens": 312,
|
| 271 |
-
"total_tokens": 399,
|
| 272 |
-
"estimated_cost": 0.000201,
|
| 273 |
-
"tokens_saved": 14,
|
| 274 |
-
"cost_saved": 0.003891,
|
| 275 |
-
"compression_ratio": 0.12,
|
| 276 |
-
"complexity_score": 0.38,
|
| 277 |
-
"complexity_tier": "medium",
|
| 278 |
-
"latency_ms": 1243
|
| 279 |
-
}
|
| 280 |
-
```
|
| 281 |
-
|
| 282 |
-
**All request options:**
|
| 283 |
```json
|
| 284 |
{
|
| 285 |
-
"
|
| 286 |
-
"
|
| 287 |
-
"
|
| 288 |
-
"
|
| 289 |
-
"
|
| 290 |
-
"
|
| 291 |
-
"
|
| 292 |
-
"
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
"
|
| 297 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
}
|
| 299 |
```
|
| 300 |
|
| 301 |
-
#### `POST /explain`
|
| 302 |
-
|
| 303 |
-
Returns routing decision **without** calling any LLM API. Free to use.
|
| 304 |
-
|
| 305 |
-
```bash
|
| 306 |
-
curl -X POST http://localhost:8000/explain \
|
| 307 |
-
-H "Content-Type: application/json" \
|
| 308 |
-
-d '{"query": "Implement Dijkstra'\''s algorithm", "budget_mode": "cheap"}'
|
| 309 |
-
```
|
| 310 |
-
|
| 311 |
-
#### `GET /models`
|
| 312 |
-
|
| 313 |
-
List all registered models with pricing and capability scores.
|
| 314 |
-
|
| 315 |
-
```bash
|
| 316 |
-
curl http://localhost:8000/models
|
| 317 |
-
```
|
| 318 |
-
|
| 319 |
-
#### `GET /health`
|
| 320 |
-
|
| 321 |
-
```bash
|
| 322 |
-
curl http://localhost:8000/health
|
| 323 |
-
# {"status": "ok", "version": "0.1.0"}
|
| 324 |
-
```
|
| 325 |
-
|
| 326 |
-
#### `POST /stream`
|
| 327 |
-
|
| 328 |
-
Server-sent stream of response tokens. Same request body as `/generate`.
|
| 329 |
-
|
| 330 |
-
---
|
| 331 |
-
|
| 332 |
-
## Architecture
|
| 333 |
-
|
| 334 |
-
```
|
| 335 |
-
┌─────────────────────────────────────────────────────────┐
|
| 336 |
-
│ LLMOpt Client │
|
| 337 |
-
│ llmopt.generate(query, budget_mode, constraints, ...) │
|
| 338 |
-
└──────────────────────┬──────────────────────────────────┘
|
| 339 |
-
│
|
| 340 |
-
┌─────────────▼─────────────┐
|
| 341 |
-
│ Query Analyzer │ Extracts features:
|
| 342 |
-
│ analyzer/query_analyzer.py│ domain, structure,
|
| 343 |
-
│ │ complexity signals
|
| 344 |
-
└─────────────┬─────────────┘
|
| 345 |
-
│ QueryFeatures
|
| 346 |
-
┌─────────────▼─────────────┐
|
| 347 |
-
│ Complexity Estimator │ C(q) ∈ [0,1]
|
| 348 |
-
│ estimator/complexity_ │ Heuristic (V1) or
|
| 349 |
-
│ estimator.py │ ML model (trained)
|
| 350 |
-
└─────────────┬─────────────┘
|
| 351 |
-
│ ComplexityResult
|
| 352 |
-
┌─────────────▼─────────────┐
|
| 353 |
-
│ Optimization Engine │ Solves:
|
| 354 |
-
│ engine/optimization_ │ min J(x) = α·Cost
|
| 355 |
-
│ engine.py │ + β·Tokens - γ·Quality
|
| 356 |
-
└─────────────┬─────────────┘
|
| 357 |
-
│ OptimizationResult (model, config)
|
| 358 |
-
┌─────────────▼─────────────┐
|
| 359 |
-
│ Prompt Optimizer │ Reduces tokens:
|
| 360 |
-
│ optimizer/prompt_ │ filler removal,
|
| 361 |
-
│ optimizer.py │ whitespace, rewrites
|
| 362 |
-
└─────────────┬─────────────┘
|
| 363 |
-
│ OptimizedPrompt
|
| 364 |
-
┌─────────────▼─────────────┐
|
| 365 |
-
│ Model Router │ Provider abstraction
|
| 366 |
-
│ router/model_router.py │ via LiteLLM
|
| 367 |
-
└─────────────┬─────────────┘
|
| 368 |
-
│
|
| 369 |
-
┌─────────────▼─────────────┐
|
| 370 |
-
│ LLM Provider API │ OpenAI / Anthropic /
|
| 371 |
-
│ (OpenAI, Anthropic, │ Google / Mistral /
|
| 372 |
-
│ Google, Mistral, │ DeepSeek / Ollama
|
| 373 |
-
│ DeepSeek, Ollama) │
|
| 374 |
-
└───────────────────────────┘
|
| 375 |
-
```
|
| 376 |
-
|
| 377 |
-
### Decision Flow
|
| 378 |
-
|
| 379 |
-
1. **Query Analyzer** extracts ~20 features: domain flags (code/math/reasoning), structural signals (code blocks, math notation, multi-step), and output length estimate.
|
| 380 |
-
|
| 381 |
-
2. **Complexity Estimator** converts features into `C(q) ∈ [0, 1]` using a weighted scoring formula. With the optional ML model trained, it uses a GradientBoosting regressor instead.
|
| 382 |
-
|
| 383 |
-
3. **Optimization Engine** solves `min J(x) = α·Cost + β·Tokens - γ·Quality` where weights α/β/γ shift based on `budget_mode`. It filters models by capability constraints, applies hard filters (cost caps, latency), then scores remaining candidates.
|
| 384 |
-
|
| 385 |
-
4. **Prompt Optimizer** reduces input tokens by removing filler phrases, normalizing whitespace, rewriting verbose instructions, and selecting the right system prompt style.
|
| 386 |
-
|
| 387 |
-
5. **Model Router** dispatches via LiteLLM to the selected provider, returning a `RoutedResponse` with actual token usage and cost.
|
| 388 |
-
|
| 389 |
-
---
|
| 390 |
-
|
| 391 |
-
## Training the ML Complexity Model
|
| 392 |
-
|
| 393 |
-
The heuristic estimator works well but a trained model will be more accurate, especially for edge cases.
|
| 394 |
-
|
| 395 |
-
### Step 1 — Add labeled examples
|
| 396 |
-
|
| 397 |
-
Edit `data/complexity_training_data.json`:
|
| 398 |
-
```json
|
| 399 |
-
[
|
| 400 |
-
{"query": "What is Python?", "complexity": 0.05, "tier": "trivial", "domain": "factual"},
|
| 401 |
-
{"query": "Implement a B-tree from scratch", "complexity": 0.88, "tier": "expert", "domain": "code"},
|
| 402 |
-
...
|
| 403 |
-
]
|
| 404 |
-
```
|
| 405 |
-
|
| 406 |
-
The more examples you add (especially from your actual traffic), the better the model.
|
| 407 |
-
|
| 408 |
-
### Step 2 — Train
|
| 409 |
-
|
| 410 |
-
```bash
|
| 411 |
-
python scripts/train_complexity_model.py
|
| 412 |
-
```
|
| 413 |
-
|
| 414 |
-
Output:
|
| 415 |
-
```
|
| 416 |
-
Loading training data...
|
| 417 |
-
50 labeled examples loaded
|
| 418 |
-
Training GradientBoostingRegressor...
|
| 419 |
-
|
| 420 |
-
Train R² : 0.988
|
| 421 |
-
Test R² : 0.821 ← improves with more data
|
| 422 |
-
CV R² mean: 0.794 ± 0.12
|
| 423 |
-
MAE : 0.048
|
| 424 |
-
|
| 425 |
-
Model saved to: data/complexity_model.pkl
|
| 426 |
-
```
|
| 427 |
-
|
| 428 |
-
### Step 3 — Activate
|
| 429 |
-
|
| 430 |
-
The trained model is auto-detected at startup when `data/complexity_model.pkl` exists.
|
| 431 |
-
|
| 432 |
-
To plug it into the `ComplexityEstimator`:
|
| 433 |
-
|
| 434 |
-
```python
|
| 435 |
-
# In llmopt/estimator/complexity_estimator.py, add to __init__:
|
| 436 |
-
from scripts.train_complexity_model import load_model, predict
|
| 437 |
-
|
| 438 |
-
class ComplexityEstimator:
|
| 439 |
-
def __init__(self):
|
| 440 |
-
bundle = load_model()
|
| 441 |
-
if bundle:
|
| 442 |
-
self._ml_model, self._feature_extractor = bundle
|
| 443 |
-
else:
|
| 444 |
-
self._ml_model = None
|
| 445 |
-
|
| 446 |
-
def estimate(self, features: QueryFeatures) -> ComplexityResult:
|
| 447 |
-
if self._ml_model:
|
| 448 |
-
score = predict(features.raw_query, self._ml_model, self._feature_extractor)
|
| 449 |
-
# ... continue with score
|
| 450 |
-
else:
|
| 451 |
-
# heuristic fallback
|
| 452 |
-
```
|
| 453 |
-
|
| 454 |
-
> **Note:** With only 50 training examples, test R² will be low. Collect 500+ labeled examples from real queries for production use.
|
| 455 |
-
|
| 456 |
---
|
| 457 |
|
| 458 |
## Supported Providers & Models
|
| 459 |
|
|
|
|
|
|
|
| 460 |
| Model | Provider | Input $/1k | Output $/1k | Capability | Best For |
|
| 461 |
|-------|----------|-----------|------------|------------|----------|
|
| 462 |
-
| `gpt-4o` |
|
| 463 |
-
| `gpt-4o-mini` |
|
| 464 |
-
| `
|
| 465 |
-
| `claude-3-5-
|
| 466 |
-
| `
|
| 467 |
-
| `
|
| 468 |
-
| `
|
| 469 |
-
| `
|
| 470 |
-
| `mistral-small-latest` | mistral | $0.001 | $0.003 | 0.686 | European data |
|
| 471 |
-
| `mistral-large-latest` | mistral | $0.003 | $0.009 | 0.852 | EU + quality |
|
| 472 |
-
| `deepseek-chat` | deepseek | $0.00014 | $0.00028 | 0.887 | Best value math/code |
|
| 473 |
-
| `llama3.1:8b` | ollama | FREE | FREE | 0.657 | Private/local |
|
| 474 |
-
| `llama3.1:70b` | ollama | FREE | FREE | 0.823 | Local high-quality |
|
| 475 |
-
|
| 476 |
-
Add or update models in `data/model_registry.json` — no code changes needed.
|
| 477 |
-
|
| 478 |
-
---
|
| 479 |
-
|
| 480 |
-
## Extending LLMOpt
|
| 481 |
|
| 482 |
-
|
| 483 |
|
| 484 |
-
|
| 485 |
-
2. Add the LiteLLM model string to `_LITELLM_MODEL_MAP` in `router/model_router.py`
|
| 486 |
-
3. Add the API key env var to `config/.env.example`
|
| 487 |
-
|
| 488 |
-
### Add a new model to an existing provider
|
| 489 |
-
|
| 490 |
-
Just add an entry to `data/model_registry.json`:
|
| 491 |
-
```json
|
| 492 |
-
{
|
| 493 |
-
"model_name": "gpt-5",
|
| 494 |
-
"provider": "openai",
|
| 495 |
-
"input_cost_per_1k": 0.01,
|
| 496 |
-
"output_cost_per_1k": 0.03,
|
| 497 |
-
"context_window": 256000,
|
| 498 |
-
"reasoning_score": 0.98,
|
| 499 |
-
"coding_score": 0.97,
|
| 500 |
-
"math_score": 0.96,
|
| 501 |
-
"instruction_following_score": 0.98,
|
| 502 |
-
"latency_score": 0.65,
|
| 503 |
-
"max_complexity": 1.0,
|
| 504 |
-
"notes": "Next-gen flagship"
|
| 505 |
-
}
|
| 506 |
-
```
|
| 507 |
-
|
| 508 |
-
### Replace the complexity estimator with an ML model
|
| 509 |
|
| 510 |
-
|
| 511 |
|
| 512 |
-
|
| 513 |
|
| 514 |
-
Add a Redis-backed cache layer before the router:
|
| 515 |
```python
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
|
|
|
| 519 |
```
|
| 520 |
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
# Single layer
|
| 530 |
-
pytest tests/test_pipeline.py::TestOptimizationEngine -v
|
| 531 |
-
```
|
| 532 |
|
| 533 |
-
--
|
|
|
|
|
|
|
|
|
|
| 534 |
|
| 535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
|
|
|
|
|
|
|
| 537 |
```
|
| 538 |
-
llmopt/
|
| 539 |
-
├── config/
|
| 540 |
-
│ └── .env.example ← Copy to .env, add API keys
|
| 541 |
-
├── data/
|
| 542 |
-
│ ├── model_registry.json ← Model specs + pricing (edit to add models)
|
| 543 |
-
│ ├── complexity_training_data.json ← Labeled examples for ML model
|
| 544 |
-
│ └── complexity_model.pkl ← Generated by train script
|
| 545 |
-
├── llmopt/
|
| 546 |
-
│ ├── core.py ← Main LLMOpt client (start here)
|
| 547 |
-
│ ├── analyzer/
|
| 548 |
-
│ │ └── query_analyzer.py ← Feature extraction from raw queries
|
| 549 |
-
│ ├── estimator/
|
| 550 |
-
│ │ └── complexity_estimator.py ← C(q) scoring
|
| 551 |
-
│ ├── engine/
|
| 552 |
-
│ │ └── optimization_engine.py ← Model selection + objective fn
|
| 553 |
-
│ ├── optimizer/
|
| 554 |
-
│ │ └── prompt_optimizer.py ← Token compression
|
| 555 |
-
│ ├── router/
|
| 556 |
-
│ │ └── model_router.py ← LiteLLM provider abstraction
|
| 557 |
-
│ ├── registry/
|
| 558 |
-
│ │ └── model_registry.py ← Model registry loader + queries
|
| 559 |
-
│ └── api/
|
| 560 |
-
│ └── app.py ← FastAPI REST API
|
| 561 |
-
├── scripts/
|
| 562 |
-
│ └── train_complexity_model.py ← Train GBR complexity estimator
|
| 563 |
-
├── tests/
|
| 564 |
-
│ └── test_pipeline.py ← 35 unit + integration tests
|
| 565 |
-
├── run.py ← Server entry point
|
| 566 |
-
└── pyproject.toml
|
| 567 |
-
```
|
| 568 |
-
|
| 569 |
-
---
|
| 570 |
-
|
| 571 |
-
## V2 / V3 Roadmap
|
| 572 |
-
|
| 573 |
-
The architecture is designed so each layer can be upgraded independently:
|
| 574 |
-
|
| 575 |
-
| Component | V1 (current) | V2 | V3 |
|
| 576 |
-
|-----------|-------------|----|----|
|
| 577 |
-
| Query Analyzer | Heuristic + regex | DistilBERT classifier | MiniLM embeddings |
|
| 578 |
-
| Complexity Estimator | Weighted scoring | GBR regressor (50+ samples) | Pairwise ranking model |
|
| 579 |
-
| Optimization Engine | Rule-based + objective fn | Bayesian optimization | Contextual bandits |
|
| 580 |
-
| Prompt Optimizer | Whitespace + filler removal | Semantic compression | Learned token pruning |
|
| 581 |
-
| Caching | None | Semantic cache (Redis) | Distributed cache |
|
| 582 |
-
| Evaluation | None | LLM-as-a-judge | Human preference modeling |
|
|
|
|
| 1 |
+
# LLMOpt: The Adaptive Inference Optimization Framework (V2)
|
| 2 |
|
| 3 |
+
> **Intelligent Routing. Minimal Latency. Maximum ROI.**
|
| 4 |
|
| 5 |
+
In the era of sprawling Large Language Models (LLMs), routing every query to a flagship model like GPT-4o or Claude 3.5 Sonnet is financially unsustainable and computationally wasteful.
|
| 6 |
|
| 7 |
+
**LLMOpt** is an enterprise-grade middleware layer that sits between your application and your LLM providers. By dynamically analyzing the semantic complexity of incoming queries, LLMOpt automatically selects the most cost-effective model capable of handling the request, compresses context windows to reduce token waste, and caches responses—all while giving you full observability into its decision-making process.
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
```text
|
| 10 |
+
Your App → llmopt.generate(query)
|
| 11 |
+
→ [Semantic Cache → NLI Analyze → GBR Estimate → Bayesian Optimize → LLMLingua Compress → Route]
|
| 12 |
+
→ LLM API → Response
|
| 13 |
+
```
|
| 14 |
|
| 15 |
## Table of Contents
|
| 16 |
|
| 17 |
+
- [The V2 Architecture](#the-v2-architecture)
|
| 18 |
+
- [Core ML Components](#core-ml-components)
|
| 19 |
+
- [Graceful Degradation](#graceful-degradation)
|
| 20 |
+
- [Quick Start & Installation](#quick-start--installation)
|
| 21 |
- [Python SDK Usage](#python-sdk-usage)
|
| 22 |
+
- [REST API Integration](#rest-api-integration)
|
|
|
|
|
|
|
| 23 |
- [Supported Providers & Models](#supported-providers--models)
|
| 24 |
+
- [Explainability & Observability](#explainability--observability)
|
|
|
|
| 25 |
|
| 26 |
---
|
| 27 |
|
| 28 |
+
## The V2 Architecture
|
| 29 |
+
|
| 30 |
+
LLMOpt V2 has transitioned from a static, heuristic-based router to a fully **Machine Learning-powered pipeline**. The framework acts as an intelligent funnel, progressively optimizing the request before it ever reaches an LLM provider.
|
| 31 |
+
|
| 32 |
+
```mermaid
|
| 33 |
+
flowchart TD
|
| 34 |
+
A[Incoming Query] --> B(Semantic Cache)
|
| 35 |
+
B -->|Cache Hit| Z[Return Cached Response]
|
| 36 |
+
B -->|Cache Miss| C(Query Analyzer)
|
| 37 |
+
C --> D(Complexity Estimator)
|
| 38 |
+
D --> E(Optimization Engine)
|
| 39 |
+
E --> F(Prompt Optimizer)
|
| 40 |
+
F --> G(Model Router)
|
| 41 |
+
G --> H((LLM Provider))
|
| 42 |
+
H --> I[LLM-as-a-Judge Evaluator]
|
| 43 |
+
I -->|Feedback Loop| E
|
| 44 |
+
I --> Z
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### Pipeline Stages
|
| 48 |
+
1. **Semantic Cache**: Checks Redis for highly similar past queries.
|
| 49 |
+
2. **Query Analyzer**: Extracts structural features and semantic domains from the prompt.
|
| 50 |
+
3. **Complexity Estimator**: Predicts the cognitive load required to answer the query (0.0 to 1.0).
|
| 51 |
+
4. **Optimization Engine**: Minimizes a cost/quality objective function to pick the perfect model.
|
| 52 |
+
5. **Prompt Optimizer**: Intelligently compresses the prompt to shed unnecessary tokens.
|
| 53 |
+
6. **Model Router**: Dispatches the request via LiteLLM to OpenAI, Anthropic, Google, Ollama, etc.
|
| 54 |
+
7. **Evaluator (Optional)**: Scores the response quality and feeds it back to the optimization engine.
|
| 55 |
|
| 56 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
## Core ML Components
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
The V2 release introduces state-of-the-art machine learning to every layer of the pipeline:
|
| 61 |
+
|
| 62 |
+
### 1. Zero-Shot NLI Query Analyzer
|
| 63 |
+
Instead of relying on brittle regex patterns to determine if a query is asking for "code" or "math," LLMOpt utilizes HuggingFace's `cross-encoder/nli-distilroberta-base`. This semantic reasoning engine accurately categorizes query intent on the fly without requiring labeled datasets.
|
| 64 |
+
|
| 65 |
+
### 2. Sentence-Transformer Semantic Cache
|
| 66 |
+
Before spending API credits, the framework embeds the incoming query using a lightweight, local `all-MiniLM-L6-v2` model and compares it against a Redis-backed vector store using cosine similarity. If an existing query matches with >95% similarity, the cached response is served at **$0.00 cost** and near-zero latency.
|
| 67 |
+
|
| 68 |
+
### 3. Gradient Boosting Complexity Estimator
|
| 69 |
+
To predict how "hard" a query is, LLMOpt leverages a `scikit-learn` Gradient Boosting Regressor (GBR) trained on hundreds of annotated examples. It accurately scales the required capability threshold, ensuring that "What is Python?" gets routed to a fast, cheap model, while "Implement a distributed Paxos consensus algorithm" gets routed to a flagship reasoning model.
|
| 70 |
+
|
| 71 |
+
### 4. Bayesian Weight Optimization (Optuna)
|
| 72 |
+
The Optimization Engine selects models by minimizing the objective function:
|
| 73 |
+
`J(x) = α·Cost + β·Tokens - γ·Quality`
|
| 74 |
+
Instead of hardcoding `α`, `β`, and `γ`, LLMOpt integrates **Optuna**. By processing real-world feedback from the LLM evaluator, Optuna uses Bayesian optimization to continuously learn and adjust these weights to mathematically guarantee the highest quality responses for the lowest possible price.
|
| 75 |
+
|
| 76 |
+
### 5. LLMLingua Semantic Compression
|
| 77 |
+
Large context windows are expensive. LLMOpt integrates Microsoft's `llmlingua-2` to perform semantic token pruning. It identifies and removes non-essential tokens (filler words, redundant context) from the prompt while preserving the core semantic meaning, reducing input costs by up to 40% before the LLM is even called.
|
| 78 |
+
|
| 79 |
+
### 6. LLM-as-a-Judge Evaluation Loop
|
| 80 |
+
When explicitly requested (`evaluate=True`), LLMOpt uses a highly efficient judge model (`gpt-4o-mini`) to score the returned response across Accuracy, Completeness, Clarity, and Conciseness. This score is automatically fed back into the Bayesian Optimizer to improve future routing decisions.
|
| 81 |
|
| 82 |
---
|
| 83 |
|
| 84 |
+
## Graceful Degradation
|
| 85 |
+
|
| 86 |
+
Enterprise systems must be resilient. **LLMOpt is designed to never crash if an ML dependency is missing or unavailable.**
|
| 87 |
+
|
| 88 |
+
If you choose not to install the heavy `[ml]` dependencies (like PyTorch or sentence-transformers), or if your Redis cache goes offline, LLMOpt silently and seamlessly **falls back to its robust V1 heuristic rules**. This ensures that your application continues to route requests efficiently under all circumstances.
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## Quick Start & Installation
|
| 93 |
|
| 94 |
### Requirements
|
| 95 |
- Python 3.10+
|
| 96 |
+
- At least one API key (OpenAI, Anthropic, Google, Mistral, DeepSeek) OR a local Ollama instance.
|
| 97 |
|
| 98 |
+
### Installation
|
| 99 |
|
| 100 |
```bash
|
| 101 |
+
# Clone the repository
|
| 102 |
+
git clone <repo_url>
|
| 103 |
+
cd llmopt
|
| 104 |
|
| 105 |
+
# Install with all Machine Learning capabilities (Highly Recommended for V2)
|
| 106 |
pip install -e ".[ml]"
|
| 107 |
|
| 108 |
+
# Install Core only (uses V1 heuristic fallbacks)
|
| 109 |
+
pip install -e .
|
| 110 |
|
| 111 |
+
# Install with Local Model support
|
| 112 |
+
pip install -e ".[ml,local]"
|
| 113 |
```
|
| 114 |
|
| 115 |
+
### Configuration
|
| 116 |
+
Copy the environment template and add your API keys. You only need to provide keys for the providers you intend to use.
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
```bash
|
| 119 |
cp config/.env.example config/.env
|
| 120 |
```
|
| 121 |
|
| 122 |
```env
|
|
|
|
|
|
|
| 123 |
OPENAI_API_KEY=sk-...
|
| 124 |
ANTHROPIC_API_KEY=sk-ant-...
|
| 125 |
GEMINI_API_KEY=AIza...
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
OLLAMA_API_BASE=http://localhost:11434
|
| 127 |
|
| 128 |
+
# Required for V2 Semantic Caching
|
| 129 |
+
REDIS_URL=redis://localhost:6379/0
|
|
|
|
| 130 |
```
|
| 131 |
|
| 132 |
---
|
|
|
|
| 140 |
|
| 141 |
client = LLMOpt()
|
| 142 |
|
| 143 |
+
# The framework handles analysis, optimization, and routing automatically
|
| 144 |
result = client.generate(
|
| 145 |
query="Explain the difference between TCP and UDP",
|
| 146 |
+
budget_mode="balanced" # Options: "cheap" | "balanced" | "quality"
|
| 147 |
)
|
| 148 |
|
| 149 |
print(result.response)
|
| 150 |
+
print(f"Model used : {result.model_used}")
|
| 151 |
+
print(f"Cost : ${result.estimated_cost:.6f}")
|
| 152 |
print(f"Tokens saved: {result.tokens_saved}")
|
| 153 |
```
|
| 154 |
|
| 155 |
+
### Advanced Constraints & Evaluation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
```python
|
| 158 |
result = client.generate(
|
| 159 |
+
query="Design a highly available distributed rate limiter.",
|
| 160 |
+
budget_mode="quality",
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
+
# Hard cap — never spend more than this per request (USD)
|
| 163 |
+
max_cost_per_request=0.01,
|
| 164 |
|
| 165 |
# Provider filtering
|
| 166 |
+
exclude_providers=["openai"],
|
| 167 |
+
only_providers=["anthropic", "google"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
# Opt-in to the LLM-as-a-judge feedback loop
|
| 170 |
+
evaluate=True,
|
| 171 |
|
| 172 |
+
# dry_run=True → runs full optimization pipeline but skips the actual API call
|
| 173 |
dry_run=False,
|
| 174 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
+
if result.evaluation:
|
| 177 |
+
print(f"Quality Score: {result.evaluation.overall}/10")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
```
|
| 179 |
|
| 180 |
---
|
| 181 |
|
| 182 |
+
## REST API Integration
|
| 183 |
|
| 184 |
+
LLMOpt includes a built-in FastAPI server for easy integration into non-Python architectures.
|
| 185 |
|
| 186 |
+
### Start the server
|
| 187 |
```bash
|
| 188 |
+
python run.py --host 0.0.0.0 --port 8000
|
|
|
|
|
|
|
| 189 |
```
|
| 190 |
|
| 191 |
+
### `POST /generate`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
```bash
|
| 193 |
curl -X POST http://localhost:8000/generate \
|
| 194 |
-H "Content-Type: application/json" \
|
| 195 |
-d '{
|
| 196 |
+
"query": "Write a recursive Fibonacci function in Rust",
|
| 197 |
+
"budget_mode": "balanced",
|
| 198 |
+
"evaluate": true
|
| 199 |
}'
|
| 200 |
```
|
| 201 |
|
| 202 |
+
**Response payload includes deep insights into the optimization process:**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
```json
|
| 204 |
{
|
| 205 |
+
"response": "Here is the Rust implementation...",
|
| 206 |
+
"model_used": "claude-3-5-haiku-20241022",
|
| 207 |
+
"provider": "anthropic",
|
| 208 |
+
"input_tokens": 105,
|
| 209 |
+
"output_tokens": 342,
|
| 210 |
+
"total_tokens": 447,
|
| 211 |
+
"estimated_cost": 0.001452,
|
| 212 |
+
"tokens_saved": 28,
|
| 213 |
+
"compression_ratio": 0.21,
|
| 214 |
+
"complexity_score": 0.62,
|
| 215 |
+
"complexity_tier": "hard",
|
| 216 |
+
"latency_ms": 1140,
|
| 217 |
+
"evaluation": {
|
| 218 |
+
"overall": 9.5,
|
| 219 |
+
"accuracy": 10.0,
|
| 220 |
+
"feedback": "The code is idiomatic and correctly implements recursion."
|
| 221 |
+
}
|
| 222 |
}
|
| 223 |
```
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
---
|
| 226 |
|
| 227 |
## Supported Providers & Models
|
| 228 |
|
| 229 |
+
The routing engine dynamically compares models across providers based on their unified capability scores and per-token pricing. Add or update models simply by modifying `data/model_registry.json`.
|
| 230 |
+
|
| 231 |
| Model | Provider | Input $/1k | Output $/1k | Capability | Best For |
|
| 232 |
|-------|----------|-----------|------------|------------|----------|
|
| 233 |
+
| `gpt-4o` | OpenAI | $0.0025 | $0.010 | 0.930 | Complex reasoning |
|
| 234 |
+
| `gpt-4o-mini` | OpenAI | $0.00015 | $0.0006 | 0.784 | Balanced tasks |
|
| 235 |
+
| `claude-3-5-sonnet-20241022` | Anthropic | $0.003 | $0.015 | 0.934 | Coding, analysis |
|
| 236 |
+
| `claude-3-5-haiku-20241022` | Anthropic | $0.0008 | $0.004 | 0.794 | Fast tasks |
|
| 237 |
+
| `gemini-1.5-flash` | Google | $0.000075 | $0.0003 | 0.742 | Cheapest cloud |
|
| 238 |
+
| `mistral-large-latest` | Mistral | $0.003 | $0.009 | 0.852 | EU + quality |
|
| 239 |
+
| `deepseek-chat` | DeepSeek | $0.00014 | $0.00028 | 0.887 | Best value math/code |
|
| 240 |
+
| `llama3.1:70b` | Ollama | FREE | FREE | 0.823 | Local high-quality |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
*(See the registry file for the complete list of supported models).*
|
| 243 |
|
| 244 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
+
## Explainability & Observability
|
| 247 |
|
| 248 |
+
Unlike black-box routing systems, LLMOpt is completely transparent. You can ask the framework to explain exactly why it chose a specific model for a specific query without spending any money (`dry_run=True` or the `/explain` endpoint).
|
| 249 |
|
|
|
|
| 250 |
```python
|
| 251 |
+
explanation = client.explain(
|
| 252 |
+
query="What is the capital of France?",
|
| 253 |
+
budget_mode="cheap"
|
| 254 |
+
)
|
| 255 |
```
|
| 256 |
|
| 257 |
+
**Explanation Output:**
|
| 258 |
+
```text
|
| 259 |
+
=======================================================
|
| 260 |
+
LLMOpt Decision Explanation
|
| 261 |
+
=======================================================
|
| 262 |
+
Query complexity : 0.050 (trivial)
|
| 263 |
+
Primary domain : factual
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
+
Selected model : gemini-1.5-flash (google)
|
| 266 |
+
Fallback model : gpt-4o-mini
|
| 267 |
+
Compression : yes
|
| 268 |
+
System prompt : minimal
|
| 269 |
|
| 270 |
+
Scoring rationale:
|
| 271 |
+
• model=gemini-1.5-flash
|
| 272 |
+
• capability=0.742
|
| 273 |
+
• cost_norm=0.0042
|
| 274 |
+
• J=-0.124 (α=0.6,β=0.3,γ=0.1)
|
| 275 |
|
| 276 |
+
Cost saved : $0.009850 vs GPT-4o baseline
|
| 277 |
+
=======================================================
|
| 278 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/complexity_training_data.json
CHANGED
|
@@ -1,52 +1,3053 @@
|
|
| 1 |
[
|
| 2 |
-
{
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
{
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
{
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
{
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
{
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
{
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
{
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
{
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
{
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
[
|
| 2 |
+
{
|
| 3 |
+
"query": "What is Python?",
|
| 4 |
+
"complexity": 0.05,
|
| 5 |
+
"tier": "trivial",
|
| 6 |
+
"domain": "factual"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"query": "What is the capital of France?",
|
| 10 |
+
"complexity": 0.04,
|
| 11 |
+
"tier": "trivial",
|
| 12 |
+
"domain": "factual"
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"query": "Who invented the telephone?",
|
| 16 |
+
"complexity": 0.05,
|
| 17 |
+
"tier": "trivial",
|
| 18 |
+
"domain": "factual"
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"query": "What does HTTP stand for?",
|
| 22 |
+
"complexity": 0.06,
|
| 23 |
+
"tier": "trivial",
|
| 24 |
+
"domain": "factual"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"query": "Translate 'hello' to Spanish",
|
| 28 |
+
"complexity": 0.07,
|
| 29 |
+
"tier": "trivial",
|
| 30 |
+
"domain": "translation"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"query": "What year was Python created?",
|
| 34 |
+
"complexity": 0.04,
|
| 35 |
+
"tier": "trivial",
|
| 36 |
+
"domain": "factual"
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"query": "Is Java object-oriented?",
|
| 40 |
+
"complexity": 0.07,
|
| 41 |
+
"tier": "trivial",
|
| 42 |
+
"domain": "factual"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"query": "What is RAM?",
|
| 46 |
+
"complexity": 0.05,
|
| 47 |
+
"tier": "trivial",
|
| 48 |
+
"domain": "factual"
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"query": "Summarize this paragraph: The quick brown fox...",
|
| 52 |
+
"complexity": 0.18,
|
| 53 |
+
"tier": "easy",
|
| 54 |
+
"domain": "summarization"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"query": "Explain what a variable is in programming",
|
| 58 |
+
"complexity": 0.15,
|
| 59 |
+
"tier": "easy",
|
| 60 |
+
"domain": "code"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"query": "What is the difference between a list and a tuple in Python?",
|
| 64 |
+
"complexity": 0.22,
|
| 65 |
+
"tier": "easy",
|
| 66 |
+
"domain": "code"
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"query": "Write a simple hello world in JavaScript",
|
| 70 |
+
"complexity": 0.2,
|
| 71 |
+
"tier": "easy",
|
| 72 |
+
"domain": "code"
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"query": "What is recursion? Give a simple example",
|
| 76 |
+
"complexity": 0.25,
|
| 77 |
+
"tier": "easy",
|
| 78 |
+
"domain": "code"
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"query": "Translate this paragraph to French",
|
| 82 |
+
"complexity": 0.2,
|
| 83 |
+
"tier": "easy",
|
| 84 |
+
"domain": "translation"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"query": "Summarize the key points of agile methodology",
|
| 88 |
+
"complexity": 0.25,
|
| 89 |
+
"tier": "easy",
|
| 90 |
+
"domain": "summarization"
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"query": "Explain binary search with a code example",
|
| 94 |
+
"complexity": 0.4,
|
| 95 |
+
"tier": "medium",
|
| 96 |
+
"domain": "code"
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"query": "Write a Python function to check if a number is prime",
|
| 100 |
+
"complexity": 0.38,
|
| 101 |
+
"tier": "medium",
|
| 102 |
+
"domain": "code"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"query": "Compare REST and GraphQL APIs",
|
| 106 |
+
"complexity": 0.42,
|
| 107 |
+
"tier": "medium",
|
| 108 |
+
"domain": "reasoning"
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"query": "Explain the CAP theorem in distributed systems",
|
| 112 |
+
"complexity": 0.48,
|
| 113 |
+
"tier": "medium",
|
| 114 |
+
"domain": "reasoning"
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"query": "Write SQL to find duplicate rows in a table",
|
| 118 |
+
"complexity": 0.4,
|
| 119 |
+
"tier": "medium",
|
| 120 |
+
"domain": "code"
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"query": "Explain gradient descent step by step",
|
| 124 |
+
"complexity": 0.5,
|
| 125 |
+
"tier": "medium",
|
| 126 |
+
"domain": "math"
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"query": "What is the time complexity of quicksort? Explain with an example",
|
| 130 |
+
"complexity": 0.45,
|
| 131 |
+
"tier": "medium",
|
| 132 |
+
"domain": "code"
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"query": "Design a rate limiter for an API",
|
| 136 |
+
"complexity": 0.62,
|
| 137 |
+
"tier": "hard",
|
| 138 |
+
"domain": "code"
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"query": "Implement a LRU cache in Python",
|
| 142 |
+
"complexity": 0.6,
|
| 143 |
+
"tier": "hard",
|
| 144 |
+
"domain": "code"
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"query": "Explain the Transformer architecture in detail",
|
| 148 |
+
"complexity": 0.7,
|
| 149 |
+
"tier": "hard",
|
| 150 |
+
"domain": "science"
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"query": "Write a comprehensive tutorial on Docker and Kubernetes",
|
| 154 |
+
"complexity": 0.68,
|
| 155 |
+
"tier": "hard",
|
| 156 |
+
"domain": "code"
|
| 157 |
+
},
|
| 158 |
+
{
|
| 159 |
+
"query": "Analyze the pros and cons of microservices vs monoliths",
|
| 160 |
+
"complexity": 0.65,
|
| 161 |
+
"tier": "hard",
|
| 162 |
+
"domain": "reasoning"
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"query": "Derive the backpropagation equations from first principles",
|
| 166 |
+
"complexity": 0.8,
|
| 167 |
+
"tier": "hard",
|
| 168 |
+
"domain": "math"
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"query": "Design the Paxos consensus algorithm",
|
| 172 |
+
"complexity": 0.92,
|
| 173 |
+
"tier": "expert",
|
| 174 |
+
"domain": "reasoning"
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"query": "Prove that P ≠ NP (or outline the key open problems)",
|
| 178 |
+
"complexity": 0.98,
|
| 179 |
+
"tier": "expert",
|
| 180 |
+
"domain": "math"
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"query": "Design a distributed SQL database from scratch",
|
| 184 |
+
"complexity": 0.95,
|
| 185 |
+
"tier": "expert",
|
| 186 |
+
"domain": "code"
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"query": "Implement a compiler for a simple language in Python",
|
| 190 |
+
"complexity": 0.9,
|
| 191 |
+
"tier": "expert",
|
| 192 |
+
"domain": "code"
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"query": "Explain quantum entanglement and Bell's inequality with math",
|
| 196 |
+
"complexity": 0.88,
|
| 197 |
+
"tier": "expert",
|
| 198 |
+
"domain": "science"
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"query": "Write a full-stack web app with React and FastAPI",
|
| 202 |
+
"complexity": 0.85,
|
| 203 |
+
"tier": "expert",
|
| 204 |
+
"domain": "code"
|
| 205 |
+
},
|
| 206 |
+
{
|
| 207 |
+
"query": "Analyze the ethical implications of AI in healthcare",
|
| 208 |
+
"complexity": 0.72,
|
| 209 |
+
"tier": "hard",
|
| 210 |
+
"domain": "reasoning"
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"query": "Compare BERT and GPT architectures in depth",
|
| 214 |
+
"complexity": 0.75,
|
| 215 |
+
"tier": "hard",
|
| 216 |
+
"domain": "science"
|
| 217 |
+
},
|
| 218 |
+
{
|
| 219 |
+
"query": "What is async/await in Python?",
|
| 220 |
+
"complexity": 0.28,
|
| 221 |
+
"tier": "easy",
|
| 222 |
+
"domain": "code"
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"query": "Explain SOLID principles with examples",
|
| 226 |
+
"complexity": 0.45,
|
| 227 |
+
"tier": "medium",
|
| 228 |
+
"domain": "code"
|
| 229 |
+
},
|
| 230 |
+
{
|
| 231 |
+
"query": "Write a regex to validate email addresses",
|
| 232 |
+
"complexity": 0.3,
|
| 233 |
+
"tier": "easy",
|
| 234 |
+
"domain": "code"
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"query": "Design a URL shortener system like bit.ly",
|
| 238 |
+
"complexity": 0.7,
|
| 239 |
+
"tier": "hard",
|
| 240 |
+
"domain": "code"
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"query": "Implement a red-black tree in C++",
|
| 244 |
+
"complexity": 0.8,
|
| 245 |
+
"tier": "hard",
|
| 246 |
+
"domain": "code"
|
| 247 |
+
},
|
| 248 |
+
{
|
| 249 |
+
"query": "Explain Bayesian inference with an example",
|
| 250 |
+
"complexity": 0.65,
|
| 251 |
+
"tier": "hard",
|
| 252 |
+
"domain": "math"
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"query": "Write a neural network from scratch in numpy",
|
| 256 |
+
"complexity": 0.82,
|
| 257 |
+
"tier": "expert",
|
| 258 |
+
"domain": "code"
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"query": "Translate this legal document to Spanish",
|
| 262 |
+
"complexity": 0.35,
|
| 263 |
+
"tier": "medium",
|
| 264 |
+
"domain": "translation"
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"query": "Summarize this 50-page research paper",
|
| 268 |
+
"complexity": 0.45,
|
| 269 |
+
"tier": "medium",
|
| 270 |
+
"domain": "summarization"
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"query": "Debate the pros and cons of nuclear energy",
|
| 274 |
+
"complexity": 0.6,
|
| 275 |
+
"tier": "hard",
|
| 276 |
+
"domain": "reasoning"
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"query": "Write a creative short story about time travel",
|
| 280 |
+
"complexity": 0.42,
|
| 281 |
+
"tier": "medium",
|
| 282 |
+
"domain": "creative"
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"query": "Explain what a closure is in JavaScript",
|
| 286 |
+
"complexity": 0.3,
|
| 287 |
+
"tier": "easy",
|
| 288 |
+
"domain": "code"
|
| 289 |
+
},
|
| 290 |
+
{
|
| 291 |
+
"query": "What is the difference between TCP and UDP?",
|
| 292 |
+
"complexity": 0.22,
|
| 293 |
+
"tier": "easy",
|
| 294 |
+
"domain": "factual"
|
| 295 |
+
},
|
| 296 |
+
{
|
| 297 |
+
"query": "Prove the Pythagorean theorem",
|
| 298 |
+
"complexity": 0.55,
|
| 299 |
+
"tier": "medium",
|
| 300 |
+
"domain": "math"
|
| 301 |
+
},
|
| 302 |
+
{
|
| 303 |
+
"query": "What is Python?",
|
| 304 |
+
"complexity": 0.05,
|
| 305 |
+
"tier": "trivial",
|
| 306 |
+
"domain": "factual",
|
| 307 |
+
"model": "gpt-3.5-turbo"
|
| 308 |
+
},
|
| 309 |
+
{
|
| 310 |
+
"query": "What is the capital of France?",
|
| 311 |
+
"complexity": 0.04,
|
| 312 |
+
"tier": "trivial",
|
| 313 |
+
"domain": "factual",
|
| 314 |
+
"model": "claude-3-haiku-20240307"
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"query": "Who invented the telephone?",
|
| 318 |
+
"complexity": 0.05,
|
| 319 |
+
"tier": "trivial",
|
| 320 |
+
"domain": "factual",
|
| 321 |
+
"model": "llama3.2:3b"
|
| 322 |
+
},
|
| 323 |
+
{
|
| 324 |
+
"query": "What does HTTP stand for?",
|
| 325 |
+
"complexity": 0.06,
|
| 326 |
+
"tier": "trivial",
|
| 327 |
+
"domain": "factual",
|
| 328 |
+
"model": "gemini-1.5-flash"
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"query": "Translate 'hello' to Spanish",
|
| 332 |
+
"complexity": 0.07,
|
| 333 |
+
"tier": "trivial",
|
| 334 |
+
"domain": "translation",
|
| 335 |
+
"model": "mistral-small-latest"
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"query": "What year was Python created?",
|
| 339 |
+
"complexity": 0.04,
|
| 340 |
+
"tier": "trivial",
|
| 341 |
+
"domain": "factual",
|
| 342 |
+
"model": "gpt-4o-mini"
|
| 343 |
+
},
|
| 344 |
+
{
|
| 345 |
+
"query": "Is Java object-oriented?",
|
| 346 |
+
"complexity": 0.07,
|
| 347 |
+
"tier": "trivial",
|
| 348 |
+
"domain": "factual",
|
| 349 |
+
"model": "llama3.1:8b"
|
| 350 |
+
},
|
| 351 |
+
{
|
| 352 |
+
"query": "What is RAM?",
|
| 353 |
+
"complexity": 0.05,
|
| 354 |
+
"tier": "trivial",
|
| 355 |
+
"domain": "factual",
|
| 356 |
+
"model": "claude-3-5-haiku-20241022"
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"query": "What is 2+2?",
|
| 360 |
+
"complexity": 0.02,
|
| 361 |
+
"tier": "trivial",
|
| 362 |
+
"domain": "math",
|
| 363 |
+
"model": "gpt-3.5-turbo"
|
| 364 |
+
},
|
| 365 |
+
{
|
| 366 |
+
"query": "Name one planet in our solar system",
|
| 367 |
+
"complexity": 0.03,
|
| 368 |
+
"tier": "trivial",
|
| 369 |
+
"domain": "science",
|
| 370 |
+
"model": "llama3.2:3b"
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"query": "Summarize this: 'The quick brown fox jumps over the lazy dog.'",
|
| 374 |
+
"complexity": 0.18,
|
| 375 |
+
"tier": "easy",
|
| 376 |
+
"domain": "summarization",
|
| 377 |
+
"model": "gpt-4o-mini"
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"query": "Explain what a variable is in programming",
|
| 381 |
+
"complexity": 0.15,
|
| 382 |
+
"tier": "easy",
|
| 383 |
+
"domain": "code",
|
| 384 |
+
"model": "claude-3-haiku-20240307"
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"query": "What is the difference between a list and a tuple in Python?",
|
| 388 |
+
"complexity": 0.22,
|
| 389 |
+
"tier": "easy",
|
| 390 |
+
"domain": "code",
|
| 391 |
+
"model": "gemini-1.5-flash"
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"query": "Write a simple hello world in JavaScript",
|
| 395 |
+
"complexity": 0.2,
|
| 396 |
+
"tier": "easy",
|
| 397 |
+
"domain": "code",
|
| 398 |
+
"model": "mistral-small-latest"
|
| 399 |
+
},
|
| 400 |
+
{
|
| 401 |
+
"query": "What is recursion? Give a simple example",
|
| 402 |
+
"complexity": 0.25,
|
| 403 |
+
"tier": "easy",
|
| 404 |
+
"domain": "code",
|
| 405 |
+
"model": "llama3.1:8b"
|
| 406 |
+
},
|
| 407 |
+
{
|
| 408 |
+
"query": "Translate 'Good morning, how are you?' to French",
|
| 409 |
+
"complexity": 0.2,
|
| 410 |
+
"tier": "easy",
|
| 411 |
+
"domain": "translation",
|
| 412 |
+
"model": "gpt-4o-mini"
|
| 413 |
+
},
|
| 414 |
+
{
|
| 415 |
+
"query": "Summarize the key points of agile methodology",
|
| 416 |
+
"complexity": 0.25,
|
| 417 |
+
"tier": "easy",
|
| 418 |
+
"domain": "summarization",
|
| 419 |
+
"model": "claude-3-5-haiku-20241022"
|
| 420 |
+
},
|
| 421 |
+
{
|
| 422 |
+
"query": "Solve for x: 2x + 5 = 15",
|
| 423 |
+
"complexity": 0.12,
|
| 424 |
+
"tier": "easy",
|
| 425 |
+
"domain": "math",
|
| 426 |
+
"model": "gpt-3.5-turbo"
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"query": "What is photosynthesis? Explain briefly",
|
| 430 |
+
"complexity": 0.17,
|
| 431 |
+
"tier": "easy",
|
| 432 |
+
"domain": "science",
|
| 433 |
+
"model": "gemini-1.5-flash"
|
| 434 |
+
},
|
| 435 |
+
{
|
| 436 |
+
"query": "Write a haiku about spring",
|
| 437 |
+
"complexity": 0.21,
|
| 438 |
+
"tier": "easy",
|
| 439 |
+
"domain": "creative",
|
| 440 |
+
"model": "llama3.1:70b"
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"query": "Why is the sky blue?",
|
| 444 |
+
"complexity": 0.18,
|
| 445 |
+
"tier": "easy",
|
| 446 |
+
"domain": "reasoning",
|
| 447 |
+
"model": "mistral-large-latest"
|
| 448 |
+
},
|
| 449 |
+
{
|
| 450 |
+
"query": "Explain binary search with a code example",
|
| 451 |
+
"complexity": 0.4,
|
| 452 |
+
"tier": "medium",
|
| 453 |
+
"domain": "code",
|
| 454 |
+
"model": "deepseek-chat"
|
| 455 |
+
},
|
| 456 |
+
{
|
| 457 |
+
"query": "Write a Python function to check if a number is prime",
|
| 458 |
+
"complexity": 0.38,
|
| 459 |
+
"tier": "medium",
|
| 460 |
+
"domain": "code",
|
| 461 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 462 |
+
},
|
| 463 |
+
{
|
| 464 |
+
"query": "Compare REST and GraphQL APIs",
|
| 465 |
+
"complexity": 0.42,
|
| 466 |
+
"tier": "medium",
|
| 467 |
+
"domain": "reasoning",
|
| 468 |
+
"model": "gpt-4o"
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"query": "Explain the CAP theorem in distributed systems",
|
| 472 |
+
"complexity": 0.48,
|
| 473 |
+
"tier": "medium",
|
| 474 |
+
"domain": "reasoning",
|
| 475 |
+
"model": "gemini-1.5-pro"
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"query": "Write SQL to find duplicate rows in a table",
|
| 479 |
+
"complexity": 0.4,
|
| 480 |
+
"tier": "medium",
|
| 481 |
+
"domain": "code",
|
| 482 |
+
"model": "llama3.1:70b"
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"query": "Explain gradient descent step by step",
|
| 486 |
+
"complexity": 0.5,
|
| 487 |
+
"tier": "medium",
|
| 488 |
+
"domain": "math",
|
| 489 |
+
"model": "deepseek-chat"
|
| 490 |
+
},
|
| 491 |
+
{
|
| 492 |
+
"query": "What is the time complexity of quicksort? Explain with an example",
|
| 493 |
+
"complexity": 0.45,
|
| 494 |
+
"tier": "medium",
|
| 495 |
+
"domain": "code",
|
| 496 |
+
"model": "gpt-4o"
|
| 497 |
+
},
|
| 498 |
+
{
|
| 499 |
+
"query": "Translate this technical document abstract to German",
|
| 500 |
+
"complexity": 0.35,
|
| 501 |
+
"tier": "medium",
|
| 502 |
+
"domain": "translation",
|
| 503 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 504 |
+
},
|
| 505 |
+
{
|
| 506 |
+
"query": "Summarize the plot of '1984' by George Orwell",
|
| 507 |
+
"complexity": 0.37,
|
| 508 |
+
"tier": "medium",
|
| 509 |
+
"domain": "summarization",
|
| 510 |
+
"model": "mistral-large-latest"
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"query": "Write a short story about a robot learning to dream",
|
| 514 |
+
"complexity": 0.44,
|
| 515 |
+
"tier": "medium",
|
| 516 |
+
"domain": "creative",
|
| 517 |
+
"model": "gpt-4o"
|
| 518 |
+
},
|
| 519 |
+
{
|
| 520 |
+
"query": "Explain the theory of relativity in simple terms",
|
| 521 |
+
"complexity": 0.46,
|
| 522 |
+
"tier": "medium",
|
| 523 |
+
"domain": "science",
|
| 524 |
+
"model": "gemini-1.5-pro"
|
| 525 |
+
},
|
| 526 |
+
{
|
| 527 |
+
"query": "What is the difference between supervised and unsupervised learning?",
|
| 528 |
+
"complexity": 0.36,
|
| 529 |
+
"tier": "medium",
|
| 530 |
+
"domain": "factual",
|
| 531 |
+
"model": "claude-3-5-haiku-20241022"
|
| 532 |
+
},
|
| 533 |
+
{
|
| 534 |
+
"query": "Solve the quadratic equation: x^2 - 5x + 6 = 0",
|
| 535 |
+
"complexity": 0.32,
|
| 536 |
+
"tier": "medium",
|
| 537 |
+
"domain": "math",
|
| 538 |
+
"model": "llama3.1:70b"
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"query": "Design a rate limiter for an API",
|
| 542 |
+
"complexity": 0.62,
|
| 543 |
+
"tier": "hard",
|
| 544 |
+
"domain": "code",
|
| 545 |
+
"model": "gpt-4o"
|
| 546 |
+
},
|
| 547 |
+
{
|
| 548 |
+
"query": "Implement an LRU cache in Python",
|
| 549 |
+
"complexity": 0.6,
|
| 550 |
+
"tier": "hard",
|
| 551 |
+
"domain": "code",
|
| 552 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 553 |
+
},
|
| 554 |
+
{
|
| 555 |
+
"query": "Explain the Transformer architecture in detail",
|
| 556 |
+
"complexity": 0.7,
|
| 557 |
+
"tier": "hard",
|
| 558 |
+
"domain": "science",
|
| 559 |
+
"model": "deepseek-chat"
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"query": "Write a comprehensive tutorial on Docker and Kubernetes",
|
| 563 |
+
"complexity": 0.68,
|
| 564 |
+
"tier": "hard",
|
| 565 |
+
"domain": "code",
|
| 566 |
+
"model": "gemini-1.5-pro"
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"query": "Analyze the pros and cons of microservices vs monoliths",
|
| 570 |
+
"complexity": 0.65,
|
| 571 |
+
"tier": "hard",
|
| 572 |
+
"domain": "reasoning",
|
| 573 |
+
"model": "gpt-4o"
|
| 574 |
+
},
|
| 575 |
+
{
|
| 576 |
+
"query": "Derive the backpropagation equations from first principles",
|
| 577 |
+
"complexity": 0.8,
|
| 578 |
+
"tier": "expert",
|
| 579 |
+
"domain": "math",
|
| 580 |
+
"model": "deepseek-chat"
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"query": "Prove that the square root of 2 is irrational",
|
| 584 |
+
"complexity": 0.58,
|
| 585 |
+
"tier": "hard",
|
| 586 |
+
"domain": "math",
|
| 587 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 588 |
+
},
|
| 589 |
+
{
|
| 590 |
+
"query": "Translate this legal contract summary to Japanese",
|
| 591 |
+
"complexity": 0.55,
|
| 592 |
+
"tier": "hard",
|
| 593 |
+
"domain": "translation",
|
| 594 |
+
"model": "gpt-4o"
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"query": "Summarize the latest advances in quantum computing (2023-2024)",
|
| 598 |
+
"complexity": 0.66,
|
| 599 |
+
"tier": "hard",
|
| 600 |
+
"domain": "summarization",
|
| 601 |
+
"model": "gemini-1.5-pro"
|
| 602 |
+
},
|
| 603 |
+
{
|
| 604 |
+
"query": "Write a poem in the style of Edgar Allan Poe about AI",
|
| 605 |
+
"complexity": 0.63,
|
| 606 |
+
"tier": "hard",
|
| 607 |
+
"domain": "creative",
|
| 608 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 609 |
+
},
|
| 610 |
+
{
|
| 611 |
+
"query": "Explain the ethics of autonomous weapons systems",
|
| 612 |
+
"complexity": 0.67,
|
| 613 |
+
"tier": "hard",
|
| 614 |
+
"domain": "reasoning",
|
| 615 |
+
"model": "gpt-4o"
|
| 616 |
+
},
|
| 617 |
+
{
|
| 618 |
+
"query": "What is the role of the Golgi apparatus in cells?",
|
| 619 |
+
"complexity": 0.42,
|
| 620 |
+
"tier": "medium",
|
| 621 |
+
"domain": "science",
|
| 622 |
+
"model": "mistral-large-latest"
|
| 623 |
+
},
|
| 624 |
+
{
|
| 625 |
+
"query": "Write a script to scrape a website and extract all links",
|
| 626 |
+
"complexity": 0.52,
|
| 627 |
+
"tier": "medium",
|
| 628 |
+
"domain": "code",
|
| 629 |
+
"model": "deepseek-chat"
|
| 630 |
+
},
|
| 631 |
+
{
|
| 632 |
+
"query": "Explain the concept of idempotency in REST APIs",
|
| 633 |
+
"complexity": 0.39,
|
| 634 |
+
"tier": "medium",
|
| 635 |
+
"domain": "code",
|
| 636 |
+
"model": "gpt-4o-mini"
|
| 637 |
+
},
|
| 638 |
+
{
|
| 639 |
+
"query": "Describe the water cycle for a 10-year-old",
|
| 640 |
+
"complexity": 0.19,
|
| 641 |
+
"tier": "easy",
|
| 642 |
+
"domain": "science",
|
| 643 |
+
"model": "llama3.2:3b"
|
| 644 |
+
},
|
| 645 |
+
{
|
| 646 |
+
"query": "What is a neural network? Give a simple analogy",
|
| 647 |
+
"complexity": 0.24,
|
| 648 |
+
"tier": "easy",
|
| 649 |
+
"domain": "science",
|
| 650 |
+
"model": "claude-3-haiku-20240307"
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"query": "Solve: If a train travels 60 mph for 2 hours, how far does it go?",
|
| 654 |
+
"complexity": 0.08,
|
| 655 |
+
"tier": "trivial",
|
| 656 |
+
"domain": "math",
|
| 657 |
+
"model": "gpt-3.5-turbo"
|
| 658 |
+
},
|
| 659 |
+
{
|
| 660 |
+
"query": "Name three data types in Python",
|
| 661 |
+
"complexity": 0.06,
|
| 662 |
+
"tier": "trivial",
|
| 663 |
+
"domain": "code",
|
| 664 |
+
"model": "llama3.2:3b"
|
| 665 |
+
},
|
| 666 |
+
{
|
| 667 |
+
"query": "Translate 'Thank you very much' to Italian",
|
| 668 |
+
"complexity": 0.07,
|
| 669 |
+
"tier": "trivial",
|
| 670 |
+
"domain": "translation",
|
| 671 |
+
"model": "gemini-1.5-flash"
|
| 672 |
+
},
|
| 673 |
+
{
|
| 674 |
+
"query": "What is the main ingredient in guacamole?",
|
| 675 |
+
"complexity": 0.03,
|
| 676 |
+
"tier": "trivial",
|
| 677 |
+
"domain": "factual",
|
| 678 |
+
"model": "mistral-small-latest"
|
| 679 |
+
},
|
| 680 |
+
{
|
| 681 |
+
"query": "Write a for loop that prints numbers 1 to 10 in Python",
|
| 682 |
+
"complexity": 0.16,
|
| 683 |
+
"tier": "easy",
|
| 684 |
+
"domain": "code",
|
| 685 |
+
"model": "gpt-4o-mini"
|
| 686 |
+
},
|
| 687 |
+
{
|
| 688 |
+
"query": "Explain what a deadlock is in concurrency",
|
| 689 |
+
"complexity": 0.33,
|
| 690 |
+
"tier": "medium",
|
| 691 |
+
"domain": "code",
|
| 692 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 693 |
+
},
|
| 694 |
+
{
|
| 695 |
+
"query": "Compare socialism and capitalism",
|
| 696 |
+
"complexity": 0.47,
|
| 697 |
+
"tier": "medium",
|
| 698 |
+
"domain": "reasoning",
|
| 699 |
+
"model": "gpt-4o"
|
| 700 |
+
},
|
| 701 |
+
{
|
| 702 |
+
"query": "What is the significance of the Higgs boson discovery?",
|
| 703 |
+
"complexity": 0.56,
|
| 704 |
+
"tier": "medium",
|
| 705 |
+
"domain": "science",
|
| 706 |
+
"model": "deepseek-chat"
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"query": "Write a SQL query to join three tables",
|
| 710 |
+
"complexity": 0.41,
|
| 711 |
+
"tier": "medium",
|
| 712 |
+
"domain": "code",
|
| 713 |
+
"model": "llama3.1:70b"
|
| 714 |
+
},
|
| 715 |
+
{
|
| 716 |
+
"query": "Create a simple JavaScript function that returns the current date",
|
| 717 |
+
"complexity": 0.14,
|
| 718 |
+
"tier": "easy",
|
| 719 |
+
"domain": "code",
|
| 720 |
+
"model": "mistral-small-latest"
|
| 721 |
+
},
|
| 722 |
+
{
|
| 723 |
+
"query": "What is the Pythagorean theorem? Provide an example",
|
| 724 |
+
"complexity": 0.23,
|
| 725 |
+
"tier": "easy",
|
| 726 |
+
"domain": "math",
|
| 727 |
+
"model": "gemini-1.5-flash"
|
| 728 |
+
},
|
| 729 |
+
{
|
| 730 |
+
"query": "Summarize the main ideas of Stoic philosophy",
|
| 731 |
+
"complexity": 0.49,
|
| 732 |
+
"tier": "medium",
|
| 733 |
+
"domain": "summarization",
|
| 734 |
+
"model": "claude-3-5-haiku-20241022"
|
| 735 |
+
},
|
| 736 |
+
{
|
| 737 |
+
"query": "Write a recipe for chocolate chip cookies in poetic form",
|
| 738 |
+
"complexity": 0.51,
|
| 739 |
+
"tier": "medium",
|
| 740 |
+
"domain": "creative",
|
| 741 |
+
"model": "gpt-4o-mini"
|
| 742 |
+
},
|
| 743 |
+
{
|
| 744 |
+
"query": "Explain the difference between CPU and GPU",
|
| 745 |
+
"complexity": 0.27,
|
| 746 |
+
"tier": "easy",
|
| 747 |
+
"domain": "factual",
|
| 748 |
+
"model": "llama3.1:8b"
|
| 749 |
+
},
|
| 750 |
+
{
|
| 751 |
+
"query": "Solve: What is the derivative of x^3?",
|
| 752 |
+
"complexity": 0.31,
|
| 753 |
+
"tier": "medium",
|
| 754 |
+
"domain": "math",
|
| 755 |
+
"model": "deepseek-chat"
|
| 756 |
+
},
|
| 757 |
+
{
|
| 758 |
+
"query": "Design a simple task scheduler in Python",
|
| 759 |
+
"complexity": 0.59,
|
| 760 |
+
"tier": "medium",
|
| 761 |
+
"domain": "code",
|
| 762 |
+
"model": "gpt-4o"
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"query": "Explain the concept of 'impostor syndrome' in the workplace",
|
| 766 |
+
"complexity": 0.34,
|
| 767 |
+
"tier": "easy",
|
| 768 |
+
"domain": "reasoning",
|
| 769 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 770 |
+
},
|
| 771 |
+
{
|
| 772 |
+
"query": "What are the three laws of robotics?",
|
| 773 |
+
"complexity": 0.09,
|
| 774 |
+
"tier": "trivial",
|
| 775 |
+
"domain": "science",
|
| 776 |
+
"model": "gpt-3.5-turbo"
|
| 777 |
+
},
|
| 778 |
+
{
|
| 779 |
+
"query": "Translate 'The food was delicious' to Mandarin Chinese",
|
| 780 |
+
"complexity": 0.26,
|
| 781 |
+
"tier": "easy",
|
| 782 |
+
"domain": "translation",
|
| 783 |
+
"model": "gemini-1.5-pro"
|
| 784 |
+
},
|
| 785 |
+
{
|
| 786 |
+
"query": "Write a CSS snippet to center a div",
|
| 787 |
+
"complexity": 0.13,
|
| 788 |
+
"tier": "easy",
|
| 789 |
+
"domain": "code",
|
| 790 |
+
"model": "mistral-small-latest"
|
| 791 |
+
},
|
| 792 |
+
{
|
| 793 |
+
"query": "Explain the Drake equation",
|
| 794 |
+
"complexity": 0.44,
|
| 795 |
+
"tier": "medium",
|
| 796 |
+
"domain": "science",
|
| 797 |
+
"model": "llama3.1:70b"
|
| 798 |
+
},
|
| 799 |
+
{
|
| 800 |
+
"query": "What is the difference between error and exception in programming?",
|
| 801 |
+
"complexity": 0.21,
|
| 802 |
+
"tier": "easy",
|
| 803 |
+
"domain": "code",
|
| 804 |
+
"model": "gpt-4o-mini"
|
| 805 |
+
},
|
| 806 |
+
{
|
| 807 |
+
"query": "Solve: 15% of 200",
|
| 808 |
+
"complexity": 0.05,
|
| 809 |
+
"tier": "trivial",
|
| 810 |
+
"domain": "math",
|
| 811 |
+
"model": "llama3.2:3b"
|
| 812 |
+
},
|
| 813 |
+
{
|
| 814 |
+
"query": "Write a haiku about debugging",
|
| 815 |
+
"complexity": 0.28,
|
| 816 |
+
"tier": "easy",
|
| 817 |
+
"domain": "creative",
|
| 818 |
+
"model": "claude-3-haiku-20240307"
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"query": "Explain the concept of 'technical debt'",
|
| 822 |
+
"complexity": 0.37,
|
| 823 |
+
"tier": "medium",
|
| 824 |
+
"domain": "code",
|
| 825 |
+
"model": "gpt-4o"
|
| 826 |
+
},
|
| 827 |
+
{
|
| 828 |
+
"query": "What are the benefits of using TypeScript over JavaScript?",
|
| 829 |
+
"complexity": 0.29,
|
| 830 |
+
"tier": "easy",
|
| 831 |
+
"domain": "code",
|
| 832 |
+
"model": "mistral-large-latest"
|
| 833 |
+
},
|
| 834 |
+
{
|
| 835 |
+
"query": "Summarize the plot of 'The Great Gatsby' in one paragraph",
|
| 836 |
+
"complexity": 0.38,
|
| 837 |
+
"tier": "medium",
|
| 838 |
+
"domain": "summarization",
|
| 839 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 840 |
+
},
|
| 841 |
+
{
|
| 842 |
+
"query": "Translate the following error message to Spanish: 'File not found'",
|
| 843 |
+
"complexity": 0.08,
|
| 844 |
+
"tier": "trivial",
|
| 845 |
+
"domain": "translation",
|
| 846 |
+
"model": "gemini-1.5-flash"
|
| 847 |
+
},
|
| 848 |
+
{
|
| 849 |
+
"query": "Write a Python decorator that measures execution time",
|
| 850 |
+
"complexity": 0.55,
|
| 851 |
+
"tier": "medium",
|
| 852 |
+
"domain": "code",
|
| 853 |
+
"model": "deepseek-chat"
|
| 854 |
+
},
|
| 855 |
+
{
|
| 856 |
+
"query": "Explain the concept of 'virtue ethics'",
|
| 857 |
+
"complexity": 0.46,
|
| 858 |
+
"tier": "medium",
|
| 859 |
+
"domain": "reasoning",
|
| 860 |
+
"model": "gpt-4o"
|
| 861 |
+
},
|
| 862 |
+
{
|
| 863 |
+
"query": "What is the function of the mitochondria?",
|
| 864 |
+
"complexity": 0.11,
|
| 865 |
+
"tier": "easy",
|
| 866 |
+
"domain": "science",
|
| 867 |
+
"model": "llama3.1:8b"
|
| 868 |
+
},
|
| 869 |
+
{
|
| 870 |
+
"query": "Solve for x: log(x) = 2",
|
| 871 |
+
"complexity": 0.3,
|
| 872 |
+
"tier": "medium",
|
| 873 |
+
"domain": "math",
|
| 874 |
+
"model": "gemini-1.5-pro"
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"query": "Design a simple REST API for a todo list",
|
| 878 |
+
"complexity": 0.57,
|
| 879 |
+
"tier": "medium",
|
| 880 |
+
"domain": "code",
|
| 881 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 882 |
+
},
|
| 883 |
+
{
|
| 884 |
+
"query": "Explain the difference between let, const, and var in JavaScript",
|
| 885 |
+
"complexity": 0.23,
|
| 886 |
+
"tier": "easy",
|
| 887 |
+
"domain": "code",
|
| 888 |
+
"model": "gpt-4o-mini"
|
| 889 |
+
},
|
| 890 |
+
{
|
| 891 |
+
"query": "Write a limerick about a programmer",
|
| 892 |
+
"complexity": 0.32,
|
| 893 |
+
"tier": "easy",
|
| 894 |
+
"domain": "creative",
|
| 895 |
+
"model": "llama3.1:70b"
|
| 896 |
+
},
|
| 897 |
+
{
|
| 898 |
+
"query": "What is the greenhouse effect? Explain simply",
|
| 899 |
+
"complexity": 0.19,
|
| 900 |
+
"tier": "easy",
|
| 901 |
+
"domain": "science",
|
| 902 |
+
"model": "claude-3-haiku-20240307"
|
| 903 |
+
},
|
| 904 |
+
{
|
| 905 |
+
"query": "Translate 'I would like to book a flight' to French",
|
| 906 |
+
"complexity": 0.16,
|
| 907 |
+
"tier": "easy",
|
| 908 |
+
"domain": "translation",
|
| 909 |
+
"model": "mistral-small-latest"
|
| 910 |
+
},
|
| 911 |
+
{
|
| 912 |
+
"query": "What is the difference between a stack and a queue?",
|
| 913 |
+
"complexity": 0.24,
|
| 914 |
+
"tier": "easy",
|
| 915 |
+
"domain": "code",
|
| 916 |
+
"model": "gpt-3.5-turbo"
|
| 917 |
+
},
|
| 918 |
+
{
|
| 919 |
+
"query": "Explain the concept of 'opportunity cost' in economics",
|
| 920 |
+
"complexity": 0.28,
|
| 921 |
+
"tier": "easy",
|
| 922 |
+
"domain": "reasoning",
|
| 923 |
+
"model": "gemini-1.5-flash"
|
| 924 |
+
},
|
| 925 |
+
{
|
| 926 |
+
"query": "Write a binary search algorithm in Python",
|
| 927 |
+
"complexity": 0.43,
|
| 928 |
+
"tier": "medium",
|
| 929 |
+
"domain": "code",
|
| 930 |
+
"model": "deepseek-chat"
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"query": "Summarize the key innovations of the Renaissance",
|
| 934 |
+
"complexity": 0.41,
|
| 935 |
+
"tier": "medium",
|
| 936 |
+
"domain": "summarization",
|
| 937 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 938 |
+
},
|
| 939 |
+
{
|
| 940 |
+
"query": "Solve: What is the area of a circle with radius 5?",
|
| 941 |
+
"complexity": 0.1,
|
| 942 |
+
"tier": "easy",
|
| 943 |
+
"domain": "math",
|
| 944 |
+
"model": "llama3.2:3b"
|
| 945 |
+
},
|
| 946 |
+
{
|
| 947 |
+
"query": "Create a simple HTML page with a button",
|
| 948 |
+
"complexity": 0.12,
|
| 949 |
+
"tier": "easy",
|
| 950 |
+
"domain": "code",
|
| 951 |
+
"model": "gpt-4o-mini"
|
| 952 |
+
},
|
| 953 |
+
{
|
| 954 |
+
"query": "Explain the concept of 'black swan events'",
|
| 955 |
+
"complexity": 0.45,
|
| 956 |
+
"tier": "medium",
|
| 957 |
+
"domain": "reasoning",
|
| 958 |
+
"model": "gpt-4o"
|
| 959 |
+
},
|
| 960 |
+
{
|
| 961 |
+
"query": "What is CRISPR used for?",
|
| 962 |
+
"complexity": 0.22,
|
| 963 |
+
"tier": "easy",
|
| 964 |
+
"domain": "science",
|
| 965 |
+
"model": "gemini-1.5-pro"
|
| 966 |
+
},
|
| 967 |
+
{
|
| 968 |
+
"query": "Write a regular expression to match an email address",
|
| 969 |
+
"complexity": 0.34,
|
| 970 |
+
"tier": "medium",
|
| 971 |
+
"domain": "code",
|
| 972 |
+
"model": "mistral-large-latest"
|
| 973 |
+
},
|
| 974 |
+
{
|
| 975 |
+
"query": "Translate 'Where is the nearest hospital?' to German",
|
| 976 |
+
"complexity": 0.15,
|
| 977 |
+
"tier": "easy",
|
| 978 |
+
"domain": "translation",
|
| 979 |
+
"model": "claude-3-haiku-20240307"
|
| 980 |
+
},
|
| 981 |
+
{
|
| 982 |
+
"query": "Write a short story about a character who wakes up with amnesia",
|
| 983 |
+
"complexity": 0.49,
|
| 984 |
+
"tier": "medium",
|
| 985 |
+
"domain": "creative",
|
| 986 |
+
"model": "llama3.1:70b"
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"query": "Explain the difference between correlation and causation",
|
| 990 |
+
"complexity": 0.33,
|
| 991 |
+
"tier": "medium",
|
| 992 |
+
"domain": "reasoning",
|
| 993 |
+
"model": "deepseek-chat"
|
| 994 |
+
},
|
| 995 |
+
{
|
| 996 |
+
"query": "What is a SAT solver used for in computer science?",
|
| 997 |
+
"complexity": 0.52,
|
| 998 |
+
"tier": "medium",
|
| 999 |
+
"domain": "code",
|
| 1000 |
+
"model": "gpt-4o"
|
| 1001 |
+
},
|
| 1002 |
+
{
|
| 1003 |
+
"query": "What is an API?",
|
| 1004 |
+
"complexity": 0.06,
|
| 1005 |
+
"tier": "trivial",
|
| 1006 |
+
"domain": "factual",
|
| 1007 |
+
"model": "llama3.2:3b"
|
| 1008 |
+
},
|
| 1009 |
+
{
|
| 1010 |
+
"query": "Who wrote 'Romeo and Juliet'?",
|
| 1011 |
+
"complexity": 0.04,
|
| 1012 |
+
"tier": "trivial",
|
| 1013 |
+
"domain": "factual",
|
| 1014 |
+
"model": "gpt-3.5-turbo"
|
| 1015 |
+
},
|
| 1016 |
+
{
|
| 1017 |
+
"query": "What does CSS stand for?",
|
| 1018 |
+
"complexity": 0.05,
|
| 1019 |
+
"tier": "trivial",
|
| 1020 |
+
"domain": "factual",
|
| 1021 |
+
"model": "claude-3-haiku-20240307"
|
| 1022 |
+
},
|
| 1023 |
+
{
|
| 1024 |
+
"query": "Translate 'good night' to German",
|
| 1025 |
+
"complexity": 0.06,
|
| 1026 |
+
"tier": "trivial",
|
| 1027 |
+
"domain": "translation",
|
| 1028 |
+
"model": "gemini-1.5-flash"
|
| 1029 |
+
},
|
| 1030 |
+
{
|
| 1031 |
+
"query": "What is the square root of 64?",
|
| 1032 |
+
"complexity": 0.03,
|
| 1033 |
+
"tier": "trivial",
|
| 1034 |
+
"domain": "math",
|
| 1035 |
+
"model": "mistral-small-latest"
|
| 1036 |
+
},
|
| 1037 |
+
{
|
| 1038 |
+
"query": "What color is the sky on a clear day?",
|
| 1039 |
+
"complexity": 0.02,
|
| 1040 |
+
"tier": "trivial",
|
| 1041 |
+
"domain": "science",
|
| 1042 |
+
"model": "llama3.1:8b"
|
| 1043 |
+
},
|
| 1044 |
+
{
|
| 1045 |
+
"query": "What does SQL stand for?",
|
| 1046 |
+
"complexity": 0.06,
|
| 1047 |
+
"tier": "trivial",
|
| 1048 |
+
"domain": "factual",
|
| 1049 |
+
"model": "gpt-4o-mini"
|
| 1050 |
+
},
|
| 1051 |
+
{
|
| 1052 |
+
"query": "Name one mammal that can fly",
|
| 1053 |
+
"complexity": 0.04,
|
| 1054 |
+
"tier": "trivial",
|
| 1055 |
+
"domain": "science",
|
| 1056 |
+
"model": "claude-3-5-haiku-20241022"
|
| 1057 |
+
},
|
| 1058 |
+
{
|
| 1059 |
+
"query": "Translate 'I love you' to French",
|
| 1060 |
+
"complexity": 0.05,
|
| 1061 |
+
"tier": "trivial",
|
| 1062 |
+
"domain": "translation",
|
| 1063 |
+
"model": "llama3.2:3b"
|
| 1064 |
+
},
|
| 1065 |
+
{
|
| 1066 |
+
"query": "What is the largest ocean on Earth?",
|
| 1067 |
+
"complexity": 0.03,
|
| 1068 |
+
"tier": "trivial",
|
| 1069 |
+
"domain": "factual",
|
| 1070 |
+
"model": "gpt-3.5-turbo"
|
| 1071 |
+
},
|
| 1072 |
+
{
|
| 1073 |
+
"query": "Explain what a function is in programming",
|
| 1074 |
+
"complexity": 0.16,
|
| 1075 |
+
"tier": "easy",
|
| 1076 |
+
"domain": "code",
|
| 1077 |
+
"model": "gemini-1.5-flash"
|
| 1078 |
+
},
|
| 1079 |
+
{
|
| 1080 |
+
"query": "Write a simple for loop in C++ that prints 0 to 9",
|
| 1081 |
+
"complexity": 0.19,
|
| 1082 |
+
"tier": "easy",
|
| 1083 |
+
"domain": "code",
|
| 1084 |
+
"model": "mistral-small-latest"
|
| 1085 |
+
},
|
| 1086 |
+
{
|
| 1087 |
+
"query": "What is the difference between '==' and '===' in JavaScript?",
|
| 1088 |
+
"complexity": 0.21,
|
| 1089 |
+
"tier": "easy",
|
| 1090 |
+
"domain": "code",
|
| 1091 |
+
"model": "llama3.1:8b"
|
| 1092 |
+
},
|
| 1093 |
+
{
|
| 1094 |
+
"query": "Translate 'The weather is nice today' to Italian",
|
| 1095 |
+
"complexity": 0.18,
|
| 1096 |
+
"tier": "easy",
|
| 1097 |
+
"domain": "translation",
|
| 1098 |
+
"model": "gpt-4o-mini"
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"query": "Summarize the water cycle in two sentences",
|
| 1102 |
+
"complexity": 0.17,
|
| 1103 |
+
"tier": "easy",
|
| 1104 |
+
"domain": "summarization",
|
| 1105 |
+
"model": "claude-3-haiku-20240307"
|
| 1106 |
+
},
|
| 1107 |
+
{
|
| 1108 |
+
"query": "Solve for y: 3y - 7 = 11",
|
| 1109 |
+
"complexity": 0.11,
|
| 1110 |
+
"tier": "easy",
|
| 1111 |
+
"domain": "math",
|
| 1112 |
+
"model": "llama3.2:3b"
|
| 1113 |
+
},
|
| 1114 |
+
{
|
| 1115 |
+
"query": "Why do we have seasons?",
|
| 1116 |
+
"complexity": 0.19,
|
| 1117 |
+
"tier": "easy",
|
| 1118 |
+
"domain": "science",
|
| 1119 |
+
"model": "gpt-3.5-turbo"
|
| 1120 |
+
},
|
| 1121 |
+
{
|
| 1122 |
+
"query": "Write a two-line poem about the moon",
|
| 1123 |
+
"complexity": 0.14,
|
| 1124 |
+
"tier": "easy",
|
| 1125 |
+
"domain": "creative",
|
| 1126 |
+
"model": "claude-3-5-haiku-20241022"
|
| 1127 |
+
},
|
| 1128 |
+
{
|
| 1129 |
+
"query": "Explain why ice floats on water",
|
| 1130 |
+
"complexity": 0.2,
|
| 1131 |
+
"tier": "easy",
|
| 1132 |
+
"domain": "reasoning",
|
| 1133 |
+
"model": "gemini-1.5-pro"
|
| 1134 |
+
},
|
| 1135 |
+
{
|
| 1136 |
+
"query": "What is a stack overflow in programming?",
|
| 1137 |
+
"complexity": 0.23,
|
| 1138 |
+
"tier": "easy",
|
| 1139 |
+
"domain": "code",
|
| 1140 |
+
"model": "mistral-large-latest"
|
| 1141 |
+
},
|
| 1142 |
+
{
|
| 1143 |
+
"query": "Write a Python function to find the maximum of three numbers",
|
| 1144 |
+
"complexity": 0.27,
|
| 1145 |
+
"tier": "easy",
|
| 1146 |
+
"domain": "code",
|
| 1147 |
+
"model": "deepseek-chat"
|
| 1148 |
+
},
|
| 1149 |
+
{
|
| 1150 |
+
"query": "What is the difference between a class and an object?",
|
| 1151 |
+
"complexity": 0.25,
|
| 1152 |
+
"tier": "easy",
|
| 1153 |
+
"domain": "code",
|
| 1154 |
+
"model": "gpt-4o-mini"
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"query": "Translate 'Where is the bathroom?' to Spanish",
|
| 1158 |
+
"complexity": 0.12,
|
| 1159 |
+
"tier": "easy",
|
| 1160 |
+
"domain": "translation",
|
| 1161 |
+
"model": "llama3.1:70b"
|
| 1162 |
+
},
|
| 1163 |
+
{
|
| 1164 |
+
"query": "Summarize the main idea of the movie 'Inception'",
|
| 1165 |
+
"complexity": 0.29,
|
| 1166 |
+
"tier": "easy",
|
| 1167 |
+
"domain": "summarization",
|
| 1168 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1169 |
+
},
|
| 1170 |
+
{
|
| 1171 |
+
"query": "Solve: What is 25% of 80?",
|
| 1172 |
+
"complexity": 0.08,
|
| 1173 |
+
"tier": "trivial",
|
| 1174 |
+
"domain": "math",
|
| 1175 |
+
"model": "gpt-4o"
|
| 1176 |
+
},
|
| 1177 |
+
{
|
| 1178 |
+
"query": "Write a short story about a lost key",
|
| 1179 |
+
"complexity": 0.35,
|
| 1180 |
+
"tier": "medium",
|
| 1181 |
+
"domain": "creative",
|
| 1182 |
+
"model": "gpt-4o"
|
| 1183 |
+
},
|
| 1184 |
+
{
|
| 1185 |
+
"query": "Explain how a memoization works with an example in JavaScript",
|
| 1186 |
+
"complexity": 0.44,
|
| 1187 |
+
"tier": "medium",
|
| 1188 |
+
"domain": "code",
|
| 1189 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1190 |
+
},
|
| 1191 |
+
{
|
| 1192 |
+
"query": "Compare TCP and UDP protocols in detail",
|
| 1193 |
+
"complexity": 0.43,
|
| 1194 |
+
"tier": "medium",
|
| 1195 |
+
"domain": "code",
|
| 1196 |
+
"model": "deepseek-chat"
|
| 1197 |
+
},
|
| 1198 |
+
{
|
| 1199 |
+
"query": "What is the halting problem? Why is it important?",
|
| 1200 |
+
"complexity": 0.51,
|
| 1201 |
+
"tier": "medium",
|
| 1202 |
+
"domain": "reasoning",
|
| 1203 |
+
"model": "gpt-4o"
|
| 1204 |
+
},
|
| 1205 |
+
{
|
| 1206 |
+
"query": "Derive the formula for the area of a circle",
|
| 1207 |
+
"complexity": 0.37,
|
| 1208 |
+
"tier": "medium",
|
| 1209 |
+
"domain": "math",
|
| 1210 |
+
"model": "gemini-1.5-pro"
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"query": "Explain the second law of thermodynamics",
|
| 1214 |
+
"complexity": 0.53,
|
| 1215 |
+
"tier": "medium",
|
| 1216 |
+
"domain": "science",
|
| 1217 |
+
"model": "llama3.1:70b"
|
| 1218 |
+
},
|
| 1219 |
+
{
|
| 1220 |
+
"query": "Write a SQL query to get the top 5 highest paid employees",
|
| 1221 |
+
"complexity": 0.39,
|
| 1222 |
+
"tier": "medium",
|
| 1223 |
+
"domain": "code",
|
| 1224 |
+
"model": "mistral-large-latest"
|
| 1225 |
+
},
|
| 1226 |
+
{
|
| 1227 |
+
"query": "Translate this business email to Japanese: 'Dear Sir, we appreciate your prompt response.'",
|
| 1228 |
+
"complexity": 0.48,
|
| 1229 |
+
"tier": "medium",
|
| 1230 |
+
"domain": "translation",
|
| 1231 |
+
"model": "gpt-4o"
|
| 1232 |
+
},
|
| 1233 |
+
{
|
| 1234 |
+
"query": "Summarize the plot of 'The Odyssey' in 100 words",
|
| 1235 |
+
"complexity": 0.42,
|
| 1236 |
+
"tier": "medium",
|
| 1237 |
+
"domain": "summarization",
|
| 1238 |
+
"model": "claude-3-5-haiku-20241022"
|
| 1239 |
+
},
|
| 1240 |
+
{
|
| 1241 |
+
"query": "Write a sonnet about artificial intelligence",
|
| 1242 |
+
"complexity": 0.54,
|
| 1243 |
+
"tier": "medium",
|
| 1244 |
+
"domain": "creative",
|
| 1245 |
+
"model": "deepseek-chat"
|
| 1246 |
+
},
|
| 1247 |
+
{
|
| 1248 |
+
"query": "Explain the concept of race conditions in multithreading",
|
| 1249 |
+
"complexity": 0.46,
|
| 1250 |
+
"tier": "medium",
|
| 1251 |
+
"domain": "code",
|
| 1252 |
+
"model": "gpt-4o-mini"
|
| 1253 |
+
},
|
| 1254 |
+
{
|
| 1255 |
+
"query": "What is the difference between deep learning and traditional machine learning?",
|
| 1256 |
+
"complexity": 0.47,
|
| 1257 |
+
"tier": "medium",
|
| 1258 |
+
"domain": "science",
|
| 1259 |
+
"model": "gemini-1.5-pro"
|
| 1260 |
+
},
|
| 1261 |
+
{
|
| 1262 |
+
"query": "Solve the integral of x^2 dx",
|
| 1263 |
+
"complexity": 0.36,
|
| 1264 |
+
"tier": "medium",
|
| 1265 |
+
"domain": "math",
|
| 1266 |
+
"model": "llama3.1:8b"
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"query": "Implement a binary tree traversal in Python (in-order)",
|
| 1270 |
+
"complexity": 0.56,
|
| 1271 |
+
"tier": "medium",
|
| 1272 |
+
"domain": "code",
|
| 1273 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1274 |
+
},
|
| 1275 |
+
{
|
| 1276 |
+
"query": "Design a simple load balancer algorithm",
|
| 1277 |
+
"complexity": 0.58,
|
| 1278 |
+
"tier": "medium",
|
| 1279 |
+
"domain": "code",
|
| 1280 |
+
"model": "gpt-4o"
|
| 1281 |
+
},
|
| 1282 |
+
{
|
| 1283 |
+
"query": "Write a regular expression to validate a US phone number",
|
| 1284 |
+
"complexity": 0.41,
|
| 1285 |
+
"tier": "medium",
|
| 1286 |
+
"domain": "code",
|
| 1287 |
+
"model": "mistral-small-latest"
|
| 1288 |
+
},
|
| 1289 |
+
{
|
| 1290 |
+
"query": "Explain the prisoner's dilemma and its implications",
|
| 1291 |
+
"complexity": 0.49,
|
| 1292 |
+
"tier": "medium",
|
| 1293 |
+
"domain": "reasoning",
|
| 1294 |
+
"model": "deepseek-chat"
|
| 1295 |
+
},
|
| 1296 |
+
{
|
| 1297 |
+
"query": "What is the Golden Ratio? Provide examples",
|
| 1298 |
+
"complexity": 0.34,
|
| 1299 |
+
"tier": "medium",
|
| 1300 |
+
"domain": "math",
|
| 1301 |
+
"model": "gpt-4o-mini"
|
| 1302 |
+
},
|
| 1303 |
+
{
|
| 1304 |
+
"query": "Translate a medical prescription summary to German",
|
| 1305 |
+
"complexity": 0.52,
|
| 1306 |
+
"tier": "medium",
|
| 1307 |
+
"domain": "translation",
|
| 1308 |
+
"model": "llama3.1:70b"
|
| 1309 |
+
},
|
| 1310 |
+
{
|
| 1311 |
+
"query": "Summarize the key arguments in Plato's 'Republic'",
|
| 1312 |
+
"complexity": 0.57,
|
| 1313 |
+
"tier": "hard",
|
| 1314 |
+
"domain": "summarization",
|
| 1315 |
+
"model": "gpt-4o"
|
| 1316 |
+
},
|
| 1317 |
+
{
|
| 1318 |
+
"query": "Write a dark fantasy short story about a cursed mirror",
|
| 1319 |
+
"complexity": 0.64,
|
| 1320 |
+
"tier": "hard",
|
| 1321 |
+
"domain": "creative",
|
| 1322 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"query": "Implement a concurrent web scraper in Python with asyncio",
|
| 1326 |
+
"complexity": 0.69,
|
| 1327 |
+
"tier": "hard",
|
| 1328 |
+
"domain": "code",
|
| 1329 |
+
"model": "deepseek-chat"
|
| 1330 |
+
},
|
| 1331 |
+
{
|
| 1332 |
+
"query": "Explain the proof of Fermat's Last Theorem at a high level",
|
| 1333 |
+
"complexity": 0.78,
|
| 1334 |
+
"tier": "hard",
|
| 1335 |
+
"domain": "math",
|
| 1336 |
+
"model": "gpt-4o"
|
| 1337 |
+
},
|
| 1338 |
+
{
|
| 1339 |
+
"query": "Analyze the performance implications of columnar vs row-based storage",
|
| 1340 |
+
"complexity": 0.66,
|
| 1341 |
+
"tier": "hard",
|
| 1342 |
+
"domain": "reasoning",
|
| 1343 |
+
"model": "gemini-1.5-pro"
|
| 1344 |
+
},
|
| 1345 |
+
{
|
| 1346 |
+
"query": "Describe the architecture of a distributed key-value store like DynamoDB",
|
| 1347 |
+
"complexity": 0.72,
|
| 1348 |
+
"tier": "hard",
|
| 1349 |
+
"domain": "code",
|
| 1350 |
+
"model": "llama3.1:70b"
|
| 1351 |
+
},
|
| 1352 |
+
{
|
| 1353 |
+
"query": "What is the Bellman equation in reinforcement learning?",
|
| 1354 |
+
"complexity": 0.63,
|
| 1355 |
+
"tier": "hard",
|
| 1356 |
+
"domain": "science",
|
| 1357 |
+
"model": "mistral-large-latest"
|
| 1358 |
+
},
|
| 1359 |
+
{
|
| 1360 |
+
"query": "Translate a complex legal disclaimer to French",
|
| 1361 |
+
"complexity": 0.61,
|
| 1362 |
+
"tier": "hard",
|
| 1363 |
+
"domain": "translation",
|
| 1364 |
+
"model": "gpt-4o"
|
| 1365 |
+
},
|
| 1366 |
+
{
|
| 1367 |
+
"query": "Design a real-time chat system supporting 1 million concurrent users",
|
| 1368 |
+
"complexity": 0.75,
|
| 1369 |
+
"tier": "hard",
|
| 1370 |
+
"domain": "code",
|
| 1371 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1372 |
+
},
|
| 1373 |
+
{
|
| 1374 |
+
"query": "Explain the concept of zero-knowledge proofs",
|
| 1375 |
+
"complexity": 0.7,
|
| 1376 |
+
"tier": "hard",
|
| 1377 |
+
"domain": "science",
|
| 1378 |
+
"model": "deepseek-chat"
|
| 1379 |
+
},
|
| 1380 |
+
{
|
| 1381 |
+
"query": "Write a detailed guide on optimizing Python code with C extensions",
|
| 1382 |
+
"complexity": 0.68,
|
| 1383 |
+
"tier": "hard",
|
| 1384 |
+
"domain": "code",
|
| 1385 |
+
"model": "gpt-4o"
|
| 1386 |
+
},
|
| 1387 |
+
{
|
| 1388 |
+
"query": "Prove that there are infinitely many prime numbers",
|
| 1389 |
+
"complexity": 0.59,
|
| 1390 |
+
"tier": "hard",
|
| 1391 |
+
"domain": "math",
|
| 1392 |
+
"model": "gemini-1.5-pro"
|
| 1393 |
+
},
|
| 1394 |
+
{
|
| 1395 |
+
"query": "What is the 'butterfly effect' in chaos theory?",
|
| 1396 |
+
"complexity": 0.55,
|
| 1397 |
+
"tier": "medium",
|
| 1398 |
+
"domain": "science",
|
| 1399 |
+
"model": "llama3.1:8b"
|
| 1400 |
+
},
|
| 1401 |
+
{
|
| 1402 |
+
"query": "Write a memoization decorator in Python",
|
| 1403 |
+
"complexity": 0.53,
|
| 1404 |
+
"tier": "medium",
|
| 1405 |
+
"domain": "code",
|
| 1406 |
+
"model": "claude-3-5-haiku-20241022"
|
| 1407 |
+
},
|
| 1408 |
+
{
|
| 1409 |
+
"query": "Explain the difference between a process and a thread",
|
| 1410 |
+
"complexity": 0.31,
|
| 1411 |
+
"tier": "easy",
|
| 1412 |
+
"domain": "code",
|
| 1413 |
+
"model": "gpt-4o-mini"
|
| 1414 |
+
},
|
| 1415 |
+
{
|
| 1416 |
+
"query": "Translate 'Congratulations on your new job' to Portuguese",
|
| 1417 |
+
"complexity": 0.13,
|
| 1418 |
+
"tier": "easy",
|
| 1419 |
+
"domain": "translation",
|
| 1420 |
+
"model": "mistral-small-latest"
|
| 1421 |
+
},
|
| 1422 |
+
{
|
| 1423 |
+
"query": "What is the role of the nucleus in a cell?",
|
| 1424 |
+
"complexity": 0.15,
|
| 1425 |
+
"tier": "easy",
|
| 1426 |
+
"domain": "science",
|
| 1427 |
+
"model": "gemini-1.5-flash"
|
| 1428 |
+
},
|
| 1429 |
+
{
|
| 1430 |
+
"query": "Write a Java program to reverse a string",
|
| 1431 |
+
"complexity": 0.26,
|
| 1432 |
+
"tier": "easy",
|
| 1433 |
+
"domain": "code",
|
| 1434 |
+
"model": "llama3.2:3b"
|
| 1435 |
+
},
|
| 1436 |
+
{
|
| 1437 |
+
"query": "Solve for x: 4x^2 - 16 = 0",
|
| 1438 |
+
"complexity": 0.28,
|
| 1439 |
+
"tier": "easy",
|
| 1440 |
+
"domain": "math",
|
| 1441 |
+
"model": "gpt-3.5-turbo"
|
| 1442 |
+
},
|
| 1443 |
+
{
|
| 1444 |
+
"query": "Write a haiku about winter",
|
| 1445 |
+
"complexity": 0.15,
|
| 1446 |
+
"tier": "easy",
|
| 1447 |
+
"domain": "creative",
|
| 1448 |
+
"model": "claude-3-haiku-20240307"
|
| 1449 |
+
},
|
| 1450 |
+
{
|
| 1451 |
+
"query": "Explain why the sky is red at sunset",
|
| 1452 |
+
"complexity": 0.27,
|
| 1453 |
+
"tier": "easy",
|
| 1454 |
+
"domain": "reasoning",
|
| 1455 |
+
"model": "gpt-4o-mini"
|
| 1456 |
+
},
|
| 1457 |
+
{
|
| 1458 |
+
"query": "What is an abstract class in Java?",
|
| 1459 |
+
"complexity": 0.29,
|
| 1460 |
+
"tier": "easy",
|
| 1461 |
+
"domain": "code",
|
| 1462 |
+
"model": "mistral-large-latest"
|
| 1463 |
+
},
|
| 1464 |
+
{
|
| 1465 |
+
"query": "Write a simple HTML form with two input fields",
|
| 1466 |
+
"complexity": 0.18,
|
| 1467 |
+
"tier": "easy",
|
| 1468 |
+
"domain": "code",
|
| 1469 |
+
"model": "llama3.1:70b"
|
| 1470 |
+
},
|
| 1471 |
+
{
|
| 1472 |
+
"query": "Translate 'I need a doctor' to Korean",
|
| 1473 |
+
"complexity": 0.17,
|
| 1474 |
+
"tier": "easy",
|
| 1475 |
+
"domain": "translation",
|
| 1476 |
+
"model": "deepseek-chat"
|
| 1477 |
+
},
|
| 1478 |
+
{
|
| 1479 |
+
"query": "Summarize the plot of 'Hamlet' in three sentences",
|
| 1480 |
+
"complexity": 0.32,
|
| 1481 |
+
"tier": "easy",
|
| 1482 |
+
"domain": "summarization",
|
| 1483 |
+
"model": "gpt-4o"
|
| 1484 |
+
},
|
| 1485 |
+
{
|
| 1486 |
+
"query": "What is the difference between GET and POST in HTTP?",
|
| 1487 |
+
"complexity": 0.22,
|
| 1488 |
+
"tier": "easy",
|
| 1489 |
+
"domain": "code",
|
| 1490 |
+
"model": "gemini-1.5-pro"
|
| 1491 |
+
},
|
| 1492 |
+
{
|
| 1493 |
+
"query": "Write a C program to calculate factorial using recursion",
|
| 1494 |
+
"complexity": 0.4,
|
| 1495 |
+
"tier": "medium",
|
| 1496 |
+
"domain": "code",
|
| 1497 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1498 |
+
},
|
| 1499 |
+
{
|
| 1500 |
+
"query": "Explain the concept of 'sharding' in databases",
|
| 1501 |
+
"complexity": 0.48,
|
| 1502 |
+
"tier": "medium",
|
| 1503 |
+
"domain": "code",
|
| 1504 |
+
"model": "gpt-4o"
|
| 1505 |
+
},
|
| 1506 |
+
{
|
| 1507 |
+
"query": "What is the difference between L1 and L2 regularization?",
|
| 1508 |
+
"complexity": 0.54,
|
| 1509 |
+
"tier": "medium",
|
| 1510 |
+
"domain": "science",
|
| 1511 |
+
"model": "deepseek-chat"
|
| 1512 |
+
},
|
| 1513 |
+
{
|
| 1514 |
+
"query": "Solve the system of equations: 2x + y = 10, x - y = 2",
|
| 1515 |
+
"complexity": 0.33,
|
| 1516 |
+
"tier": "medium",
|
| 1517 |
+
"domain": "math",
|
| 1518 |
+
"model": "llama3.1:70b"
|
| 1519 |
+
},
|
| 1520 |
+
{
|
| 1521 |
+
"query": "Write a Python script to download an image from a URL",
|
| 1522 |
+
"complexity": 0.36,
|
| 1523 |
+
"tier": "medium",
|
| 1524 |
+
"domain": "code",
|
| 1525 |
+
"model": "mistral-large-latest"
|
| 1526 |
+
},
|
| 1527 |
+
{
|
| 1528 |
+
"query": "Explain the Turing test and its criticisms",
|
| 1529 |
+
"complexity": 0.45,
|
| 1530 |
+
"tier": "medium",
|
| 1531 |
+
"domain": "reasoning",
|
| 1532 |
+
"model": "gemini-1.5-pro"
|
| 1533 |
+
},
|
| 1534 |
+
{
|
| 1535 |
+
"query": "Translate 'The system is currently offline' to Russian",
|
| 1536 |
+
"complexity": 0.39,
|
| 1537 |
+
"tier": "medium",
|
| 1538 |
+
"domain": "translation",
|
| 1539 |
+
"model": "claude-3-5-haiku-20241022"
|
| 1540 |
+
},
|
| 1541 |
+
{
|
| 1542 |
+
"query": "Summarize the main findings of the Human Genome Project",
|
| 1543 |
+
"complexity": 0.5,
|
| 1544 |
+
"tier": "medium",
|
| 1545 |
+
"domain": "science",
|
| 1546 |
+
"model": "gpt-4o-mini"
|
| 1547 |
+
},
|
| 1548 |
+
{
|
| 1549 |
+
"query": "Write a villanelle about lost time",
|
| 1550 |
+
"complexity": 0.62,
|
| 1551 |
+
"tier": "hard",
|
| 1552 |
+
"domain": "creative",
|
| 1553 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1554 |
+
},
|
| 1555 |
+
{
|
| 1556 |
+
"query": "Implement a connection pool in Go",
|
| 1557 |
+
"complexity": 0.71,
|
| 1558 |
+
"tier": "hard",
|
| 1559 |
+
"domain": "code",
|
| 1560 |
+
"model": "deepseek-chat"
|
| 1561 |
+
},
|
| 1562 |
+
{
|
| 1563 |
+
"query": "Explain the concept of 'eventual consistency' in distributed systems",
|
| 1564 |
+
"complexity": 0.65,
|
| 1565 |
+
"tier": "hard",
|
| 1566 |
+
"domain": "reasoning",
|
| 1567 |
+
"model": "gpt-4o"
|
| 1568 |
+
},
|
| 1569 |
+
{
|
| 1570 |
+
"query": "Derive the Black-Scholes equation for option pricing",
|
| 1571 |
+
"complexity": 0.85,
|
| 1572 |
+
"tier": "expert",
|
| 1573 |
+
"domain": "math",
|
| 1574 |
+
"model": "gpt-4o"
|
| 1575 |
+
},
|
| 1576 |
+
{
|
| 1577 |
+
"query": "Design a distributed consensus protocol like Raft",
|
| 1578 |
+
"complexity": 0.88,
|
| 1579 |
+
"tier": "expert",
|
| 1580 |
+
"domain": "code",
|
| 1581 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1582 |
+
},
|
| 1583 |
+
{
|
| 1584 |
+
"query": "Explain the holographic principle in theoretical physics",
|
| 1585 |
+
"complexity": 0.92,
|
| 1586 |
+
"tier": "expert",
|
| 1587 |
+
"domain": "science",
|
| 1588 |
+
"model": "deepseek-chat"
|
| 1589 |
+
},
|
| 1590 |
+
{
|
| 1591 |
+
"query": "Write a full compiler frontend for a small language in Rust",
|
| 1592 |
+
"complexity": 0.95,
|
| 1593 |
+
"tier": "expert",
|
| 1594 |
+
"domain": "code",
|
| 1595 |
+
"model": "gemini-1.5-pro"
|
| 1596 |
+
},
|
| 1597 |
+
{
|
| 1598 |
+
"query": "Prove the Riemann Hypothesis (outline the main approach)",
|
| 1599 |
+
"complexity": 0.99,
|
| 1600 |
+
"tier": "expert",
|
| 1601 |
+
"domain": "math",
|
| 1602 |
+
"model": "gpt-4o"
|
| 1603 |
+
},
|
| 1604 |
+
{
|
| 1605 |
+
"query": "Analyze the security of the TLS 1.3 handshake",
|
| 1606 |
+
"complexity": 0.82,
|
| 1607 |
+
"tier": "expert",
|
| 1608 |
+
"domain": "reasoning",
|
| 1609 |
+
"model": "llama3.1:70b"
|
| 1610 |
+
},
|
| 1611 |
+
{
|
| 1612 |
+
"query": "Implement a B+ tree index from scratch",
|
| 1613 |
+
"complexity": 0.87,
|
| 1614 |
+
"tier": "expert",
|
| 1615 |
+
"domain": "code",
|
| 1616 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1617 |
+
},
|
| 1618 |
+
{
|
| 1619 |
+
"query": "Translate a complex patent document to Chinese",
|
| 1620 |
+
"complexity": 0.76,
|
| 1621 |
+
"tier": "hard",
|
| 1622 |
+
"domain": "translation",
|
| 1623 |
+
"model": "gpt-4o"
|
| 1624 |
+
},
|
| 1625 |
+
{
|
| 1626 |
+
"query": "What is the difference between Bayesian and frequentist statistics?",
|
| 1627 |
+
"complexity": 0.67,
|
| 1628 |
+
"tier": "hard",
|
| 1629 |
+
"domain": "math",
|
| 1630 |
+
"model": "deepseek-chat"
|
| 1631 |
+
},
|
| 1632 |
+
{
|
| 1633 |
+
"query": "Write a high-performance WebSocket server in C++",
|
| 1634 |
+
"complexity": 0.79,
|
| 1635 |
+
"tier": "hard",
|
| 1636 |
+
"domain": "code",
|
| 1637 |
+
"model": "gpt-4o"
|
| 1638 |
+
},
|
| 1639 |
+
{
|
| 1640 |
+
"query": "Explain the many-worlds interpretation of quantum mechanics",
|
| 1641 |
+
"complexity": 0.73,
|
| 1642 |
+
"tier": "hard",
|
| 1643 |
+
"domain": "science",
|
| 1644 |
+
"model": "llama3.1:70b"
|
| 1645 |
+
},
|
| 1646 |
+
{
|
| 1647 |
+
"query": "Design a URL shortening service like TinyURL (full design)",
|
| 1648 |
+
"complexity": 0.69,
|
| 1649 |
+
"tier": "hard",
|
| 1650 |
+
"domain": "code",
|
| 1651 |
+
"model": "gemini-1.5-pro"
|
| 1652 |
+
},
|
| 1653 |
+
{
|
| 1654 |
+
"query": "Prove the central limit theorem",
|
| 1655 |
+
"complexity": 0.84,
|
| 1656 |
+
"tier": "expert",
|
| 1657 |
+
"domain": "math",
|
| 1658 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1659 |
+
},
|
| 1660 |
+
{
|
| 1661 |
+
"query": "Write a distributed task queue using Redis",
|
| 1662 |
+
"complexity": 0.74,
|
| 1663 |
+
"tier": "hard",
|
| 1664 |
+
"domain": "code",
|
| 1665 |
+
"model": "deepseek-chat"
|
| 1666 |
+
},
|
| 1667 |
+
{
|
| 1668 |
+
"query": "Explain the concept of 'transfer learning' in neural networks",
|
| 1669 |
+
"complexity": 0.56,
|
| 1670 |
+
"tier": "medium",
|
| 1671 |
+
"domain": "science",
|
| 1672 |
+
"model": "gpt-4o"
|
| 1673 |
+
},
|
| 1674 |
+
{
|
| 1675 |
+
"query": "Solve the traveling salesman problem using dynamic programming",
|
| 1676 |
+
"complexity": 0.77,
|
| 1677 |
+
"tier": "hard",
|
| 1678 |
+
"domain": "code",
|
| 1679 |
+
"model": "llama3.1:70b"
|
| 1680 |
+
},
|
| 1681 |
+
{
|
| 1682 |
+
"query": "Translate a software license agreement to Spanish",
|
| 1683 |
+
"complexity": 0.58,
|
| 1684 |
+
"tier": "hard",
|
| 1685 |
+
"domain": "translation",
|
| 1686 |
+
"model": "gpt-4o-mini"
|
| 1687 |
+
},
|
| 1688 |
+
{
|
| 1689 |
+
"query": "Write a detailed critique of the OpenAI GPT-4 architecture",
|
| 1690 |
+
"complexity": 0.7,
|
| 1691 |
+
"tier": "hard",
|
| 1692 |
+
"domain": "reasoning",
|
| 1693 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1694 |
+
},
|
| 1695 |
+
{
|
| 1696 |
+
"query": "What is the curse of dimensionality in machine learning?",
|
| 1697 |
+
"complexity": 0.51,
|
| 1698 |
+
"tier": "medium",
|
| 1699 |
+
"domain": "science",
|
| 1700 |
+
"model": "gemini-1.5-pro"
|
| 1701 |
+
},
|
| 1702 |
+
{
|
| 1703 |
+
"query": "Fix this Python code: `def add(a,b): return a-b` – it should add, not subtract.",
|
| 1704 |
+
"complexity": 0.11,
|
| 1705 |
+
"tier": "easy",
|
| 1706 |
+
"domain": "code",
|
| 1707 |
+
"model": "gpt-3.5-turbo"
|
| 1708 |
+
},
|
| 1709 |
+
{
|
| 1710 |
+
"query": "Write a Python one-liner to reverse a string.",
|
| 1711 |
+
"complexity": 0.13,
|
| 1712 |
+
"tier": "easy",
|
| 1713 |
+
"domain": "code",
|
| 1714 |
+
"model": "claude-3-haiku-20240307"
|
| 1715 |
+
},
|
| 1716 |
+
{
|
| 1717 |
+
"query": "Why does this JavaScript code print 'undefined'? `var x; console.log(x);`",
|
| 1718 |
+
"complexity": 0.09,
|
| 1719 |
+
"tier": "trivial",
|
| 1720 |
+
"domain": "code",
|
| 1721 |
+
"model": "llama3.2:3b"
|
| 1722 |
+
},
|
| 1723 |
+
{
|
| 1724 |
+
"query": "Implement a function `is_palindrome(s)` in Python that ignores spaces and case.",
|
| 1725 |
+
"complexity": 0.32,
|
| 1726 |
+
"tier": "medium",
|
| 1727 |
+
"domain": "code",
|
| 1728 |
+
"model": "gemini-1.5-flash"
|
| 1729 |
+
},
|
| 1730 |
+
{
|
| 1731 |
+
"query": "Debug this SQL: `SELECT * FORM users WHERE name = 'John';`",
|
| 1732 |
+
"complexity": 0.07,
|
| 1733 |
+
"tier": "trivial",
|
| 1734 |
+
"domain": "code",
|
| 1735 |
+
"model": "mistral-small-latest"
|
| 1736 |
+
},
|
| 1737 |
+
{
|
| 1738 |
+
"query": "Write a recursive function to compute the nth Fibonacci number in Java.",
|
| 1739 |
+
"complexity": 0.34,
|
| 1740 |
+
"tier": "medium",
|
| 1741 |
+
"domain": "code",
|
| 1742 |
+
"model": "gpt-4o-mini"
|
| 1743 |
+
},
|
| 1744 |
+
{
|
| 1745 |
+
"query": "What is the output of `console.log(1 + '2' + 3)` in JavaScript? Explain.",
|
| 1746 |
+
"complexity": 0.15,
|
| 1747 |
+
"tier": "easy",
|
| 1748 |
+
"domain": "code",
|
| 1749 |
+
"model": "llama3.1:8b"
|
| 1750 |
+
},
|
| 1751 |
+
{
|
| 1752 |
+
"query": "Write a C function to swap two integers using pointers.",
|
| 1753 |
+
"complexity": 0.28,
|
| 1754 |
+
"tier": "easy",
|
| 1755 |
+
"domain": "code",
|
| 1756 |
+
"model": "claude-3-5-haiku-20241022"
|
| 1757 |
+
},
|
| 1758 |
+
{
|
| 1759 |
+
"query": "Fix the memory leak in this C++ snippet: `int* p = new int; p = new int; delete p;`",
|
| 1760 |
+
"complexity": 0.45,
|
| 1761 |
+
"tier": "medium",
|
| 1762 |
+
"domain": "code",
|
| 1763 |
+
"model": "deepseek-chat"
|
| 1764 |
+
},
|
| 1765 |
+
{
|
| 1766 |
+
"query": "Implement a queue using two stacks in Python.",
|
| 1767 |
+
"complexity": 0.52,
|
| 1768 |
+
"tier": "medium",
|
| 1769 |
+
"domain": "code",
|
| 1770 |
+
"model": "gpt-4o"
|
| 1771 |
+
},
|
| 1772 |
+
{
|
| 1773 |
+
"query": "Write a regex to extract all email addresses from a text.",
|
| 1774 |
+
"complexity": 0.38,
|
| 1775 |
+
"tier": "medium",
|
| 1776 |
+
"domain": "code",
|
| 1777 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1778 |
+
},
|
| 1779 |
+
{
|
| 1780 |
+
"query": "Why does this infinite loop happen? `for i in range(10): i -= 1`",
|
| 1781 |
+
"complexity": 0.21,
|
| 1782 |
+
"tier": "easy",
|
| 1783 |
+
"domain": "code",
|
| 1784 |
+
"model": "gemini-1.5-pro"
|
| 1785 |
+
},
|
| 1786 |
+
{
|
| 1787 |
+
"query": "Write a Python decorator that caches return values of a function.",
|
| 1788 |
+
"complexity": 0.58,
|
| 1789 |
+
"tier": "hard",
|
| 1790 |
+
"domain": "code",
|
| 1791 |
+
"model": "llama3.1:70b"
|
| 1792 |
+
},
|
| 1793 |
+
{
|
| 1794 |
+
"query": "Convert this list comprehension to a for loop: `[x**2 for x in range(10) if x%2==0]`",
|
| 1795 |
+
"complexity": 0.19,
|
| 1796 |
+
"tier": "easy",
|
| 1797 |
+
"domain": "code",
|
| 1798 |
+
"model": "gpt-3.5-turbo"
|
| 1799 |
+
},
|
| 1800 |
+
{
|
| 1801 |
+
"query": "Write a SQL query to find employees who earn more than their managers.",
|
| 1802 |
+
"complexity": 0.44,
|
| 1803 |
+
"tier": "medium",
|
| 1804 |
+
"domain": "code",
|
| 1805 |
+
"model": "mistral-large-latest"
|
| 1806 |
+
},
|
| 1807 |
+
{
|
| 1808 |
+
"query": "Implement a binary search in a sorted array (any language).",
|
| 1809 |
+
"complexity": 0.36,
|
| 1810 |
+
"tier": "medium",
|
| 1811 |
+
"domain": "code",
|
| 1812 |
+
"model": "deepseek-chat"
|
| 1813 |
+
},
|
| 1814 |
+
{
|
| 1815 |
+
"query": "Debug this Python: `print('Hello' + 123)` – what error and how to fix?",
|
| 1816 |
+
"complexity": 0.1,
|
| 1817 |
+
"tier": "trivial",
|
| 1818 |
+
"domain": "code",
|
| 1819 |
+
"model": "llama3.2:3b"
|
| 1820 |
+
},
|
| 1821 |
+
{
|
| 1822 |
+
"query": "Write a function that merges two sorted lists into one sorted list (O(n)).",
|
| 1823 |
+
"complexity": 0.41,
|
| 1824 |
+
"tier": "medium",
|
| 1825 |
+
"domain": "code",
|
| 1826 |
+
"model": "gpt-4o-mini"
|
| 1827 |
+
},
|
| 1828 |
+
{
|
| 1829 |
+
"query": "Explain why `[1,2,3].map(parseInt)` returns `[1, NaN, NaN]` in JavaScript.",
|
| 1830 |
+
"complexity": 0.47,
|
| 1831 |
+
"tier": "medium",
|
| 1832 |
+
"domain": "code",
|
| 1833 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1834 |
+
},
|
| 1835 |
+
{
|
| 1836 |
+
"query": "Write a simple HTML page that fetches data from a REST API and displays it.",
|
| 1837 |
+
"complexity": 0.39,
|
| 1838 |
+
"tier": "medium",
|
| 1839 |
+
"domain": "code",
|
| 1840 |
+
"model": "gemini-1.5-pro"
|
| 1841 |
+
},
|
| 1842 |
+
{
|
| 1843 |
+
"query": "Implement a singleton pattern in Python.",
|
| 1844 |
+
"complexity": 0.35,
|
| 1845 |
+
"tier": "medium",
|
| 1846 |
+
"domain": "code",
|
| 1847 |
+
"model": "gpt-4o"
|
| 1848 |
+
},
|
| 1849 |
+
{
|
| 1850 |
+
"query": "Fix the race condition in this multi‑threaded Python code (pseudo).",
|
| 1851 |
+
"complexity": 0.63,
|
| 1852 |
+
"tier": "hard",
|
| 1853 |
+
"domain": "code",
|
| 1854 |
+
"model": "deepseek-chat"
|
| 1855 |
+
},
|
| 1856 |
+
{
|
| 1857 |
+
"query": "Write a recursive descent parser for simple arithmetic expressions (+, -, *, /).",
|
| 1858 |
+
"complexity": 0.71,
|
| 1859 |
+
"tier": "hard",
|
| 1860 |
+
"domain": "code",
|
| 1861 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1862 |
+
},
|
| 1863 |
+
{
|
| 1864 |
+
"query": "Why does this C code crash? `int arr[5]; arr[10] = 42;`",
|
| 1865 |
+
"complexity": 0.08,
|
| 1866 |
+
"tier": "trivial",
|
| 1867 |
+
"domain": "code",
|
| 1868 |
+
"model": "gpt-3.5-turbo"
|
| 1869 |
+
},
|
| 1870 |
+
{
|
| 1871 |
+
"query": "Write a Python generator that yields the Fibonacci sequence infinitely.",
|
| 1872 |
+
"complexity": 0.33,
|
| 1873 |
+
"tier": "medium",
|
| 1874 |
+
"domain": "code",
|
| 1875 |
+
"model": "llama3.1:8b"
|
| 1876 |
+
},
|
| 1877 |
+
{
|
| 1878 |
+
"query": "Implement an LRU cache using `OrderedDict` in Python.",
|
| 1879 |
+
"complexity": 0.59,
|
| 1880 |
+
"tier": "hard",
|
| 1881 |
+
"domain": "code",
|
| 1882 |
+
"model": "gemini-1.5-pro"
|
| 1883 |
+
},
|
| 1884 |
+
{
|
| 1885 |
+
"query": "What does `*args` and `**kwargs` do in Python? Give an example.",
|
| 1886 |
+
"complexity": 0.25,
|
| 1887 |
+
"tier": "easy",
|
| 1888 |
+
"domain": "code",
|
| 1889 |
+
"model": "mistral-small-latest"
|
| 1890 |
+
},
|
| 1891 |
+
{
|
| 1892 |
+
"query": "Write a JavaScript function that throttles another function (limit calls per second).",
|
| 1893 |
+
"complexity": 0.55,
|
| 1894 |
+
"tier": "medium",
|
| 1895 |
+
"domain": "code",
|
| 1896 |
+
"model": "gpt-4o"
|
| 1897 |
+
},
|
| 1898 |
+
{
|
| 1899 |
+
"query": "Debug this SQL injection vulnerability: `\"SELECT * FROM users WHERE id = \" + user_id`",
|
| 1900 |
+
"complexity": 0.3,
|
| 1901 |
+
"tier": "easy",
|
| 1902 |
+
"domain": "code",
|
| 1903 |
+
"model": "claude-3-haiku-20240307"
|
| 1904 |
+
},
|
| 1905 |
+
{
|
| 1906 |
+
"query": "Implement a simple HTTP server in Python using `sockets`.",
|
| 1907 |
+
"complexity": 0.62,
|
| 1908 |
+
"tier": "hard",
|
| 1909 |
+
"domain": "code",
|
| 1910 |
+
"model": "deepseek-chat"
|
| 1911 |
+
},
|
| 1912 |
+
{
|
| 1913 |
+
"query": "Explain the output: `console.log([] + []); console.log([] + {}); console.log({} + []);`",
|
| 1914 |
+
"complexity": 0.49,
|
| 1915 |
+
"tier": "medium",
|
| 1916 |
+
"domain": "code",
|
| 1917 |
+
"model": "llama3.1:70b"
|
| 1918 |
+
},
|
| 1919 |
+
{
|
| 1920 |
+
"query": "Write a C++ program that reverses a linked list.",
|
| 1921 |
+
"complexity": 0.46,
|
| 1922 |
+
"tier": "medium",
|
| 1923 |
+
"domain": "code",
|
| 1924 |
+
"model": "gpt-4o-mini"
|
| 1925 |
+
},
|
| 1926 |
+
{
|
| 1927 |
+
"query": "Convert this `try/except` to using `contextlib.suppress` in Python.",
|
| 1928 |
+
"complexity": 0.27,
|
| 1929 |
+
"tier": "easy",
|
| 1930 |
+
"domain": "code",
|
| 1931 |
+
"model": "claude-3-5-haiku-20241022"
|
| 1932 |
+
},
|
| 1933 |
+
{
|
| 1934 |
+
"query": "Implement the Sieve of Eratosthenes in Java.",
|
| 1935 |
+
"complexity": 0.42,
|
| 1936 |
+
"tier": "medium",
|
| 1937 |
+
"domain": "code",
|
| 1938 |
+
"model": "gemini-1.5-flash"
|
| 1939 |
+
},
|
| 1940 |
+
{
|
| 1941 |
+
"query": "Why does `0.1 + 0.2 !== 0.3` in JavaScript? Explain floating point.",
|
| 1942 |
+
"complexity": 0.35,
|
| 1943 |
+
"tier": "medium",
|
| 1944 |
+
"domain": "code",
|
| 1945 |
+
"model": "mistral-large-latest"
|
| 1946 |
+
},
|
| 1947 |
+
{
|
| 1948 |
+
"query": "Write a Python script to find duplicate files in a directory (by hash).",
|
| 1949 |
+
"complexity": 0.51,
|
| 1950 |
+
"tier": "medium",
|
| 1951 |
+
"domain": "code",
|
| 1952 |
+
"model": "deepseek-chat"
|
| 1953 |
+
},
|
| 1954 |
+
{
|
| 1955 |
+
"query": "Fix the deadlock in this pseudocode: two threads lock A then B, and B then A.",
|
| 1956 |
+
"complexity": 0.66,
|
| 1957 |
+
"tier": "hard",
|
| 1958 |
+
"domain": "code",
|
| 1959 |
+
"model": "gpt-4o"
|
| 1960 |
+
},
|
| 1961 |
+
{
|
| 1962 |
+
"query": "Implement a simple event emitter in JavaScript (Node.js style).",
|
| 1963 |
+
"complexity": 0.48,
|
| 1964 |
+
"tier": "medium",
|
| 1965 |
+
"domain": "code",
|
| 1966 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 1967 |
+
},
|
| 1968 |
+
{
|
| 1969 |
+
"query": "What is tail recursion? Convert this factorial to tail‑recursive: `def fact(n): return 1 if n==0 else n*fact(n-1)`",
|
| 1970 |
+
"complexity": 0.37,
|
| 1971 |
+
"tier": "medium",
|
| 1972 |
+
"domain": "code",
|
| 1973 |
+
"model": "llama3.1:8b"
|
| 1974 |
+
},
|
| 1975 |
+
{
|
| 1976 |
+
"query": "Write a Go routine that computes the sum of squares concurrently.",
|
| 1977 |
+
"complexity": 0.54,
|
| 1978 |
+
"tier": "medium",
|
| 1979 |
+
"domain": "code",
|
| 1980 |
+
"model": "gpt-4o-mini"
|
| 1981 |
+
},
|
| 1982 |
+
{
|
| 1983 |
+
"query": "Translate 'The server is down' to Arabic",
|
| 1984 |
+
"complexity": 0.14,
|
| 1985 |
+
"tier": "easy",
|
| 1986 |
+
"domain": "translation",
|
| 1987 |
+
"model": "gemini-1.5-flash"
|
| 1988 |
+
},
|
| 1989 |
+
{
|
| 1990 |
+
"query": "Summarize the concept of 'reference counting' in memory management.",
|
| 1991 |
+
"complexity": 0.28,
|
| 1992 |
+
"tier": "easy",
|
| 1993 |
+
"domain": "summarization",
|
| 1994 |
+
"model": "claude-3-haiku-20240307"
|
| 1995 |
+
},
|
| 1996 |
+
{
|
| 1997 |
+
"query": "Solve `∫ x e^x dx`",
|
| 1998 |
+
"complexity": 0.43,
|
| 1999 |
+
"tier": "medium",
|
| 2000 |
+
"domain": "math",
|
| 2001 |
+
"model": "deepseek-chat"
|
| 2002 |
+
},
|
| 2003 |
+
{
|
| 2004 |
+
"query": "Explain why black holes evaporate (Hawking radiation).",
|
| 2005 |
+
"complexity": 0.61,
|
| 2006 |
+
"tier": "hard",
|
| 2007 |
+
"domain": "science",
|
| 2008 |
+
"model": "gpt-4o"
|
| 2009 |
+
},
|
| 2010 |
+
{
|
| 2011 |
+
"query": "Write a haiku about a segfault",
|
| 2012 |
+
"complexity": 0.22,
|
| 2013 |
+
"tier": "easy",
|
| 2014 |
+
"domain": "creative",
|
| 2015 |
+
"model": "llama3.2:3b"
|
| 2016 |
+
},
|
| 2017 |
+
{
|
| 2018 |
+
"query": "What is the difference between `malloc` and `calloc` in C?",
|
| 2019 |
+
"complexity": 0.17,
|
| 2020 |
+
"tier": "easy",
|
| 2021 |
+
"domain": "factual",
|
| 2022 |
+
"model": "gpt-3.5-turbo"
|
| 2023 |
+
},
|
| 2024 |
+
{
|
| 2025 |
+
"query": "Write a Python script to fetch JSON from an API and pretty‑print it.",
|
| 2026 |
+
"complexity": 0.24,
|
| 2027 |
+
"tier": "easy",
|
| 2028 |
+
"domain": "code",
|
| 2029 |
+
"model": "mistral-small-latest"
|
| 2030 |
+
},
|
| 2031 |
+
{
|
| 2032 |
+
"query": "Debug this React hook: `useEffect(() => { setCount(count+1) }, [])` – why infinite loop?",
|
| 2033 |
+
"complexity": 0.4,
|
| 2034 |
+
"tier": "medium",
|
| 2035 |
+
"domain": "code",
|
| 2036 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2037 |
+
},
|
| 2038 |
+
{
|
| 2039 |
+
"query": "Implement a deep copy function for nested dictionaries in Python.",
|
| 2040 |
+
"complexity": 0.39,
|
| 2041 |
+
"tier": "medium",
|
| 2042 |
+
"domain": "code",
|
| 2043 |
+
"model": "gemini-1.5-pro"
|
| 2044 |
+
},
|
| 2045 |
+
{
|
| 2046 |
+
"query": "Explain the `volatile` keyword in Java.",
|
| 2047 |
+
"complexity": 0.44,
|
| 2048 |
+
"tier": "medium",
|
| 2049 |
+
"domain": "code",
|
| 2050 |
+
"model": "gpt-4o-mini"
|
| 2051 |
+
},
|
| 2052 |
+
{
|
| 2053 |
+
"query": "Write a SQL query to delete duplicate rows keeping one copy.",
|
| 2054 |
+
"complexity": 0.41,
|
| 2055 |
+
"tier": "medium",
|
| 2056 |
+
"domain": "code",
|
| 2057 |
+
"model": "llama3.1:70b"
|
| 2058 |
+
},
|
| 2059 |
+
{
|
| 2060 |
+
"query": "Why does this Python code raise `UnboundLocalError`? `x = 10; def foo(): print(x); x=5`",
|
| 2061 |
+
"complexity": 0.26,
|
| 2062 |
+
"tier": "easy",
|
| 2063 |
+
"domain": "code",
|
| 2064 |
+
"model": "deepseek-chat"
|
| 2065 |
+
},
|
| 2066 |
+
{
|
| 2067 |
+
"query": "Implement a simple key‑value store with TTL (time‑to‑live) in Python.",
|
| 2068 |
+
"complexity": 0.57,
|
| 2069 |
+
"tier": "hard",
|
| 2070 |
+
"domain": "code",
|
| 2071 |
+
"model": "gpt-4o"
|
| 2072 |
+
},
|
| 2073 |
+
{
|
| 2074 |
+
"query": "Translate 'The warranty is void if seal is broken' to Mandarin.",
|
| 2075 |
+
"complexity": 0.36,
|
| 2076 |
+
"tier": "medium",
|
| 2077 |
+
"domain": "translation",
|
| 2078 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2079 |
+
},
|
| 2080 |
+
{
|
| 2081 |
+
"query": "Summarize the paper 'Attention is All You Need' in 5 bullet points.",
|
| 2082 |
+
"complexity": 0.49,
|
| 2083 |
+
"tier": "medium",
|
| 2084 |
+
"domain": "summarization",
|
| 2085 |
+
"model": "gemini-1.5-pro"
|
| 2086 |
+
},
|
| 2087 |
+
{
|
| 2088 |
+
"query": "Solve `det([[1,2],[3,4]])` (determinant).",
|
| 2089 |
+
"complexity": 0.18,
|
| 2090 |
+
"tier": "easy",
|
| 2091 |
+
"domain": "math",
|
| 2092 |
+
"model": "llama3.1:8b"
|
| 2093 |
+
},
|
| 2094 |
+
{
|
| 2095 |
+
"query": "Explain the chemical process of rusting.",
|
| 2096 |
+
"complexity": 0.23,
|
| 2097 |
+
"tier": "easy",
|
| 2098 |
+
"domain": "science",
|
| 2099 |
+
"model": "gpt-3.5-turbo"
|
| 2100 |
+
},
|
| 2101 |
+
{
|
| 2102 |
+
"query": "Write a limerick about a null pointer",
|
| 2103 |
+
"complexity": 0.29,
|
| 2104 |
+
"tier": "easy",
|
| 2105 |
+
"domain": "creative",
|
| 2106 |
+
"model": "claude-3-5-haiku-20241022"
|
| 2107 |
+
},
|
| 2108 |
+
{
|
| 2109 |
+
"query": "What is a JIT compiler? Give an example runtime.",
|
| 2110 |
+
"complexity": 0.34,
|
| 2111 |
+
"tier": "medium",
|
| 2112 |
+
"domain": "factual",
|
| 2113 |
+
"model": "mistral-large-latest"
|
| 2114 |
+
},
|
| 2115 |
+
{
|
| 2116 |
+
"query": "Write a C# method that reads a CSV file and returns a list of objects.",
|
| 2117 |
+
"complexity": 0.42,
|
| 2118 |
+
"tier": "medium",
|
| 2119 |
+
"domain": "code",
|
| 2120 |
+
"model": "deepseek-chat"
|
| 2121 |
+
},
|
| 2122 |
+
{
|
| 2123 |
+
"query": "Fix the JSON syntax error: `{name: 'John', age: 30}`",
|
| 2124 |
+
"complexity": 0.06,
|
| 2125 |
+
"tier": "trivial",
|
| 2126 |
+
"domain": "code",
|
| 2127 |
+
"model": "llama3.2:3b"
|
| 2128 |
+
},
|
| 2129 |
+
{
|
| 2130 |
+
"query": "Implement a rate limiter using the token bucket algorithm in Python.",
|
| 2131 |
+
"complexity": 0.65,
|
| 2132 |
+
"tier": "hard",
|
| 2133 |
+
"domain": "code",
|
| 2134 |
+
"model": "gpt-4o"
|
| 2135 |
+
},
|
| 2136 |
+
{
|
| 2137 |
+
"query": "Explain the output `(0 == '0')` vs `(0 === '0')` in JavaScript.",
|
| 2138 |
+
"complexity": 0.16,
|
| 2139 |
+
"tier": "easy",
|
| 2140 |
+
"domain": "code",
|
| 2141 |
+
"model": "gpt-4o-mini"
|
| 2142 |
+
},
|
| 2143 |
+
{
|
| 2144 |
+
"query": "Write a Bash script to find all `.log` files older than 7 days and compress them.",
|
| 2145 |
+
"complexity": 0.38,
|
| 2146 |
+
"tier": "medium",
|
| 2147 |
+
"domain": "code",
|
| 2148 |
+
"model": "gemini-1.5-flash"
|
| 2149 |
+
},
|
| 2150 |
+
{
|
| 2151 |
+
"query": "Debug this Python multiprocessing code that hangs: `p = Pool(); p.map(f, range(10)); p.close()` (missing join).",
|
| 2152 |
+
"complexity": 0.33,
|
| 2153 |
+
"tier": "medium",
|
| 2154 |
+
"domain": "code",
|
| 2155 |
+
"model": "claude-3-haiku-20240307"
|
| 2156 |
+
},
|
| 2157 |
+
{
|
| 2158 |
+
"query": "Write a regular expression to match a valid IPv4 address.",
|
| 2159 |
+
"complexity": 0.48,
|
| 2160 |
+
"tier": "medium",
|
| 2161 |
+
"domain": "code",
|
| 2162 |
+
"model": "llama3.1:70b"
|
| 2163 |
+
},
|
| 2164 |
+
{
|
| 2165 |
+
"query": "Translate 'Please sign here' to Russian.",
|
| 2166 |
+
"complexity": 0.11,
|
| 2167 |
+
"tier": "easy",
|
| 2168 |
+
"domain": "translation",
|
| 2169 |
+
"model": "mistral-small-latest"
|
| 2170 |
+
},
|
| 2171 |
+
{
|
| 2172 |
+
"query": "Summarize the concept of 'copy‑on‑write' in operating systems.",
|
| 2173 |
+
"complexity": 0.37,
|
| 2174 |
+
"tier": "medium",
|
| 2175 |
+
"domain": "summarization",
|
| 2176 |
+
"model": "deepseek-chat"
|
| 2177 |
+
},
|
| 2178 |
+
{
|
| 2179 |
+
"query": "Solve `dy/dx = y` with initial condition y(0)=1.",
|
| 2180 |
+
"complexity": 0.3,
|
| 2181 |
+
"tier": "medium",
|
| 2182 |
+
"domain": "math",
|
| 2183 |
+
"model": "gpt-4o"
|
| 2184 |
+
},
|
| 2185 |
+
{
|
| 2186 |
+
"query": "Explain the Doppler effect with an example.",
|
| 2187 |
+
"complexity": 0.27,
|
| 2188 |
+
"tier": "easy",
|
| 2189 |
+
"domain": "science",
|
| 2190 |
+
"model": "gemini-1.5-pro"
|
| 2191 |
+
},
|
| 2192 |
+
{
|
| 2193 |
+
"query": "Write a tanka (5 lines) about a broken build pipeline.",
|
| 2194 |
+
"complexity": 0.35,
|
| 2195 |
+
"tier": "medium",
|
| 2196 |
+
"domain": "creative",
|
| 2197 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2198 |
+
},
|
| 2199 |
+
{
|
| 2200 |
+
"query": "What is the difference between `INNER JOIN` and `LEFT JOIN`?",
|
| 2201 |
+
"complexity": 0.21,
|
| 2202 |
+
"tier": "easy",
|
| 2203 |
+
"domain": "factual",
|
| 2204 |
+
"model": "gpt-3.5-turbo"
|
| 2205 |
+
},
|
| 2206 |
+
{
|
| 2207 |
+
"query": "Write a Python script that watches a directory for new files and processes them.",
|
| 2208 |
+
"complexity": 0.56,
|
| 2209 |
+
"tier": "medium",
|
| 2210 |
+
"domain": "code",
|
| 2211 |
+
"model": "llama3.1:8b"
|
| 2212 |
+
},
|
| 2213 |
+
{
|
| 2214 |
+
"query": "Fix the off‑by‑one error: `for i in range(1, len(arr)): if arr[i] > arr[i-1]:` – correct.",
|
| 2215 |
+
"complexity": 0.12,
|
| 2216 |
+
"tier": "easy",
|
| 2217 |
+
"domain": "code",
|
| 2218 |
+
"model": "gpt-4o-mini"
|
| 2219 |
+
},
|
| 2220 |
+
{
|
| 2221 |
+
"query": "Implement a Bloom filter in Python (simple version).",
|
| 2222 |
+
"complexity": 0.6,
|
| 2223 |
+
"tier": "hard",
|
| 2224 |
+
"domain": "code",
|
| 2225 |
+
"model": "deepseek-chat"
|
| 2226 |
+
},
|
| 2227 |
+
{
|
| 2228 |
+
"query": "Explain why `'5' - 3` works but `'5' + 3` gives different results in JavaScript.",
|
| 2229 |
+
"complexity": 0.31,
|
| 2230 |
+
"tier": "medium",
|
| 2231 |
+
"domain": "code",
|
| 2232 |
+
"model": "claude-3-5-haiku-20241022"
|
| 2233 |
+
},
|
| 2234 |
+
{
|
| 2235 |
+
"query": "Write a Golang function that reads a file line by line.",
|
| 2236 |
+
"complexity": 0.26,
|
| 2237 |
+
"tier": "easy",
|
| 2238 |
+
"domain": "code",
|
| 2239 |
+
"model": "gemini-1.5-flash"
|
| 2240 |
+
},
|
| 2241 |
+
{
|
| 2242 |
+
"query": "Translate 'This is a confidential document' to German.",
|
| 2243 |
+
"complexity": 0.2,
|
| 2244 |
+
"tier": "easy",
|
| 2245 |
+
"domain": "translation",
|
| 2246 |
+
"model": "mistral-large-latest"
|
| 2247 |
+
},
|
| 2248 |
+
{
|
| 2249 |
+
"query": "Summarize the key differences between TCP and UDP.",
|
| 2250 |
+
"complexity": 0.24,
|
| 2251 |
+
"tier": "easy",
|
| 2252 |
+
"domain": "summarization",
|
| 2253 |
+
"model": "gpt-4o"
|
| 2254 |
+
},
|
| 2255 |
+
{
|
| 2256 |
+
"query": "Solve the eigenvalue problem for matrix `[[2, 1], [1, 2]]`.",
|
| 2257 |
+
"complexity": 0.53,
|
| 2258 |
+
"tier": "medium",
|
| 2259 |
+
"domain": "math",
|
| 2260 |
+
"model": "llama3.1:70b"
|
| 2261 |
+
},
|
| 2262 |
+
{
|
| 2263 |
+
"query": "Explain the double‑slit experiment in quantum mechanics.",
|
| 2264 |
+
"complexity": 0.58,
|
| 2265 |
+
"tier": "hard",
|
| 2266 |
+
"domain": "science",
|
| 2267 |
+
"model": "deepseek-chat"
|
| 2268 |
+
},
|
| 2269 |
+
{
|
| 2270 |
+
"query": "Write a sonnet about a race condition (14 lines).",
|
| 2271 |
+
"complexity": 0.63,
|
| 2272 |
+
"tier": "hard",
|
| 2273 |
+
"domain": "creative",
|
| 2274 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2275 |
+
},
|
| 2276 |
+
{
|
| 2277 |
+
"query": "What is the purpose of `__slots__` in Python?",
|
| 2278 |
+
"complexity": 0.41,
|
| 2279 |
+
"tier": "medium",
|
| 2280 |
+
"domain": "factual",
|
| 2281 |
+
"model": "gpt-4o-mini"
|
| 2282 |
+
},
|
| 2283 |
+
{
|
| 2284 |
+
"query": "Write a Rust function that takes a string and returns the first word.",
|
| 2285 |
+
"complexity": 0.32,
|
| 2286 |
+
"tier": "medium",
|
| 2287 |
+
"domain": "code",
|
| 2288 |
+
"model": "gemini-1.5-pro"
|
| 2289 |
+
},
|
| 2290 |
+
{
|
| 2291 |
+
"query": "Fix the SQL injection in this PHP code: `$query = \"SELECT * FROM users WHERE id = $_GET[id]\";`",
|
| 2292 |
+
"complexity": 0.29,
|
| 2293 |
+
"tier": "easy",
|
| 2294 |
+
"domain": "code",
|
| 2295 |
+
"model": "gpt-3.5-turbo"
|
| 2296 |
+
},
|
| 2297 |
+
{
|
| 2298 |
+
"query": "Implement a priority queue using a binary heap in Java.",
|
| 2299 |
+
"complexity": 0.55,
|
| 2300 |
+
"tier": "medium",
|
| 2301 |
+
"domain": "code",
|
| 2302 |
+
"model": "deepseek-chat"
|
| 2303 |
+
},
|
| 2304 |
+
{
|
| 2305 |
+
"query": "Why does `a = []; a.append(a)` cause infinite recursion when printed? Explain.",
|
| 2306 |
+
"complexity": 0.34,
|
| 2307 |
+
"tier": "medium",
|
| 2308 |
+
"domain": "code",
|
| 2309 |
+
"model": "llama3.1:8b"
|
| 2310 |
+
},
|
| 2311 |
+
{
|
| 2312 |
+
"query": "Write a JavaScript function that deep freezes an object.",
|
| 2313 |
+
"complexity": 0.46,
|
| 2314 |
+
"tier": "medium",
|
| 2315 |
+
"domain": "code",
|
| 2316 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2317 |
+
},
|
| 2318 |
+
{
|
| 2319 |
+
"query": "Translate 'Temperature exceeds safe limit' to Japanese.",
|
| 2320 |
+
"complexity": 0.25,
|
| 2321 |
+
"tier": "easy",
|
| 2322 |
+
"domain": "translation",
|
| 2323 |
+
"model": "gpt-4o"
|
| 2324 |
+
},
|
| 2325 |
+
{
|
| 2326 |
+
"query": "Summarize the plot of 'The Metamorphosis' by Kafka.",
|
| 2327 |
+
"complexity": 0.33,
|
| 2328 |
+
"tier": "medium",
|
| 2329 |
+
"domain": "summarization",
|
| 2330 |
+
"model": "gemini-1.5-flash"
|
| 2331 |
+
},
|
| 2332 |
+
{
|
| 2333 |
+
"query": "Solve `lim x→0 (sin x)/x`.",
|
| 2334 |
+
"complexity": 0.22,
|
| 2335 |
+
"tier": "easy",
|
| 2336 |
+
"domain": "math",
|
| 2337 |
+
"model": "mistral-small-latest"
|
| 2338 |
+
},
|
| 2339 |
+
{
|
| 2340 |
+
"query": "Explain why the sky appears blue (Rayleigh scattering).",
|
| 2341 |
+
"complexity": 0.28,
|
| 2342 |
+
"tier": "easy",
|
| 2343 |
+
"domain": "science",
|
| 2344 |
+
"model": "claude-3-haiku-20240307"
|
| 2345 |
+
},
|
| 2346 |
+
{
|
| 2347 |
+
"query": "Write a seven‑line poem about a memory leak.",
|
| 2348 |
+
"complexity": 0.38,
|
| 2349 |
+
"tier": "medium",
|
| 2350 |
+
"domain": "creative",
|
| 2351 |
+
"model": "llama3.2:3b"
|
| 2352 |
+
},
|
| 2353 |
+
{
|
| 2354 |
+
"query": "What is the difference between a thread and a coroutine?",
|
| 2355 |
+
"complexity": 0.43,
|
| 2356 |
+
"tier": "medium",
|
| 2357 |
+
"domain": "factual",
|
| 2358 |
+
"model": "gpt-4o-mini"
|
| 2359 |
+
},
|
| 2360 |
+
{
|
| 2361 |
+
"query": "Write a Python script that converts a CSV to JSON.",
|
| 2362 |
+
"complexity": 0.31,
|
| 2363 |
+
"tier": "medium",
|
| 2364 |
+
"domain": "code",
|
| 2365 |
+
"model": "deepseek-chat"
|
| 2366 |
+
},
|
| 2367 |
+
{
|
| 2368 |
+
"query": "Fix the race condition in this Python async code: `async def f(): global x; x+=1` (use lock).",
|
| 2369 |
+
"complexity": 0.51,
|
| 2370 |
+
"tier": "medium",
|
| 2371 |
+
"domain": "code",
|
| 2372 |
+
"model": "gpt-4o"
|
| 2373 |
+
},
|
| 2374 |
+
{
|
| 2375 |
+
"query": "Translate 'The quick brown fox jumps over the lazy dog' to Spanish.",
|
| 2376 |
+
"complexity": 0.12,
|
| 2377 |
+
"tier": "easy",
|
| 2378 |
+
"domain": "translation",
|
| 2379 |
+
"model": "gpt-3.5-turbo"
|
| 2380 |
+
},
|
| 2381 |
+
{
|
| 2382 |
+
"query": "Paraphrase this sentence: 'Despite the rain, the event was a huge success.'",
|
| 2383 |
+
"complexity": 0.18,
|
| 2384 |
+
"tier": "easy",
|
| 2385 |
+
"domain": "translation",
|
| 2386 |
+
"model": "claude-3-haiku-20240307"
|
| 2387 |
+
},
|
| 2388 |
+
{
|
| 2389 |
+
"query": "Translate 'Where is the nearest metro station?' to French.",
|
| 2390 |
+
"complexity": 0.1,
|
| 2391 |
+
"tier": "trivial",
|
| 2392 |
+
"domain": "translation",
|
| 2393 |
+
"model": "llama3.2:3b"
|
| 2394 |
+
},
|
| 2395 |
+
{
|
| 2396 |
+
"query": "Correct the grammar: 'He don't know nothing about that.'",
|
| 2397 |
+
"complexity": 0.09,
|
| 2398 |
+
"tier": "trivial",
|
| 2399 |
+
"domain": "translation",
|
| 2400 |
+
"model": "gemini-1.5-flash"
|
| 2401 |
+
},
|
| 2402 |
+
{
|
| 2403 |
+
"query": "Translate 'I would like to order a vegetarian pizza' to Italian.",
|
| 2404 |
+
"complexity": 0.14,
|
| 2405 |
+
"tier": "easy",
|
| 2406 |
+
"domain": "translation",
|
| 2407 |
+
"model": "mistral-small-latest"
|
| 2408 |
+
},
|
| 2409 |
+
{
|
| 2410 |
+
"query": "Rewrite this in formal English: 'Hey, can you send me the doc ASAP?'",
|
| 2411 |
+
"complexity": 0.16,
|
| 2412 |
+
"tier": "easy",
|
| 2413 |
+
"domain": "translation",
|
| 2414 |
+
"model": "gpt-4o-mini"
|
| 2415 |
+
},
|
| 2416 |
+
{
|
| 2417 |
+
"query": "Translate 'The system is under maintenance' to German.",
|
| 2418 |
+
"complexity": 0.11,
|
| 2419 |
+
"tier": "easy",
|
| 2420 |
+
"domain": "translation",
|
| 2421 |
+
"model": "llama3.1:8b"
|
| 2422 |
+
},
|
| 2423 |
+
{
|
| 2424 |
+
"query": "Convert this passive voice to active: 'The report was written by John.'",
|
| 2425 |
+
"complexity": 0.08,
|
| 2426 |
+
"tier": "trivial",
|
| 2427 |
+
"domain": "translation",
|
| 2428 |
+
"model": "claude-3-5-haiku-20241022"
|
| 2429 |
+
},
|
| 2430 |
+
{
|
| 2431 |
+
"query": "Translate 'What is your name?' to Mandarin Chinese (pinyin).",
|
| 2432 |
+
"complexity": 0.13,
|
| 2433 |
+
"tier": "easy",
|
| 2434 |
+
"domain": "translation",
|
| 2435 |
+
"model": "deepseek-chat"
|
| 2436 |
+
},
|
| 2437 |
+
{
|
| 2438 |
+
"query": "Paraphrase the idiom 'It's raining cats and dogs' into plain English.",
|
| 2439 |
+
"complexity": 0.15,
|
| 2440 |
+
"tier": "easy",
|
| 2441 |
+
"domain": "translation",
|
| 2442 |
+
"model": "gpt-4o"
|
| 2443 |
+
},
|
| 2444 |
+
{
|
| 2445 |
+
"query": "Translate 'Please call back later' to Japanese.",
|
| 2446 |
+
"complexity": 0.15,
|
| 2447 |
+
"tier": "easy",
|
| 2448 |
+
"domain": "translation",
|
| 2449 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2450 |
+
},
|
| 2451 |
+
{
|
| 2452 |
+
"query": "Change this sentence to past tense: 'I go to the gym every day.'",
|
| 2453 |
+
"complexity": 0.06,
|
| 2454 |
+
"tier": "trivial",
|
| 2455 |
+
"domain": "translation",
|
| 2456 |
+
"model": "gemini-1.5-pro"
|
| 2457 |
+
},
|
| 2458 |
+
{
|
| 2459 |
+
"query": "Translate 'The price does not include tax' to Portuguese.",
|
| 2460 |
+
"complexity": 0.12,
|
| 2461 |
+
"tier": "easy",
|
| 2462 |
+
"domain": "translation",
|
| 2463 |
+
"model": "mistral-large-latest"
|
| 2464 |
+
},
|
| 2465 |
+
{
|
| 2466 |
+
"query": "Summarize this paragraph in 10 words: 'Machine learning is a subset of artificial intelligence that enables systems to learn from data.'",
|
| 2467 |
+
"complexity": 0.19,
|
| 2468 |
+
"tier": "easy",
|
| 2469 |
+
"domain": "summarization",
|
| 2470 |
+
"model": "llama3.1:70b"
|
| 2471 |
+
},
|
| 2472 |
+
{
|
| 2473 |
+
"query": "Translate 'I have a meeting at 3 PM' to Russian.",
|
| 2474 |
+
"complexity": 0.14,
|
| 2475 |
+
"tier": "easy",
|
| 2476 |
+
"domain": "translation",
|
| 2477 |
+
"model": "gpt-3.5-turbo"
|
| 2478 |
+
},
|
| 2479 |
+
{
|
| 2480 |
+
"query": "Explain the difference between 'affect' and 'effect' with examples.",
|
| 2481 |
+
"complexity": 0.17,
|
| 2482 |
+
"tier": "easy",
|
| 2483 |
+
"domain": "factual",
|
| 2484 |
+
"model": "claude-3-haiku-20240307"
|
| 2485 |
+
},
|
| 2486 |
+
{
|
| 2487 |
+
"query": "Translate 'Your session has expired' to Arabic.",
|
| 2488 |
+
"complexity": 0.16,
|
| 2489 |
+
"tier": "easy",
|
| 2490 |
+
"domain": "translation",
|
| 2491 |
+
"model": "deepseek-chat"
|
| 2492 |
+
},
|
| 2493 |
+
{
|
| 2494 |
+
"query": "Rewrite this sentence more concisely: 'Due to the fact that it was raining, we canceled the picnic.'",
|
| 2495 |
+
"complexity": 0.11,
|
| 2496 |
+
"tier": "easy",
|
| 2497 |
+
"domain": "translation",
|
| 2498 |
+
"model": "gemini-1.5-flash"
|
| 2499 |
+
},
|
| 2500 |
+
{
|
| 2501 |
+
"query": "Translate 'Can you help me with this problem?' to Hindi (Romanized).",
|
| 2502 |
+
"complexity": 0.18,
|
| 2503 |
+
"tier": "easy",
|
| 2504 |
+
"domain": "translation",
|
| 2505 |
+
"model": "gpt-4o-mini"
|
| 2506 |
+
},
|
| 2507 |
+
{
|
| 2508 |
+
"query": "Change this to a question: 'She knows the answer.'",
|
| 2509 |
+
"complexity": 0.05,
|
| 2510 |
+
"tier": "trivial",
|
| 2511 |
+
"domain": "translation",
|
| 2512 |
+
"model": "llama3.2:3b"
|
| 2513 |
+
},
|
| 2514 |
+
{
|
| 2515 |
+
"query": "Translate 'The file is corrupted' to Korean.",
|
| 2516 |
+
"complexity": 0.15,
|
| 2517 |
+
"tier": "easy",
|
| 2518 |
+
"domain": "translation",
|
| 2519 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2520 |
+
},
|
| 2521 |
+
{
|
| 2522 |
+
"query": "Paraphrase this technical sentence: 'Authentication via OAuth 2.0 provides delegated access.'",
|
| 2523 |
+
"complexity": 0.34,
|
| 2524 |
+
"tier": "medium",
|
| 2525 |
+
"domain": "translation",
|
| 2526 |
+
"model": "gpt-4o"
|
| 2527 |
+
},
|
| 2528 |
+
{
|
| 2529 |
+
"query": "Translate a short poem from English to French (preserve rhyme).",
|
| 2530 |
+
"complexity": 0.45,
|
| 2531 |
+
"tier": "medium",
|
| 2532 |
+
"domain": "translation",
|
| 2533 |
+
"model": "mistral-large-latest"
|
| 2534 |
+
},
|
| 2535 |
+
{
|
| 2536 |
+
"query": "Detect the language of this text: 'Bonjour, comment ça va?' and translate to English.",
|
| 2537 |
+
"complexity": 0.2,
|
| 2538 |
+
"tier": "easy",
|
| 2539 |
+
"domain": "translation",
|
| 2540 |
+
"model": "gemini-1.5-pro"
|
| 2541 |
+
},
|
| 2542 |
+
{
|
| 2543 |
+
"query": "Write a grammatically correct sentence using 'their', 'there', and 'they're'.",
|
| 2544 |
+
"complexity": 0.12,
|
| 2545 |
+
"tier": "easy",
|
| 2546 |
+
"domain": "creative",
|
| 2547 |
+
"model": "llama3.1:8b"
|
| 2548 |
+
},
|
| 2549 |
+
{
|
| 2550 |
+
"query": "Translate 'The application has encountered an unexpected error' to German.",
|
| 2551 |
+
"complexity": 0.22,
|
| 2552 |
+
"tier": "easy",
|
| 2553 |
+
"domain": "translation",
|
| 2554 |
+
"model": "gpt-3.5-turbo"
|
| 2555 |
+
},
|
| 2556 |
+
{
|
| 2557 |
+
"query": "Rewrite this in the imperative mood: 'You should read the instructions carefully.'",
|
| 2558 |
+
"complexity": 0.09,
|
| 2559 |
+
"tier": "trivial",
|
| 2560 |
+
"domain": "translation",
|
| 2561 |
+
"model": "claude-3-haiku-20240307"
|
| 2562 |
+
},
|
| 2563 |
+
{
|
| 2564 |
+
"query": "Translate 'I apologize for the inconvenience' to Spanish.",
|
| 2565 |
+
"complexity": 0.13,
|
| 2566 |
+
"tier": "easy",
|
| 2567 |
+
"domain": "translation",
|
| 2568 |
+
"model": "mistral-small-latest"
|
| 2569 |
+
},
|
| 2570 |
+
{
|
| 2571 |
+
"query": "Convert this direct speech to indirect: He said he would come tomorrow.",
|
| 2572 |
+
"complexity": 0.16,
|
| 2573 |
+
"tier": "easy",
|
| 2574 |
+
"domain": "translation",
|
| 2575 |
+
"model": "deepseek-chat"
|
| 2576 |
+
},
|
| 2577 |
+
{
|
| 2578 |
+
"query": "Translate 'Please proceed to gate B12' to Japanese.",
|
| 2579 |
+
"complexity": 0.17,
|
| 2580 |
+
"tier": "easy",
|
| 2581 |
+
"domain": "translation",
|
| 2582 |
+
"model": "gpt-4o"
|
| 2583 |
+
},
|
| 2584 |
+
{
|
| 2585 |
+
"query": "Simplify this legalese: 'The party of the first part shall indemnify the party of the second part.'",
|
| 2586 |
+
"complexity": 0.41,
|
| 2587 |
+
"tier": "medium",
|
| 2588 |
+
"domain": "translation",
|
| 2589 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2590 |
+
},
|
| 2591 |
+
{
|
| 2592 |
+
"query": "Translate 'The deadline is Friday at 5 PM' to Italian.",
|
| 2593 |
+
"complexity": 0.14,
|
| 2594 |
+
"tier": "easy",
|
| 2595 |
+
"domain": "translation",
|
| 2596 |
+
"model": "gemini-1.5-flash"
|
| 2597 |
+
},
|
| 2598 |
+
{
|
| 2599 |
+
"query": "Explain what a 'malapropism' is and give an example.",
|
| 2600 |
+
"complexity": 0.26,
|
| 2601 |
+
"tier": "easy",
|
| 2602 |
+
"domain": "factual",
|
| 2603 |
+
"model": "llama3.1:70b"
|
| 2604 |
+
},
|
| 2605 |
+
{
|
| 2606 |
+
"query": "Translate a business email subject: 'Q3 Financial Results Attached' to French.",
|
| 2607 |
+
"complexity": 0.18,
|
| 2608 |
+
"tier": "easy",
|
| 2609 |
+
"domain": "translation",
|
| 2610 |
+
"model": "gpt-4o-mini"
|
| 2611 |
+
},
|
| 2612 |
+
{
|
| 2613 |
+
"query": "Change this to future perfect tense: 'I finish the project.'",
|
| 2614 |
+
"complexity": 0.1,
|
| 2615 |
+
"tier": "trivial",
|
| 2616 |
+
"domain": "translation",
|
| 2617 |
+
"model": "llama3.2:3b"
|
| 2618 |
+
},
|
| 2619 |
+
{
|
| 2620 |
+
"query": "Translate 'The server will reboot in 5 minutes' to Portuguese.",
|
| 2621 |
+
"complexity": 0.15,
|
| 2622 |
+
"tier": "easy",
|
| 2623 |
+
"domain": "translation",
|
| 2624 |
+
"model": "mistral-large-latest"
|
| 2625 |
+
},
|
| 2626 |
+
{
|
| 2627 |
+
"query": "Paraphrase this scientific abstract (2 sentences) for a general audience.",
|
| 2628 |
+
"complexity": 0.39,
|
| 2629 |
+
"tier": "medium",
|
| 2630 |
+
"domain": "summarization",
|
| 2631 |
+
"model": "deepseek-chat"
|
| 2632 |
+
},
|
| 2633 |
+
{
|
| 2634 |
+
"query": "Translate 'Your payment was successful' to Russian.",
|
| 2635 |
+
"complexity": 0.13,
|
| 2636 |
+
"tier": "easy",
|
| 2637 |
+
"domain": "translation",
|
| 2638 |
+
"model": "gpt-3.5-turbo"
|
| 2639 |
+
},
|
| 2640 |
+
{
|
| 2641 |
+
"query": "Rewrite this sentence in the negative form: 'Everyone attended the meeting.'",
|
| 2642 |
+
"complexity": 0.07,
|
| 2643 |
+
"tier": "trivial",
|
| 2644 |
+
"domain": "translation",
|
| 2645 |
+
"model": "claude-3-5-haiku-20241022"
|
| 2646 |
+
},
|
| 2647 |
+
{
|
| 2648 |
+
"query": "Translate 'Please do not touch the glass' to Mandarin.",
|
| 2649 |
+
"complexity": 0.14,
|
| 2650 |
+
"tier": "easy",
|
| 2651 |
+
"domain": "translation",
|
| 2652 |
+
"model": "gemini-1.5-pro"
|
| 2653 |
+
},
|
| 2654 |
+
{
|
| 2655 |
+
"query": "Correct the spelling: 'recieve', 'seperate', 'definately'",
|
| 2656 |
+
"complexity": 0.06,
|
| 2657 |
+
"tier": "trivial",
|
| 2658 |
+
"domain": "translation",
|
| 2659 |
+
"model": "llama3.1:8b"
|
| 2660 |
+
},
|
| 2661 |
+
{
|
| 2662 |
+
"query": "Translate 'The operation was completed successfully' to Arabic.",
|
| 2663 |
+
"complexity": 0.18,
|
| 2664 |
+
"tier": "easy",
|
| 2665 |
+
"domain": "translation",
|
| 2666 |
+
"model": "gpt-4o"
|
| 2667 |
+
},
|
| 2668 |
+
{
|
| 2669 |
+
"query": "Convert this bullet list into a coherent paragraph.",
|
| 2670 |
+
"complexity": 0.28,
|
| 2671 |
+
"tier": "easy",
|
| 2672 |
+
"domain": "summarization",
|
| 2673 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2674 |
+
},
|
| 2675 |
+
{
|
| 2676 |
+
"query": "Translate 'What time does the store close?' to German.",
|
| 2677 |
+
"complexity": 0.11,
|
| 2678 |
+
"tier": "easy",
|
| 2679 |
+
"domain": "translation",
|
| 2680 |
+
"model": "mistral-small-latest"
|
| 2681 |
+
},
|
| 2682 |
+
{
|
| 2683 |
+
"query": "Explain the difference between 'who' and 'whom' with examples.",
|
| 2684 |
+
"complexity": 0.22,
|
| 2685 |
+
"tier": "easy",
|
| 2686 |
+
"domain": "factual",
|
| 2687 |
+
"model": "deepseek-chat"
|
| 2688 |
+
},
|
| 2689 |
+
{
|
| 2690 |
+
"query": "Translate 'This feature is not yet implemented' to Spanish.",
|
| 2691 |
+
"complexity": 0.16,
|
| 2692 |
+
"tier": "easy",
|
| 2693 |
+
"domain": "translation",
|
| 2694 |
+
"model": "gpt-4o-mini"
|
| 2695 |
+
},
|
| 2696 |
+
{
|
| 2697 |
+
"query": "Rewrite this sentence using a simile: 'Her voice was loud.'",
|
| 2698 |
+
"complexity": 0.2,
|
| 2699 |
+
"tier": "easy",
|
| 2700 |
+
"domain": "creative",
|
| 2701 |
+
"model": "gemini-1.5-flash"
|
| 2702 |
+
},
|
| 2703 |
+
{
|
| 2704 |
+
"query": "Translate 'Please verify your email address' to Japanese.",
|
| 2705 |
+
"complexity": 0.17,
|
| 2706 |
+
"tier": "easy",
|
| 2707 |
+
"domain": "translation",
|
| 2708 |
+
"model": "llama3.2:3b"
|
| 2709 |
+
},
|
| 2710 |
+
{
|
| 2711 |
+
"query": "Change this from first person to third person: 'I think this solution is optimal.'",
|
| 2712 |
+
"complexity": 0.15,
|
| 2713 |
+
"tier": "easy",
|
| 2714 |
+
"domain": "translation",
|
| 2715 |
+
"model": "gpt-3.5-turbo"
|
| 2716 |
+
},
|
| 2717 |
+
{
|
| 2718 |
+
"query": "Translate a chat message: 'BRB, gonna grab coffee' to formal English.",
|
| 2719 |
+
"complexity": 0.12,
|
| 2720 |
+
"tier": "easy",
|
| 2721 |
+
"domain": "translation",
|
| 2722 |
+
"model": "claude-3-haiku-20240307"
|
| 2723 |
+
},
|
| 2724 |
+
{
|
| 2725 |
+
"query": "Paraphrase 'The new update includes several security patches' without changing meaning.",
|
| 2726 |
+
"complexity": 0.19,
|
| 2727 |
+
"tier": "easy",
|
| 2728 |
+
"domain": "translation",
|
| 2729 |
+
"model": "mistral-large-latest"
|
| 2730 |
+
},
|
| 2731 |
+
{
|
| 2732 |
+
"query": "Translate 'The connection has timed out' to Italian.",
|
| 2733 |
+
"complexity": 0.15,
|
| 2734 |
+
"tier": "easy",
|
| 2735 |
+
"domain": "translation",
|
| 2736 |
+
"model": "gpt-4o"
|
| 2737 |
+
},
|
| 2738 |
+
{
|
| 2739 |
+
"query": "Write a haiku about the translator's dilemma.",
|
| 2740 |
+
"complexity": 0.31,
|
| 2741 |
+
"tier": "medium",
|
| 2742 |
+
"domain": "creative",
|
| 2743 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2744 |
+
},
|
| 2745 |
+
{
|
| 2746 |
+
"query": "Translate 'Access denied' to French.",
|
| 2747 |
+
"complexity": 0.06,
|
| 2748 |
+
"tier": "trivial",
|
| 2749 |
+
"domain": "translation",
|
| 2750 |
+
"model": "gemini-1.5-pro"
|
| 2751 |
+
},
|
| 2752 |
+
{
|
| 2753 |
+
"query": "Explain the concept of 'cognates' in linguistics with examples.",
|
| 2754 |
+
"complexity": 0.35,
|
| 2755 |
+
"tier": "medium",
|
| 2756 |
+
"domain": "science",
|
| 2757 |
+
"model": "deepseek-chat"
|
| 2758 |
+
},
|
| 2759 |
+
{
|
| 2760 |
+
"query": "Translate a short warning: 'High voltage, risk of electric shock' to Spanish.",
|
| 2761 |
+
"complexity": 0.18,
|
| 2762 |
+
"tier": "easy",
|
| 2763 |
+
"domain": "translation",
|
| 2764 |
+
"model": "llama3.1:70b"
|
| 2765 |
+
},
|
| 2766 |
+
{
|
| 2767 |
+
"query": "Rewrite this in colloquial English: 'I am extremely fatigued.'",
|
| 2768 |
+
"complexity": 0.1,
|
| 2769 |
+
"tier": "trivial",
|
| 2770 |
+
"domain": "translation",
|
| 2771 |
+
"model": "gpt-4o-mini"
|
| 2772 |
+
},
|
| 2773 |
+
{
|
| 2774 |
+
"query": "Translate 'The package will arrive within 2 business days' to German.",
|
| 2775 |
+
"complexity": 0.19,
|
| 2776 |
+
"tier": "easy",
|
| 2777 |
+
"domain": "translation",
|
| 2778 |
+
"model": "mistral-small-latest"
|
| 2779 |
+
},
|
| 2780 |
+
{
|
| 2781 |
+
"query": "Change this sentence to exclamatory: 'It is a beautiful day.'",
|
| 2782 |
+
"complexity": 0.05,
|
| 2783 |
+
"tier": "trivial",
|
| 2784 |
+
"domain": "translation",
|
| 2785 |
+
"model": "claude-3-5-haiku-20241022"
|
| 2786 |
+
},
|
| 2787 |
+
{
|
| 2788 |
+
"query": "Translate 'Please enter your password' to Korean.",
|
| 2789 |
+
"complexity": 0.13,
|
| 2790 |
+
"tier": "easy",
|
| 2791 |
+
"domain": "translation",
|
| 2792 |
+
"model": "gpt-3.5-turbo"
|
| 2793 |
+
},
|
| 2794 |
+
{
|
| 2795 |
+
"query": "Summarize the difference between British and American English spelling (5 examples).",
|
| 2796 |
+
"complexity": 0.27,
|
| 2797 |
+
"tier": "easy",
|
| 2798 |
+
"domain": "summarization",
|
| 2799 |
+
"model": "gemini-1.5-flash"
|
| 2800 |
+
},
|
| 2801 |
+
{
|
| 2802 |
+
"query": "Translate 'I don't understand this instruction' to Portuguese.",
|
| 2803 |
+
"complexity": 0.14,
|
| 2804 |
+
"tier": "easy",
|
| 2805 |
+
"domain": "translation",
|
| 2806 |
+
"model": "llama3.1:8b"
|
| 2807 |
+
},
|
| 2808 |
+
{
|
| 2809 |
+
"query": "Paraphrase this corporate jargon: 'We'll circle back on that action item.'",
|
| 2810 |
+
"complexity": 0.22,
|
| 2811 |
+
"tier": "easy",
|
| 2812 |
+
"domain": "translation",
|
| 2813 |
+
"model": "gpt-4o"
|
| 2814 |
+
},
|
| 2815 |
+
{
|
| 2816 |
+
"query": "Translate 'Your account has been locked' to Arabic.",
|
| 2817 |
+
"complexity": 0.18,
|
| 2818 |
+
"tier": "easy",
|
| 2819 |
+
"domain": "translation",
|
| 2820 |
+
"model": "deepseek-chat"
|
| 2821 |
+
},
|
| 2822 |
+
{
|
| 2823 |
+
"query": "Write a sentence using the word 'ubiquitous' correctly.",
|
| 2824 |
+
"complexity": 0.12,
|
| 2825 |
+
"tier": "easy",
|
| 2826 |
+
"domain": "creative",
|
| 2827 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2828 |
+
},
|
| 2829 |
+
{
|
| 2830 |
+
"query": "Translate 'The meeting was rescheduled to Monday' to Italian.",
|
| 2831 |
+
"complexity": 0.16,
|
| 2832 |
+
"tier": "easy",
|
| 2833 |
+
"domain": "translation",
|
| 2834 |
+
"model": "mistral-large-latest"
|
| 2835 |
+
},
|
| 2836 |
+
{
|
| 2837 |
+
"query": "Change this from active to passive: 'The chef cooked a delicious meal.'",
|
| 2838 |
+
"complexity": 0.08,
|
| 2839 |
+
"tier": "trivial",
|
| 2840 |
+
"domain": "translation",
|
| 2841 |
+
"model": "gpt-4o-mini"
|
| 2842 |
+
},
|
| 2843 |
+
{
|
| 2844 |
+
"query": "Translate 'Please wait while we process your request' to Japanese.",
|
| 2845 |
+
"complexity": 0.2,
|
| 2846 |
+
"tier": "easy",
|
| 2847 |
+
"domain": "translation",
|
| 2848 |
+
"model": "gemini-1.5-pro"
|
| 2849 |
+
},
|
| 2850 |
+
{
|
| 2851 |
+
"query": "Explain the term 'portmanteau' and give three examples.",
|
| 2852 |
+
"complexity": 0.25,
|
| 2853 |
+
"tier": "easy",
|
| 2854 |
+
"domain": "factual",
|
| 2855 |
+
"model": "llama3.2:3b"
|
| 2856 |
+
},
|
| 2857 |
+
{
|
| 2858 |
+
"query": "Translate a legal disclaimer: 'Not responsible for lost or stolen items' to French.",
|
| 2859 |
+
"complexity": 0.33,
|
| 2860 |
+
"tier": "medium",
|
| 2861 |
+
"domain": "translation",
|
| 2862 |
+
"model": "gpt-3.5-turbo"
|
| 2863 |
+
},
|
| 2864 |
+
{
|
| 2865 |
+
"query": "Rewrite this sentence without using the word 'very': 'She was very tired.'",
|
| 2866 |
+
"complexity": 0.09,
|
| 2867 |
+
"tier": "trivial",
|
| 2868 |
+
"domain": "translation",
|
| 2869 |
+
"model": "claude-3-haiku-20240307"
|
| 2870 |
+
},
|
| 2871 |
+
{
|
| 2872 |
+
"query": "Translate 'Your session will expire in 10 minutes' to Russian.",
|
| 2873 |
+
"complexity": 0.17,
|
| 2874 |
+
"tier": "easy",
|
| 2875 |
+
"domain": "translation",
|
| 2876 |
+
"model": "deepseek-chat"
|
| 2877 |
+
},
|
| 2878 |
+
{
|
| 2879 |
+
"query": "Convert this to a rhetorical question: 'You should know better.'",
|
| 2880 |
+
"complexity": 0.11,
|
| 2881 |
+
"tier": "easy",
|
| 2882 |
+
"domain": "translation",
|
| 2883 |
+
"model": "gpt-4o"
|
| 2884 |
+
},
|
| 2885 |
+
{
|
| 2886 |
+
"query": "Translate 'The file size exceeds the limit' to Spanish.",
|
| 2887 |
+
"complexity": 0.14,
|
| 2888 |
+
"tier": "easy",
|
| 2889 |
+
"domain": "translation",
|
| 2890 |
+
"model": "mistral-small-latest"
|
| 2891 |
+
},
|
| 2892 |
+
{
|
| 2893 |
+
"query": "Summarize the plot of 'The Tower of Babel' story in two sentences.",
|
| 2894 |
+
"complexity": 0.25,
|
| 2895 |
+
"tier": "easy",
|
| 2896 |
+
"domain": "summarization",
|
| 2897 |
+
"model": "gemini-1.5-flash"
|
| 2898 |
+
},
|
| 2899 |
+
{
|
| 2900 |
+
"query": "Translate 'Please accept our sincere apologies' to German.",
|
| 2901 |
+
"complexity": 0.16,
|
| 2902 |
+
"tier": "easy",
|
| 2903 |
+
"domain": "translation",
|
| 2904 |
+
"model": "llama3.1:70b"
|
| 2905 |
+
},
|
| 2906 |
+
{
|
| 2907 |
+
"query": "Paraphrase this proverb: 'A bird in the hand is worth two in the bush.'",
|
| 2908 |
+
"complexity": 0.2,
|
| 2909 |
+
"tier": "easy",
|
| 2910 |
+
"domain": "translation",
|
| 2911 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 2912 |
+
},
|
| 2913 |
+
{
|
| 2914 |
+
"query": "Translate 'The system will restart automatically' to Mandarin.",
|
| 2915 |
+
"complexity": 0.18,
|
| 2916 |
+
"tier": "easy",
|
| 2917 |
+
"domain": "translation",
|
| 2918 |
+
"model": "gpt-4o-mini"
|
| 2919 |
+
},
|
| 2920 |
+
{
|
| 2921 |
+
"query": "Fix the subject‑verb agreement: 'The list of items are on the table.'",
|
| 2922 |
+
"complexity": 0.07,
|
| 2923 |
+
"tier": "trivial",
|
| 2924 |
+
"domain": "translation",
|
| 2925 |
+
"model": "llama3.2:3b"
|
| 2926 |
+
},
|
| 2927 |
+
{
|
| 2928 |
+
"query": "Translate 'I've attached the document for your review' to Italian.",
|
| 2929 |
+
"complexity": 0.17,
|
| 2930 |
+
"tier": "easy",
|
| 2931 |
+
"domain": "translation",
|
| 2932 |
+
"model": "gpt-3.5-turbo"
|
| 2933 |
+
},
|
| 2934 |
+
{
|
| 2935 |
+
"query": "Explain the difference between denotation and connotation with examples.",
|
| 2936 |
+
"complexity": 0.29,
|
| 2937 |
+
"tier": "easy",
|
| 2938 |
+
"domain": "factual",
|
| 2939 |
+
"model": "gemini-1.5-pro"
|
| 2940 |
+
},
|
| 2941 |
+
{
|
| 2942 |
+
"query": "Translate 'Your request has been received' to Korean.",
|
| 2943 |
+
"complexity": 0.14,
|
| 2944 |
+
"tier": "easy",
|
| 2945 |
+
"domain": "translation",
|
| 2946 |
+
"model": "mistral-large-latest"
|
| 2947 |
+
},
|
| 2948 |
+
{
|
| 2949 |
+
"query": "Rewrite this sentence using alliteration: 'The dog ran fast.'",
|
| 2950 |
+
"complexity": 0.21,
|
| 2951 |
+
"tier": "easy",
|
| 2952 |
+
"domain": "creative",
|
| 2953 |
+
"model": "deepseek-chat"
|
| 2954 |
+
},
|
| 2955 |
+
{
|
| 2956 |
+
"query": "Translate 'The store is closed on Sundays' to Portuguese.",
|
| 2957 |
+
"complexity": 0.13,
|
| 2958 |
+
"tier": "easy",
|
| 2959 |
+
"domain": "translation",
|
| 2960 |
+
"model": "claude-3-5-haiku-20241022"
|
| 2961 |
+
},
|
| 2962 |
+
{
|
| 2963 |
+
"query": "Change this from singular to plural: 'The child is playing.'",
|
| 2964 |
+
"complexity": 0.04,
|
| 2965 |
+
"tier": "trivial",
|
| 2966 |
+
"domain": "translation",
|
| 2967 |
+
"model": "gpt-4o"
|
| 2968 |
+
},
|
| 2969 |
+
{
|
| 2970 |
+
"query": "Translate 'Please note that prices are subject to change' to French.",
|
| 2971 |
+
"complexity": 0.24,
|
| 2972 |
+
"tier": "easy",
|
| 2973 |
+
"domain": "translation",
|
| 2974 |
+
"model": "llama3.1:8b"
|
| 2975 |
+
},
|
| 2976 |
+
{
|
| 2977 |
+
"query": "Paraphrase this headline: 'Tech Giant Announces Record Profits Amid Layoffs'",
|
| 2978 |
+
"complexity": 0.26,
|
| 2979 |
+
"tier": "easy",
|
| 2980 |
+
"domain": "summarization",
|
| 2981 |
+
"model": "gpt-4o-mini"
|
| 2982 |
+
},
|
| 2983 |
+
{
|
| 2984 |
+
"query": "Translate 'The shipment is delayed due to weather' to German.",
|
| 2985 |
+
"complexity": 0.19,
|
| 2986 |
+
"tier": "easy",
|
| 2987 |
+
"domain": "translation",
|
| 2988 |
+
"model": "mistral-small-latest"
|
| 2989 |
+
},
|
| 2990 |
+
{
|
| 2991 |
+
"query": "Write a sentence that is grammatically correct but semantically nonsensical.",
|
| 2992 |
+
"complexity": 0.23,
|
| 2993 |
+
"tier": "easy",
|
| 2994 |
+
"domain": "creative",
|
| 2995 |
+
"model": "claude-3-haiku-20240307"
|
| 2996 |
+
},
|
| 2997 |
+
{
|
| 2998 |
+
"query": "Translate 'Your feedback is valuable to us' to Japanese.",
|
| 2999 |
+
"complexity": 0.16,
|
| 3000 |
+
"tier": "easy",
|
| 3001 |
+
"domain": "translation",
|
| 3002 |
+
"model": "gemini-1.5-flash"
|
| 3003 |
+
},
|
| 3004 |
+
{
|
| 3005 |
+
"query": "Explain what a 'double negative' is and why it's often avoided in standard English.",
|
| 3006 |
+
"complexity": 0.2,
|
| 3007 |
+
"tier": "easy",
|
| 3008 |
+
"domain": "factual",
|
| 3009 |
+
"model": "deepseek-chat"
|
| 3010 |
+
},
|
| 3011 |
+
{
|
| 3012 |
+
"query": "Translate 'The server is experiencing high load' to Arabic.",
|
| 3013 |
+
"complexity": 0.21,
|
| 3014 |
+
"tier": "easy",
|
| 3015 |
+
"domain": "translation",
|
| 3016 |
+
"model": "gpt-4o"
|
| 3017 |
+
},
|
| 3018 |
+
{
|
| 3019 |
+
"query": "Rewrite this sentence as a conditional: 'You didn't water the plant, so it died.'",
|
| 3020 |
+
"complexity": 0.14,
|
| 3021 |
+
"tier": "easy",
|
| 3022 |
+
"domain": "translation",
|
| 3023 |
+
"model": "llama3.1:70b"
|
| 3024 |
+
},
|
| 3025 |
+
{
|
| 3026 |
+
"query": "Translate 'Congratulations on your promotion' to Spanish.",
|
| 3027 |
+
"complexity": 0.11,
|
| 3028 |
+
"tier": "easy",
|
| 3029 |
+
"domain": "translation",
|
| 3030 |
+
"model": "claude-3-5-sonnet-20241022"
|
| 3031 |
+
},
|
| 3032 |
+
{
|
| 3033 |
+
"query": "Summarize the concept of 'code‑switching' in linguistics in one sentence.",
|
| 3034 |
+
"complexity": 0.28,
|
| 3035 |
+
"tier": "easy",
|
| 3036 |
+
"domain": "science",
|
| 3037 |
+
"model": "gpt-4o-mini"
|
| 3038 |
+
},
|
| 3039 |
+
{
|
| 3040 |
+
"query": "Translate 'Please do not disturb' to Italian.",
|
| 3041 |
+
"complexity": 0.09,
|
| 3042 |
+
"tier": "trivial",
|
| 3043 |
+
"domain": "translation",
|
| 3044 |
+
"model": "gemini-1.5-pro"
|
| 3045 |
+
},
|
| 3046 |
+
{
|
| 3047 |
+
"query": "Convert this sentence to use the subjunctive mood: 'I wish I was there.' (correction)",
|
| 3048 |
+
"complexity": 0.15,
|
| 3049 |
+
"tier": "easy",
|
| 3050 |
+
"domain": "translation",
|
| 3051 |
+
"model": "mistral-large-latest"
|
| 3052 |
+
}
|
| 3053 |
+
]
|
llmopt/analyzer/query_analyzer.py
CHANGED
|
@@ -9,9 +9,12 @@ implementing the same QueryFeatures interface.
|
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
import re
|
|
|
|
| 12 |
from dataclasses import dataclass, field
|
| 13 |
from typing import Optional
|
| 14 |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# ---------------------------------------------------------------------------
|
| 17 |
# Feature schema
|
|
@@ -55,7 +58,8 @@ class QueryFeatures:
|
|
| 55 |
|
| 56 |
# Derived
|
| 57 |
primary_domain: str = "general"
|
| 58 |
-
domain_scores: dict = field(default_factory=dict)
|
|
|
|
| 59 |
|
| 60 |
def to_dict(self) -> dict:
|
| 61 |
return self.__dict__.copy()
|
|
@@ -147,11 +151,29 @@ class QueryAnalyzer:
|
|
| 147 |
"""
|
| 148 |
Extracts semantic + structural features from a raw query string.
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
- The returned `QueryFeatures` dataclass is the stable interface.
|
| 153 |
"""
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
def analyze(self, query: str) -> QueryFeatures:
|
| 156 |
q = query.strip()
|
| 157 |
ql = q.lower()
|
|
@@ -185,30 +207,51 @@ class QueryAnalyzer:
|
|
| 185 |
# ------------------------------------------------------------------
|
| 186 |
|
| 187 |
def _domain_features(self, ql: str, f: QueryFeatures) -> None:
|
| 188 |
-
scores: dict[str, float] = {
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
f.domain_scores = scores
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
f.
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
def _keyword_score(self, ql: str, keywords: set) -> float:
|
| 214 |
"""Fraction of keywords found; capped to avoid over-scoring long queries."""
|
|
@@ -278,4 +321,4 @@ class QueryAnalyzer:
|
|
| 278 |
if not scores or max(scores.values()) == 0:
|
| 279 |
f.primary_domain = "general"
|
| 280 |
return
|
| 281 |
-
f.primary_domain = max(scores, key=scores
|
|
|
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
import re
|
| 12 |
+
import logging
|
| 13 |
from dataclasses import dataclass, field
|
| 14 |
from typing import Optional
|
| 15 |
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
|
| 19 |
# ---------------------------------------------------------------------------
|
| 20 |
# Feature schema
|
|
|
|
| 58 |
|
| 59 |
# Derived
|
| 60 |
primary_domain: str = "general"
|
| 61 |
+
domain_scores: dict[str, float] = field(default_factory=dict)
|
| 62 |
+
_expert_signal: bool = False
|
| 63 |
|
| 64 |
def to_dict(self) -> dict:
|
| 65 |
return self.__dict__.copy()
|
|
|
|
| 151 |
"""
|
| 152 |
Extracts semantic + structural features from a raw query string.
|
| 153 |
|
| 154 |
+
V2: Uses HuggingFace zero-shot classification if transformers is installed.
|
| 155 |
+
Falls back to V1 heuristics if not available.
|
|
|
|
| 156 |
"""
|
| 157 |
|
| 158 |
+
def __init__(self):
|
| 159 |
+
self.ml_classifier = None
|
| 160 |
+
self.ml_labels = [
|
| 161 |
+
"code", "math", "science", "creative",
|
| 162 |
+
"reasoning", "summarization", "translation", "factual"
|
| 163 |
+
]
|
| 164 |
+
try:
|
| 165 |
+
from transformers import pipeline # type: ignore
|
| 166 |
+
logger.info("Loading ML Zero-Shot Classifier for Query Analyzer...")
|
| 167 |
+
self.ml_classifier = pipeline(
|
| 168 |
+
"zero-shot-classification",
|
| 169 |
+
model="cross-encoder/nli-distilroberta-base",
|
| 170 |
+
device=-1
|
| 171 |
+
)
|
| 172 |
+
except ImportError:
|
| 173 |
+
logger.info("transformers not found, using V1 heuristic Query Analyzer.")
|
| 174 |
+
except Exception as e:
|
| 175 |
+
logger.warning(f"Failed to load ML classifier: {e}. Falling back to V1.")
|
| 176 |
+
|
| 177 |
def analyze(self, query: str) -> QueryFeatures:
|
| 178 |
q = query.strip()
|
| 179 |
ql = q.lower()
|
|
|
|
| 207 |
# ------------------------------------------------------------------
|
| 208 |
|
| 209 |
def _domain_features(self, ql: str, f: QueryFeatures) -> None:
|
| 210 |
+
scores: dict[str, float] = {}
|
| 211 |
+
|
| 212 |
+
if self.ml_classifier:
|
| 213 |
+
try:
|
| 214 |
+
result = self.ml_classifier(f.raw_query, self.ml_labels, multi_label=True)
|
| 215 |
+
for label, score in zip(result['labels'], result['scores']):
|
| 216 |
+
scores[label] = score
|
| 217 |
+
except Exception as e:
|
| 218 |
+
logger.warning(f"ML inference failed: {e}. Falling back to V1.")
|
| 219 |
+
|
| 220 |
+
if not scores:
|
| 221 |
+
scores = {
|
| 222 |
+
"code": self._keyword_score(ql, _CODE_KEYWORDS),
|
| 223 |
+
"math": self._keyword_score(ql, _MATH_KEYWORDS),
|
| 224 |
+
"science": self._keyword_score(ql, _SCIENCE_KEYWORDS),
|
| 225 |
+
"creative": self._keyword_score(ql, _CREATIVE_KEYWORDS),
|
| 226 |
+
"reasoning": self._keyword_score(ql, _REASONING_KEYWORDS),
|
| 227 |
+
"summarization": self._keyword_score(ql, _SUMMARIZATION_KEYWORDS),
|
| 228 |
+
"translation": self._keyword_score(ql, _TRANSLATION_KEYWORDS),
|
| 229 |
+
}
|
| 230 |
|
| 231 |
f.domain_scores = scores
|
| 232 |
+
|
| 233 |
+
if self.ml_classifier:
|
| 234 |
+
f.domain_code = scores.get("code", 0) > 0.4
|
| 235 |
+
f.domain_math = scores.get("math", 0) > 0.4 or f.has_math_notation
|
| 236 |
+
f.domain_science = scores.get("science", 0) > 0.4
|
| 237 |
+
f.domain_creative = scores.get("creative", 0) > 0.4
|
| 238 |
+
f.domain_reasoning = scores.get("reasoning", 0) > 0.4
|
| 239 |
+
f.domain_summarization = scores.get("summarization", 0) > 0.4
|
| 240 |
+
f.domain_translation = scores.get("translation", 0) > 0.4
|
| 241 |
+
f.domain_factual = scores.get("factual", 0) > 0.4
|
| 242 |
+
else:
|
| 243 |
+
f.domain_code = scores.get("code", 0) > 0
|
| 244 |
+
f.domain_math = scores.get("math", 0) > 0 or f.has_math_notation
|
| 245 |
+
f.domain_science = scores.get("science", 0) > 0
|
| 246 |
+
f.domain_creative = scores.get("creative", 0) > 0
|
| 247 |
+
f.domain_reasoning = scores.get("reasoning", 0) > 0
|
| 248 |
+
f.domain_summarization = scores.get("summarization", 0) > 0
|
| 249 |
+
f.domain_translation = scores.get("translation", 0) > 0
|
| 250 |
+
f.domain_factual = (
|
| 251 |
+
f.has_question_mark
|
| 252 |
+
and sum(scores.values()) < 0.5
|
| 253 |
+
and f.token_count < 25
|
| 254 |
+
)
|
| 255 |
|
| 256 |
def _keyword_score(self, ql: str, keywords: set) -> float:
|
| 257 |
"""Fraction of keywords found; capped to avoid over-scoring long queries."""
|
|
|
|
| 321 |
if not scores or max(scores.values()) == 0:
|
| 322 |
f.primary_domain = "general"
|
| 323 |
return
|
| 324 |
+
f.primary_domain = max(scores.keys(), key=lambda k: scores[k])
|
llmopt/cache/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Init file
|
llmopt/cache/semantic_cache.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import json
|
| 3 |
+
import hashlib
|
| 4 |
+
from typing import Optional, Any
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SemanticCache:
|
| 10 |
+
"""
|
| 11 |
+
Semantic Cache powered by Redis and sentence-transformers.
|
| 12 |
+
|
| 13 |
+
Recommended redis.conf / Redis server settings:
|
| 14 |
+
maxmemory 240mb
|
| 15 |
+
maxmemory-policy allkeys-lfu
|
| 16 |
+
lfu-decay-time 5
|
| 17 |
+
lfu-log-factor 10
|
| 18 |
+
|
| 19 |
+
Automatically disables itself when Redis or ML dependencies are unavailable.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, redis_url: Optional[str] = None, similarity_threshold: float = 0.95):
|
| 23 |
+
self.enabled = False
|
| 24 |
+
self.similarity_threshold = similarity_threshold
|
| 25 |
+
self.redis: Any = None
|
| 26 |
+
self.model: Any = None
|
| 27 |
+
self.cosine_similarity: Any = None
|
| 28 |
+
self.np: Any = None
|
| 29 |
+
|
| 30 |
+
if not redis_url:
|
| 31 |
+
logger.info("SemanticCache: No Redis URL provided. Cache disabled.")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
# Try connecting to Redis
|
| 35 |
+
try:
|
| 36 |
+
import redis # type: ignore
|
| 37 |
+
self.redis = redis.Redis.from_url(redis_url, decode_responses=True)
|
| 38 |
+
self.redis.ping()
|
| 39 |
+
except ImportError:
|
| 40 |
+
logger.warning("SemanticCache: 'redis' package not installed. Cache disabled.")
|
| 41 |
+
return
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.warning(f"SemanticCache: Failed to connect to Redis at {redis_url}: {e}")
|
| 44 |
+
self.redis = None
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
# Try loading sentence-transformers + sklearn
|
| 48 |
+
try:
|
| 49 |
+
from sentence_transformers import SentenceTransformer # type: ignore
|
| 50 |
+
import numpy as np # type: ignore
|
| 51 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 52 |
+
|
| 53 |
+
self.cosine_similarity = cosine_similarity
|
| 54 |
+
self.np = np
|
| 55 |
+
logger.info("SemanticCache: Loading embedding model (all-MiniLM-L6-v2)...")
|
| 56 |
+
self.model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 57 |
+
self.enabled = True
|
| 58 |
+
logger.info("SemanticCache: Successfully initialized and connected to Redis!")
|
| 59 |
+
except ImportError:
|
| 60 |
+
logger.warning(
|
| 61 |
+
"SemanticCache: 'sentence-transformers' or 'scikit-learn' not installed. Cache disabled."
|
| 62 |
+
)
|
| 63 |
+
self.redis = None
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.warning(f"SemanticCache: Failed to load ML models: {e}")
|
| 66 |
+
self.redis = None
|
| 67 |
+
|
| 68 |
+
# ------------------------------------------------------------------
|
| 69 |
+
# Internal helpers
|
| 70 |
+
# ------------------------------------------------------------------
|
| 71 |
+
|
| 72 |
+
def _is_within_memory_limit(self, safety_ratio: float = 0.90) -> bool:
|
| 73 |
+
"""
|
| 74 |
+
Returns False when Redis has consumed >= safety_ratio of its maxmemory.
|
| 75 |
+
Prevents new writes from pushing Redis over the 250 MB hard limit.
|
| 76 |
+
Fails open (returns True) if the info call itself errors.
|
| 77 |
+
"""
|
| 78 |
+
try:
|
| 79 |
+
info = self.redis.info("memory")
|
| 80 |
+
used = info["used_memory"]
|
| 81 |
+
max_mem = info.get("maxmemory", 0)
|
| 82 |
+
if max_mem == 0:
|
| 83 |
+
# No maxmemory configured — rely solely on allkeys-lfu eviction.
|
| 84 |
+
return True
|
| 85 |
+
within = (used / max_mem) < safety_ratio
|
| 86 |
+
if not within:
|
| 87 |
+
logger.warning(
|
| 88 |
+
f"SemanticCache: Memory at {used / max_mem:.1%} of limit "
|
| 89 |
+
f"({used / 1_048_576:.1f} MB / {max_mem / 1_048_576:.1f} MB). "
|
| 90 |
+
"Skipping write."
|
| 91 |
+
)
|
| 92 |
+
return within
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.warning(f"SemanticCache: Memory check failed (failing open): {e}")
|
| 95 |
+
return True
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def _cache_key(query: str) -> str:
|
| 99 |
+
"""Stable, cross-process MD5 key for a query string."""
|
| 100 |
+
query_hash = hashlib.md5(query.encode("utf-8")).hexdigest()
|
| 101 |
+
return f"llmopt:cache:{query_hash}"
|
| 102 |
+
|
| 103 |
+
@staticmethod
|
| 104 |
+
def _ttl_for_response(response: str) -> int:
|
| 105 |
+
"""
|
| 106 |
+
Longer, richer responses get a longer TTL — they are more expensive to
|
| 107 |
+
regenerate and therefore more valuable to keep around.
|
| 108 |
+
|
| 109 |
+
> 500 chars → 7 days (604 800 s)
|
| 110 |
+
≤ 500 chars → 3 days (259 200 s)
|
| 111 |
+
"""
|
| 112 |
+
return 604_800 if len(response) > 500 else 259_200
|
| 113 |
+
|
| 114 |
+
# ------------------------------------------------------------------
|
| 115 |
+
# Public API
|
| 116 |
+
# ------------------------------------------------------------------
|
| 117 |
+
|
| 118 |
+
def get(self, query: str) -> Optional[str]:
|
| 119 |
+
"""
|
| 120 |
+
Return the cached LLM response for a semantically similar query, or
|
| 121 |
+
None on a cache miss.
|
| 122 |
+
|
| 123 |
+
Uses a Redis pipeline to fetch all cached entries in a single round
|
| 124 |
+
trip instead of one GET per key, keeping network overhead low even as
|
| 125 |
+
the cache grows.
|
| 126 |
+
"""
|
| 127 |
+
if not self.enabled:
|
| 128 |
+
return None
|
| 129 |
+
|
| 130 |
+
try:
|
| 131 |
+
query_embedding = self.model.encode([query])[0]
|
| 132 |
+
|
| 133 |
+
keys = self.redis.keys("llmopt:cache:*")
|
| 134 |
+
if not keys:
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
# Batch-fetch all entries in one round trip
|
| 138 |
+
pipe = self.redis.pipeline()
|
| 139 |
+
for key in keys:
|
| 140 |
+
pipe.get(key)
|
| 141 |
+
results = pipe.execute()
|
| 142 |
+
|
| 143 |
+
best_key = None
|
| 144 |
+
highest_sim = -1.0
|
| 145 |
+
|
| 146 |
+
for key, data_str in zip(keys, results):
|
| 147 |
+
if not data_str:
|
| 148 |
+
continue
|
| 149 |
+
data = json.loads(data_str)
|
| 150 |
+
cached_emb = self.np.array(data["embedding"])
|
| 151 |
+
sim = self.cosine_similarity([query_embedding], [cached_emb])[0][0]
|
| 152 |
+
if sim > highest_sim:
|
| 153 |
+
highest_sim = sim
|
| 154 |
+
best_key = key
|
| 155 |
+
|
| 156 |
+
if highest_sim >= self.similarity_threshold and best_key:
|
| 157 |
+
logger.info(f"SemanticCache HIT! Similarity: {highest_sim:.3f}")
|
| 158 |
+
match_data = json.loads(self.redis.get(best_key))
|
| 159 |
+
return match_data["response"]
|
| 160 |
+
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.warning(f"SemanticCache GET error: {e}")
|
| 163 |
+
|
| 164 |
+
return None
|
| 165 |
+
|
| 166 |
+
def set(self, query: str, response: str) -> None:
|
| 167 |
+
"""
|
| 168 |
+
Embed and store a query/response pair.
|
| 169 |
+
|
| 170 |
+
Skips the write when Redis is near its memory ceiling so that the
|
| 171 |
+
allkeys-lfu policy never has to evict a hot entry just to absorb a
|
| 172 |
+
brand-new one.
|
| 173 |
+
"""
|
| 174 |
+
if not self.enabled:
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
# Guard: don't write when we are close to the 250 MB limit
|
| 178 |
+
if not self._is_within_memory_limit(safety_ratio=0.90):
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
try:
|
| 182 |
+
query_embedding = self.model.encode([query])[0]
|
| 183 |
+
key = self._cache_key(query)
|
| 184 |
+
ttl = self._ttl_for_response(response)
|
| 185 |
+
|
| 186 |
+
data = {
|
| 187 |
+
"query": query,
|
| 188 |
+
"embedding": query_embedding.tolist(),
|
| 189 |
+
"response": response,
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
# Atomic set + expiry via pipeline
|
| 193 |
+
pipe = self.redis.pipeline()
|
| 194 |
+
pipe.set(key, json.dumps(data))
|
| 195 |
+
pipe.expire(key, ttl)
|
| 196 |
+
pipe.execute()
|
| 197 |
+
|
| 198 |
+
logger.debug(
|
| 199 |
+
f"SemanticCache SET: key={key} ttl={ttl}s "
|
| 200 |
+
f"response_len={len(response)}"
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
logger.warning(f"SemanticCache SET error: {e}")
|
llmopt/core.py
CHANGED
|
@@ -15,6 +15,7 @@ from __future__ import annotations
|
|
| 15 |
|
| 16 |
import logging
|
| 17 |
import time
|
|
|
|
| 18 |
from dataclasses import dataclass
|
| 19 |
from pathlib import Path
|
| 20 |
from typing import Optional
|
|
@@ -25,6 +26,9 @@ from llmopt.engine.optimization_engine import OptimizationEngine, OptimizationRe
|
|
| 25 |
from llmopt.optimizer.prompt_optimizer import PromptOptimizer, OptimizedPrompt
|
| 26 |
from llmopt.router.model_router import ModelRouter, RoutedResponse
|
| 27 |
from llmopt.registry.model_registry import ModelRegistry
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
logger = logging.getLogger(__name__)
|
| 30 |
|
|
@@ -57,6 +61,7 @@ class GenerateResult:
|
|
| 57 |
optimization: OptimizationResult
|
| 58 |
optimized_prompt: OptimizedPrompt
|
| 59 |
latency_ms: float
|
|
|
|
| 60 |
|
| 61 |
def explain(self) -> str:
|
| 62 |
"""Human-readable explanation of routing decisions."""
|
|
@@ -140,6 +145,20 @@ class LLMOpt:
|
|
| 140 |
self.engine = OptimizationEngine(self.registry)
|
| 141 |
self.optimizer = PromptOptimizer()
|
| 142 |
self.router = ModelRouter(ollama_base_url=ollama_base_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
# ------------------------------------------------------------------
|
| 145 |
# Primary API
|
|
@@ -157,6 +176,7 @@ class LLMOpt:
|
|
| 157 |
conversation_history: Optional[list[dict]] = None,
|
| 158 |
temperature: float = 0.7,
|
| 159 |
dry_run: bool = False,
|
|
|
|
| 160 |
) -> GenerateResult:
|
| 161 |
"""
|
| 162 |
Full pipeline: analyze → estimate → optimize → compress → route → return.
|
|
@@ -184,6 +204,50 @@ class LLMOpt:
|
|
| 184 |
complexity = self.estimator.estimate(features)
|
| 185 |
logger.debug(f"Complexity: {complexity.score:.3f} ({complexity.tier})")
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
# 3. Build constraints
|
| 188 |
constraints = UserConstraints(
|
| 189 |
budget_mode=budget_mode,
|
|
@@ -229,9 +293,12 @@ class LLMOpt:
|
|
| 229 |
input_cost_per_1k=model_spec.input_cost_per_1k,
|
| 230 |
output_cost_per_1k=model_spec.output_cost_per_1k,
|
| 231 |
)
|
| 232 |
-
|
| 233 |
latency_ms = (time.perf_counter() - t0) * 1000
|
| 234 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
# 8. Compute savings vs baseline
|
| 236 |
baseline_cost = (
|
| 237 |
self._BASELINE_INPUT_COST * routed.input_tokens / 1000
|
|
@@ -239,6 +306,19 @@ class LLMOpt:
|
|
| 239 |
)
|
| 240 |
cost_saved = max(0.0, baseline_cost - routed.estimated_cost)
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
return GenerateResult(
|
| 243 |
response=routed.content,
|
| 244 |
model_used=routed.model_used,
|
|
@@ -255,6 +335,7 @@ class LLMOpt:
|
|
| 255 |
optimization=optimization,
|
| 256 |
optimized_prompt=optimized_prompt,
|
| 257 |
latency_ms=round(latency_ms, 1),
|
|
|
|
| 258 |
)
|
| 259 |
|
| 260 |
# ------------------------------------------------------------------
|
|
|
|
| 15 |
|
| 16 |
import logging
|
| 17 |
import time
|
| 18 |
+
import os
|
| 19 |
from dataclasses import dataclass
|
| 20 |
from pathlib import Path
|
| 21 |
from typing import Optional
|
|
|
|
| 26 |
from llmopt.optimizer.prompt_optimizer import PromptOptimizer, OptimizedPrompt
|
| 27 |
from llmopt.router.model_router import ModelRouter, RoutedResponse
|
| 28 |
from llmopt.registry.model_registry import ModelRegistry
|
| 29 |
+
from llmopt.cache.semantic_cache import SemanticCache
|
| 30 |
+
from llmopt.evaluation.evaluator import LLMJudge, EvaluationResult
|
| 31 |
+
import os
|
| 32 |
|
| 33 |
logger = logging.getLogger(__name__)
|
| 34 |
|
|
|
|
| 61 |
optimization: OptimizationResult
|
| 62 |
optimized_prompt: OptimizedPrompt
|
| 63 |
latency_ms: float
|
| 64 |
+
evaluation: Optional[EvaluationResult] = None
|
| 65 |
|
| 66 |
def explain(self) -> str:
|
| 67 |
"""Human-readable explanation of routing decisions."""
|
|
|
|
| 145 |
self.engine = OptimizationEngine(self.registry)
|
| 146 |
self.optimizer = PromptOptimizer()
|
| 147 |
self.router = ModelRouter(ollama_base_url=ollama_base_url)
|
| 148 |
+
|
| 149 |
+
# Initialize Semantic Cache (reads REDIS_URL from env if available)
|
| 150 |
+
# Using python-dotenv to ensure .env is loaded
|
| 151 |
+
try:
|
| 152 |
+
from dotenv import load_dotenv # type: ignore
|
| 153 |
+
# Attempt to load from both the root and config/.env
|
| 154 |
+
load_dotenv()
|
| 155 |
+
load_dotenv("config/.env")
|
| 156 |
+
except ImportError:
|
| 157 |
+
pass
|
| 158 |
+
|
| 159 |
+
redis_url = os.environ.get("REDIS_URL")
|
| 160 |
+
self.cache = SemanticCache(redis_url=redis_url)
|
| 161 |
+
self.judge = LLMJudge(judge_model="gpt-4o-mini")
|
| 162 |
|
| 163 |
# ------------------------------------------------------------------
|
| 164 |
# Primary API
|
|
|
|
| 176 |
conversation_history: Optional[list[dict]] = None,
|
| 177 |
temperature: float = 0.7,
|
| 178 |
dry_run: bool = False,
|
| 179 |
+
evaluate: bool = False,
|
| 180 |
) -> GenerateResult:
|
| 181 |
"""
|
| 182 |
Full pipeline: analyze → estimate → optimize → compress → route → return.
|
|
|
|
| 204 |
complexity = self.estimator.estimate(features)
|
| 205 |
logger.debug(f"Complexity: {complexity.score:.3f} ({complexity.tier})")
|
| 206 |
|
| 207 |
+
# 2.5 Check Semantic Cache
|
| 208 |
+
if not dry_run and not conversation_history:
|
| 209 |
+
cached_response = self.cache.get(query)
|
| 210 |
+
if cached_response:
|
| 211 |
+
latency_ms = (time.perf_counter() - t0) * 1000
|
| 212 |
+
logger.info("Returning cached response directly.")
|
| 213 |
+
|
| 214 |
+
constraints = UserConstraints(budget_mode=budget_mode)
|
| 215 |
+
optimization = self.engine.optimize(
|
| 216 |
+
complexity=complexity,
|
| 217 |
+
output_length_bucket=features.estimated_output_length,
|
| 218 |
+
constraints=constraints,
|
| 219 |
+
)
|
| 220 |
+
optimized_prompt = self.optimizer.optimize(
|
| 221 |
+
query=query,
|
| 222 |
+
system_prompt_style=optimization.system_prompt_style,
|
| 223 |
+
compression_enabled=optimization.compression_enabled,
|
| 224 |
+
conversation_history=conversation_history,
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# Baseline cost for metrics calculation
|
| 228 |
+
baseline_cost = (
|
| 229 |
+
self._BASELINE_INPUT_COST * optimization.estimated_input_tokens / 1000
|
| 230 |
+
+ self._BASELINE_OUTPUT_COST * optimization.estimated_output_tokens / 1000
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
return GenerateResult(
|
| 234 |
+
response=cached_response,
|
| 235 |
+
model_used="redis-semantic-cache",
|
| 236 |
+
provider="cache",
|
| 237 |
+
input_tokens=0,
|
| 238 |
+
output_tokens=0,
|
| 239 |
+
total_tokens=0,
|
| 240 |
+
estimated_cost=0.0,
|
| 241 |
+
tokens_saved=optimized_prompt.tokens_saved,
|
| 242 |
+
cost_saved=round(baseline_cost, 6),
|
| 243 |
+
compression_ratio=optimized_prompt.compression_ratio,
|
| 244 |
+
query_features=features,
|
| 245 |
+
complexity=complexity,
|
| 246 |
+
optimization=optimization,
|
| 247 |
+
optimized_prompt=optimized_prompt,
|
| 248 |
+
latency_ms=round(latency_ms, 1),
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
# 3. Build constraints
|
| 252 |
constraints = UserConstraints(
|
| 253 |
budget_mode=budget_mode,
|
|
|
|
| 293 |
input_cost_per_1k=model_spec.input_cost_per_1k,
|
| 294 |
output_cost_per_1k=model_spec.output_cost_per_1k,
|
| 295 |
)
|
|
|
|
| 296 |
latency_ms = (time.perf_counter() - t0) * 1000
|
| 297 |
|
| 298 |
+
# Save to cache
|
| 299 |
+
if not dry_run and not conversation_history:
|
| 300 |
+
self.cache.set(query, routed.content)
|
| 301 |
+
|
| 302 |
# 8. Compute savings vs baseline
|
| 303 |
baseline_cost = (
|
| 304 |
self._BASELINE_INPUT_COST * routed.input_tokens / 1000
|
|
|
|
| 306 |
)
|
| 307 |
cost_saved = max(0.0, baseline_cost - routed.estimated_cost)
|
| 308 |
|
| 309 |
+
# 9. Evaluate (if requested) and feed Bayesian optimizer
|
| 310 |
+
evaluation = None
|
| 311 |
+
if evaluate and not dry_run:
|
| 312 |
+
evaluation = self.judge.evaluate(query, routed.content)
|
| 313 |
+
if evaluation:
|
| 314 |
+
α, β, γ = self.engine.bayes.get_weights(constraints.budget_mode)
|
| 315 |
+
self.engine.bayes.record_outcome(
|
| 316 |
+
budget_mode=constraints.budget_mode,
|
| 317 |
+
alpha=α, beta=β, gamma=γ,
|
| 318 |
+
actual_cost=routed.estimated_cost,
|
| 319 |
+
quality_score=evaluation.overall,
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
return GenerateResult(
|
| 323 |
response=routed.content,
|
| 324 |
model_used=routed.model_used,
|
|
|
|
| 335 |
optimization=optimization,
|
| 336 |
optimized_prompt=optimized_prompt,
|
| 337 |
latency_ms=round(latency_ms, 1),
|
| 338 |
+
evaluation=evaluation,
|
| 339 |
)
|
| 340 |
|
| 341 |
# ------------------------------------------------------------------
|
llmopt/engine/optimization_engine.py
CHANGED
|
@@ -9,18 +9,27 @@ Subject to:
|
|
| 9 |
Quality >= threshold
|
| 10 |
Latency <= max_latency
|
| 11 |
|
| 12 |
-
V1: deterministic rule engine with
|
| 13 |
-
V2
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
from __future__ import annotations
|
| 17 |
|
|
|
|
|
|
|
| 18 |
from dataclasses import dataclass
|
|
|
|
| 19 |
from typing import Optional
|
| 20 |
|
| 21 |
from llmopt.registry.model_registry import ModelRegistry, ModelSpec
|
| 22 |
from llmopt.estimator.complexity_estimator import ComplexityResult
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
# ---------------------------------------------------------------------------
|
| 26 |
# User constraints schema
|
|
@@ -35,8 +44,8 @@ class UserConstraints:
|
|
| 35 |
max_cost_per_request: Optional[float] = None # USD hard cap
|
| 36 |
max_latency_score: Optional[float] = None # lower = faster model
|
| 37 |
quality_threshold: float = 0.60 # min acceptable quality proxy
|
| 38 |
-
exclude_providers: list[str] = None
|
| 39 |
-
only_providers: list[str] = None
|
| 40 |
prefer_local: bool = False # prefer Ollama models
|
| 41 |
compression_enabled: Optional[bool] = None # None = auto-decide
|
| 42 |
|
|
@@ -89,6 +98,127 @@ _MAX_TOKENS_MAP = {
|
|
| 89 |
}
|
| 90 |
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
# ---------------------------------------------------------------------------
|
| 93 |
# Engine
|
| 94 |
# ---------------------------------------------------------------------------
|
|
@@ -97,10 +227,13 @@ class OptimizationEngine:
|
|
| 97 |
"""
|
| 98 |
Core decision engine. Selects model + config that minimizes
|
| 99 |
J(x) = α·Cost + β·Tokens - γ·Quality under user constraints.
|
|
|
|
|
|
|
| 100 |
"""
|
| 101 |
|
| 102 |
def __init__(self, registry: ModelRegistry):
|
| 103 |
self.registry = registry
|
|
|
|
| 104 |
|
| 105 |
def optimize(
|
| 106 |
self,
|
|
@@ -111,7 +244,8 @@ class OptimizationEngine:
|
|
| 111 |
if constraints is None:
|
| 112 |
constraints = UserConstraints()
|
| 113 |
|
| 114 |
-
α, β, γ =
|
|
|
|
| 115 |
|
| 116 |
# --- 1. Build candidate set ---
|
| 117 |
candidates = self.registry.capable_of(
|
|
|
|
| 9 |
Quality >= threshold
|
| 10 |
Latency <= max_latency
|
| 11 |
|
| 12 |
+
V1: deterministic rule engine with fixed per-budget-mode weights.
|
| 13 |
+
V2: BayesianWeightOptimizer (Optuna) learns α,β,γ from past outcomes.
|
| 14 |
+
Falls back to V1 fixed weights if optuna is not installed.
|
| 15 |
"""
|
| 16 |
|
| 17 |
from __future__ import annotations
|
| 18 |
|
| 19 |
+
import json
|
| 20 |
+
import logging
|
| 21 |
from dataclasses import dataclass
|
| 22 |
+
from pathlib import Path
|
| 23 |
from typing import Optional
|
| 24 |
|
| 25 |
from llmopt.registry.model_registry import ModelRegistry, ModelSpec
|
| 26 |
from llmopt.estimator.complexity_estimator import ComplexityResult
|
| 27 |
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
| 31 |
+
BAYES_STUDY_PATH = DATA_DIR / "bayesian_study.json"
|
| 32 |
+
|
| 33 |
|
| 34 |
# ---------------------------------------------------------------------------
|
| 35 |
# User constraints schema
|
|
|
|
| 44 |
max_cost_per_request: Optional[float] = None # USD hard cap
|
| 45 |
max_latency_score: Optional[float] = None # lower = faster model
|
| 46 |
quality_threshold: float = 0.60 # min acceptable quality proxy
|
| 47 |
+
exclude_providers: Optional[list[str]] = None # e.g. ["ollama"] for cloud-only
|
| 48 |
+
only_providers: Optional[list[str]] = None # e.g. ["openai"]
|
| 49 |
prefer_local: bool = False # prefer Ollama models
|
| 50 |
compression_enabled: Optional[bool] = None # None = auto-decide
|
| 51 |
|
|
|
|
| 98 |
}
|
| 99 |
|
| 100 |
|
| 101 |
+
# ---------------------------------------------------------------------------
|
| 102 |
+
# Bayesian Weight Optimizer (V2)
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
|
| 105 |
+
class BayesianWeightOptimizer:
|
| 106 |
+
"""
|
| 107 |
+
Uses Optuna to find optimal α,β,γ weights for J(x) based on
|
| 108 |
+
accumulated feedback (cost vs quality trade-offs from past requests).
|
| 109 |
+
|
| 110 |
+
Falls back gracefully to fixed V1 weights if optuna is not installed.
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
def __init__(self):
|
| 114 |
+
self._optuna_available = False
|
| 115 |
+
self._studies: dict = {} # one study per budget_mode
|
| 116 |
+
try:
|
| 117 |
+
import optuna # type: ignore
|
| 118 |
+
optuna.logging.set_verbosity(optuna.logging.WARNING)
|
| 119 |
+
self._optuna_available = True
|
| 120 |
+
logger.info("BayesianWeightOptimizer: Optuna available. Using Bayesian weight tuning.")
|
| 121 |
+
except ImportError:
|
| 122 |
+
logger.info("BayesianWeightOptimizer: Optuna not installed. Using V1 fixed weights.")
|
| 123 |
+
|
| 124 |
+
def get_weights(self, budget_mode: str) -> tuple[float, float, float]:
|
| 125 |
+
"""
|
| 126 |
+
Returns (α, β, γ) weights for the given budget mode.
|
| 127 |
+
Uses Bayesian optimization if optuna is available and we have
|
| 128 |
+
enough feedback history, otherwise falls back to V1 fixed weights.
|
| 129 |
+
"""
|
| 130 |
+
if not self._optuna_available:
|
| 131 |
+
return _BUDGET_WEIGHTS.get(budget_mode, _BUDGET_WEIGHTS["balanced"])
|
| 132 |
+
|
| 133 |
+
# Load saved trials
|
| 134 |
+
history = self._load_history(budget_mode)
|
| 135 |
+
if len(history) < 5:
|
| 136 |
+
# Not enough data yet — use V1 defaults but still warm up
|
| 137 |
+
logger.debug(f"Bayesian: Only {len(history)} trials for '{budget_mode}', using V1 defaults.")
|
| 138 |
+
return _BUDGET_WEIGHTS.get(budget_mode, _BUDGET_WEIGHTS["balanced"])
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
import optuna # type: ignore
|
| 142 |
+
|
| 143 |
+
study_key = budget_mode
|
| 144 |
+
if study_key not in self._studies:
|
| 145 |
+
self._studies[study_key] = optuna.create_study(direction="minimize")
|
| 146 |
+
# Seed with historical trials
|
| 147 |
+
for trial_data in history:
|
| 148 |
+
self._studies[study_key].add_trial(
|
| 149 |
+
optuna.trial.create_trial(
|
| 150 |
+
params={"alpha": trial_data["alpha"], "beta": trial_data["beta"], "gamma": trial_data["gamma"]},
|
| 151 |
+
distributions={
|
| 152 |
+
"alpha": optuna.distributions.FloatDistribution(0.05, 0.90),
|
| 153 |
+
"beta": optuna.distributions.FloatDistribution(0.05, 0.60),
|
| 154 |
+
"gamma": optuna.distributions.FloatDistribution(0.05, 0.90),
|
| 155 |
+
},
|
| 156 |
+
value=trial_data["outcome"],
|
| 157 |
+
)
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
study = self._studies[study_key]
|
| 161 |
+
best = study.best_params
|
| 162 |
+
α = best["alpha"]
|
| 163 |
+
β = best["beta"]
|
| 164 |
+
γ = best["gamma"]
|
| 165 |
+
logger.debug(f"Bayesian weights for '{budget_mode}': α={α:.3f} β={β:.3f} γ={γ:.3f}")
|
| 166 |
+
return α, β, γ
|
| 167 |
+
|
| 168 |
+
except Exception as e:
|
| 169 |
+
logger.warning(f"Bayesian weight retrieval failed: {e}. Using V1 defaults.")
|
| 170 |
+
return _BUDGET_WEIGHTS.get(budget_mode, _BUDGET_WEIGHTS["balanced"])
|
| 171 |
+
|
| 172 |
+
def record_outcome(
|
| 173 |
+
self,
|
| 174 |
+
budget_mode: str,
|
| 175 |
+
alpha: float, beta: float, gamma: float,
|
| 176 |
+
actual_cost: float,
|
| 177 |
+
quality_score: float,
|
| 178 |
+
) -> None:
|
| 179 |
+
"""
|
| 180 |
+
Records the outcome of a request. The 'outcome' score is what
|
| 181 |
+
we want to minimize: actual cost weighted against quality.
|
| 182 |
+
Call this after receiving a response + evaluation score.
|
| 183 |
+
"""
|
| 184 |
+
# Composite outcome: high cost = bad, low quality = bad
|
| 185 |
+
# Normalise: assume max_cost ~$0.02, quality in [1,10] → [0,1]
|
| 186 |
+
cost_norm = min(actual_cost / 0.02, 1.0)
|
| 187 |
+
quality_norm = quality_score / 10.0
|
| 188 |
+
outcome = cost_norm - quality_norm # minimise this
|
| 189 |
+
|
| 190 |
+
history = self._load_history(budget_mode)
|
| 191 |
+
history.append({
|
| 192 |
+
"alpha": alpha, "beta": beta, "gamma": gamma,
|
| 193 |
+
"actual_cost": actual_cost,
|
| 194 |
+
"quality_score": quality_score,
|
| 195 |
+
"outcome": outcome,
|
| 196 |
+
})
|
| 197 |
+
self._save_history(budget_mode, history)
|
| 198 |
+
# Invalidate the in-memory study so it reloads next time
|
| 199 |
+
self._studies.pop(budget_mode, None)
|
| 200 |
+
|
| 201 |
+
def _load_history(self, budget_mode: str) -> list:
|
| 202 |
+
if not BAYES_STUDY_PATH.exists():
|
| 203 |
+
return []
|
| 204 |
+
try:
|
| 205 |
+
data = json.loads(BAYES_STUDY_PATH.read_text())
|
| 206 |
+
return data.get(budget_mode, [])
|
| 207 |
+
except Exception:
|
| 208 |
+
return []
|
| 209 |
+
|
| 210 |
+
def _save_history(self, budget_mode: str, history: list) -> None:
|
| 211 |
+
existing = {}
|
| 212 |
+
if BAYES_STUDY_PATH.exists():
|
| 213 |
+
try:
|
| 214 |
+
existing = json.loads(BAYES_STUDY_PATH.read_text())
|
| 215 |
+
except Exception:
|
| 216 |
+
pass
|
| 217 |
+
existing[budget_mode] = history
|
| 218 |
+
BAYES_STUDY_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 219 |
+
BAYES_STUDY_PATH.write_text(json.dumps(existing, indent=2))
|
| 220 |
+
|
| 221 |
+
|
| 222 |
# ---------------------------------------------------------------------------
|
| 223 |
# Engine
|
| 224 |
# ---------------------------------------------------------------------------
|
|
|
|
| 227 |
"""
|
| 228 |
Core decision engine. Selects model + config that minimizes
|
| 229 |
J(x) = α·Cost + β·Tokens - γ·Quality under user constraints.
|
| 230 |
+
|
| 231 |
+
V2: Uses BayesianWeightOptimizer to learn optimal α,β,γ weights over time.
|
| 232 |
"""
|
| 233 |
|
| 234 |
def __init__(self, registry: ModelRegistry):
|
| 235 |
self.registry = registry
|
| 236 |
+
self.bayes = BayesianWeightOptimizer()
|
| 237 |
|
| 238 |
def optimize(
|
| 239 |
self,
|
|
|
|
| 244 |
if constraints is None:
|
| 245 |
constraints = UserConstraints()
|
| 246 |
|
| 247 |
+
α, β, γ = self.bayes.get_weights(constraints.budget_mode)
|
| 248 |
+
logger.debug(f"Using weights α={α:.3f} β={β:.3f} γ={γ:.3f} for mode '{constraints.budget_mode}'")
|
| 249 |
|
| 250 |
# --- 1. Build candidate set ---
|
| 251 |
candidates = self.registry.capable_of(
|
llmopt/estimator/complexity_estimator.py
CHANGED
|
@@ -7,11 +7,15 @@ V2 hook: swap in a supervised regressor or pairwise ranking model.
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
|
|
|
| 10 |
from dataclasses import dataclass
|
|
|
|
| 11 |
from typing import Optional
|
| 12 |
|
| 13 |
from llmopt.analyzer.query_analyzer import QueryFeatures
|
| 14 |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# ---------------------------------------------------------------------------
|
| 17 |
# Result schema
|
|
@@ -83,12 +87,45 @@ class ComplexityEstimator:
|
|
| 83 |
"""
|
| 84 |
Produces a normalized complexity score from QueryFeatures.
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
calibrated from labeled data in V2.
|
| 89 |
"""
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
def estimate(self, features: QueryFeatures) -> ComplexityResult:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
score = 0.0
|
| 93 |
rationale: list[str] = []
|
| 94 |
|
|
@@ -139,13 +176,22 @@ class ComplexityEstimator:
|
|
| 139 |
estimated_input_tokens = features.token_count * 1.3 + 50 # rough overhead
|
| 140 |
estimated_output_tokens = _OUTPUT_TOKENS_MAP.get(features.estimated_output_length, 300)
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
return ComplexityResult(
|
| 143 |
score=round(score, 4),
|
| 144 |
tier=self._tier(score),
|
| 145 |
required_reasoning=round(required_reasoning, 3),
|
| 146 |
required_coding=round(required_coding, 3),
|
| 147 |
required_math=round(required_math, 3),
|
| 148 |
-
rationale=
|
| 149 |
estimated_input_tokens=int(estimated_input_tokens),
|
| 150 |
estimated_output_tokens=estimated_output_tokens,
|
| 151 |
)
|
|
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
+
import logging
|
| 11 |
from dataclasses import dataclass
|
| 12 |
+
from pathlib import Path
|
| 13 |
from typing import Optional
|
| 14 |
|
| 15 |
from llmopt.analyzer.query_analyzer import QueryFeatures
|
| 16 |
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
|
| 20 |
# ---------------------------------------------------------------------------
|
| 21 |
# Result schema
|
|
|
|
| 87 |
"""
|
| 88 |
Produces a normalized complexity score from QueryFeatures.
|
| 89 |
|
| 90 |
+
V2: Auto-detects and uses a trained GBR model if data/complexity_model.pkl exists.
|
| 91 |
+
Falls back to V1 weighted heuristic scoring if the model is not found.
|
|
|
|
| 92 |
"""
|
| 93 |
|
| 94 |
+
def __init__(self):
|
| 95 |
+
self._ml_model = None
|
| 96 |
+
self._feature_extractor = None
|
| 97 |
+
try:
|
| 98 |
+
import sys
|
| 99 |
+
from pathlib import Path
|
| 100 |
+
ROOT = Path(__file__).parent.parent.parent
|
| 101 |
+
sys.path.insert(0, str(ROOT))
|
| 102 |
+
from scripts.train_complexity_model import load_model
|
| 103 |
+
bundle = load_model()
|
| 104 |
+
if bundle:
|
| 105 |
+
self._ml_model, self._feature_extractor = bundle
|
| 106 |
+
logger.info("ComplexityEstimator: Using trained GBR ML model.")
|
| 107 |
+
else:
|
| 108 |
+
logger.info("ComplexityEstimator: No trained model found, using V1 heuristics.")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.warning(f"ComplexityEstimator: Failed to load ML model: {e}. Using V1 heuristics.")
|
| 111 |
+
|
| 112 |
def estimate(self, features: QueryFeatures) -> ComplexityResult:
|
| 113 |
+
# --- Try ML model first ---
|
| 114 |
+
if self._ml_model and self._feature_extractor:
|
| 115 |
+
try:
|
| 116 |
+
ml_score = float(self._ml_model.predict(
|
| 117 |
+
self._feature_extractor(features.raw_query).reshape(1, -1)
|
| 118 |
+
)[0])
|
| 119 |
+
ml_score = max(0.0, min(ml_score, 1.0))
|
| 120 |
+
logger.debug(f"GBR complexity score: {ml_score:.4f}")
|
| 121 |
+
return self._build_result(features, ml_score)
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.warning(f"ML complexity prediction failed: {e}. Falling back to V1.")
|
| 124 |
+
|
| 125 |
+
# --- V1 heuristic fallback ---
|
| 126 |
+
return self._heuristic_estimate(features)
|
| 127 |
+
|
| 128 |
+
def _heuristic_estimate(self, features: QueryFeatures) -> ComplexityResult:
|
| 129 |
score = 0.0
|
| 130 |
rationale: list[str] = []
|
| 131 |
|
|
|
|
| 176 |
estimated_input_tokens = features.token_count * 1.3 + 50 # rough overhead
|
| 177 |
estimated_output_tokens = _OUTPUT_TOKENS_MAP.get(features.estimated_output_length, 300)
|
| 178 |
|
| 179 |
+
return self._build_result(features, score)
|
| 180 |
+
|
| 181 |
+
def _build_result(self, features: QueryFeatures, score: float) -> ComplexityResult:
|
| 182 |
+
"""Builds ComplexityResult from a score (used by both ML and heuristic paths)."""
|
| 183 |
+
required_reasoning = self._required_reasoning(features, score)
|
| 184 |
+
required_coding = self._required_coding(features, score)
|
| 185 |
+
required_math = self._required_math(features, score)
|
| 186 |
+
estimated_input_tokens = features.token_count * 1.3 + 50
|
| 187 |
+
estimated_output_tokens = _OUTPUT_TOKENS_MAP.get(features.estimated_output_length, 300)
|
| 188 |
return ComplexityResult(
|
| 189 |
score=round(score, 4),
|
| 190 |
tier=self._tier(score),
|
| 191 |
required_reasoning=round(required_reasoning, 3),
|
| 192 |
required_coding=round(required_coding, 3),
|
| 193 |
required_math=round(required_math, 3),
|
| 194 |
+
rationale=[f"score={score:.4f}"],
|
| 195 |
estimated_input_tokens=int(estimated_input_tokens),
|
| 196 |
estimated_output_tokens=estimated_output_tokens,
|
| 197 |
)
|
llmopt/evaluation/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Evaluation module
|
llmopt/evaluation/evaluator.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation — LLM-as-a-Judge framework.
|
| 3 |
+
|
| 4 |
+
Uses gpt-4o-mini to evaluate the quality of responses generated by cheaper models.
|
| 5 |
+
Only runs when explicitly requested (evaluate=True in generate()).
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
_JUDGE_PROMPT = """You are an objective AI quality evaluator.
|
| 16 |
+
|
| 17 |
+
A user asked:
|
| 18 |
+
"{query}"
|
| 19 |
+
|
| 20 |
+
An AI assistant responded:
|
| 21 |
+
"{response}"
|
| 22 |
+
|
| 23 |
+
Please evaluate this response on the following criteria and provide a score from 1-10 for each:
|
| 24 |
+
1. **Accuracy**: Is the information correct and factually accurate?
|
| 25 |
+
2. **Completeness**: Does it fully answer the question?
|
| 26 |
+
3. **Clarity**: Is it clear and easy to understand?
|
| 27 |
+
4. **Conciseness**: Is it appropriately concise without being too brief?
|
| 28 |
+
|
| 29 |
+
Also provide an **Overall Score** from 1-10.
|
| 30 |
+
|
| 31 |
+
Respond ONLY in this exact JSON format:
|
| 32 |
+
{{
|
| 33 |
+
"accuracy": <1-10>,
|
| 34 |
+
"completeness": <1-10>,
|
| 35 |
+
"clarity": <1-10>,
|
| 36 |
+
"conciseness": <1-10>,
|
| 37 |
+
"overall": <1-10>,
|
| 38 |
+
"feedback": "<one sentence summary of the main strength or weakness>"
|
| 39 |
+
}}"""
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class EvaluationResult:
|
| 44 |
+
accuracy: float
|
| 45 |
+
completeness: float
|
| 46 |
+
clarity: float
|
| 47 |
+
conciseness: float
|
| 48 |
+
overall: float
|
| 49 |
+
feedback: str
|
| 50 |
+
judge_model: str
|
| 51 |
+
|
| 52 |
+
def to_dict(self) -> dict:
|
| 53 |
+
return self.__dict__.copy()
|
| 54 |
+
|
| 55 |
+
def __str__(self) -> str:
|
| 56 |
+
return (
|
| 57 |
+
f"Evaluation (judge={self.judge_model})\n"
|
| 58 |
+
f" Overall : {self.overall}/10\n"
|
| 59 |
+
f" Accuracy : {self.accuracy}/10\n"
|
| 60 |
+
f" Completeness: {self.completeness}/10\n"
|
| 61 |
+
f" Clarity : {self.clarity}/10\n"
|
| 62 |
+
f" Conciseness: {self.conciseness}/10\n"
|
| 63 |
+
f" Feedback : {self.feedback}"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class LLMJudge:
|
| 68 |
+
"""
|
| 69 |
+
Evaluates LLM responses using a judge model (default: gpt-4o-mini).
|
| 70 |
+
Gracefully disabled if litellm or API keys are not available.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def __init__(self, judge_model: str = "gpt-4o-mini"):
|
| 74 |
+
self.judge_model = judge_model
|
| 75 |
+
self.enabled = True
|
| 76 |
+
try:
|
| 77 |
+
import litellm # type: ignore # noqa: F401
|
| 78 |
+
except ImportError:
|
| 79 |
+
logger.warning("LLMJudge: litellm not installed. Evaluation disabled.")
|
| 80 |
+
self.enabled = False
|
| 81 |
+
|
| 82 |
+
def evaluate(self, query: str, response: str) -> Optional[EvaluationResult]:
|
| 83 |
+
"""
|
| 84 |
+
Evaluates a query-response pair using the judge model.
|
| 85 |
+
Returns None if evaluation fails or is disabled.
|
| 86 |
+
"""
|
| 87 |
+
if not self.enabled:
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
import json
|
| 92 |
+
import litellm # type: ignore
|
| 93 |
+
|
| 94 |
+
prompt = _JUDGE_PROMPT.format(query=query, response=response)
|
| 95 |
+
result = litellm.completion(
|
| 96 |
+
model=self.judge_model,
|
| 97 |
+
messages=[{"role": "user", "content": prompt}],
|
| 98 |
+
temperature=0.0,
|
| 99 |
+
max_tokens=300,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
raw = str(result.choices[0].message.content).strip() # type: ignore
|
| 103 |
+
|
| 104 |
+
# Extract JSON from the response (handle markdown code blocks)
|
| 105 |
+
if "```" in raw:
|
| 106 |
+
raw = raw.split("```")[1]
|
| 107 |
+
if raw.startswith("json"):
|
| 108 |
+
raw = raw[4:]
|
| 109 |
+
|
| 110 |
+
scores = json.loads(raw)
|
| 111 |
+
|
| 112 |
+
return EvaluationResult(
|
| 113 |
+
accuracy=float(scores.get("accuracy", 0)),
|
| 114 |
+
completeness=float(scores.get("completeness", 0)),
|
| 115 |
+
clarity=float(scores.get("clarity", 0)),
|
| 116 |
+
conciseness=float(scores.get("conciseness", 0)),
|
| 117 |
+
overall=float(scores.get("overall", 0)),
|
| 118 |
+
feedback=scores.get("feedback", ""),
|
| 119 |
+
judge_model=self.judge_model,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.warning(f"LLMJudge: Evaluation failed: {e}")
|
| 124 |
+
return None
|
llmopt/optimizer/prompt_optimizer.py
CHANGED
|
@@ -9,16 +9,19 @@ V1 strategy:
|
|
| 9 |
- Conversation history summarization (stub)
|
| 10 |
- System prompt selection
|
| 11 |
|
| 12 |
-
V2
|
| 13 |
-
|
| 14 |
"""
|
| 15 |
|
| 16 |
from __future__ import annotations
|
| 17 |
|
|
|
|
| 18 |
import re
|
| 19 |
from dataclasses import dataclass
|
| 20 |
from typing import Optional
|
| 21 |
|
|
|
|
|
|
|
| 22 |
|
| 23 |
@dataclass
|
| 24 |
class OptimizedPrompt:
|
|
@@ -78,9 +81,27 @@ _INSTRUCTION_REWRITES = [
|
|
| 78 |
class PromptOptimizer:
|
| 79 |
"""
|
| 80 |
Optimizes prompts to minimize token usage.
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
"""
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def optimize(
|
| 85 |
self,
|
| 86 |
query: str,
|
|
@@ -126,6 +147,24 @@ class PromptOptimizer:
|
|
| 126 |
|
| 127 |
def _compress(self, text: str) -> tuple[str, list[str]]:
|
| 128 |
techniques = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
original = text
|
| 130 |
|
| 131 |
# 1. Whitespace normalization
|
|
@@ -190,7 +229,7 @@ class PromptOptimizer:
|
|
| 190 |
The fallback is accurate to within ~10% for English text.
|
| 191 |
"""
|
| 192 |
try:
|
| 193 |
-
import tiktoken
|
| 194 |
enc = tiktoken.get_encoding("cl100k_base")
|
| 195 |
return len(enc.encode(text))
|
| 196 |
except Exception:
|
|
|
|
| 9 |
- Conversation history summarization (stub)
|
| 10 |
- System prompt selection
|
| 11 |
|
| 12 |
+
V2: LLMLingua semantic compression (if llmlingua installed)
|
| 13 |
+
Falls back to V1 heuristics if not available.
|
| 14 |
"""
|
| 15 |
|
| 16 |
from __future__ import annotations
|
| 17 |
|
| 18 |
+
import logging
|
| 19 |
import re
|
| 20 |
from dataclasses import dataclass
|
| 21 |
from typing import Optional
|
| 22 |
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
|
| 26 |
@dataclass
|
| 27 |
class OptimizedPrompt:
|
|
|
|
| 81 |
class PromptOptimizer:
|
| 82 |
"""
|
| 83 |
Optimizes prompts to minimize token usage.
|
| 84 |
+
|
| 85 |
+
V2: Uses LLMLingua for semantic compression when installed.
|
| 86 |
+
Falls back to V1 heuristic compression (filler removal, rewrites) if not.
|
| 87 |
"""
|
| 88 |
|
| 89 |
+
def __init__(self):
|
| 90 |
+
self._llmlingua = None
|
| 91 |
+
try:
|
| 92 |
+
from llmlingua import PromptCompressor # type: ignore
|
| 93 |
+
logger.info("PromptOptimizer: Loading LLMLingua compressor...")
|
| 94 |
+
self._llmlingua = PromptCompressor(
|
| 95 |
+
model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
|
| 96 |
+
use_llmlingua2=True,
|
| 97 |
+
device_map="cpu",
|
| 98 |
+
)
|
| 99 |
+
logger.info("PromptOptimizer: LLMLingua ready!")
|
| 100 |
+
except ImportError:
|
| 101 |
+
logger.info("PromptOptimizer: llmlingua not installed. Using V1 heuristic compression.")
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.warning(f"PromptOptimizer: Failed to load LLMLingua: {e}. Using V1 heuristics.")
|
| 104 |
+
|
| 105 |
def optimize(
|
| 106 |
self,
|
| 107 |
query: str,
|
|
|
|
| 147 |
|
| 148 |
def _compress(self, text: str) -> tuple[str, list[str]]:
|
| 149 |
techniques = []
|
| 150 |
+
|
| 151 |
+
# --- V2: LLMLingua Semantic Compression ---
|
| 152 |
+
if self._llmlingua and len(text.split()) > 15:
|
| 153 |
+
try:
|
| 154 |
+
result = self._llmlingua.compress_prompt(
|
| 155 |
+
[text],
|
| 156 |
+
rate=0.6, # Keep 60% of tokens
|
| 157 |
+
force_tokens=["?"], # Always keep question marks
|
| 158 |
+
)
|
| 159 |
+
compressed = result["compressed_prompt"].strip()
|
| 160 |
+
# Only use if it actually saved tokens and isn't empty
|
| 161 |
+
if compressed and len(compressed.split()) < len(text.split()):
|
| 162 |
+
techniques.append("llmlingua_semantic_compression")
|
| 163 |
+
return compressed, techniques
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.warning(f"LLMLingua compression failed: {e}. Falling back to V1.")
|
| 166 |
+
|
| 167 |
+
# --- V1: Heuristic Compression ---
|
| 168 |
original = text
|
| 169 |
|
| 170 |
# 1. Whitespace normalization
|
|
|
|
| 229 |
The fallback is accurate to within ~10% for English text.
|
| 230 |
"""
|
| 231 |
try:
|
| 232 |
+
import tiktoken # type: ignore
|
| 233 |
enc = tiktoken.get_encoding("cl100k_base")
|
| 234 |
return len(enc.encode(text))
|
| 235 |
except Exception:
|
llmopt/router/model_router.py
CHANGED
|
@@ -104,8 +104,8 @@ class ModelRouter:
|
|
| 104 |
|
| 105 |
def _configure_litellm(self) -> None:
|
| 106 |
try:
|
| 107 |
-
import litellm
|
| 108 |
-
litellm.set_verbose = False
|
| 109 |
# Set Ollama base URL so litellm knows where to find local models
|
| 110 |
os.environ.setdefault("OLLAMA_API_BASE", self.ollama_base_url)
|
| 111 |
except ImportError:
|
|
@@ -150,7 +150,7 @@ class ModelRouter:
|
|
| 150 |
usage = getattr(response, "usage", None)
|
| 151 |
input_tokens = getattr(usage, "prompt_tokens", 0) if usage else 0
|
| 152 |
output_tokens = getattr(usage, "completion_tokens", 0) if usage else 0
|
| 153 |
-
content = response.choices[0].message.content or ""
|
| 154 |
|
| 155 |
# Cost calculation using actual token usage
|
| 156 |
estimated_cost = (
|
|
@@ -171,7 +171,7 @@ class ModelRouter:
|
|
| 171 |
)
|
| 172 |
|
| 173 |
def _call_litellm(self, model: str, messages: list[dict], **kwargs) -> object:
|
| 174 |
-
import litellm
|
| 175 |
return litellm.completion(model=model, messages=messages, **kwargs)
|
| 176 |
|
| 177 |
# ------------------------------------------------------------------
|
|
@@ -188,7 +188,7 @@ class ModelRouter:
|
|
| 188 |
"""Generator that yields text chunks as they arrive."""
|
| 189 |
litellm_model = _LITELLM_MODEL_MAP.get(model_name, model_name)
|
| 190 |
try:
|
| 191 |
-
import litellm
|
| 192 |
for chunk in litellm.completion(
|
| 193 |
model=litellm_model,
|
| 194 |
messages=messages,
|
|
@@ -196,7 +196,7 @@ class ModelRouter:
|
|
| 196 |
temperature=temperature,
|
| 197 |
stream=True,
|
| 198 |
):
|
| 199 |
-
delta = chunk.choices[0].delta
|
| 200 |
text = getattr(delta, "content", "") or ""
|
| 201 |
if text:
|
| 202 |
yield text
|
|
|
|
| 104 |
|
| 105 |
def _configure_litellm(self) -> None:
|
| 106 |
try:
|
| 107 |
+
import litellm # type: ignore
|
| 108 |
+
litellm.set_verbose = False # type: ignore
|
| 109 |
# Set Ollama base URL so litellm knows where to find local models
|
| 110 |
os.environ.setdefault("OLLAMA_API_BASE", self.ollama_base_url)
|
| 111 |
except ImportError:
|
|
|
|
| 150 |
usage = getattr(response, "usage", None)
|
| 151 |
input_tokens = getattr(usage, "prompt_tokens", 0) if usage else 0
|
| 152 |
output_tokens = getattr(usage, "completion_tokens", 0) if usage else 0
|
| 153 |
+
content = response.choices[0].message.content or "" # type: ignore
|
| 154 |
|
| 155 |
# Cost calculation using actual token usage
|
| 156 |
estimated_cost = (
|
|
|
|
| 171 |
)
|
| 172 |
|
| 173 |
def _call_litellm(self, model: str, messages: list[dict], **kwargs) -> object:
|
| 174 |
+
import litellm # type: ignore
|
| 175 |
return litellm.completion(model=model, messages=messages, **kwargs)
|
| 176 |
|
| 177 |
# ------------------------------------------------------------------
|
|
|
|
| 188 |
"""Generator that yields text chunks as they arrive."""
|
| 189 |
litellm_model = _LITELLM_MODEL_MAP.get(model_name, model_name)
|
| 190 |
try:
|
| 191 |
+
import litellm # type: ignore
|
| 192 |
for chunk in litellm.completion(
|
| 193 |
model=litellm_model,
|
| 194 |
messages=messages,
|
|
|
|
| 196 |
temperature=temperature,
|
| 197 |
stream=True,
|
| 198 |
):
|
| 199 |
+
delta = chunk.choices[0].delta # type: ignore
|
| 200 |
text = getattr(delta, "content", "") or ""
|
| 201 |
if text:
|
| 202 |
yield text
|
pyproject.toml
CHANGED
|
@@ -16,7 +16,7 @@ dependencies = [
|
|
| 16 |
]
|
| 17 |
|
| 18 |
[project.optional-dependencies]
|
| 19 |
-
ml = ["scikit-learn", "numpy", "pandas"]
|
| 20 |
local = ["ollama"]
|
| 21 |
dev = ["pytest", "black", "isort"]
|
| 22 |
|
|
|
|
| 16 |
]
|
| 17 |
|
| 18 |
[project.optional-dependencies]
|
| 19 |
+
ml = ["scikit-learn", "numpy", "pandas", "transformers", "torch", "redis", "sentence-transformers", "llmlingua", "optuna"]
|
| 20 |
local = ["ollama"]
|
| 21 |
dev = ["pytest", "black", "isort"]
|
| 22 |
|
scripts/fix_json.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
with open('data/complexity_training_data.json', 'r', encoding='utf-8') as f:
|
| 4 |
+
content = f.read()
|
| 5 |
+
|
| 6 |
+
lines = content.split('\n')
|
| 7 |
+
for i, line in enumerate(lines):
|
| 8 |
+
if 'Convert this direct speech to indirect' in line:
|
| 9 |
+
lines[i] = ' "query": "Convert this direct speech to indirect: He said he would come tomorrow.",'
|
| 10 |
+
print(f'Fixed line {i+1}')
|
| 11 |
+
break
|
| 12 |
+
|
| 13 |
+
content = '\n'.join(lines)
|
| 14 |
+
try:
|
| 15 |
+
data = json.loads(content)
|
| 16 |
+
print(f'JSON valid: {len(data)} records')
|
| 17 |
+
with open('data/complexity_training_data.json', 'w', encoding='utf-8') as f:
|
| 18 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 19 |
+
print('Saved!')
|
| 20 |
+
except Exception as e:
|
| 21 |
+
print(f'Still broken: {e}')
|
tests/test_pipeline.py
CHANGED
|
@@ -9,7 +9,7 @@ from pathlib import Path
|
|
| 9 |
ROOT = Path(__file__).parent.parent
|
| 10 |
sys.path.insert(0, str(ROOT))
|
| 11 |
|
| 12 |
-
import pytest
|
| 13 |
from llmopt.analyzer.query_analyzer import QueryAnalyzer
|
| 14 |
from llmopt.estimator.complexity_estimator import ComplexityEstimator
|
| 15 |
from llmopt.engine.optimization_engine import OptimizationEngine, UserConstraints
|
|
@@ -120,7 +120,7 @@ class TestQueryAnalyzer:
|
|
| 120 |
class TestComplexityEstimator:
|
| 121 |
CASES = [
|
| 122 |
("What is Python?", 0.0, 0.30),
|
| 123 |
-
("Write a hello world in JavaScript", 0.0, 0.
|
| 124 |
("Explain binary search with code", 0.0, 0.65), # medium query
|
| 125 |
("Design Paxos consensus algorithm", 0.50, 1.0),
|
| 126 |
("Prove Fermat's last theorem", 0.50, 1.0),
|
|
|
|
| 9 |
ROOT = Path(__file__).parent.parent
|
| 10 |
sys.path.insert(0, str(ROOT))
|
| 11 |
|
| 12 |
+
import pytest # type: ignore
|
| 13 |
from llmopt.analyzer.query_analyzer import QueryAnalyzer
|
| 14 |
from llmopt.estimator.complexity_estimator import ComplexityEstimator
|
| 15 |
from llmopt.engine.optimization_engine import OptimizationEngine, UserConstraints
|
|
|
|
| 120 |
class TestComplexityEstimator:
|
| 121 |
CASES = [
|
| 122 |
("What is Python?", 0.0, 0.30),
|
| 123 |
+
("Write a hello world in JavaScript", 0.0, 0.80), # ML model scores generation signals higher
|
| 124 |
("Explain binary search with code", 0.0, 0.65), # medium query
|
| 125 |
("Design Paxos consensus algorithm", 0.50, 1.0),
|
| 126 |
("Prove Fermat's last theorem", 0.50, 1.0),
|