File size: 5,145 Bytes
59e6760 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field
# Models
class OptimizeRequest(BaseModel):
"""
π§ Explicit optimization request: user provides all pipeline configs manually.
"""
docs_path: Optional[str] = Field(
default="data/docs",
description="π Folder containing your documents for RAG optimization. Example: 'data/docs'"
)
retriever: Optional[List[str]] = Field(
description="π Retriever type to use. Example: 'bm25', 'faiss', 'chroma'",
default=['faiss']
)
embedding_model: Optional[List[str]] = Field(
description="π§ Embedding model name or path. Example: 'sentence-transformers/all-MiniLM-L6-v2'",
default=['sentence-transformers/all-MiniLM-L6-v2']
)
strategy: Optional[List[str]] = Field(
description="π― RAG strategy name. Example: 'fixed', 'token', 'sentence'",
default=['fixed']
)
chunk_sizes: Optional[List[int]] = Field(
description="π List of chunk sizes to evaluate. Example: [200, 400, 600]",
default=[200, 400, 600]
)
overlaps: Optional[List[int]] = Field(
description="π List of overlap values to test. Example: [50, 100, 200]",
default = [50, 100, 200]
)
rerankers: Optional[List[str]] = Field(
default=["mmr"],
description="βοΈ Rerankers to apply after retrieval. Default: ['mmr']"
)
search_type: Optional[str] = Field(
default="grid",
description="π Search method to explore parameter space. Options: 'grid', 'random', 'bayesian'"
)
trials: Optional[int] = Field(
default=5,
description="π§ͺ Number of optimization trials to run."
)
metric: Optional[str] = Field(
default="faithfulness",
description="π Evaluation metric for optimization. Options: 'faithfulness'"
)
validation_choice: Optional[str] = Field(
default='generate',
description=(
"β
Validation data source. Options:\n"
" - Leave blank β use default 'validation_qa.json' if available\n"
" - 'generate' β auto-generate a validation QA file from your docs\n"
" - Path to a local JSON file (e.g. 'data/validation_qa.json')\n"
" - Hugging Face dataset ID (e.g. 'squad')"
)
)
llm_model: Optional[str] = Field(
default="gemini-2.5-flash-lite",
description="π€ LLM used to generate QA dataset when validation_choice='generate'. Example: 'gemini-pro', 'gpt-4o-mini'"
)
class AutotuneRequest(BaseModel):
docs_path: Optional[str] = Field(
default="data/docs",
description="π Folder containing your documents for RAG optimization. Example: 'data/docs'"
)
embedding_model: Optional[str] = Field(
default="sentence-transformers/all-MiniLM-L6-v2",
description="π§ Embedding model name or path. Example: 'sentence-transformers/all-MiniLM-L6-v2'"
)
num_chunk_pairs: Optional[int] = Field(
default=5,
description="π’ Number of chunk pairs to analyze for tuning."
)
metric: Optional[str] = Field(
default="faithfulness",
description="π Evaluation metric for optimization. Options: 'faithfulness'"
)
search_type: Optional[str] = Field(
default="grid",
description="π Search method to explore parameter space. Options: 'grid', 'random', 'bayesian'"
)
trials: Optional[int] = Field(
default=5,
description="π§ͺ Number of optimization trials to run."
)
validation_choice: Optional[str] = Field(
default='generate',
description=(
"β
Validation data source. Options:\n"
" - Leave blank β use default 'validation_qa.jsonl' if available\n"
" - 'generate' β auto-generate a validation QA file from your docs\n"
" - Path to a local JSON file (e.g. 'data/validation_qa.json')\n"
" - Hugging Face dataset ID (e.g. 'squad')"
)
)
llm_model: Optional[str] = Field(
default="gemini-2.5-flash-lite",
description="π€ LLM used to generate QA dataset when validation_choice='generate'. Example: 'gemini-pro', 'gpt-4o-mini'"
)
class QARequest(BaseModel):
"""
π§© Generates a validation QA dataset for RAG evaluation.
"""
docs_path: str = Field(
description="π Folder containing your documents to generate QA pairs from. Example: 'data/docs'",
default='data/docs'
)
llm_model: str = Field(
default="gemini-2.5-flash-lite",
description="π€ LLM model used for question generation. Example: 'gemini-2.5-flash-lite', 'gpt-4o-mini'"
)
batch_size: int = Field(
default=5,
description="π¦ Number of documents processed per generation batch."
)
min_q: int = Field(
default=3,
description="β Minimum number of questions per document."
)
max_q: int = Field(
default=25,
description="β Maximum number of questions per document."
)
|