|
|
from pydantic import BaseModel, Field |
|
|
from typing import List, Dict, Literal, Optional, Any |
|
|
from dataclasses import dataclass, asdict |
|
|
from datetime import datetime |
|
|
|
|
|
class RouteQuery(BaseModel): |
|
|
"""Route a user query to the most relevant content type.""" |
|
|
content_type: Literal["course", "program", "both"] = Field( |
|
|
..., |
|
|
description="Route to: 'course' for specific course questions, 'program' for program questions, 'both' when the question involves both or is unclear" |
|
|
) |
|
|
|
|
|
@dataclass |
|
|
class DocumentMetadata: |
|
|
"""Metadata for processed documents.""" |
|
|
source: str |
|
|
type: str |
|
|
category: str |
|
|
doc_type: str |
|
|
filename: str |
|
|
course_code: Optional[str] = None |
|
|
|
|
|
@dataclass |
|
|
class QueryResult: |
|
|
"""Result of a RAG query.""" |
|
|
answer: str |
|
|
source_documents: List[Any] |
|
|
content_type: str |
|
|
processing_time: Optional[float] = None |
|
|
generated_queries: Optional[List[str]] = None |
|
|
retrieval_stats: Optional[Dict[str, Any]] = None |
|
|
|
|
|
@dataclass |
|
|
class ChatInteraction: |
|
|
"""Single chat interaction for logging.""" |
|
|
timestamp: str |
|
|
query: Dict[str, Any] |
|
|
retrieval: Dict[str, Any] |
|
|
response: Dict[str, str] |
|
|
performance: Dict[str, Any] |
|
|
chat_context: Dict[str, Any] |
|
|
system_info: Dict[str, Any] |
|
|
|
|
|
@dataclass |
|
|
class RetrievalStats: |
|
|
"""Statistics about document retrieval.""" |
|
|
total_documents: int |
|
|
document_types: Dict[str, int] |
|
|
search_config: Dict[str, Any] |
|
|
queries_used: List[str] |
|
|
|
|
|
class EmbeddingConfig(BaseModel): |
|
|
"""Configuration for embeddings.""" |
|
|
model: str = "text-embedding-3-small" |
|
|
chunk_size: int = 1000 |
|
|
max_retries: int = 3 |
|
|
request_timeout: int = 60 |
|
|
|
|
|
class ModelConfig(BaseModel): |
|
|
"""Configuration for LLM models.""" |
|
|
model_name: str = "gpt-4o-mini" |
|
|
temperature: float = 0.1 |
|
|
max_tokens: Optional[int] = None |
|
|
|
|
|
class VectorStoreConfig(BaseModel): |
|
|
"""Configuration for vector store.""" |
|
|
persist_directory: str = "./data/chroma" |
|
|
collection_name: str = "course_docs" |
|
|
collection_metadata: Dict[str, str] = Field(default_factory=lambda: {"hnsw:space": "cosine"}) |
|
|
|
|
|
class RetrievalConfig(BaseModel): |
|
|
"""Configuration for retrieval.""" |
|
|
search_type: str = "mmr" |
|
|
k_values: Dict[str, int] = Field(default_factory=lambda: { |
|
|
"course": 6, |
|
|
"program": 15, |
|
|
"both": 15 |
|
|
}) |
|
|
fetch_k_multiplier: int = 3 |
|
|
|
|
|
@dataclass |
|
|
class ProcessingStats: |
|
|
"""Statistics about document processing.""" |
|
|
total_documents: int |
|
|
courses_processed: int |
|
|
programs_processed: int |
|
|
chunks_created: int |
|
|
processing_time: float |
|
|
|
|
|
def to_dict(self) -> Dict[str, Any]: |
|
|
"""Convert to dictionary.""" |
|
|
return asdict(self) |
|
|
|
|
|
class ChatMemoryMessage(BaseModel): |
|
|
"""Message in chat memory.""" |
|
|
role: str |
|
|
content: str |
|
|
timestamp: Optional[str] = None |
|
|
|
|
|
class SystemStatus(BaseModel): |
|
|
"""System status information.""" |
|
|
database_initialized: bool = False |
|
|
documents_loaded: int = 0 |
|
|
model_version: str = "" |
|
|
embedding_version: str = "" |
|
|
last_updated: Optional[str] = None |