File size: 3,174 Bytes
8629355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from pydantic import BaseModel, Field
from typing import List, Dict, Literal, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime

class RouteQuery(BaseModel):
    """Route a user query to the most relevant content type."""
    content_type: Literal["course", "program", "both"] = Field(
        ...,
        description="Route to: 'course' for specific course questions, 'program' for program questions, 'both' when the question involves both or is unclear"
    )

@dataclass
class DocumentMetadata:
    """Metadata for processed documents."""
    source: str
    type: str  # 'markdown' or 'pdf'
    category: str  # 'courses' or 'programs'
    doc_type: str  # 'course' or 'program'
    filename: str
    course_code: Optional[str] = None

@dataclass
class QueryResult:
    """Result of a RAG query."""
    answer: str
    source_documents: List[Any]  # List of Document objects
    content_type: str
    processing_time: Optional[float] = None
    generated_queries: Optional[List[str]] = None
    retrieval_stats: Optional[Dict[str, Any]] = None

@dataclass
class ChatInteraction:
    """Single chat interaction for logging."""
    timestamp: str
    query: Dict[str, Any]
    retrieval: Dict[str, Any]
    response: Dict[str, str]
    performance: Dict[str, Any]
    chat_context: Dict[str, Any]
    system_info: Dict[str, Any]

@dataclass
class RetrievalStats:
    """Statistics about document retrieval."""
    total_documents: int
    document_types: Dict[str, int]
    search_config: Dict[str, Any]
    queries_used: List[str]

class EmbeddingConfig(BaseModel):
    """Configuration for embeddings."""
    model: str = "text-embedding-3-small"
    chunk_size: int = 1000
    max_retries: int = 3
    request_timeout: int = 60

class ModelConfig(BaseModel):
    """Configuration for LLM models."""
    model_name: str = "gpt-4o-mini"
    temperature: float = 0.1
    max_tokens: Optional[int] = None

class VectorStoreConfig(BaseModel):
    """Configuration for vector store."""
    persist_directory: str = "./data/chroma"
    collection_name: str = "course_docs"
    collection_metadata: Dict[str, str] = Field(default_factory=lambda: {"hnsw:space": "cosine"})

class RetrievalConfig(BaseModel):
    """Configuration for retrieval."""
    search_type: str = "mmr"
    k_values: Dict[str, int] = Field(default_factory=lambda: {
        "course": 6,
        "program": 15,
        "both": 15
    })
    fetch_k_multiplier: int = 3

@dataclass
class ProcessingStats:
    """Statistics about document processing."""
    total_documents: int
    courses_processed: int
    programs_processed: int
    chunks_created: int
    processing_time: float
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return asdict(self)

class ChatMemoryMessage(BaseModel):
    """Message in chat memory."""
    role: str
    content: str
    timestamp: Optional[str] = None

class SystemStatus(BaseModel):
    """System status information."""
    database_initialized: bool = False
    documents_loaded: int = 0
    model_version: str = ""
    embedding_version: str = ""
    last_updated: Optional[str] = None