faerazo commited on
Commit
8629355
·
verified ·
1 Parent(s): 50af053

Initial commit to HFS

Browse files
.dockerignore ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Git
2
+ .git
3
+ .gitignore
4
+ .gitattributes
5
+
6
+ # Python
7
+ __pycache__
8
+ *.pyc
9
+ *.pyo
10
+ *.pyd
11
+ .Python
12
+ *.so
13
+ .coverage
14
+ .coverage.*
15
+ coverage.*
16
+ .cache
17
+ nosetests.xml
18
+ coverage.xml
19
+ *.log
20
+ .pytest_cache/
21
+ .hypothesis/
22
+
23
+ # Virtual environments
24
+ venv/
25
+ env/
26
+ ENV/
27
+ env.bak/
28
+ venv.bak/
29
+
30
+ # IDE
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # OS
38
+ .DS_Store
39
+ .DS_Store?
40
+ ._*
41
+ .Spotlight-V100
42
+ .Trashes
43
+ ehthumbs.db
44
+ Thumbs.db
45
+
46
+ # Environment files
47
+ .env
48
+ .env.local
49
+ .env.*.local
50
+
51
+ # Node modules (if any)
52
+ node_modules/
53
+
54
+ # Documentation
55
+ README.md
56
+ docs/
57
+ *.md
58
+ !README.md
59
+
60
+ # Temporary files
61
+ *.tmp
62
+ *.temp
63
+
64
+ # Build artifacts
65
+ build/
66
+ dist/
67
+ *.egg-info/
68
+
69
+ # Other
70
+ chat_history.json
71
+ *.log
72
+ *.pid
Dockerfile ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GuPT - Gothenburg University RAG System
2
+ # Optimized Docker build for Hugging Face Spaces
3
+ # Uses environment.yml for dependencies
4
+
5
+ FROM python:3.11-slim
6
+
7
+ # Set working directory
8
+ WORKDIR /app
9
+
10
+ # Install system dependencies
11
+ RUN apt-get update && apt-get install -y \
12
+ gcc \
13
+ g++ \
14
+ curl \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Copy environment file and install dependencies
18
+ COPY environment.yml .
19
+ # Extract pip dependencies from environment.yml and install them
20
+ RUN grep -A 100 "pip:" environment.yml | grep " -" | sed 's/ - //' > requirements.txt && \
21
+ pip install --no-cache-dir -r requirements.txt
22
+
23
+ # Copy application code
24
+ COPY . .
25
+
26
+ # Create non-root user for security (required by Hugging Face Spaces)
27
+ RUN useradd --create-home --shell /bin/bash --uid 1000 user
28
+ RUN chown -R user:user /app
29
+ USER user
30
+
31
+ # Set environment variables
32
+ ENV PYTHONPATH=/app
33
+ ENV PYTHONDONTWRITEBYTECODE=1
34
+ ENV PYTHONUNBUFFERED=1
35
+ ENV GRADIO_SERVER_NAME=0.0.0.0
36
+ ENV GRADIO_SERVER_PORT=7860
37
+
38
+ # Expose port 7860 (required by Hugging Face Spaces)
39
+ EXPOSE 7860
40
+
41
+ # Health check
42
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
43
+ CMD curl -f http://localhost:7860/ || exit 1
44
+
45
+ # Command to run the application
46
+ CMD ["python", "src/main.py", "--host", "0.0.0.0", "--port", "7860"]
environment.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gupt
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python=3.11
7
+ - pip
8
+ - jupyter
9
+ - pip:
10
+ - langchain==0.3.26
11
+ - langchain-openai==0.3.27
12
+ - langchain-community==0.3.27
13
+ - langchain-core==0.3.68
14
+ - langchain-chroma==0.1.4
15
+ - langchain-text-splitters==0.3.8
16
+ - openai==1.95.1
17
+ - chromadb==0.5.23
18
+ - gradio==5.22.0
19
+ - python-dotenv==1.1.1
20
+ - numpy==1.26.4
21
+ - pandas==2.2.3
22
+ - rouge-score==0.1.2
23
+ - sentence-transformers==3.3.0
24
+ - bert-score==0.3.13
25
+ - scikit-learn==1.5.2
26
+ - typing-extensions==4.12.2
27
+ - pydantic==2.11.7
28
+ - pypdf==5.1.0
29
+ - requests==2.32.3
30
+ - urllib3==2.2.3
31
+ - charset-normalizer==3.4.0
32
+ - posthog==3.7.2
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.3.26
2
+ langchain-openai==0.3.27
3
+ langchain-community==0.3.27
4
+ langchain-core==0.3.68
5
+ langchain-chroma==0.1.4
6
+ langchain-text-splitters==0.3.8
7
+ openai==1.95.1
8
+ chromadb==0.5.23
9
+ gradio==5.22.0
10
+ python-dotenv==1.1.1
11
+ numpy==1.26.4
12
+ pandas==2.2.3
13
+ rouge-score==0.1.2
14
+ sentence-transformers==3.3.0
15
+ bert-score==0.3.13
16
+ scikit-learn==1.5.2
17
+ typing-extensions==4.12.2
18
+ pydantic==2.11.7
19
+ pypdf==5.1.0
20
+ requests==2.32.3
21
+ urllib3==2.2.3
22
+ charset-normalizer==3.4.0
23
+ posthog==3.7.2
src/chat_logger.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ from datetime import datetime
5
+ from typing import List, Dict, Any, Optional
6
+ from dataclasses import asdict
7
+
8
+ from models import ChatInteraction, RetrievalStats
9
+ from config import Config
10
+
11
+ class ChatLogger:
12
+ """Handles logging of chat interactions with enhanced metadata."""
13
+
14
+ def __init__(self, log_file: str = None):
15
+ """Initialize the chat logger.
16
+
17
+ Args:
18
+ log_file: Path to the log file. If None, uses config default.
19
+ """
20
+ self.log_file = log_file or Config.LOG_FILE
21
+ self._initialize_log_file()
22
+
23
+ def _initialize_log_file(self):
24
+ """Create log file if it doesn't exist."""
25
+ if not os.path.exists(self.log_file):
26
+ with open(self.log_file, 'w') as f:
27
+ json.dump([], f)
28
+
29
+ def log_interaction(self,
30
+ question: str,
31
+ answer: str,
32
+ source_documents: List[Any],
33
+ content_type: str,
34
+ generated_queries: List[str],
35
+ processing_time: float,
36
+ chat_history: List[Any],
37
+ system_info: Dict[str, Any]) -> None:
38
+ """Log a complete chat interaction with detailed metadata.
39
+
40
+ Args:
41
+ question: The user's question
42
+ answer: The generated answer
43
+ source_documents: Retrieved documents
44
+ content_type: The routing type (course/program/both)
45
+ generated_queries: List of generated query variations
46
+ processing_time: Time taken to process the query
47
+ chat_history: Chat memory messages
48
+ system_info: System configuration info
49
+ """
50
+ try:
51
+ # Prepare retrieval statistics
52
+ retrieval_stats = self._prepare_retrieval_stats(
53
+ source_documents, content_type, generated_queries
54
+ )
55
+
56
+ # Prepare chat context
57
+ chat_context = self._prepare_chat_context(chat_history)
58
+
59
+ # Create interaction data
60
+ interaction_data = {
61
+ "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
62
+ "query": {
63
+ "original_question": question,
64
+ "content_type": content_type,
65
+ "generated_queries": generated_queries
66
+ },
67
+ "retrieval": retrieval_stats,
68
+ "response": {
69
+ "answer": answer
70
+ },
71
+ "performance": {
72
+ "processing_time": processing_time,
73
+ "tokens_used": None # TODO: Add token usage if available
74
+ },
75
+ "chat_context": chat_context,
76
+ "system_info": system_info
77
+ }
78
+
79
+ # Read existing logs
80
+ with open(self.log_file, 'r') as f:
81
+ logs = json.load(f)
82
+
83
+ # Add new log
84
+ logs.append(interaction_data)
85
+
86
+ # Write back to file
87
+ with open(self.log_file, 'w') as f:
88
+ json.dump(logs, f, indent=2)
89
+
90
+ except Exception as e:
91
+ print(f"Error logging interaction: {str(e)}")
92
+
93
+ def _prepare_retrieval_stats(self,
94
+ source_documents: List[Any],
95
+ content_type: str,
96
+ generated_queries: List[str]) -> Dict[str, Any]:
97
+ """Prepare retrieval statistics for logging.
98
+
99
+ Args:
100
+ source_documents: Retrieved documents
101
+ content_type: The routing type
102
+ generated_queries: Generated query variations
103
+
104
+ Returns:
105
+ Dictionary with retrieval statistics
106
+ """
107
+ # Count document types
108
+ document_types = {
109
+ "course": 0,
110
+ "program": 0,
111
+ "unknown": 0
112
+ }
113
+
114
+ documents_info = []
115
+ for doc in source_documents:
116
+ doc_type = doc.metadata.get("doc_type", "unknown")
117
+ document_types[doc_type] = document_types.get(doc_type, 0) + 1
118
+
119
+ documents_info.append({
120
+ "content": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
121
+ "metadata": doc.metadata,
122
+ "source": os.path.basename(doc.metadata.get("source", ""))
123
+ })
124
+
125
+ return {
126
+ "total_documents": len(source_documents),
127
+ "documents": documents_info,
128
+ "document_types": document_types,
129
+ "generated_queries": generated_queries,
130
+ "routing_type": content_type
131
+ }
132
+
133
+ def _prepare_chat_context(self, chat_history: List[Any]) -> Dict[str, Any]:
134
+ """Prepare chat context for logging.
135
+
136
+ Args:
137
+ chat_history: Chat memory messages
138
+
139
+ Returns:
140
+ Dictionary with chat context information
141
+ """
142
+ context_messages = []
143
+
144
+ if chat_history:
145
+ # Get last few messages for context
146
+ recent_messages = chat_history[-6:] # Last 6 messages (3 pairs)
147
+
148
+ for msg in recent_messages:
149
+ if hasattr(msg, 'type') and hasattr(msg, 'content'):
150
+ context_messages.append({
151
+ "role": msg.type,
152
+ "content": msg.content[:500] + "..." if len(msg.content) > 500 else msg.content
153
+ })
154
+
155
+ return {
156
+ "chat_history": context_messages,
157
+ "memory_window_size": Config.MEMORY_WINDOW_SIZE,
158
+ "total_messages": len(chat_history) if chat_history else 0
159
+ }
160
+
161
+ def get_recent_interactions(self, limit: int = 10) -> List[Dict[str, Any]]:
162
+ """Get recent chat interactions.
163
+
164
+ Args:
165
+ limit: Maximum number of interactions to return
166
+
167
+ Returns:
168
+ List of recent interactions
169
+ """
170
+ try:
171
+ with open(self.log_file, 'r') as f:
172
+ logs = json.load(f)
173
+
174
+ # Return most recent interactions
175
+ return logs[-limit:] if len(logs) > limit else logs
176
+
177
+ except Exception as e:
178
+ print(f"Error reading recent interactions: {str(e)}")
179
+ return []
180
+
181
+ def get_stats(self) -> Dict[str, Any]:
182
+ """Get statistics about logged interactions.
183
+
184
+ Returns:
185
+ Dictionary with interaction statistics
186
+ """
187
+ try:
188
+ with open(self.log_file, 'r') as f:
189
+ logs = json.load(f)
190
+
191
+ if not logs:
192
+ return {"total_interactions": 0}
193
+
194
+ # Calculate statistics
195
+ total_interactions = len(logs)
196
+ content_types = {}
197
+ avg_processing_time = 0
198
+
199
+ for log in logs:
200
+ # Count content types
201
+ content_type = log.get("query", {}).get("content_type", "unknown")
202
+ content_types[content_type] = content_types.get(content_type, 0) + 1
203
+
204
+ # Sum processing times
205
+ processing_time = log.get("performance", {}).get("processing_time", 0)
206
+ if processing_time:
207
+ avg_processing_time += processing_time
208
+
209
+ # Calculate average processing time
210
+ if total_interactions > 0:
211
+ avg_processing_time = avg_processing_time / total_interactions
212
+
213
+ return {
214
+ "total_interactions": total_interactions,
215
+ "content_type_distribution": content_types,
216
+ "average_processing_time": avg_processing_time,
217
+ "last_interaction": logs[-1].get("timestamp") if logs else None
218
+ }
219
+
220
+ except Exception as e:
221
+ print(f"Error calculating stats: {str(e)}")
222
+ return {"error": str(e)}
223
+
224
+ def clear_logs(self) -> bool:
225
+ """Clear all logged interactions.
226
+
227
+ Returns:
228
+ True if successful, False otherwise
229
+ """
230
+ try:
231
+ with open(self.log_file, 'w') as f:
232
+ json.dump([], f)
233
+ return True
234
+ except Exception as e:
235
+ print(f"Error clearing logs: {str(e)}")
236
+ return False
237
+
238
+ def export_logs(self, output_file: str) -> bool:
239
+ """Export logs to a different file.
240
+
241
+ Args:
242
+ output_file: Path to the output file
243
+
244
+ Returns:
245
+ True if successful, False otherwise
246
+ """
247
+ try:
248
+ with open(self.log_file, 'r') as f:
249
+ logs = json.load(f)
250
+
251
+ with open(output_file, 'w') as f:
252
+ json.dump(logs, f, indent=2)
253
+
254
+ return True
255
+ except Exception as e:
256
+ print(f"Error exporting logs: {str(e)}")
257
+ return False
src/config.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, Any
3
+ from dotenv import load_dotenv
4
+
5
+ # Load environment variables
6
+ load_dotenv()
7
+
8
+ class Config:
9
+ """Application configuration settings."""
10
+
11
+ # API Keys
12
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
+ FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
14
+
15
+ # Model Configuration
16
+ MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1-mini-2025-04-14")
17
+ EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
18
+ TEMPERATURE = float(os.getenv("TEMPERATURE", "0.1"))
19
+ MAX_TOKENS = int(os.getenv("MAX_TOKENS", "2000"))
20
+
21
+ # Database Configuration
22
+ CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./data/chroma")
23
+ COLLECTION_NAME = os.getenv("COLLECTION_NAME", "course_docs")
24
+
25
+ # Text Splitting Configuration
26
+ CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2000"))
27
+ CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
28
+
29
+ # Retrieval Configuration
30
+ RETRIEVAL_K_VALUES = {
31
+ "course": int(os.getenv("RETRIEVAL_K_COURSE", "20")),
32
+ "program": int(os.getenv("RETRIEVAL_K_PROGRAM", "15")),
33
+ "both": int(os.getenv("RETRIEVAL_K_BOTH", "25"))
34
+ }
35
+
36
+ # Embedding Configuration
37
+ EMBEDDING_CHUNK_SIZE = int(os.getenv("EMBEDDING_CHUNK_SIZE", "1000"))
38
+ EMBEDDING_MAX_RETRIES = int(os.getenv("EMBEDDING_MAX_RETRIES", "3"))
39
+ EMBEDDING_REQUEST_TIMEOUT = int(os.getenv("EMBEDDING_REQUEST_TIMEOUT", "60"))
40
+
41
+ # Memory Configuration
42
+ MEMORY_WINDOW_SIZE = int(os.getenv("MEMORY_WINDOW_SIZE", "3"))
43
+
44
+ # Logging Configuration
45
+ LOG_FILE = os.getenv("LOG_FILE", "chat_history.json")
46
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
47
+ DEBUG_MODE = os.getenv("DEBUG_MODE", "false").lower() == "true"
48
+
49
+ # Directory Paths
50
+ DATA_BASE_PATH = os.getenv("DATA_BASE_PATH", "./data")
51
+ COURSES_MD_PATH = os.getenv("COURSES_MD_PATH", "data/courses/md")
52
+ COURSES_PDF_PATH = os.getenv("COURSES_PDF_PATH", "data/courses/pdf")
53
+ PROGRAMS_MD_PATH = os.getenv("PROGRAMS_MD_PATH", "data/programs/md")
54
+ PROGRAMS_PDF_PATH = os.getenv("PROGRAMS_PDF_PATH", "data/programs/pdf")
55
+
56
+ # Interface Configuration
57
+ GRADIO_PORT = int(os.getenv("GRADIO_PORT", "7860"))
58
+ GRADIO_SHARE = os.getenv("GRADIO_SHARE", "false").lower() == "true"
59
+
60
+ # Telemetry Configuration
61
+ LANGCHAIN_TRACING_V2 = os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true"
62
+ ANONYMIZED_TELEMETRY = os.getenv("ANONYMIZED_TELEMETRY", "false").lower() == "true"
63
+ POSTHOG_DISABLED = os.getenv("POSTHOG_DISABLED", "true").lower() == "true"
64
+ CHROMA_TELEMETRY_DISABLED = os.getenv("CHROMA_TELEMETRY_DISABLED", "true").lower() == "true"
65
+ DO_NOT_TRACK = os.getenv("DO_NOT_TRACK", "1")
66
+
67
+ class PromptTemplates:
68
+ """Centralized prompt templates."""
69
+
70
+ COURSE_QUERY_TEMPLATE = """You are an AI language model assistant. Your task is to generate five
71
+ different versions of the given user question to retrieve relevant documents about university COURSES.
72
+
73
+ Follow these guidelines:
74
+ 1. Focus on different aspects: content, prerequisites, learning outcomes, examination methods
75
+ 2. Use different phrasings and synonyms
76
+ 3. Include the course code or name if present in the original question
77
+ 4. Make queries both more specific and more general than the original
78
+ 5. Ensure each query is semantically meaningful and complete
79
+
80
+ Original question: {question}
81
+
82
+ Generate 5 different versions, one per line."""
83
+
84
+ PROGRAM_QUERY_TEMPLATE = """You are an AI language model assistant. Your task is to generate five
85
+ different versions of the given user question to retrieve relevant documents about university PROGRAMS.
86
+
87
+ Follow these guidelines:
88
+ 1. Focus on different aspects: program structure, career opportunities, admission requirements, outcomes
89
+ 2. Use different phrasings and synonyms
90
+ 3. Include the program name if present in the original question
91
+ 4. Make queries both more specific and more general than the original
92
+ 5. Consider both overall program information and specific details
93
+
94
+ Original question: {question}
95
+
96
+ Generate 5 different versions, one per line."""
97
+
98
+ GENERAL_QUERY_TEMPLATE = """You are an AI language model assistant. Your task is to generate five
99
+ different versions of the given user question to retrieve relevant documents about both university COURSES and PROGRAMS.
100
+
101
+ Follow these guidelines:
102
+ 1. Balance between course-specific and program-level information
103
+ 2. Include variations that focus on how courses fit into programs
104
+ 3. Use different phrasings and synonyms
105
+ 4. Make queries both more specific and more general than the original
106
+ 5. Maintain the original intent while exploring different aspects
107
+
108
+ Original question: {question}
109
+
110
+ Generate 5 different versions, one per line."""
111
+
112
+ ROUTER_SYSTEM_TEMPLATE = """You are an expert at routing user questions about university education to the appropriate content type.
113
+ Your task is to determine whether the question is about:
114
+ 1. A specific COURSE or course-related information
115
+ 2. A specific PROGRAM or program-related information
116
+ 3. BOTH when the question involves both courses and programs or when it's unclear
117
+
118
+ Examples:
119
+ - "What are the prerequisites for DIT134?" -> course
120
+ - "Tell me about the Software Engineering program" -> program
121
+ - "What courses are included in the Data Science master's?" -> both
122
+ - "How many credits do I need?" -> both"""
123
+
124
+ SYSTEM_TEMPLATE = """You are a helpful course and program information assistant for Gothenburg University.
125
+ Your role is to provide accurate information about courses and programs based ONLY on the provided course and program documents.
126
+
127
+ Important rules to follow:
128
+ 1. Only answer questions about courses that are explicitly mentioned in the provided documents
129
+ 2. If a course is not in the documents, clearly state that you don't have information about that course
130
+ 3. Base your answers solely on the content from the course documents
131
+ 4. If you're unsure about any information, say so explicitly
132
+ 5. When discussing course content, prerequisites, or evaluation methods, quote directly from the source documents when possible
133
+ 6. Include the course code (e.g., DIT123) when referring to courses
134
+ 7. For listing questions (e.g., "What programs are available?", "List all courses in X"):
135
+ - ALWAYS check the sources list first
136
+ - THOROUGHLY examine EACH source document listed in the sources
137
+ - List EVERY program or course mentioned in ANY of the retrieved documents
138
+ - Do not skip any programs even if they seem similar to others
139
+ - Include program/course codes when available
140
+ - Group items logically (e.g., by degree level: Bachelor's, Master's)
141
+ - Double-check the sources list against your response to ensure no programs were missed
142
+ 8. For questions asking about all programs from a specific school/department:
143
+ - List ALL programs from the retrieved documents
144
+ - Cross-reference the sources list with your response to ensure completeness
145
+ - Include full program names and codes
146
+ - Organize by degree level (Bachelor's/Master's)
147
+ - Specify the credit amount if available
148
+ - Before finishing your response, verify that you've included every program from every source listed
149
+
150
+ Context from documents: {context}
151
+
152
+ Current conversation history: {chat_history}
153
+
154
+ Human question: {question}
155
+
156
+ Remember:
157
+ 1. When asked to list programs or courses, THOROUGHLY check all retrieved documents and include EVERY relevant item.
158
+ 2. Do not summarize or skip any programs/courses found in the sources.
159
+ 3. Always cross-reference your final list against the sources to ensure nothing was missed.
160
+ 4. If you see a source in the list that contains "programme" or "program" in its name, make sure that program is included in your response.
161
+
162
+ Please provide a response based strictly on the above context. If the information isn't in the context, say so."""
163
+
164
+ @classmethod
165
+ def get_query_template(cls, content_type: str) -> str:
166
+ """Get the appropriate query template based on content type."""
167
+ templates = {
168
+ "course": cls.COURSE_QUERY_TEMPLATE,
169
+ "program": cls.PROGRAM_QUERY_TEMPLATE,
170
+ "both": cls.GENERAL_QUERY_TEMPLATE
171
+ }
172
+ return templates.get(content_type, cls.GENERAL_QUERY_TEMPLATE)
173
+
174
+ class AppConstants:
175
+ """Application constants."""
176
+
177
+ ROUTING_INFO = {
178
+ "course": "🎓 Course-specific response:",
179
+ "program": "📚 Program-specific response:",
180
+ "both": "🏫 General education response:"
181
+ }
182
+
183
+ EXAMPLE_QUERIES = [
184
+ "What is the Applied Data Science program about?",
185
+ "What are the prerequisites for Applied Machine Learning?",
186
+ "Tell me about courses in the Master's Program in Management.",
187
+ "List all master's programs in the School of Business, Economics and Law.",
188
+ "What programs are available in Computer Science?"
189
+ ]
190
+
191
+ SUPPORTED_FILE_ENCODINGS = ['utf-8', 'iso-8859-1', 'latin1']
192
+ SUPPORTED_FILE_EXTENSIONS = {
193
+ 'markdown': ['.md'],
194
+ 'pdf': ['.pdf']
195
+ }
196
+
197
+ BATCH_SIZE = 50 # For processing documents in batches
198
+
199
+ def setup_telemetry():
200
+ """Set up telemetry environment variables to prevent warnings."""
201
+ # Set LangChain telemetry environment variables
202
+ os.environ["LANGCHAIN_TRACING_V2"] = str(Config.LANGCHAIN_TRACING_V2).lower()
203
+ os.environ["ANONYMIZED_TELEMETRY"] = str(Config.ANONYMIZED_TELEMETRY).lower()
204
+ os.environ["POSTHOG_DISABLED"] = str(Config.POSTHOG_DISABLED).lower()
205
+
206
+ # Set ChromaDB telemetry environment variables
207
+ os.environ["CHROMA_TELEMETRY_DISABLED"] = "true"
208
+ os.environ["CHROMA_TELEMETRY"] = "false"
209
+
210
+ # Additional telemetry controls
211
+ os.environ["DO_NOT_TRACK"] = "1"
212
+
213
+ def validate_config():
214
+ """Validate that required configuration is present."""
215
+ if not Config.OPENAI_API_KEY:
216
+ raise ValueError("OpenAI API key not found in environment variables")
217
+
218
+ # Setup telemetry to prevent warnings
219
+ setup_telemetry()
220
+
221
+ return True
src/document_processor.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import time
4
+ from typing import List, Optional, Dict, Any
5
+ from pathlib import Path
6
+
7
+ from langchain_community.document_loaders import PyPDFLoader
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_core.documents import Document
10
+
11
+ from config import Config, AppConstants
12
+ from models import DocumentMetadata, ProcessingStats
13
+
14
+ class DocumentProcessor:
15
+ """Handles document loading, processing, and chunking."""
16
+
17
+ def __init__(self, base_path: str = None):
18
+ """Initialize the document processor.
19
+
20
+ Args:
21
+ base_path: Base path for document directories
22
+ """
23
+ self.base_path = base_path or Config.DATA_BASE_PATH
24
+ self.text_splitter = RecursiveCharacterTextSplitter(
25
+ chunk_size=Config.CHUNK_SIZE,
26
+ chunk_overlap=Config.CHUNK_OVERLAP,
27
+ length_function=len,
28
+ separators=["\n\n", "\n", " ", ""]
29
+ )
30
+
31
+ def process_all_documents(self) -> List[Document]:
32
+ """Process both markdown and PDF documents from courses and programs directories.
33
+
34
+ Returns:
35
+ List of processed documents with proper metadata
36
+ """
37
+ start_time = time.time()
38
+
39
+ documents = {
40
+ 'courses': [],
41
+ 'programs': []
42
+ }
43
+
44
+ # Define paths for different document types
45
+ paths = self._get_document_paths()
46
+
47
+ # Create directories if they don't exist
48
+ self._ensure_directories_exist(paths)
49
+
50
+ # Process documents by category
51
+ for category in ['courses', 'programs']:
52
+ # Process markdown files
53
+ md_path = paths[f'{category}_md']
54
+ if os.path.exists(md_path):
55
+ documents[category].extend(self._process_markdown_files(md_path, category))
56
+
57
+ # Process PDF files
58
+ pdf_path = paths[f'{category}_pdf']
59
+ if os.path.exists(pdf_path):
60
+ documents[category].extend(self._process_pdf_files(pdf_path, category))
61
+
62
+ print(f"Processed {len(documents[category])} {category} documents")
63
+
64
+ # Combine all documents while maintaining their metadata
65
+ all_documents = documents['courses'] + documents['programs']
66
+
67
+ # Create processing stats
68
+ processing_time = time.time() - start_time
69
+ stats = ProcessingStats(
70
+ total_documents=len(all_documents),
71
+ courses_processed=len(documents['courses']),
72
+ programs_processed=len(documents['programs']),
73
+ chunks_created=0, # Will be updated after chunking
74
+ processing_time=processing_time
75
+ )
76
+
77
+ print(f"Total documents processed: {len(all_documents)}")
78
+ print(f"Courses: {len(documents['courses'])}, Programs: {len(documents['programs'])}")
79
+ print(f"Processing time: {processing_time:.2f} seconds")
80
+
81
+ return all_documents
82
+
83
+ def chunk_documents(self, documents: List[Document]) -> List[Document]:
84
+ """Split documents into chunks for embedding.
85
+
86
+ Args:
87
+ documents: List of documents to chunk
88
+
89
+ Returns:
90
+ List of document chunks
91
+ """
92
+ print(f"Splitting {len(documents)} documents into chunks...")
93
+ chunks = self.text_splitter.split_documents(documents)
94
+ print(f"Created {len(chunks)} document chunks")
95
+ return chunks
96
+
97
+ def _get_document_paths(self) -> Dict[str, str]:
98
+ """Get paths for different document types.
99
+
100
+ Returns:
101
+ Dictionary with document paths
102
+ """
103
+ return {
104
+ 'courses_md': os.path.join(self.base_path, Config.COURSES_MD_PATH),
105
+ 'courses_pdf': os.path.join(self.base_path, Config.COURSES_PDF_PATH),
106
+ 'programs_md': os.path.join(self.base_path, Config.PROGRAMS_MD_PATH),
107
+ 'programs_pdf': os.path.join(self.base_path, Config.PROGRAMS_PDF_PATH)
108
+ }
109
+
110
+ def _ensure_directories_exist(self, paths: Dict[str, str]) -> None:
111
+ """Ensure all document directories exist.
112
+
113
+ Args:
114
+ paths: Dictionary of paths to create
115
+ """
116
+ for path in paths.values():
117
+ if not os.path.exists(path):
118
+ os.makedirs(path, exist_ok=True)
119
+ print(f"Created directory: {path}")
120
+
121
+ def _process_markdown_files(self, path: str, category: str) -> List[Document]:
122
+ """Process markdown files in a directory.
123
+
124
+ Args:
125
+ path: Path to the markdown files directory
126
+ category: Type of documents ('courses' or 'programs')
127
+
128
+ Returns:
129
+ List of processed markdown documents with metadata
130
+ """
131
+ documents = []
132
+
133
+ if not os.path.exists(path):
134
+ print(f"Warning: Markdown directory {path} does not exist")
135
+ return documents
136
+
137
+ for filename in os.listdir(path):
138
+ if filename.endswith('.md'):
139
+ file_path = os.path.join(path, filename)
140
+ try:
141
+ content = self._read_file_with_fallback_encoding(file_path)
142
+
143
+ # Create metadata
144
+ metadata = {
145
+ 'source': file_path,
146
+ 'type': 'markdown',
147
+ 'category': category,
148
+ 'doc_type': category.rstrip('s'), # 'course' or 'program'
149
+ 'filename': filename
150
+ }
151
+
152
+ # Extract course code if it's a course document
153
+ if category == 'courses':
154
+ code = self._extract_course_code(filename, content)
155
+ if code:
156
+ metadata['course_code'] = code
157
+
158
+ doc = Document(
159
+ page_content=content,
160
+ metadata=metadata
161
+ )
162
+ documents.append(doc)
163
+
164
+ except Exception as e:
165
+ print(f"Error processing markdown file {filename}: {str(e)}")
166
+
167
+ return documents
168
+
169
+ def _process_pdf_files(self, path: str, category: str) -> List[Document]:
170
+ """Process PDF files in a directory.
171
+
172
+ Args:
173
+ path: Path to the PDF files directory
174
+ category: Type of documents ('courses' or 'programs')
175
+
176
+ Returns:
177
+ List of processed PDF documents with metadata
178
+ """
179
+ documents = []
180
+
181
+ if not os.path.exists(path):
182
+ print(f"Warning: PDF directory {path} does not exist")
183
+ return documents
184
+
185
+ for filename in os.listdir(path):
186
+ if filename.endswith('.pdf'):
187
+ file_path = os.path.join(path, filename)
188
+ try:
189
+ loader = PyPDFLoader(file_path)
190
+ pdf_docs = loader.load()
191
+
192
+ # Create base metadata
193
+ metadata = {
194
+ 'type': 'pdf',
195
+ 'category': category,
196
+ 'doc_type': category.rstrip('s'), # 'course' or 'program'
197
+ 'filename': filename
198
+ }
199
+
200
+ # Add course code if it exists and it's a course document
201
+ if category == 'courses' and pdf_docs:
202
+ code = self._extract_course_code(filename, pdf_docs[0].page_content)
203
+ if code:
204
+ metadata['course_code'] = code
205
+
206
+ # Add metadata to each page
207
+ for doc in pdf_docs:
208
+ doc.metadata.update(metadata)
209
+
210
+ documents.extend(pdf_docs)
211
+
212
+ except Exception as e:
213
+ print(f"Error processing PDF {filename}: {str(e)}")
214
+
215
+ return documents
216
+
217
+ def _read_file_with_fallback_encoding(self, file_path: str) -> str:
218
+ """Read a file with fallback encodings.
219
+
220
+ Args:
221
+ file_path: Path to the file to read
222
+
223
+ Returns:
224
+ File content as string
225
+
226
+ Raises:
227
+ UnicodeDecodeError: If file cannot be read with any encoding
228
+ """
229
+ for encoding in AppConstants.SUPPORTED_FILE_ENCODINGS:
230
+ try:
231
+ with open(file_path, 'r', encoding=encoding) as f:
232
+ return f.read()
233
+ except UnicodeDecodeError:
234
+ continue
235
+
236
+ raise UnicodeDecodeError(f"Failed to read {file_path} with any encoding")
237
+
238
+ def _extract_course_code(self, filename: str, content: str) -> Optional[str]:
239
+ """Extract course code from filename or content if possible.
240
+
241
+ Args:
242
+ filename: Name of the file
243
+ content: Content of the document
244
+
245
+ Returns:
246
+ Course code if found, None otherwise
247
+ """
248
+ # Try to extract from filename first (e.g., "DIT134-advanced-programming.pdf")
249
+ code_match = re.search(r'([A-Z]{3}\d{3})', filename)
250
+ if code_match:
251
+ return code_match.group(1)
252
+
253
+ # Try to extract from content (first occurrence)
254
+ code_match = re.search(r'([A-Z]{3}\d{3})', content[:1000]) # Search in first 1000 chars
255
+ if code_match:
256
+ return code_match.group(1)
257
+
258
+ return None
259
+
260
+ def get_document_stats(self, documents: List[Document]) -> Dict[str, Any]:
261
+ """Get statistics about processed documents.
262
+
263
+ Args:
264
+ documents: List of processed documents
265
+
266
+ Returns:
267
+ Dictionary with document statistics
268
+ """
269
+ stats = {
270
+ 'total_documents': len(documents),
271
+ 'by_category': {},
272
+ 'by_type': {},
273
+ 'by_doc_type': {},
274
+ 'course_codes': set(),
275
+ 'total_content_length': 0
276
+ }
277
+
278
+ for doc in documents:
279
+ metadata = doc.metadata
280
+
281
+ # Count by category
282
+ category = metadata.get('category', 'unknown')
283
+ stats['by_category'][category] = stats['by_category'].get(category, 0) + 1
284
+
285
+ # Count by file type
286
+ file_type = metadata.get('type', 'unknown')
287
+ stats['by_type'][file_type] = stats['by_type'].get(file_type, 0) + 1
288
+
289
+ # Count by document type
290
+ doc_type = metadata.get('doc_type', 'unknown')
291
+ stats['by_doc_type'][doc_type] = stats['by_doc_type'].get(doc_type, 0) + 1
292
+
293
+ # Collect course codes
294
+ if metadata.get('course_code'):
295
+ stats['course_codes'].add(metadata['course_code'])
296
+
297
+ # Sum content length
298
+ stats['total_content_length'] += len(doc.page_content)
299
+
300
+ # Convert set to list for JSON serialization
301
+ stats['course_codes'] = list(stats['course_codes'])
302
+ stats['unique_course_codes'] = len(stats['course_codes'])
303
+
304
+ return stats
305
+
306
+ def validate_documents(self, documents: List[Document]) -> Dict[str, Any]:
307
+ """Validate processed documents for common issues.
308
+
309
+ Args:
310
+ documents: List of documents to validate
311
+
312
+ Returns:
313
+ Dictionary with validation results
314
+ """
315
+ validation_results = {
316
+ 'total_documents': len(documents),
317
+ 'issues': [],
318
+ 'warnings': [],
319
+ 'valid_documents': 0,
320
+ 'empty_documents': 0,
321
+ 'missing_metadata': 0
322
+ }
323
+
324
+ for i, doc in enumerate(documents):
325
+ # Check for empty content
326
+ if not doc.page_content or len(doc.page_content.strip()) == 0:
327
+ validation_results['empty_documents'] += 1
328
+ validation_results['issues'].append(f"Document {i}: Empty content")
329
+ continue
330
+
331
+ # Check for essential metadata
332
+ required_metadata = ['source', 'type', 'category', 'doc_type', 'filename']
333
+ missing_fields = [field for field in required_metadata if not doc.metadata.get(field)]
334
+
335
+ if missing_fields:
336
+ validation_results['missing_metadata'] += 1
337
+ validation_results['warnings'].append(
338
+ f"Document {i}: Missing metadata fields: {missing_fields}"
339
+ )
340
+
341
+ # Check content length
342
+ if len(doc.page_content) < 50:
343
+ validation_results['warnings'].append(
344
+ f"Document {i}: Very short content ({len(doc.page_content)} chars)"
345
+ )
346
+
347
+ validation_results['valid_documents'] += 1
348
+
349
+ return validation_results
src/interface.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import List, Dict, Any
3
+ from rag_service import RAGService
4
+ from config import Config, AppConstants
5
+
6
+ class RAGInterface:
7
+ """Gradio interface for the RAG application."""
8
+
9
+ def __init__(self, rag_service: RAGService):
10
+ """Initialize the interface.
11
+
12
+ Args:
13
+ rag_service: The RAG service instance
14
+ """
15
+ self.rag_service = rag_service
16
+ self.interface = None
17
+
18
+ def process_query(self, message: str, history: List[Dict[str, str]]) -> str:
19
+ """Process a single query in the chat interface.
20
+
21
+ Args:
22
+ message: User's message
23
+ history: Chat history in OpenAI-style format with 'role' and 'content' keys
24
+
25
+ Returns:
26
+ Assistant's response
27
+ """
28
+ try:
29
+ # Query the RAG service
30
+ result = self.rag_service.query(message)
31
+
32
+ # Format response with routing information
33
+ content_type = result.content_type
34
+ answer = result.answer
35
+
36
+ # Add routing indicator
37
+ routing_prefix = AppConstants.ROUTING_INFO.get(content_type, "")
38
+ if routing_prefix:
39
+ return f"{routing_prefix}\n\n{answer}"
40
+ else:
41
+ return answer
42
+
43
+ except Exception as e:
44
+ error_msg = f"❌ Error: {str(e)}"
45
+ print(f"Interface error: {error_msg}")
46
+ return error_msg
47
+
48
+ def get_system_info(self) -> str:
49
+ """Get formatted system information.
50
+
51
+ Returns:
52
+ Formatted system status string
53
+ """
54
+ try:
55
+ status = self.rag_service.get_system_status()
56
+
57
+ # Format the information nicely
58
+ info = f"""
59
+ ### 📊 System Status
60
+
61
+ **Database Status:** {'✅ Initialized' if status['database_initialized'] else '❌ Not Initialized'}
62
+ **Documents Loaded:** {status['documents_loaded']}
63
+ **Model Version:** {status['model_version']}
64
+ **Embedding Model:** {status['embedding_version']}
65
+ **Conversation Length:** {status['conversation_length']} messages
66
+
67
+ ### 🔍 Search Configuration
68
+
69
+ **Course Queries:** Top {Config.RETRIEVAL_K_VALUES['course']} matches
70
+ **Program Queries:** Top {Config.RETRIEVAL_K_VALUES['program']} matches
71
+ **Mixed Queries:** Top {Config.RETRIEVAL_K_VALUES['both']} matches
72
+ **Search Type:** MMR (Maximal Marginal Relevance)
73
+
74
+ ### 📚 Query Types
75
+
76
+ **🎓 Course Queries**
77
+ - Specific course information
78
+ - Prerequisites and requirements
79
+ - Learning outcomes
80
+ - Course content and structure
81
+
82
+ **📚 Program Queries**
83
+ - Program overviews and structure
84
+ - Available programs by department
85
+ - Program requirements and outcomes
86
+ - Career opportunities
87
+
88
+ **🏫 General Queries**
89
+ - Courses within programs
90
+ - Department offerings
91
+ - Combined course/program information
92
+ - Cross-referencing content
93
+ """
94
+ return info.strip()
95
+
96
+ except Exception as e:
97
+ return f"Error getting system info: {str(e)}"
98
+
99
+
100
+
101
+
102
+
103
+ def create_interface(self) -> gr.Blocks:
104
+ """Create and configure the Gradio interface.
105
+
106
+ Returns:
107
+ Configured Gradio Blocks interface
108
+ """
109
+ # Create the interface
110
+ with gr.Blocks(theme=gr.themes.Soft()) as interface:
111
+ gr.Markdown("""
112
+ # GuPT: Gothenburg University Information Assistant
113
+ Ask questions about Gothenburg University's courses and programs.
114
+ """)
115
+
116
+ with gr.Row(equal_height=True):
117
+ # Chat column (2/3 of width)
118
+ with gr.Column(scale=2):
119
+ chat_interface = gr.ChatInterface(
120
+ fn=self.process_query,
121
+ examples=AppConstants.EXAMPLE_QUERIES,
122
+ css="""
123
+ div.message-wrap { height: 600px !important; overflow-y: auto; }
124
+ details { margin-top: 10px; }
125
+ summary { cursor: pointer; color: #2A6BB0; }
126
+ summary:hover { text-decoration: underline; }
127
+ """,
128
+ type="messages"
129
+ )
130
+
131
+ # Info column (1/3 of width)
132
+ with gr.Column(scale=1):
133
+ # Get system status for static display
134
+ status = self.rag_service.get_system_status()
135
+
136
+ gr.Markdown(f"""
137
+ ### Document Collection
138
+ - Documents Loaded: {status['documents_loaded']}
139
+ - Database Status: {'✅ Initialized' if status['database_initialized'] else '❌ Not Ready'}
140
+ - Model: {status['model_version']}
141
+
142
+ ### Search Configuration
143
+ - Using MMR for diverse results
144
+ - Course queries: top {Config.RETRIEVAL_K_VALUES['course']} matches
145
+ - Program queries: top {Config.RETRIEVAL_K_VALUES['program']} matches
146
+ - Mixed queries: top {Config.RETRIEVAL_K_VALUES['both']} matches
147
+
148
+ ### Query Types
149
+
150
+ 🎓 **Course Queries**
151
+ - Specific course information
152
+ - Prerequisites and requirements
153
+ - Learning outcomes
154
+
155
+ 📚 **Program Queries**
156
+ - Program overviews
157
+ - Available programs by department
158
+ - Program requirements
159
+
160
+ 🏫 **General Queries**
161
+ - Courses within programs
162
+ - Department offerings
163
+ - Combined course/program information
164
+ """)
165
+
166
+ self.interface = interface
167
+ return interface
168
+
169
+ def launch(self, **kwargs):
170
+ """Launch the Gradio interface.
171
+
172
+ Args:
173
+ **kwargs: Additional arguments for Gradio launch
174
+ """
175
+ if not self.interface:
176
+ self.create_interface()
177
+
178
+ # Default launch parameters
179
+ launch_params = {
180
+ "share": False,
181
+ "server_name": "0.0.0.0",
182
+ "server_port": 7860,
183
+ "show_error": True,
184
+ "quiet": False
185
+ }
186
+
187
+ # Update with any provided parameters
188
+ launch_params.update(kwargs)
189
+
190
+ print(f"🚀 Launching GuPT interface...")
191
+ print(f"📍 Server: {launch_params['server_name']}:{launch_params['server_port']}")
192
+
193
+ try:
194
+ self.interface.launch(**launch_params)
195
+ except Exception as e:
196
+ print(f"❌ Error launching interface: {str(e)}")
197
+ raise
198
+
199
+ def create_interface(rag_service: RAGService) -> RAGInterface:
200
+ """Factory function to create a RAG interface.
201
+
202
+ Args:
203
+ rag_service: The RAG service instance
204
+
205
+ Returns:
206
+ Configured RAGInterface instance
207
+ """
208
+ return RAGInterface(rag_service)
src/main.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GuPT: Gothenburg University Information Assistant
4
+ Main entry point for the restructured RAG application.
5
+
6
+ This is the modernized version using:
7
+ - LCEL (LangChain Expression Language)
8
+ - Modular architecture
9
+ - Better error handling
10
+ - Enhanced logging
11
+ """
12
+
13
+ import sys
14
+ import time
15
+ import argparse
16
+ from typing import Optional
17
+
18
+ # Local imports
19
+ from config import Config, validate_config
20
+ from rag_service import RAGService
21
+ from interface import create_interface
22
+
23
+ def parse_arguments():
24
+ """Parse command line arguments."""
25
+ parser = argparse.ArgumentParser(
26
+ description="GuPT: Gothenburg University Information Assistant",
27
+ formatter_class=argparse.RawDescriptionHelpFormatter,
28
+ epilog="""
29
+ Examples:
30
+ python main.py # Launch with default settings
31
+ python main.py --no-share # Launch without sharing
32
+ python main.py --port 8080 # Launch on port 8080
33
+ python main.py --rebuild-db # Force rebuild of vector database
34
+ """
35
+ )
36
+
37
+ # Interface options
38
+ parser.add_argument(
39
+ "--share",
40
+ action="store_true",
41
+ default=False,
42
+ help="Share the interface via Gradio public link"
43
+ )
44
+ parser.add_argument(
45
+ "--no-share",
46
+ action="store_true",
47
+ default=False,
48
+ help="Explicitly disable sharing (default)"
49
+ )
50
+ parser.add_argument(
51
+ "--port",
52
+ type=int,
53
+ default=7860,
54
+ help="Port to run the interface on (default: 7860)"
55
+ )
56
+ parser.add_argument(
57
+ "--host",
58
+ type=str,
59
+ default="0.0.0.0",
60
+ help="Host to bind to (default: 0.0.0.0)"
61
+ )
62
+
63
+ # Database options
64
+ parser.add_argument(
65
+ "--rebuild-db",
66
+ action="store_true",
67
+ help="Force rebuild of the vector database"
68
+ )
69
+ parser.add_argument(
70
+ "--db-path",
71
+ type=str,
72
+ default=None,
73
+ help=f"Custom path for vector database (default: {Config.CHROMA_DB_PATH})"
74
+ )
75
+
76
+ # Debug options
77
+ parser.add_argument(
78
+ "--debug",
79
+ action="store_true",
80
+ help="Enable debug mode with verbose output"
81
+ )
82
+ parser.add_argument(
83
+ "--quiet",
84
+ action="store_true",
85
+ help="Suppress non-essential output"
86
+ )
87
+
88
+ return parser.parse_args()
89
+
90
+ def print_banner():
91
+ """Print application banner."""
92
+ banner = """
93
+ ╔══════════════════════════════════════════════════════════════╗
94
+ ║ ║
95
+ ║ 🎓 GuPT - Gothenburg University Information Assistant ║
96
+ ║ ║
97
+ ║ Built with: LangChain + OpenAI + Gradio ║
98
+ ║ ║
99
+ ╚══════════════════════════════════════════════════════════════╝
100
+ """
101
+ print(banner)
102
+
103
+ def check_prerequisites() -> bool:
104
+ """Check if all prerequisites are met.
105
+
106
+ Returns:
107
+ True if all prerequisites are met, False otherwise
108
+ """
109
+ try:
110
+ # Validate configuration
111
+ validate_config()
112
+ print("✅ Configuration validated")
113
+
114
+ # Check if required directories exist
115
+ import os
116
+ data_dirs = [
117
+ Config.COURSES_MD_PATH,
118
+ Config.COURSES_PDF_PATH,
119
+ Config.PROGRAMS_MD_PATH,
120
+ Config.PROGRAMS_PDF_PATH
121
+ ]
122
+
123
+ missing_dirs = []
124
+ for dir_path in data_dirs:
125
+ if not os.path.exists(dir_path):
126
+ missing_dirs.append(dir_path)
127
+
128
+ if missing_dirs:
129
+ print("⚠️ Warning: Some data directories are missing:")
130
+ for dir_path in missing_dirs:
131
+ print(f" - {dir_path}")
132
+ print(" The system will create them automatically if needed.")
133
+
134
+ print("✅ Prerequisites check completed")
135
+ return True
136
+
137
+ except Exception as e:
138
+ print(f"❌ Prerequisites check failed: {str(e)}")
139
+ return False
140
+
141
+ def initialize_rag_service(args) -> Optional[RAGService]:
142
+ """Initialize the RAG service.
143
+
144
+ Args:
145
+ args: Parsed command line arguments
146
+
147
+ Returns:
148
+ Initialized RAG service or None if failed
149
+ """
150
+ try:
151
+ print("🔧 Initializing RAG service...")
152
+
153
+ # Create RAG service
154
+ rag_service = RAGService()
155
+
156
+ print("📚 Loading documents and vector store...")
157
+ start_time = time.time()
158
+
159
+ # Handle database rebuild
160
+ if args.rebuild_db:
161
+ print("🔄 Rebuilding vector database...")
162
+ import shutil
163
+ import os
164
+ if os.path.exists(Config.CHROMA_DB_PATH):
165
+ shutil.rmtree(Config.CHROMA_DB_PATH)
166
+ print(f" Removed existing database at {Config.CHROMA_DB_PATH}")
167
+
168
+ # Load documents
169
+ num_chunks = rag_service.load_documents()
170
+ load_time = time.time() - start_time
171
+
172
+ print(f"✅ RAG service initialized successfully!")
173
+ print(f" 📊 Processed {num_chunks} document chunks")
174
+ print(f" ⏱️ Loading time: {load_time:.2f} seconds")
175
+
176
+ return rag_service
177
+
178
+ except Exception as e:
179
+ print(f"❌ Failed to initialize RAG service: {str(e)}")
180
+ return None
181
+
182
+ def main():
183
+ """Main entry point."""
184
+ # Parse arguments
185
+ args = parse_arguments()
186
+
187
+ # Set up quiet mode
188
+ if args.quiet:
189
+ import os
190
+ # Redirect stdout to devnull for quiet mode
191
+ # We'll still print important messages to stderr
192
+ pass
193
+
194
+ # Print banner unless in quiet mode
195
+ if not args.quiet:
196
+ print_banner()
197
+
198
+ try:
199
+ # Check prerequisites
200
+ if not check_prerequisites():
201
+ print("❌ Prerequisites check failed. Please fix the issues and try again.")
202
+ sys.exit(1)
203
+
204
+ # Initialize RAG service
205
+ rag_service = initialize_rag_service(args)
206
+ if not rag_service:
207
+ print("❌ Failed to initialize RAG service. Exiting.")
208
+ sys.exit(1)
209
+
210
+ # Create and launch interface
211
+ print("🚀 Creating Gradio interface...")
212
+ interface_wrapper = create_interface(rag_service)
213
+
214
+ # Determine share setting
215
+ share = args.share and not args.no_share
216
+
217
+ # Launch parameters
218
+ launch_params = {
219
+ "share": share,
220
+ "server_name": args.host,
221
+ "server_port": args.port,
222
+ "show_error": True,
223
+ "quiet": args.quiet
224
+ }
225
+
226
+ print(f"🌐 Launching interface...")
227
+ if not args.quiet:
228
+ print(f" 📍 Local URL: http://{args.host}:{args.port}")
229
+ if share:
230
+ print(f" 🌍 Public sharing: Enabled")
231
+ else:
232
+ print(f" 🔒 Public sharing: Disabled")
233
+
234
+ # Launch the interface
235
+ interface_wrapper.create_interface()
236
+ interface_wrapper.launch(**launch_params)
237
+
238
+ except KeyboardInterrupt:
239
+ print("\n👋 Shutting down gracefully...")
240
+ sys.exit(0)
241
+
242
+ except Exception as e:
243
+ print(f"❌ Unexpected error: {str(e)}")
244
+ if args.debug:
245
+ import traceback
246
+ traceback.print_exc()
247
+ sys.exit(1)
248
+
249
+ if __name__ == "__main__":
250
+ main()
src/models.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Dict, Literal, Optional, Any
3
+ from dataclasses import dataclass, asdict
4
+ from datetime import datetime
5
+
6
+ class RouteQuery(BaseModel):
7
+ """Route a user query to the most relevant content type."""
8
+ content_type: Literal["course", "program", "both"] = Field(
9
+ ...,
10
+ description="Route to: 'course' for specific course questions, 'program' for program questions, 'both' when the question involves both or is unclear"
11
+ )
12
+
13
+ @dataclass
14
+ class DocumentMetadata:
15
+ """Metadata for processed documents."""
16
+ source: str
17
+ type: str # 'markdown' or 'pdf'
18
+ category: str # 'courses' or 'programs'
19
+ doc_type: str # 'course' or 'program'
20
+ filename: str
21
+ course_code: Optional[str] = None
22
+
23
+ @dataclass
24
+ class QueryResult:
25
+ """Result of a RAG query."""
26
+ answer: str
27
+ source_documents: List[Any] # List of Document objects
28
+ content_type: str
29
+ processing_time: Optional[float] = None
30
+ generated_queries: Optional[List[str]] = None
31
+ retrieval_stats: Optional[Dict[str, Any]] = None
32
+
33
+ @dataclass
34
+ class ChatInteraction:
35
+ """Single chat interaction for logging."""
36
+ timestamp: str
37
+ query: Dict[str, Any]
38
+ retrieval: Dict[str, Any]
39
+ response: Dict[str, str]
40
+ performance: Dict[str, Any]
41
+ chat_context: Dict[str, Any]
42
+ system_info: Dict[str, Any]
43
+
44
+ @dataclass
45
+ class RetrievalStats:
46
+ """Statistics about document retrieval."""
47
+ total_documents: int
48
+ document_types: Dict[str, int]
49
+ search_config: Dict[str, Any]
50
+ queries_used: List[str]
51
+
52
+ class EmbeddingConfig(BaseModel):
53
+ """Configuration for embeddings."""
54
+ model: str = "text-embedding-3-small"
55
+ chunk_size: int = 1000
56
+ max_retries: int = 3
57
+ request_timeout: int = 60
58
+
59
+ class ModelConfig(BaseModel):
60
+ """Configuration for LLM models."""
61
+ model_name: str = "gpt-4o-mini"
62
+ temperature: float = 0.1
63
+ max_tokens: Optional[int] = None
64
+
65
+ class VectorStoreConfig(BaseModel):
66
+ """Configuration for vector store."""
67
+ persist_directory: str = "./data/chroma"
68
+ collection_name: str = "course_docs"
69
+ collection_metadata: Dict[str, str] = Field(default_factory=lambda: {"hnsw:space": "cosine"})
70
+
71
+ class RetrievalConfig(BaseModel):
72
+ """Configuration for retrieval."""
73
+ search_type: str = "mmr"
74
+ k_values: Dict[str, int] = Field(default_factory=lambda: {
75
+ "course": 6,
76
+ "program": 15,
77
+ "both": 15
78
+ })
79
+ fetch_k_multiplier: int = 3
80
+
81
+ @dataclass
82
+ class ProcessingStats:
83
+ """Statistics about document processing."""
84
+ total_documents: int
85
+ courses_processed: int
86
+ programs_processed: int
87
+ chunks_created: int
88
+ processing_time: float
89
+
90
+ def to_dict(self) -> Dict[str, Any]:
91
+ """Convert to dictionary."""
92
+ return asdict(self)
93
+
94
+ class ChatMemoryMessage(BaseModel):
95
+ """Message in chat memory."""
96
+ role: str
97
+ content: str
98
+ timestamp: Optional[str] = None
99
+
100
+ class SystemStatus(BaseModel):
101
+ """System status information."""
102
+ database_initialized: bool = False
103
+ documents_loaded: int = 0
104
+ model_version: str = ""
105
+ embedding_version: str = ""
106
+ last_updated: Optional[str] = None
src/rag_service.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from typing import List, Dict, Any, Optional
4
+
5
+ # LangChain imports using modern patterns
6
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
7
+ from langchain_chroma import Chroma
8
+ from langchain_core.documents import Document
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from langchain_core.output_parsers import StrOutputParser
11
+ from langchain_core.runnables import RunnablePassthrough, RunnableParallel
12
+ from langchain_core.messages import HumanMessage, AIMessage
13
+
14
+ # Local imports
15
+ from config import Config, PromptTemplates, validate_config
16
+ from models import RouteQuery, QueryResult, RetrievalStats
17
+ from document_processor import DocumentProcessor
18
+ from chat_logger import ChatLogger
19
+
20
+ class RAGService:
21
+ """Modern RAG service using LangChain Expression Language (LCEL)."""
22
+
23
+ def __init__(self, base_path: str = None):
24
+ """Initialize the RAG service.
25
+
26
+ Args:
27
+ base_path: Base path for documents and vector store
28
+ """
29
+ # Validate configuration
30
+ validate_config()
31
+
32
+ self.base_path = base_path or Config.DATA_BASE_PATH
33
+ self.chat_logger = ChatLogger()
34
+ self.conversation_memory = [] # Simple in-memory conversation storage
35
+
36
+ # Initialize components
37
+ self._initialize_models()
38
+ self._initialize_vector_store()
39
+ self._setup_chains()
40
+
41
+ # Track last generated queries for logging
42
+ self.last_generated_queries = []
43
+
44
+ def _initialize_models(self):
45
+ """Initialize LLM and embedding models."""
46
+ print("Initializing AI models...")
47
+
48
+ # Initialize LLM
49
+ self.llm = ChatOpenAI(
50
+ model=Config.MODEL_NAME,
51
+ temperature=Config.TEMPERATURE,
52
+ api_key=Config.OPENAI_API_KEY
53
+ )
54
+
55
+ # Initialize embeddings with better error handling
56
+ self.embeddings = OpenAIEmbeddings(
57
+ api_key=Config.OPENAI_API_KEY,
58
+ model=Config.EMBEDDING_MODEL,
59
+ chunk_size=Config.EMBEDDING_CHUNK_SIZE,
60
+ max_retries=Config.EMBEDDING_MAX_RETRIES,
61
+ request_timeout=Config.EMBEDDING_REQUEST_TIMEOUT
62
+ )
63
+
64
+ print("✅ AI models initialized successfully")
65
+
66
+ def _initialize_vector_store(self):
67
+ """Initialize the vector store (empty initially)."""
68
+ self.vector_store = None
69
+ print("Vector store placeholder initialized")
70
+
71
+ def _setup_chains(self):
72
+ """Set up all the LCEL chains."""
73
+ print("Setting up LangChain LCEL chains...")
74
+
75
+ # Router chain
76
+ router_prompt = ChatPromptTemplate.from_messages([
77
+ ("system", PromptTemplates.ROUTER_SYSTEM_TEMPLATE),
78
+ ("human", "{question}")
79
+ ])
80
+ self.router_chain = router_prompt | self.llm.with_structured_output(RouteQuery)
81
+
82
+ # Query generation chains for different content types
83
+ self.query_generation_chains = {}
84
+ for content_type in ["course", "program", "both"]:
85
+ template = PromptTemplates.get_query_template(content_type)
86
+ prompt = ChatPromptTemplate.from_template(template)
87
+ self.query_generation_chains[content_type] = prompt | self.llm | StrOutputParser()
88
+
89
+ # Main QA chain
90
+ qa_prompt = ChatPromptTemplate.from_messages([
91
+ ("system", PromptTemplates.SYSTEM_TEMPLATE),
92
+ ("human", "{question}")
93
+ ])
94
+
95
+ # This will be completed when vector store is loaded
96
+ self.qa_chain = None
97
+
98
+ print("✅ LCEL chains set up successfully")
99
+
100
+ def load_documents(self) -> int:
101
+ """Load and process documents, create or load vector store.
102
+
103
+ Returns:
104
+ Number of document chunks processed
105
+ """
106
+ try:
107
+ print(f"Checking for existing database at: {Config.CHROMA_DB_PATH}")
108
+
109
+ if os.path.exists(Config.CHROMA_DB_PATH) and os.listdir(Config.CHROMA_DB_PATH):
110
+ print("Existing database found, attempting to load...")
111
+ count = self._load_existing_database()
112
+ if count == 0:
113
+ print("⚠️ Existing database is empty, rebuilding...")
114
+ return self._create_new_database()
115
+ return count
116
+ else:
117
+ print("No existing database found, creating new one...")
118
+ return self._create_new_database()
119
+
120
+ except Exception as e:
121
+ print(f"Error loading documents: {str(e)}")
122
+ raise
123
+
124
+ def _load_existing_database(self) -> int:
125
+ """Load existing vector database.
126
+
127
+ Returns:
128
+ Number of documents in the database
129
+ """
130
+ print("Loading existing embeddings from Chroma database...")
131
+
132
+ try:
133
+ self.vector_store = Chroma(
134
+ persist_directory=Config.CHROMA_DB_PATH,
135
+ embedding_function=self.embeddings,
136
+ collection_metadata={"hnsw:space": "cosine"},
137
+ collection_name=Config.COLLECTION_NAME
138
+ )
139
+
140
+ # Get collection size
141
+ collection_data = self.vector_store.get()
142
+ collection_size = len(collection_data['ids'])
143
+
144
+ if collection_size == 0:
145
+ print("Database exists but is empty")
146
+ return 0
147
+
148
+ print(f"✅ Loaded {collection_size} existing document chunks from database")
149
+ self._setup_qa_chain()
150
+ return collection_size
151
+
152
+ except Exception as e:
153
+ print(f"Error loading existing database: {str(e)}")
154
+ return 0
155
+
156
+ def _create_new_database(self) -> int:
157
+ """Create new vector database from documents.
158
+
159
+ Returns:
160
+ Number of document chunks processed
161
+ """
162
+ print("Creating new embeddings (this will incur OpenAI API costs)...")
163
+
164
+ # Process documents
165
+ processor = DocumentProcessor(self.base_path)
166
+ documents = processor.process_all_documents()
167
+
168
+ if not documents:
169
+ raise ValueError("No documents found to process")
170
+
171
+ # Chunk documents
172
+ chunks = processor.chunk_documents(documents)
173
+
174
+ # Initialize empty vector store
175
+ self.vector_store = Chroma(
176
+ embedding_function=self.embeddings,
177
+ persist_directory=Config.CHROMA_DB_PATH,
178
+ collection_metadata={"hnsw:space": "cosine"},
179
+ collection_name=Config.COLLECTION_NAME
180
+ )
181
+
182
+ # Process documents in batches to avoid token limits
183
+ total_processed = self._process_documents_in_batches(chunks)
184
+
185
+ print(f"✅ Database creation completed! Processed {total_processed} documents.")
186
+ self._setup_qa_chain()
187
+ return total_processed
188
+
189
+ def _process_documents_in_batches(self, chunks: List[Document]) -> int:
190
+ """Process documents in batches to avoid API limits.
191
+
192
+ Args:
193
+ chunks: List of document chunks to process
194
+
195
+ Returns:
196
+ Number of successfully processed chunks
197
+ """
198
+ batch_size = Config.BATCH_SIZE
199
+ total_processed = 0
200
+
201
+ print(f"Processing {len(chunks)} document chunks in batches of {batch_size}...")
202
+
203
+ for i in range(0, len(chunks), batch_size):
204
+ batch = chunks[i:i + batch_size]
205
+ batch_num = i // batch_size + 1
206
+ total_batches = (len(chunks) + batch_size - 1) // batch_size
207
+
208
+ print(f"Processing batch {batch_num}/{total_batches} ({len(batch)} documents)")
209
+
210
+ try:
211
+ self.vector_store.add_documents(batch)
212
+ total_processed += len(batch)
213
+ print(f"✅ Successfully processed {len(batch)} documents (Total: {total_processed})")
214
+
215
+ # Small delay to be nice to the API
216
+ time.sleep(1)
217
+
218
+ except Exception as e:
219
+ print(f"❌ Error processing batch {batch_num}: {str(e)}")
220
+ # Continue with next batch instead of failing completely
221
+ continue
222
+
223
+ return total_processed
224
+
225
+ def _setup_qa_chain(self):
226
+ """Set up the main QA chain with retriever."""
227
+ if not self.vector_store:
228
+ raise ValueError("Vector store not initialized")
229
+
230
+ # Create the main QA chain using LCEL
231
+ qa_prompt = ChatPromptTemplate.from_messages([
232
+ ("system", PromptTemplates.SYSTEM_TEMPLATE),
233
+ ("human", "{question}")
234
+ ])
235
+
236
+ def format_docs(docs):
237
+ """Format retrieved documents for the prompt."""
238
+ return "\n\n".join([d.page_content for d in docs])
239
+
240
+ def format_chat_history(memory):
241
+ """Format chat history for the prompt."""
242
+ if not memory:
243
+ return "No previous conversation."
244
+
245
+ formatted = []
246
+ for msg in memory[-6:]: # Last 6 messages (3 exchanges)
247
+ if isinstance(msg, dict):
248
+ role = msg.get('role', 'unknown')
249
+ content = msg.get('content', '')
250
+ elif hasattr(msg, 'type') and hasattr(msg, 'content'):
251
+ role = msg.type
252
+ content = msg.content
253
+ else:
254
+ continue
255
+ formatted.append(f"{role}: {content}")
256
+
257
+ return "\n".join(formatted)
258
+
259
+ # Create retriever (will be configured per query)
260
+ self.base_retriever = self.vector_store.as_retriever()
261
+
262
+ # The QA chain will be constructed per query with specific retriever config
263
+ self.qa_prompt = qa_prompt
264
+ self.format_docs = format_docs
265
+ self.format_chat_history = format_chat_history
266
+
267
+ print("✅ QA chain set up successfully")
268
+
269
+ def route_query(self, question: str) -> str:
270
+ """Route the query to determine content type.
271
+
272
+ Args:
273
+ question: User's question
274
+
275
+ Returns:
276
+ Content type: 'course', 'program', or 'both'
277
+ """
278
+ try:
279
+ result = self.router_chain.invoke({"question": question})
280
+ return result.content_type
281
+ except Exception as e:
282
+ print(f"Error in query routing: {str(e)}")
283
+ return "both" # Default to both if routing fails
284
+
285
+ def generate_query_variations(self, question: str, content_type: str) -> List[str]:
286
+ """Generate multiple query variations for better retrieval.
287
+
288
+ Args:
289
+ question: Original question
290
+ content_type: Content type from routing
291
+
292
+ Returns:
293
+ List of query variations
294
+ """
295
+ try:
296
+ chain = self.query_generation_chains[content_type]
297
+ variations = chain.invoke({"question": question})
298
+
299
+ # Process and clean the variations
300
+ queries = [q.strip() for q in variations.split('\n') if q.strip()]
301
+
302
+ # Always include the original question
303
+ if question not in queries:
304
+ queries.append(question)
305
+
306
+ # Store for logging
307
+ self.last_generated_queries = queries
308
+
309
+ return queries
310
+
311
+ except Exception as e:
312
+ print(f"Error generating query variations: {str(e)}")
313
+ # Fallback to original question
314
+ self.last_generated_queries = [question]
315
+ return [question]
316
+
317
+ def retrieve_documents(self, question: str, content_type: str) -> List[Document]:
318
+ """Retrieve relevant documents using multiple query variations.
319
+
320
+ Args:
321
+ question: Original question
322
+ content_type: Content type from routing
323
+
324
+ Returns:
325
+ List of relevant documents
326
+ """
327
+ if not self.vector_store:
328
+ raise ValueError("Vector store not initialized. Please load documents first.")
329
+
330
+ # Generate query variations
331
+ queries = self.generate_query_variations(question, content_type)
332
+
333
+ print(f"\nGenerated queries for '{question}':")
334
+ for q in queries:
335
+ print(f" • {q}")
336
+
337
+ # Configure retriever based on content type
338
+ k = Config.RETRIEVAL_K_VALUES[content_type]
339
+
340
+ # Create metadata filter if needed
341
+ search_kwargs = {
342
+ "k": k,
343
+ "fetch_k": k * 3 # Fetch more candidates for MMR
344
+ }
345
+
346
+ if content_type != "both":
347
+ search_kwargs["filter"] = {"doc_type": content_type}
348
+
349
+ # Configure retriever
350
+ retriever = self.vector_store.as_retriever(
351
+ search_type="mmr",
352
+ search_kwargs=search_kwargs
353
+ )
354
+
355
+ # Retrieve documents for each query variation
356
+ all_docs = []
357
+ for query in queries:
358
+ try:
359
+ docs = retriever.invoke(query)
360
+ all_docs.extend(docs)
361
+ except Exception as e:
362
+ print(f"Error retrieving for query '{query}': {str(e)}")
363
+ continue
364
+
365
+ # Remove duplicates while preserving order
366
+ unique_docs = []
367
+ seen_content = set()
368
+
369
+ for doc in all_docs:
370
+ # Create a unique identifier from content and source
371
+ doc_id = f"{doc.page_content[:100]}_{doc.metadata.get('source', '')}"
372
+ if doc_id not in seen_content:
373
+ seen_content.add(doc_id)
374
+ unique_docs.append(doc)
375
+
376
+ # Log retrieval statistics
377
+ doc_types = [doc.metadata.get('doc_type', 'unknown') for doc in unique_docs]
378
+ print(f"\nRetrieved {len(unique_docs)} unique documents:")
379
+ print(f" • Courses: {doc_types.count('course')}")
380
+ print(f" • Programs: {doc_types.count('program')}")
381
+
382
+ return unique_docs
383
+
384
+ def query(self, question: str) -> QueryResult:
385
+ """Process a user query and return response.
386
+
387
+ Args:
388
+ question: User's question
389
+
390
+ Returns:
391
+ QueryResult with answer and metadata
392
+ """
393
+ if not self.vector_store:
394
+ raise ValueError("Model not initialized. Please load documents first.")
395
+
396
+ start_time = time.time()
397
+
398
+ try:
399
+ # Route the query
400
+ content_type = self.route_query(question)
401
+ print(f"Query routed as: {content_type}")
402
+
403
+ # Retrieve relevant documents
404
+ docs = self.retrieve_documents(question, content_type)
405
+
406
+ # Format context and chat history
407
+ context = self.format_docs(docs)
408
+ chat_history = self.format_chat_history(self.conversation_memory)
409
+
410
+ # Generate answer using LCEL
411
+ chain = self.qa_prompt | self.llm | StrOutputParser()
412
+ answer = chain.invoke({
413
+ "context": context,
414
+ "question": question,
415
+ "chat_history": chat_history
416
+ })
417
+
418
+ # Update conversation memory
419
+ self.conversation_memory.extend([
420
+ {"role": "human", "content": question},
421
+ {"role": "assistant", "content": answer}
422
+ ])
423
+
424
+ # Keep memory within reasonable size
425
+ if len(self.conversation_memory) > 12: # Keep last 6 exchanges
426
+ self.conversation_memory = self.conversation_memory[-12:]
427
+
428
+ # Format sources
429
+ sources = self._format_sources(docs)
430
+ if sources:
431
+ answer += sources
432
+
433
+ # Calculate processing time
434
+ processing_time = time.time() - start_time
435
+
436
+ # Create result
437
+ result = QueryResult(
438
+ answer=answer,
439
+ source_documents=docs,
440
+ content_type=content_type,
441
+ processing_time=processing_time,
442
+ generated_queries=self.last_generated_queries
443
+ )
444
+
445
+ # Log the interaction
446
+ self._log_interaction(question, result)
447
+
448
+ return result
449
+
450
+ except Exception as e:
451
+ error_msg = f"Error processing query: {str(e)}"
452
+ print(error_msg)
453
+ return QueryResult(
454
+ answer=error_msg,
455
+ source_documents=[],
456
+ content_type="error",
457
+ processing_time=time.time() - start_time
458
+ )
459
+
460
+ def _format_sources(self, docs: List[Document]) -> str:
461
+ """Format source documents for display.
462
+
463
+ Args:
464
+ docs: Retrieved documents
465
+
466
+ Returns:
467
+ Formatted sources string
468
+ """
469
+ if not docs:
470
+ return ""
471
+
472
+ # Get unique sources
473
+ sources = list(set(
474
+ os.path.basename(doc.metadata.get("source", ""))
475
+ for doc in docs if doc.metadata.get("source")
476
+ ))
477
+ sources = sorted(sources)
478
+
479
+ if not sources:
480
+ return ""
481
+
482
+ sources_text = ""
483
+ if len(sources) > 2:
484
+ # Show only first 2 sources with expandable section for more
485
+ visible_sources = sources[:2]
486
+ hidden_sources = sources[2:]
487
+ sources_text += "\n\nSources:"
488
+ for source in visible_sources:
489
+ sources_text += f"\n• {source}"
490
+ sources_text += f"\n<details><summary>**See {len(hidden_sources)} more sources...**</summary>\n"
491
+ for source in hidden_sources:
492
+ sources_text += f"\n• {source}"
493
+ sources_text += "\n</details>"
494
+ else:
495
+ # If 2 or fewer sources, show all
496
+ sources_text += "\n\nSources:"
497
+ for source in sources:
498
+ sources_text += f"\n• {source}"
499
+
500
+ return sources_text
501
+
502
+ def _log_interaction(self, question: str, result: QueryResult):
503
+ """Log the interaction for analysis.
504
+
505
+ Args:
506
+ question: User's question
507
+ result: Query result
508
+ """
509
+ try:
510
+ system_info = {
511
+ "model_version": Config.MODEL_NAME,
512
+ "embedding_version": Config.EMBEDDING_MODEL,
513
+ "search_config": {
514
+ "search_type": "mmr",
515
+ "k_value": Config.RETRIEVAL_K_VALUES.get(result.content_type),
516
+ "content_type": result.content_type
517
+ }
518
+ }
519
+
520
+ self.chat_logger.log_interaction(
521
+ question=question,
522
+ answer=result.answer,
523
+ source_documents=result.source_documents,
524
+ content_type=result.content_type,
525
+ generated_queries=result.generated_queries or [],
526
+ processing_time=result.processing_time or 0,
527
+ chat_history=self.conversation_memory,
528
+ system_info=system_info
529
+ )
530
+ except Exception as e:
531
+ print(f"Error logging interaction: {str(e)}")
532
+
533
+ def get_system_status(self) -> Dict[str, Any]:
534
+ """Get current system status.
535
+
536
+ Returns:
537
+ Dictionary with system status information
538
+ """
539
+ status = {
540
+ "database_initialized": self.vector_store is not None,
541
+ "model_version": Config.MODEL_NAME,
542
+ "embedding_version": Config.EMBEDDING_MODEL,
543
+ "conversation_length": len(self.conversation_memory),
544
+ "last_queries": self.last_generated_queries
545
+ }
546
+
547
+ if self.vector_store:
548
+ try:
549
+ collection_data = self.vector_store.get()
550
+ status["documents_loaded"] = len(collection_data['ids'])
551
+ except:
552
+ status["documents_loaded"] = "unknown"
553
+ else:
554
+ status["documents_loaded"] = 0
555
+
556
+ return status
557
+
558
+ def clear_conversation_memory(self):
559
+ """Clear the conversation memory."""
560
+ self.conversation_memory = []
561
+ print("Conversation memory cleared")
562
+
563
+ def get_conversation_history(self) -> List[Dict[str, str]]:
564
+ """Get the current conversation history.
565
+
566
+ Returns:
567
+ List of conversation messages
568
+ """
569
+ return self.conversation_memory.copy()