Spaces:

faerazo
/

GuPT

Sleeping

App Files Files Community

faerazo commited on Jul 12, 2025

Commit

8629355

verified ·

1 Parent(s): 50af053

Initial commit to HFS

Browse files

Files changed (11) hide show

.dockerignore +72 -0
Dockerfile +46 -0
environment.yml +32 -0
requirements.txt +23 -0
src/chat_logger.py +257 -0
src/config.py +221 -0
src/document_processor.py +349 -0
src/interface.py +208 -0
src/main.py +250 -0
src/models.py +106 -0
src/rag_service.py +569 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,72 @@

+# Git
+.git
+.gitignore
+.gitattributes
+# Python
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+.coverage
+.coverage.*
+coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.log
+.pytest_cache/
+.hypothesis/
+# Virtual environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Environment files
+.env
+.env.local
+.env.*.local
+# Node modules (if any)
+node_modules/
+# Documentation
+README.md
+docs/
+*.md
+!README.md
+# Temporary files
+*.tmp
+*.temp
+# Build artifacts
+build/
+dist/
+*.egg-info/
+# Other
+chat_history.json
+*.log
+*.pid

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+# GuPT - Gothenburg University RAG System
+# Optimized Docker build for Hugging Face Spaces
+# Uses environment.yml for dependencies
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy environment file and install dependencies
+COPY environment.yml .
+# Extract pip dependencies from environment.yml and install them
+RUN grep -A 100 "pip:" environment.yml | grep "    -" | sed 's/    - //' > requirements.txt && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create non-root user for security (required by Hugging Face Spaces)
+RUN useradd --create-home --shell /bin/bash --uid 1000 user
+RUN chown -R user:user /app
+USER user
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+# Expose port 7860 (required by Hugging Face Spaces)
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+  CMD curl -f http://localhost:7860/ || exit 1
+# Command to run the application
+CMD ["python", "src/main.py", "--host", "0.0.0.0", "--port", "7860"]

environment.yml ADDED Viewed

	@@ -0,0 +1,32 @@

+name: gupt
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.11
+  - pip
+  - jupyter
+  - pip:
+    - langchain==0.3.26
+    - langchain-openai==0.3.27
+    - langchain-community==0.3.27
+    - langchain-core==0.3.68
+    - langchain-chroma==0.1.4
+    - langchain-text-splitters==0.3.8
+    - openai==1.95.1
+    - chromadb==0.5.23
+    - gradio==5.22.0
+    - python-dotenv==1.1.1
+    - numpy==1.26.4
+    - pandas==2.2.3
+    - rouge-score==0.1.2
+    - sentence-transformers==3.3.0
+    - bert-score==0.3.13
+    - scikit-learn==1.5.2
+    - typing-extensions==4.12.2
+    - pydantic==2.11.7
+    - pypdf==5.1.0
+    - requests==2.32.3
+    - urllib3==2.2.3
+    - charset-normalizer==3.4.0
+    - posthog==3.7.2

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+langchain==0.3.26
+langchain-openai==0.3.27
+langchain-community==0.3.27
+langchain-core==0.3.68
+langchain-chroma==0.1.4
+langchain-text-splitters==0.3.8
+openai==1.95.1
+chromadb==0.5.23
+gradio==5.22.0
+python-dotenv==1.1.1
+numpy==1.26.4
+pandas==2.2.3
+rouge-score==0.1.2
+sentence-transformers==3.3.0
+bert-score==0.3.13
+scikit-learn==1.5.2
+typing-extensions==4.12.2
+pydantic==2.11.7
+pypdf==5.1.0
+requests==2.32.3
+urllib3==2.2.3
+charset-normalizer==3.4.0
+posthog==3.7.2

src/chat_logger.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import os
+import json
+import time
+from datetime import datetime
+from typing import List, Dict, Any, Optional
+from dataclasses import asdict
+from models import ChatInteraction, RetrievalStats
+from config import Config
+class ChatLogger:
+    """Handles logging of chat interactions with enhanced metadata."""
+    def __init__(self, log_file: str = None):
+        """Initialize the chat logger.
+        Args:
+            log_file: Path to the log file. If None, uses config default.
+        """
+        self.log_file = log_file or Config.LOG_FILE
+        self._initialize_log_file()
+    def _initialize_log_file(self):
+        """Create log file if it doesn't exist."""
+        if not os.path.exists(self.log_file):
+            with open(self.log_file, 'w') as f:
+                json.dump([], f)
+    def log_interaction(self,
+                       question: str,
+                       answer: str,
+                       source_documents: List[Any],
+                       content_type: str,
+                       generated_queries: List[str],
+                       processing_time: float,
+                       chat_history: List[Any],
+                       system_info: Dict[str, Any]) -> None:
+        """Log a complete chat interaction with detailed metadata.
+        Args:
+            question: The user's question
+            answer: The generated answer
+            source_documents: Retrieved documents
+            content_type: The routing type (course/program/both)
+            generated_queries: List of generated query variations
+            processing_time: Time taken to process the query
+            chat_history: Chat memory messages
+            system_info: System configuration info
+        """
+        try:
+            # Prepare retrieval statistics
+            retrieval_stats = self._prepare_retrieval_stats(
+                source_documents, content_type, generated_queries
+            )
+            # Prepare chat context
+            chat_context = self._prepare_chat_context(chat_history)
+            # Create interaction data
+            interaction_data = {
+                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "query": {
+                    "original_question": question,
+                    "content_type": content_type,
+                    "generated_queries": generated_queries
+                },
+                "retrieval": retrieval_stats,
+                "response": {
+                    "answer": answer
+                },
+                "performance": {
+                    "processing_time": processing_time,
+                    "tokens_used": None  # TODO: Add token usage if available
+                },
+                "chat_context": chat_context,
+                "system_info": system_info
+            }
+            # Read existing logs
+            with open(self.log_file, 'r') as f:
+                logs = json.load(f)
+            # Add new log
+            logs.append(interaction_data)
+            # Write back to file
+            with open(self.log_file, 'w') as f:
+                json.dump(logs, f, indent=2)
+        except Exception as e:
+            print(f"Error logging interaction: {str(e)}")
+    def _prepare_retrieval_stats(self,
+                                source_documents: List[Any],
+                                content_type: str,
+                                generated_queries: List[str]) -> Dict[str, Any]:
+        """Prepare retrieval statistics for logging.
+        Args:
+            source_documents: Retrieved documents
+            content_type: The routing type
+            generated_queries: Generated query variations
+        Returns:
+            Dictionary with retrieval statistics
+        """
+        # Count document types
+        document_types = {
+            "course": 0,
+            "program": 0,
+            "unknown": 0
+        }
+        documents_info = []
+        for doc in source_documents:
+            doc_type = doc.metadata.get("doc_type", "unknown")
+            document_types[doc_type] = document_types.get(doc_type, 0) + 1
+            documents_info.append({
+                "content": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content,
+                "metadata": doc.metadata,
+                "source": os.path.basename(doc.metadata.get("source", ""))
+            })
+        return {
+            "total_documents": len(source_documents),
+            "documents": documents_info,
+            "document_types": document_types,
+            "generated_queries": generated_queries,
+            "routing_type": content_type
+        }
+    def _prepare_chat_context(self, chat_history: List[Any]) -> Dict[str, Any]:
+        """Prepare chat context for logging.
+        Args:
+            chat_history: Chat memory messages
+        Returns:
+            Dictionary with chat context information
+        """
+        context_messages = []
+        if chat_history:
+            # Get last few messages for context
+            recent_messages = chat_history[-6:]  # Last 6 messages (3 pairs)
+            for msg in recent_messages:
+                if hasattr(msg, 'type') and hasattr(msg, 'content'):
+                    context_messages.append({
+                        "role": msg.type,
+                        "content": msg.content[:500] + "..." if len(msg.content) > 500 else msg.content
+                    })
+        return {
+            "chat_history": context_messages,
+            "memory_window_size": Config.MEMORY_WINDOW_SIZE,
+            "total_messages": len(chat_history) if chat_history else 0
+        }
+    def get_recent_interactions(self, limit: int = 10) -> List[Dict[str, Any]]:
+        """Get recent chat interactions.
+        Args:
+            limit: Maximum number of interactions to return
+        Returns:
+            List of recent interactions
+        """
+        try:
+            with open(self.log_file, 'r') as f:
+                logs = json.load(f)
+            # Return most recent interactions
+            return logs[-limit:] if len(logs) > limit else logs
+        except Exception as e:
+            print(f"Error reading recent interactions: {str(e)}")
+            return []
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about logged interactions.
+        Returns:
+            Dictionary with interaction statistics
+        """
+        try:
+            with open(self.log_file, 'r') as f:
+                logs = json.load(f)
+            if not logs:
+                return {"total_interactions": 0}
+            # Calculate statistics
+            total_interactions = len(logs)
+            content_types = {}
+            avg_processing_time = 0
+            for log in logs:
+                # Count content types
+                content_type = log.get("query", {}).get("content_type", "unknown")
+                content_types[content_type] = content_types.get(content_type, 0) + 1
+                # Sum processing times
+                processing_time = log.get("performance", {}).get("processing_time", 0)
+                if processing_time:
+                    avg_processing_time += processing_time
+            # Calculate average processing time
+            if total_interactions > 0:
+                avg_processing_time = avg_processing_time / total_interactions
+            return {
+                "total_interactions": total_interactions,
+                "content_type_distribution": content_types,
+                "average_processing_time": avg_processing_time,
+                "last_interaction": logs[-1].get("timestamp") if logs else None
+            }
+        except Exception as e:
+            print(f"Error calculating stats: {str(e)}")
+            return {"error": str(e)}
+    def clear_logs(self) -> bool:
+        """Clear all logged interactions.
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            with open(self.log_file, 'w') as f:
+                json.dump([], f)
+            return True
+        except Exception as e:
+            print(f"Error clearing logs: {str(e)}")
+            return False
+    def export_logs(self, output_file: str) -> bool:
+        """Export logs to a different file.
+        Args:
+            output_file: Path to the output file
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            with open(self.log_file, 'r') as f:
+                logs = json.load(f)
+            with open(output_file, 'w') as f:
+                json.dump(logs, f, indent=2)
+            return True
+        except Exception as e:
+            print(f"Error exporting logs: {str(e)}")
+            return False

src/config.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import os
+from typing import Dict, Any
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class Config:
+    """Application configuration settings."""
+    # API Keys
+    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+    FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
+    # Model Configuration
+    MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4.1-mini-2025-04-14")
+    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
+    TEMPERATURE = float(os.getenv("TEMPERATURE", "0.1"))
+    MAX_TOKENS = int(os.getenv("MAX_TOKENS", "2000"))
+    # Database Configuration
+    CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./data/chroma")
+    COLLECTION_NAME = os.getenv("COLLECTION_NAME", "course_docs")
+    # Text Splitting Configuration
+    CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "2000"))
+    CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "200"))
+    # Retrieval Configuration
+    RETRIEVAL_K_VALUES = {
+        "course": int(os.getenv("RETRIEVAL_K_COURSE", "20")),
+        "program": int(os.getenv("RETRIEVAL_K_PROGRAM", "15")),
+        "both": int(os.getenv("RETRIEVAL_K_BOTH", "25"))
+    }
+    # Embedding Configuration
+    EMBEDDING_CHUNK_SIZE = int(os.getenv("EMBEDDING_CHUNK_SIZE", "1000"))
+    EMBEDDING_MAX_RETRIES = int(os.getenv("EMBEDDING_MAX_RETRIES", "3"))
+    EMBEDDING_REQUEST_TIMEOUT = int(os.getenv("EMBEDDING_REQUEST_TIMEOUT", "60"))
+    # Memory Configuration
+    MEMORY_WINDOW_SIZE = int(os.getenv("MEMORY_WINDOW_SIZE", "3"))
+    # Logging Configuration
+    LOG_FILE = os.getenv("LOG_FILE", "chat_history.json")
+    LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
+    DEBUG_MODE = os.getenv("DEBUG_MODE", "false").lower() == "true"
+    # Directory Paths
+    DATA_BASE_PATH = os.getenv("DATA_BASE_PATH", "./data")
+    COURSES_MD_PATH = os.getenv("COURSES_MD_PATH", "data/courses/md")
+    COURSES_PDF_PATH = os.getenv("COURSES_PDF_PATH", "data/courses/pdf")
+    PROGRAMS_MD_PATH = os.getenv("PROGRAMS_MD_PATH", "data/programs/md")
+    PROGRAMS_PDF_PATH = os.getenv("PROGRAMS_PDF_PATH", "data/programs/pdf")
+    # Interface Configuration
+    GRADIO_PORT = int(os.getenv("GRADIO_PORT", "7860"))
+    GRADIO_SHARE = os.getenv("GRADIO_SHARE", "false").lower() == "true"
+    # Telemetry Configuration
+    LANGCHAIN_TRACING_V2 = os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true"
+    ANONYMIZED_TELEMETRY = os.getenv("ANONYMIZED_TELEMETRY", "false").lower() == "true"
+    POSTHOG_DISABLED = os.getenv("POSTHOG_DISABLED", "true").lower() == "true"
+    CHROMA_TELEMETRY_DISABLED = os.getenv("CHROMA_TELEMETRY_DISABLED", "true").lower() == "true"
+    DO_NOT_TRACK = os.getenv("DO_NOT_TRACK", "1")
+class PromptTemplates:
+    """Centralized prompt templates."""
+    COURSE_QUERY_TEMPLATE = """You are an AI language model assistant. Your task is to generate five
+different versions of the given user question to retrieve relevant documents about university COURSES.
+Follow these guidelines:
+1. Focus on different aspects: content, prerequisites, learning outcomes, examination methods
+2. Use different phrasings and synonyms
+3. Include the course code or name if present in the original question
+4. Make queries both more specific and more general than the original
+5. Ensure each query is semantically meaningful and complete
+Original question: {question}
+Generate 5 different versions, one per line."""
+    PROGRAM_QUERY_TEMPLATE = """You are an AI language model assistant. Your task is to generate five
+different versions of the given user question to retrieve relevant documents about university PROGRAMS.
+Follow these guidelines:
+1. Focus on different aspects: program structure, career opportunities, admission requirements, outcomes
+2. Use different phrasings and synonyms
+3. Include the program name if present in the original question
+4. Make queries both more specific and more general than the original
+5. Consider both overall program information and specific details
+Original question: {question}
+Generate 5 different versions, one per line."""
+    GENERAL_QUERY_TEMPLATE = """You are an AI language model assistant. Your task is to generate five
+different versions of the given user question to retrieve relevant documents about both university COURSES and PROGRAMS.
+Follow these guidelines:
+1. Balance between course-specific and program-level information
+2. Include variations that focus on how courses fit into programs
+3. Use different phrasings and synonyms
+4. Make queries both more specific and more general than the original
+5. Maintain the original intent while exploring different aspects
+Original question: {question}
+Generate 5 different versions, one per line."""
+    ROUTER_SYSTEM_TEMPLATE = """You are an expert at routing user questions about university education to the appropriate content type.
+Your task is to determine whether the question is about:
+1. A specific COURSE or course-related information
+2. A specific PROGRAM or program-related information
+3. BOTH when the question involves both courses and programs or when it's unclear
+Examples:
+- "What are the prerequisites for DIT134?" -> course
+- "Tell me about the Software Engineering program" -> program
+- "What courses are included in the Data Science master's?" -> both
+- "How many credits do I need?" -> both"""
+    SYSTEM_TEMPLATE = """You are a helpful course and program information assistant for Gothenburg University.
+Your role is to provide accurate information about courses and programs based ONLY on the provided course and program documents.
+Important rules to follow:
+1. Only answer questions about courses that are explicitly mentioned in the provided documents
+2. If a course is not in the documents, clearly state that you don't have information about that course
+3. Base your answers solely on the content from the course documents
+4. If you're unsure about any information, say so explicitly
+5. When discussing course content, prerequisites, or evaluation methods, quote directly from the source documents when possible
+6. Include the course code (e.g., DIT123) when referring to courses
+7. For listing questions (e.g., "What programs are available?", "List all courses in X"):
+   - ALWAYS check the sources list first
+   - THOROUGHLY examine EACH source document listed in the sources
+   - List EVERY program or course mentioned in ANY of the retrieved documents
+   - Do not skip any programs even if they seem similar to others
+   - Include program/course codes when available
+   - Group items logically (e.g., by degree level: Bachelor's, Master's)
+   - Double-check the sources list against your response to ensure no programs were missed
+8. For questions asking about all programs from a specific school/department:
+   - List ALL programs from the retrieved documents
+   - Cross-reference the sources list with your response to ensure completeness
+   - Include full program names and codes
+   - Organize by degree level (Bachelor's/Master's)
+   - Specify the credit amount if available
+   - Before finishing your response, verify that you've included every program from every source listed
+Context from documents: {context}
+Current conversation history: {chat_history}
+Human question: {question}
+Remember:
+1. When asked to list programs or courses, THOROUGHLY check all retrieved documents and include EVERY relevant item.
+2. Do not summarize or skip any programs/courses found in the sources.
+3. Always cross-reference your final list against the sources to ensure nothing was missed.
+4. If you see a source in the list that contains "programme" or "program" in its name, make sure that program is included in your response.
+Please provide a response based strictly on the above context. If the information isn't in the context, say so."""
+    @classmethod
+    def get_query_template(cls, content_type: str) -> str:
+        """Get the appropriate query template based on content type."""
+        templates = {
+            "course": cls.COURSE_QUERY_TEMPLATE,
+            "program": cls.PROGRAM_QUERY_TEMPLATE,
+            "both": cls.GENERAL_QUERY_TEMPLATE
+        }
+        return templates.get(content_type, cls.GENERAL_QUERY_TEMPLATE)
+class AppConstants:
+    """Application constants."""
+    ROUTING_INFO = {
+        "course": "🎓 Course-specific response:",
+        "program": "📚 Program-specific response:",
+        "both": "🏫 General education response:"
+    }
+    EXAMPLE_QUERIES = [
+        "What is the Applied Data Science program about?",
+        "What are the prerequisites for Applied Machine Learning?",
+        "Tell me about courses in the Master's Program in Management.",
+        "List all master's programs in the School of Business, Economics and Law.",
+        "What programs are available in Computer Science?"
+    ]
+    SUPPORTED_FILE_ENCODINGS = ['utf-8', 'iso-8859-1', 'latin1']
+    SUPPORTED_FILE_EXTENSIONS = {
+        'markdown': ['.md'],
+        'pdf': ['.pdf']
+    }
+    BATCH_SIZE = 50  # For processing documents in batches
+def setup_telemetry():
+    """Set up telemetry environment variables to prevent warnings."""
+    # Set LangChain telemetry environment variables
+    os.environ["LANGCHAIN_TRACING_V2"] = str(Config.LANGCHAIN_TRACING_V2).lower()
+    os.environ["ANONYMIZED_TELEMETRY"] = str(Config.ANONYMIZED_TELEMETRY).lower()
+    os.environ["POSTHOG_DISABLED"] = str(Config.POSTHOG_DISABLED).lower()
+    # Set ChromaDB telemetry environment variables
+    os.environ["CHROMA_TELEMETRY_DISABLED"] = "true"
+    os.environ["CHROMA_TELEMETRY"] = "false"
+    # Additional telemetry controls
+    os.environ["DO_NOT_TRACK"] = "1"
+def validate_config():
+    """Validate that required configuration is present."""
+    if not Config.OPENAI_API_KEY:
+        raise ValueError("OpenAI API key not found in environment variables")
+    # Setup telemetry to prevent warnings
+    setup_telemetry()
+    return True

src/document_processor.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import os
+import re
+import time
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
+from config import Config, AppConstants
+from models import DocumentMetadata, ProcessingStats
+class DocumentProcessor:
+    """Handles document loading, processing, and chunking."""
+    def __init__(self, base_path: str = None):
+        """Initialize the document processor.
+        Args:
+            base_path: Base path for document directories
+        """
+        self.base_path = base_path or Config.DATA_BASE_PATH
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=Config.CHUNK_SIZE,
+            chunk_overlap=Config.CHUNK_OVERLAP,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""]
+        )
+    def process_all_documents(self) -> List[Document]:
+        """Process both markdown and PDF documents from courses and programs directories.
+        Returns:
+            List of processed documents with proper metadata
+        """
+        start_time = time.time()
+        documents = {
+            'courses': [],
+            'programs': []
+        }
+        # Define paths for different document types
+        paths = self._get_document_paths()
+        # Create directories if they don't exist
+        self._ensure_directories_exist(paths)
+        # Process documents by category
+        for category in ['courses', 'programs']:
+            # Process markdown files
+            md_path = paths[f'{category}_md']
+            if os.path.exists(md_path):
+                documents[category].extend(self._process_markdown_files(md_path, category))
+            # Process PDF files
+            pdf_path = paths[f'{category}_pdf']
+            if os.path.exists(pdf_path):
+                documents[category].extend(self._process_pdf_files(pdf_path, category))
+            print(f"Processed {len(documents[category])} {category} documents")
+        # Combine all documents while maintaining their metadata
+        all_documents = documents['courses'] + documents['programs']
+        # Create processing stats
+        processing_time = time.time() - start_time
+        stats = ProcessingStats(
+            total_documents=len(all_documents),
+            courses_processed=len(documents['courses']),
+            programs_processed=len(documents['programs']),
+            chunks_created=0,  # Will be updated after chunking
+            processing_time=processing_time
+        )
+        print(f"Total documents processed: {len(all_documents)}")
+        print(f"Courses: {len(documents['courses'])}, Programs: {len(documents['programs'])}")
+        print(f"Processing time: {processing_time:.2f} seconds")
+        return all_documents
+    def chunk_documents(self, documents: List[Document]) -> List[Document]:
+        """Split documents into chunks for embedding.
+        Args:
+            documents: List of documents to chunk
+        Returns:
+            List of document chunks
+        """
+        print(f"Splitting {len(documents)} documents into chunks...")
+        chunks = self.text_splitter.split_documents(documents)
+        print(f"Created {len(chunks)} document chunks")
+        return chunks
+    def _get_document_paths(self) -> Dict[str, str]:
+        """Get paths for different document types.
+        Returns:
+            Dictionary with document paths
+        """
+        return {
+            'courses_md': os.path.join(self.base_path, Config.COURSES_MD_PATH),
+            'courses_pdf': os.path.join(self.base_path, Config.COURSES_PDF_PATH),
+            'programs_md': os.path.join(self.base_path, Config.PROGRAMS_MD_PATH),
+            'programs_pdf': os.path.join(self.base_path, Config.PROGRAMS_PDF_PATH)
+        }
+    def _ensure_directories_exist(self, paths: Dict[str, str]) -> None:
+        """Ensure all document directories exist.
+        Args:
+            paths: Dictionary of paths to create
+        """
+        for path in paths.values():
+            if not os.path.exists(path):
+                os.makedirs(path, exist_ok=True)
+                print(f"Created directory: {path}")
+    def _process_markdown_files(self, path: str, category: str) -> List[Document]:
+        """Process markdown files in a directory.
+        Args:
+            path: Path to the markdown files directory
+            category: Type of documents ('courses' or 'programs')
+        Returns:
+            List of processed markdown documents with metadata
+        """
+        documents = []
+        if not os.path.exists(path):
+            print(f"Warning: Markdown directory {path} does not exist")
+            return documents
+        for filename in os.listdir(path):
+            if filename.endswith('.md'):
+                file_path = os.path.join(path, filename)
+                try:
+                    content = self._read_file_with_fallback_encoding(file_path)
+                    # Create metadata
+                    metadata = {
+                        'source': file_path,
+                        'type': 'markdown',
+                        'category': category,
+                        'doc_type': category.rstrip('s'),  # 'course' or 'program'
+                        'filename': filename
+                    }
+                    # Extract course code if it's a course document
+                    if category == 'courses':
+                        code = self._extract_course_code(filename, content)
+                        if code:
+                            metadata['course_code'] = code
+                    doc = Document(
+                        page_content=content,
+                        metadata=metadata
+                    )
+                    documents.append(doc)
+                except Exception as e:
+                    print(f"Error processing markdown file {filename}: {str(e)}")
+        return documents
+    def _process_pdf_files(self, path: str, category: str) -> List[Document]:
+        """Process PDF files in a directory.
+        Args:
+            path: Path to the PDF files directory
+            category: Type of documents ('courses' or 'programs')
+        Returns:
+            List of processed PDF documents with metadata
+        """
+        documents = []
+        if not os.path.exists(path):
+            print(f"Warning: PDF directory {path} does not exist")
+            return documents
+        for filename in os.listdir(path):
+            if filename.endswith('.pdf'):
+                file_path = os.path.join(path, filename)
+                try:
+                    loader = PyPDFLoader(file_path)
+                    pdf_docs = loader.load()
+                    # Create base metadata
+                    metadata = {
+                        'type': 'pdf',
+                        'category': category,
+                        'doc_type': category.rstrip('s'),  # 'course' or 'program'
+                        'filename': filename
+                    }
+                    # Add course code if it exists and it's a course document
+                    if category == 'courses' and pdf_docs:
+                        code = self._extract_course_code(filename, pdf_docs[0].page_content)
+                        if code:
+                            metadata['course_code'] = code
+                    # Add metadata to each page
+                    for doc in pdf_docs:
+                        doc.metadata.update(metadata)
+                    documents.extend(pdf_docs)
+                except Exception as e:
+                    print(f"Error processing PDF {filename}: {str(e)}")
+        return documents
+    def _read_file_with_fallback_encoding(self, file_path: str) -> str:
+        """Read a file with fallback encodings.
+        Args:
+            file_path: Path to the file to read
+        Returns:
+            File content as string
+        Raises:
+            UnicodeDecodeError: If file cannot be read with any encoding
+        """
+        for encoding in AppConstants.SUPPORTED_FILE_ENCODINGS:
+            try:
+                with open(file_path, 'r', encoding=encoding) as f:
+                    return f.read()
+            except UnicodeDecodeError:
+                continue
+        raise UnicodeDecodeError(f"Failed to read {file_path} with any encoding")
+    def _extract_course_code(self, filename: str, content: str) -> Optional[str]:
+        """Extract course code from filename or content if possible.
+        Args:
+            filename: Name of the file
+            content: Content of the document
+        Returns:
+            Course code if found, None otherwise
+        """
+        # Try to extract from filename first (e.g., "DIT134-advanced-programming.pdf")
+        code_match = re.search(r'([A-Z]{3}\d{3})', filename)
+        if code_match:
+            return code_match.group(1)
+        # Try to extract from content (first occurrence)
+        code_match = re.search(r'([A-Z]{3}\d{3})', content[:1000])  # Search in first 1000 chars
+        if code_match:
+            return code_match.group(1)
+        return None
+    def get_document_stats(self, documents: List[Document]) -> Dict[str, Any]:
+        """Get statistics about processed documents.
+        Args:
+            documents: List of processed documents
+        Returns:
+            Dictionary with document statistics
+        """
+        stats = {
+            'total_documents': len(documents),
+            'by_category': {},
+            'by_type': {},
+            'by_doc_type': {},
+            'course_codes': set(),
+            'total_content_length': 0
+        }
+        for doc in documents:
+            metadata = doc.metadata
+            # Count by category
+            category = metadata.get('category', 'unknown')
+            stats['by_category'][category] = stats['by_category'].get(category, 0) + 1
+            # Count by file type
+            file_type = metadata.get('type', 'unknown')
+            stats['by_type'][file_type] = stats['by_type'].get(file_type, 0) + 1
+            # Count by document type
+            doc_type = metadata.get('doc_type', 'unknown')
+            stats['by_doc_type'][doc_type] = stats['by_doc_type'].get(doc_type, 0) + 1
+            # Collect course codes
+            if metadata.get('course_code'):
+                stats['course_codes'].add(metadata['course_code'])
+            # Sum content length
+            stats['total_content_length'] += len(doc.page_content)
+        # Convert set to list for JSON serialization
+        stats['course_codes'] = list(stats['course_codes'])
+        stats['unique_course_codes'] = len(stats['course_codes'])
+        return stats
+    def validate_documents(self, documents: List[Document]) -> Dict[str, Any]:
+        """Validate processed documents for common issues.
+        Args:
+            documents: List of documents to validate
+        Returns:
+            Dictionary with validation results
+        """
+        validation_results = {
+            'total_documents': len(documents),
+            'issues': [],
+            'warnings': [],
+            'valid_documents': 0,
+            'empty_documents': 0,
+            'missing_metadata': 0
+        }
+        for i, doc in enumerate(documents):
+            # Check for empty content
+            if not doc.page_content or len(doc.page_content.strip()) == 0:
+                validation_results['empty_documents'] += 1
+                validation_results['issues'].append(f"Document {i}: Empty content")
+                continue
+            # Check for essential metadata
+            required_metadata = ['source', 'type', 'category', 'doc_type', 'filename']
+            missing_fields = [field for field in required_metadata if not doc.metadata.get(field)]
+            if missing_fields:
+                validation_results['missing_metadata'] += 1
+                validation_results['warnings'].append(
+                    f"Document {i}: Missing metadata fields: {missing_fields}"
+                )
+            # Check content length
+            if len(doc.page_content) < 50:
+                validation_results['warnings'].append(
+                    f"Document {i}: Very short content ({len(doc.page_content)} chars)"
+                )
+            validation_results['valid_documents'] += 1
+        return validation_results

src/interface.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import gradio as gr
+from typing import List, Dict, Any
+from rag_service import RAGService
+from config import Config, AppConstants
+class RAGInterface:
+    """Gradio interface for the RAG application."""
+    def __init__(self, rag_service: RAGService):
+        """Initialize the interface.
+        Args:
+            rag_service: The RAG service instance
+        """
+        self.rag_service = rag_service
+        self.interface = None
+    def process_query(self, message: str, history: List[Dict[str, str]]) -> str:
+        """Process a single query in the chat interface.
+        Args:
+            message: User's message
+            history: Chat history in OpenAI-style format with 'role' and 'content' keys
+        Returns:
+            Assistant's response
+        """
+        try:
+            # Query the RAG service
+            result = self.rag_service.query(message)
+            # Format response with routing information
+            content_type = result.content_type
+            answer = result.answer
+            # Add routing indicator
+            routing_prefix = AppConstants.ROUTING_INFO.get(content_type, "")
+            if routing_prefix:
+                return f"{routing_prefix}\n\n{answer}"
+            else:
+                return answer
+        except Exception as e:
+            error_msg = f"❌ Error: {str(e)}"
+            print(f"Interface error: {error_msg}")
+            return error_msg
+    def get_system_info(self) -> str:
+        """Get formatted system information.
+        Returns:
+            Formatted system status string
+        """
+        try:
+            status = self.rag_service.get_system_status()
+            # Format the information nicely
+            info = f"""
+### 📊 System Status
+**Database Status:** {'✅ Initialized' if status['database_initialized'] else '❌ Not Initialized'}
+**Documents Loaded:** {status['documents_loaded']}
+**Model Version:** {status['model_version']}
+**Embedding Model:** {status['embedding_version']}
+**Conversation Length:** {status['conversation_length']} messages
+### 🔍 Search Configuration
+**Course Queries:** Top {Config.RETRIEVAL_K_VALUES['course']} matches
+**Program Queries:** Top {Config.RETRIEVAL_K_VALUES['program']} matches
+**Mixed Queries:** Top {Config.RETRIEVAL_K_VALUES['both']} matches
+**Search Type:** MMR (Maximal Marginal Relevance)
+### 📚 Query Types
+**🎓 Course Queries**
+- Specific course information
+- Prerequisites and requirements
+- Learning outcomes
+- Course content and structure
+**📚 Program Queries**
+- Program overviews and structure
+- Available programs by department
+- Program requirements and outcomes
+- Career opportunities
+**🏫 General Queries**
+- Courses within programs
+- Department offerings
+- Combined course/program information
+- Cross-referencing content
+"""
+            return info.strip()
+        except Exception as e:
+            return f"Error getting system info: {str(e)}"
+    def create_interface(self) -> gr.Blocks:
+        """Create and configure the Gradio interface.
+        Returns:
+            Configured Gradio Blocks interface
+        """
+        # Create the interface
+        with gr.Blocks(theme=gr.themes.Soft()) as interface:
+            gr.Markdown("""
+            # GuPT: Gothenburg University Information Assistant
+            Ask questions about Gothenburg University's courses and programs.
+            """)
+            with gr.Row(equal_height=True):
+                # Chat column (2/3 of width)
+                with gr.Column(scale=2):
+                    chat_interface = gr.ChatInterface(
+                        fn=self.process_query,
+                        examples=AppConstants.EXAMPLE_QUERIES,
+                        css="""
+                        div.message-wrap { height: 600px !important; overflow-y: auto; }
+                        details { margin-top: 10px; }
+                        summary { cursor: pointer; color: #2A6BB0; }
+                        summary:hover { text-decoration: underline; }
+                        """,
+                        type="messages"
+                    )
+                # Info column (1/3 of width)
+                with gr.Column(scale=1):
+                    # Get system status for static display
+                    status = self.rag_service.get_system_status()
+                    gr.Markdown(f"""
+                    ### Document Collection
+                    - Documents Loaded: {status['documents_loaded']}
+                    - Database Status: {'✅ Initialized' if status['database_initialized'] else '❌ Not Ready'}
+                    - Model: {status['model_version']}
+                    ### Search Configuration
+                    - Using MMR for diverse results
+                    - Course queries: top {Config.RETRIEVAL_K_VALUES['course']} matches
+                    - Program queries: top {Config.RETRIEVAL_K_VALUES['program']} matches
+                    - Mixed queries: top {Config.RETRIEVAL_K_VALUES['both']} matches
+                    ### Query Types
+                    🎓 **Course Queries**
+                    - Specific course information
+                    - Prerequisites and requirements
+                    - Learning outcomes
+                    📚 **Program Queries**
+                    - Program overviews
+                    - Available programs by department
+                    - Program requirements
+                    🏫 **General Queries**
+                    - Courses within programs
+                    - Department offerings
+                    - Combined course/program information
+                    """)
+        self.interface = interface
+        return interface
+    def launch(self, **kwargs):
+        """Launch the Gradio interface.
+        Args:
+            **kwargs: Additional arguments for Gradio launch
+        """
+        if not self.interface:
+            self.create_interface()
+        # Default launch parameters
+        launch_params = {
+            "share": False,
+            "server_name": "0.0.0.0",
+            "server_port": 7860,
+            "show_error": True,
+            "quiet": False
+        }
+        # Update with any provided parameters
+        launch_params.update(kwargs)
+        print(f"🚀 Launching GuPT interface...")
+        print(f"📍 Server: {launch_params['server_name']}:{launch_params['server_port']}")
+        try:
+            self.interface.launch(**launch_params)
+        except Exception as e:
+            print(f"❌ Error launching interface: {str(e)}")
+            raise
+def create_interface(rag_service: RAGService) -> RAGInterface:
+    """Factory function to create a RAG interface.
+    Args:
+        rag_service: The RAG service instance
+    Returns:
+        Configured RAGInterface instance
+    """
+    return RAGInterface(rag_service)

src/main.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#!/usr/bin/env python3
+"""
+GuPT: Gothenburg University Information Assistant
+Main entry point for the restructured RAG application.
+This is the modernized version using:
+- LCEL (LangChain Expression Language)
+- Modular architecture
+- Better error handling
+- Enhanced logging
+"""
+import sys
+import time
+import argparse
+from typing import Optional
+# Local imports
+from config import Config, validate_config
+from rag_service import RAGService
+from interface import create_interface
+def parse_arguments():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="GuPT: Gothenburg University Information Assistant",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python main.py                    # Launch with default settings
+  python main.py --no-share         # Launch without sharing
+  python main.py --port 8080        # Launch on port 8080
+  python main.py --rebuild-db       # Force rebuild of vector database
+        """
+    )
+    # Interface options
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        default=False,
+        help="Share the interface via Gradio public link"
+    )
+    parser.add_argument(
+        "--no-share",
+        action="store_true",
+        default=False,
+        help="Explicitly disable sharing (default)"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Port to run the interface on (default: 7860)"
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="0.0.0.0",
+        help="Host to bind to (default: 0.0.0.0)"
+    )
+    # Database options
+    parser.add_argument(
+        "--rebuild-db",
+        action="store_true",
+        help="Force rebuild of the vector database"
+    )
+    parser.add_argument(
+        "--db-path",
+        type=str,
+        default=None,
+        help=f"Custom path for vector database (default: {Config.CHROMA_DB_PATH})"
+    )
+    # Debug options
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable debug mode with verbose output"
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress non-essential output"
+    )
+    return parser.parse_args()
+def print_banner():
+    """Print application banner."""
+    banner = """
+╔══════════════════════════════════════════════════════════════╗
+║                                                              ║
+║   🎓 GuPT - Gothenburg University Information Assistant      ║
+║                                                              ║
+║   Built with: LangChain + OpenAI + Gradio                    ║
+║                                                              ║
+╚══════════════════════════════════════════════════════════════╝
+"""
+    print(banner)
+def check_prerequisites() -> bool:
+    """Check if all prerequisites are met.
+    Returns:
+        True if all prerequisites are met, False otherwise
+    """
+    try:
+        # Validate configuration
+        validate_config()
+        print("✅ Configuration validated")
+        # Check if required directories exist
+        import os
+        data_dirs = [
+            Config.COURSES_MD_PATH,
+            Config.COURSES_PDF_PATH,
+            Config.PROGRAMS_MD_PATH,
+            Config.PROGRAMS_PDF_PATH
+        ]
+        missing_dirs = []
+        for dir_path in data_dirs:
+            if not os.path.exists(dir_path):
+                missing_dirs.append(dir_path)
+        if missing_dirs:
+            print("⚠️  Warning: Some data directories are missing:")
+            for dir_path in missing_dirs:
+                print(f"   - {dir_path}")
+            print("   The system will create them automatically if needed.")
+        print("✅ Prerequisites check completed")
+        return True
+    except Exception as e:
+        print(f"❌ Prerequisites check failed: {str(e)}")
+        return False
+def initialize_rag_service(args) -> Optional[RAGService]:
+    """Initialize the RAG service.
+    Args:
+        args: Parsed command line arguments
+    Returns:
+        Initialized RAG service or None if failed
+    """
+    try:
+        print("🔧 Initializing RAG service...")
+        # Create RAG service
+        rag_service = RAGService()
+        print("📚 Loading documents and vector store...")
+        start_time = time.time()
+        # Handle database rebuild
+        if args.rebuild_db:
+            print("🔄 Rebuilding vector database...")
+            import shutil
+            import os
+            if os.path.exists(Config.CHROMA_DB_PATH):
+                shutil.rmtree(Config.CHROMA_DB_PATH)
+                print(f"   Removed existing database at {Config.CHROMA_DB_PATH}")
+        # Load documents
+        num_chunks = rag_service.load_documents()
+        load_time = time.time() - start_time
+        print(f"✅ RAG service initialized successfully!")
+        print(f"   📊 Processed {num_chunks} document chunks")
+        print(f"   ⏱️  Loading time: {load_time:.2f} seconds")
+        return rag_service
+    except Exception as e:
+        print(f"❌ Failed to initialize RAG service: {str(e)}")
+        return None
+def main():
+    """Main entry point."""
+    # Parse arguments
+    args = parse_arguments()
+    # Set up quiet mode
+    if args.quiet:
+        import os
+        # Redirect stdout to devnull for quiet mode
+        # We'll still print important messages to stderr
+        pass
+    # Print banner unless in quiet mode
+    if not args.quiet:
+        print_banner()
+    try:
+        # Check prerequisites
+        if not check_prerequisites():
+            print("❌ Prerequisites check failed. Please fix the issues and try again.")
+            sys.exit(1)
+        # Initialize RAG service
+        rag_service = initialize_rag_service(args)
+        if not rag_service:
+            print("❌ Failed to initialize RAG service. Exiting.")
+            sys.exit(1)
+        # Create and launch interface
+        print("🚀 Creating Gradio interface...")
+        interface_wrapper = create_interface(rag_service)
+        # Determine share setting
+        share = args.share and not args.no_share
+        # Launch parameters
+        launch_params = {
+            "share": share,
+            "server_name": args.host,
+            "server_port": args.port,
+            "show_error": True,
+            "quiet": args.quiet
+        }
+        print(f"🌐 Launching interface...")
+        if not args.quiet:
+            print(f"   📍 Local URL: http://{args.host}:{args.port}")
+            if share:
+                print(f"   🌍 Public sharing: Enabled")
+            else:
+                print(f"   🔒 Public sharing: Disabled")
+        # Launch the interface
+        interface_wrapper.create_interface()
+        interface_wrapper.launch(**launch_params)
+    except KeyboardInterrupt:
+        print("\n👋 Shutting down gracefully...")
+        sys.exit(0)
+    except Exception as e:
+        print(f"❌ Unexpected error: {str(e)}")
+        if args.debug:
+            import traceback
+            traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

src/models.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from pydantic import BaseModel, Field
+from typing import List, Dict, Literal, Optional, Any
+from dataclasses import dataclass, asdict
+from datetime import datetime
+class RouteQuery(BaseModel):
+    """Route a user query to the most relevant content type."""
+    content_type: Literal["course", "program", "both"] = Field(
+        ...,
+        description="Route to: 'course' for specific course questions, 'program' for program questions, 'both' when the question involves both or is unclear"
+    )
+@dataclass
+class DocumentMetadata:
+    """Metadata for processed documents."""
+    source: str
+    type: str  # 'markdown' or 'pdf'
+    category: str  # 'courses' or 'programs'
+    doc_type: str  # 'course' or 'program'
+    filename: str
+    course_code: Optional[str] = None
+@dataclass
+class QueryResult:
+    """Result of a RAG query."""
+    answer: str
+    source_documents: List[Any]  # List of Document objects
+    content_type: str
+    processing_time: Optional[float] = None
+    generated_queries: Optional[List[str]] = None
+    retrieval_stats: Optional[Dict[str, Any]] = None
+@dataclass
+class ChatInteraction:
+    """Single chat interaction for logging."""
+    timestamp: str
+    query: Dict[str, Any]
+    retrieval: Dict[str, Any]
+    response: Dict[str, str]
+    performance: Dict[str, Any]
+    chat_context: Dict[str, Any]
+    system_info: Dict[str, Any]
+@dataclass
+class RetrievalStats:
+    """Statistics about document retrieval."""
+    total_documents: int
+    document_types: Dict[str, int]
+    search_config: Dict[str, Any]
+    queries_used: List[str]
+class EmbeddingConfig(BaseModel):
+    """Configuration for embeddings."""
+    model: str = "text-embedding-3-small"
+    chunk_size: int = 1000
+    max_retries: int = 3
+    request_timeout: int = 60
+class ModelConfig(BaseModel):
+    """Configuration for LLM models."""
+    model_name: str = "gpt-4o-mini"
+    temperature: float = 0.1
+    max_tokens: Optional[int] = None
+class VectorStoreConfig(BaseModel):
+    """Configuration for vector store."""
+    persist_directory: str = "./data/chroma"
+    collection_name: str = "course_docs"
+    collection_metadata: Dict[str, str] = Field(default_factory=lambda: {"hnsw:space": "cosine"})
+class RetrievalConfig(BaseModel):
+    """Configuration for retrieval."""
+    search_type: str = "mmr"
+    k_values: Dict[str, int] = Field(default_factory=lambda: {
+        "course": 6,
+        "program": 15,
+        "both": 15
+    })
+    fetch_k_multiplier: int = 3
+@dataclass
+class ProcessingStats:
+    """Statistics about document processing."""
+    total_documents: int
+    courses_processed: int
+    programs_processed: int
+    chunks_created: int
+    processing_time: float
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return asdict(self)
+class ChatMemoryMessage(BaseModel):
+    """Message in chat memory."""
+    role: str
+    content: str
+    timestamp: Optional[str] = None
+class SystemStatus(BaseModel):
+    """System status information."""
+    database_initialized: bool = False
+    documents_loaded: int = 0
+    model_version: str = ""
+    embedding_version: str = ""
+    last_updated: Optional[str] = None

src/rag_service.py ADDED Viewed

	@@ -0,0 +1,569 @@

+import os
+import time
+from typing import List, Dict, Any, Optional
+# LangChain imports using modern patterns
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough, RunnableParallel
+from langchain_core.messages import HumanMessage, AIMessage
+# Local imports
+from config import Config, PromptTemplates, validate_config
+from models import RouteQuery, QueryResult, RetrievalStats
+from document_processor import DocumentProcessor
+from chat_logger import ChatLogger
+class RAGService:
+    """Modern RAG service using LangChain Expression Language (LCEL)."""
+    def __init__(self, base_path: str = None):
+        """Initialize the RAG service.
+        Args:
+            base_path: Base path for documents and vector store
+        """
+        # Validate configuration
+        validate_config()
+        self.base_path = base_path or Config.DATA_BASE_PATH
+        self.chat_logger = ChatLogger()
+        self.conversation_memory = []  # Simple in-memory conversation storage
+        # Initialize components
+        self._initialize_models()
+        self._initialize_vector_store()
+        self._setup_chains()
+        # Track last generated queries for logging
+        self.last_generated_queries = []
+    def _initialize_models(self):
+        """Initialize LLM and embedding models."""
+        print("Initializing AI models...")
+        # Initialize LLM
+        self.llm = ChatOpenAI(
+            model=Config.MODEL_NAME,
+            temperature=Config.TEMPERATURE,
+            api_key=Config.OPENAI_API_KEY
+        )
+        # Initialize embeddings with better error handling
+        self.embeddings = OpenAIEmbeddings(
+            api_key=Config.OPENAI_API_KEY,
+            model=Config.EMBEDDING_MODEL,
+            chunk_size=Config.EMBEDDING_CHUNK_SIZE,
+            max_retries=Config.EMBEDDING_MAX_RETRIES,
+            request_timeout=Config.EMBEDDING_REQUEST_TIMEOUT
+        )
+        print("✅ AI models initialized successfully")
+    def _initialize_vector_store(self):
+        """Initialize the vector store (empty initially)."""
+        self.vector_store = None
+        print("Vector store placeholder initialized")
+    def _setup_chains(self):
+        """Set up all the LCEL chains."""
+        print("Setting up LangChain LCEL chains...")
+        # Router chain
+        router_prompt = ChatPromptTemplate.from_messages([
+            ("system", PromptTemplates.ROUTER_SYSTEM_TEMPLATE),
+            ("human", "{question}")
+        ])
+        self.router_chain = router_prompt | self.llm.with_structured_output(RouteQuery)
+        # Query generation chains for different content types
+        self.query_generation_chains = {}
+        for content_type in ["course", "program", "both"]:
+            template = PromptTemplates.get_query_template(content_type)
+            prompt = ChatPromptTemplate.from_template(template)
+            self.query_generation_chains[content_type] = prompt | self.llm | StrOutputParser()
+        # Main QA chain
+        qa_prompt = ChatPromptTemplate.from_messages([
+            ("system", PromptTemplates.SYSTEM_TEMPLATE),
+            ("human", "{question}")
+        ])
+        # This will be completed when vector store is loaded
+        self.qa_chain = None
+        print("✅ LCEL chains set up successfully")
+    def load_documents(self) -> int:
+        """Load and process documents, create or load vector store.
+        Returns:
+            Number of document chunks processed
+        """
+        try:
+            print(f"Checking for existing database at: {Config.CHROMA_DB_PATH}")
+            if os.path.exists(Config.CHROMA_DB_PATH) and os.listdir(Config.CHROMA_DB_PATH):
+                print("Existing database found, attempting to load...")
+                count = self._load_existing_database()
+                if count == 0:
+                    print("⚠️  Existing database is empty, rebuilding...")
+                    return self._create_new_database()
+                return count
+            else:
+                print("No existing database found, creating new one...")
+                return self._create_new_database()
+        except Exception as e:
+            print(f"Error loading documents: {str(e)}")
+            raise
+    def _load_existing_database(self) -> int:
+        """Load existing vector database.
+        Returns:
+            Number of documents in the database
+        """
+        print("Loading existing embeddings from Chroma database...")
+        try:
+            self.vector_store = Chroma(
+                persist_directory=Config.CHROMA_DB_PATH,
+                embedding_function=self.embeddings,
+                collection_metadata={"hnsw:space": "cosine"},
+                collection_name=Config.COLLECTION_NAME
+            )
+            # Get collection size
+            collection_data = self.vector_store.get()
+            collection_size = len(collection_data['ids'])
+            if collection_size == 0:
+                print("Database exists but is empty")
+                return 0
+            print(f"✅ Loaded {collection_size} existing document chunks from database")
+            self._setup_qa_chain()
+            return collection_size
+        except Exception as e:
+            print(f"Error loading existing database: {str(e)}")
+            return 0
+    def _create_new_database(self) -> int:
+        """Create new vector database from documents.
+        Returns:
+            Number of document chunks processed
+        """
+        print("Creating new embeddings (this will incur OpenAI API costs)...")
+        # Process documents
+        processor = DocumentProcessor(self.base_path)
+        documents = processor.process_all_documents()
+        if not documents:
+            raise ValueError("No documents found to process")
+        # Chunk documents
+        chunks = processor.chunk_documents(documents)
+        # Initialize empty vector store
+        self.vector_store = Chroma(
+            embedding_function=self.embeddings,
+            persist_directory=Config.CHROMA_DB_PATH,
+            collection_metadata={"hnsw:space": "cosine"},
+            collection_name=Config.COLLECTION_NAME
+        )
+        # Process documents in batches to avoid token limits
+        total_processed = self._process_documents_in_batches(chunks)
+        print(f"✅ Database creation completed! Processed {total_processed} documents.")
+        self._setup_qa_chain()
+        return total_processed
+    def _process_documents_in_batches(self, chunks: List[Document]) -> int:
+        """Process documents in batches to avoid API limits.
+        Args:
+            chunks: List of document chunks to process
+        Returns:
+            Number of successfully processed chunks
+        """
+        batch_size = Config.BATCH_SIZE
+        total_processed = 0
+        print(f"Processing {len(chunks)} document chunks in batches of {batch_size}...")
+        for i in range(0, len(chunks), batch_size):
+            batch = chunks[i:i + batch_size]
+            batch_num = i // batch_size + 1
+            total_batches = (len(chunks) + batch_size - 1) // batch_size
+            print(f"Processing batch {batch_num}/{total_batches} ({len(batch)} documents)")
+            try:
+                self.vector_store.add_documents(batch)
+                total_processed += len(batch)
+                print(f"✅ Successfully processed {len(batch)} documents (Total: {total_processed})")
+                # Small delay to be nice to the API
+                time.sleep(1)
+            except Exception as e:
+                print(f"❌ Error processing batch {batch_num}: {str(e)}")
+                # Continue with next batch instead of failing completely
+                continue
+        return total_processed
+    def _setup_qa_chain(self):
+        """Set up the main QA chain with retriever."""
+        if not self.vector_store:
+            raise ValueError("Vector store not initialized")
+        # Create the main QA chain using LCEL
+        qa_prompt = ChatPromptTemplate.from_messages([
+            ("system", PromptTemplates.SYSTEM_TEMPLATE),
+            ("human", "{question}")
+        ])
+        def format_docs(docs):
+            """Format retrieved documents for the prompt."""
+            return "\n\n".join([d.page_content for d in docs])
+        def format_chat_history(memory):
+            """Format chat history for the prompt."""
+            if not memory:
+                return "No previous conversation."
+            formatted = []
+            for msg in memory[-6:]:  # Last 6 messages (3 exchanges)
+                if isinstance(msg, dict):
+                    role = msg.get('role', 'unknown')
+                    content = msg.get('content', '')
+                elif hasattr(msg, 'type') and hasattr(msg, 'content'):
+                    role = msg.type
+                    content = msg.content
+                else:
+                    continue
+                formatted.append(f"{role}: {content}")
+            return "\n".join(formatted)
+        # Create retriever (will be configured per query)
+        self.base_retriever = self.vector_store.as_retriever()
+        # The QA chain will be constructed per query with specific retriever config
+        self.qa_prompt = qa_prompt
+        self.format_docs = format_docs
+        self.format_chat_history = format_chat_history
+        print("✅ QA chain set up successfully")
+    def route_query(self, question: str) -> str:
+        """Route the query to determine content type.
+        Args:
+            question: User's question
+        Returns:
+            Content type: 'course', 'program', or 'both'
+        """
+        try:
+            result = self.router_chain.invoke({"question": question})
+            return result.content_type
+        except Exception as e:
+            print(f"Error in query routing: {str(e)}")
+            return "both"  # Default to both if routing fails
+    def generate_query_variations(self, question: str, content_type: str) -> List[str]:
+        """Generate multiple query variations for better retrieval.
+        Args:
+            question: Original question
+            content_type: Content type from routing
+        Returns:
+            List of query variations
+        """
+        try:
+            chain = self.query_generation_chains[content_type]
+            variations = chain.invoke({"question": question})
+            # Process and clean the variations
+            queries = [q.strip() for q in variations.split('\n') if q.strip()]
+            # Always include the original question
+            if question not in queries:
+                queries.append(question)
+            # Store for logging
+            self.last_generated_queries = queries
+            return queries
+        except Exception as e:
+            print(f"Error generating query variations: {str(e)}")
+            # Fallback to original question
+            self.last_generated_queries = [question]
+            return [question]
+    def retrieve_documents(self, question: str, content_type: str) -> List[Document]:
+        """Retrieve relevant documents using multiple query variations.
+        Args:
+            question: Original question
+            content_type: Content type from routing
+        Returns:
+            List of relevant documents
+        """
+        if not self.vector_store:
+            raise ValueError("Vector store not initialized. Please load documents first.")
+        # Generate query variations
+        queries = self.generate_query_variations(question, content_type)
+        print(f"\nGenerated queries for '{question}':")
+        for q in queries:
+            print(f"  • {q}")
+        # Configure retriever based on content type
+        k = Config.RETRIEVAL_K_VALUES[content_type]
+        # Create metadata filter if needed
+        search_kwargs = {
+            "k": k,
+            "fetch_k": k * 3  # Fetch more candidates for MMR
+        }
+        if content_type != "both":
+            search_kwargs["filter"] = {"doc_type": content_type}
+        # Configure retriever
+        retriever = self.vector_store.as_retriever(
+            search_type="mmr",
+            search_kwargs=search_kwargs
+        )
+        # Retrieve documents for each query variation
+        all_docs = []
+        for query in queries:
+            try:
+                docs = retriever.invoke(query)
+                all_docs.extend(docs)
+            except Exception as e:
+                print(f"Error retrieving for query '{query}': {str(e)}")
+                continue
+        # Remove duplicates while preserving order
+        unique_docs = []
+        seen_content = set()
+        for doc in all_docs:
+            # Create a unique identifier from content and source
+            doc_id = f"{doc.page_content[:100]}_{doc.metadata.get('source', '')}"
+            if doc_id not in seen_content:
+                seen_content.add(doc_id)
+                unique_docs.append(doc)
+        # Log retrieval statistics
+        doc_types = [doc.metadata.get('doc_type', 'unknown') for doc in unique_docs]
+        print(f"\nRetrieved {len(unique_docs)} unique documents:")
+        print(f"  • Courses: {doc_types.count('course')}")
+        print(f"  • Programs: {doc_types.count('program')}")
+        return unique_docs
+    def query(self, question: str) -> QueryResult:
+        """Process a user query and return response.
+        Args:
+            question: User's question
+        Returns:
+            QueryResult with answer and metadata
+        """
+        if not self.vector_store:
+            raise ValueError("Model not initialized. Please load documents first.")
+        start_time = time.time()
+        try:
+            # Route the query
+            content_type = self.route_query(question)
+            print(f"Query routed as: {content_type}")
+            # Retrieve relevant documents
+            docs = self.retrieve_documents(question, content_type)
+            # Format context and chat history
+            context = self.format_docs(docs)
+            chat_history = self.format_chat_history(self.conversation_memory)
+            # Generate answer using LCEL
+            chain = self.qa_prompt | self.llm | StrOutputParser()
+            answer = chain.invoke({
+                "context": context,
+                "question": question,
+                "chat_history": chat_history
+            })
+            # Update conversation memory
+            self.conversation_memory.extend([
+                {"role": "human", "content": question},
+                {"role": "assistant", "content": answer}
+            ])
+            # Keep memory within reasonable size
+            if len(self.conversation_memory) > 12:  # Keep last 6 exchanges
+                self.conversation_memory = self.conversation_memory[-12:]
+            # Format sources
+            sources = self._format_sources(docs)
+            if sources:
+                answer += sources
+            # Calculate processing time
+            processing_time = time.time() - start_time
+            # Create result
+            result = QueryResult(
+                answer=answer,
+                source_documents=docs,
+                content_type=content_type,
+                processing_time=processing_time,
+                generated_queries=self.last_generated_queries
+            )
+            # Log the interaction
+            self._log_interaction(question, result)
+            return result
+        except Exception as e:
+            error_msg = f"Error processing query: {str(e)}"
+            print(error_msg)
+            return QueryResult(
+                answer=error_msg,
+                source_documents=[],
+                content_type="error",
+                processing_time=time.time() - start_time
+            )
+    def _format_sources(self, docs: List[Document]) -> str:
+        """Format source documents for display.
+        Args:
+            docs: Retrieved documents
+        Returns:
+            Formatted sources string
+        """
+        if not docs:
+            return ""
+        # Get unique sources
+        sources = list(set(
+            os.path.basename(doc.metadata.get("source", ""))
+            for doc in docs if doc.metadata.get("source")
+        ))
+        sources = sorted(sources)
+        if not sources:
+            return ""
+        sources_text = ""
+        if len(sources) > 2:
+            # Show only first 2 sources with expandable section for more
+            visible_sources = sources[:2]
+            hidden_sources = sources[2:]
+            sources_text += "\n\nSources:"
+            for source in visible_sources:
+                sources_text += f"\n• {source}"
+            sources_text += f"\n<details><summary>**See {len(hidden_sources)} more sources...**</summary>\n"
+            for source in hidden_sources:
+                sources_text += f"\n• {source}"
+            sources_text += "\n</details>"
+        else:
+            # If 2 or fewer sources, show all
+            sources_text += "\n\nSources:"
+            for source in sources:
+                sources_text += f"\n• {source}"
+        return sources_text
+    def _log_interaction(self, question: str, result: QueryResult):
+        """Log the interaction for analysis.
+        Args:
+            question: User's question
+            result: Query result
+        """
+        try:
+            system_info = {
+                "model_version": Config.MODEL_NAME,
+                "embedding_version": Config.EMBEDDING_MODEL,
+                "search_config": {
+                    "search_type": "mmr",
+                    "k_value": Config.RETRIEVAL_K_VALUES.get(result.content_type),
+                    "content_type": result.content_type
+                }
+            }
+            self.chat_logger.log_interaction(
+                question=question,
+                answer=result.answer,
+                source_documents=result.source_documents,
+                content_type=result.content_type,
+                generated_queries=result.generated_queries or [],
+                processing_time=result.processing_time or 0,
+                chat_history=self.conversation_memory,
+                system_info=system_info
+            )
+        except Exception as e:
+            print(f"Error logging interaction: {str(e)}")
+    def get_system_status(self) -> Dict[str, Any]:
+        """Get current system status.
+        Returns:
+            Dictionary with system status information
+        """
+        status = {
+            "database_initialized": self.vector_store is not None,
+            "model_version": Config.MODEL_NAME,
+            "embedding_version": Config.EMBEDDING_MODEL,
+            "conversation_length": len(self.conversation_memory),
+            "last_queries": self.last_generated_queries
+        }
+        if self.vector_store:
+            try:
+                collection_data = self.vector_store.get()
+                status["documents_loaded"] = len(collection_data['ids'])
+            except:
+                status["documents_loaded"] = "unknown"
+        else:
+            status["documents_loaded"] = 0
+        return status
+    def clear_conversation_memory(self):
+        """Clear the conversation memory."""
+        self.conversation_memory = []
+        print("Conversation memory cleared")
+    def get_conversation_history(self) -> List[Dict[str, str]]:
+        """Get the current conversation history.
+        Returns:
+            List of conversation messages
+        """
+        return self.conversation_memory.copy()