| | |
| | """ |
| | American University Academic Advisor Chatbot |
| | =========================================== |
| | |
| | A RAG-based chatbot system that answers questions about American University academic programs, |
| | leveraging ChromaDB for vector retrieval and Mistral 7B for response generation. |
| | |
| | Features: |
| | --------- |
| | - Course requirement pattern recognition: Distinguishes between required courses, alternative |
| | options ("take either X or Y"), option groups, and true electives |
| | - Academic terminology matching: Connects student questions using "required" to program |
| | descriptions using "must complete" |
| | - Specialized formatting for course requirements: Organizes courses by type with clear labels |
| | - Response generation using Mistral 7B: Creates natural language responses with source citations |
| | - Conversation history tracking: Maintains context across multiple questions |
| | |
| | Usage: |
| | ------ |
| | 1. Command line: |
| | python chatbot.py |
| | |
| | 2. Import in another script: |
| | from chatbot import ask_question |
| | result = ask_question("What are the required courses for the Data Science program?") |
| | print(result["response"]) |
| | |
| | 3. Clear conversation history: |
| | from chatbot import clear_conversation |
| | clear_conversation() |
| | |
| | Requirements: |
| | ------------ |
| | - Python 3.8+ |
| | - ChromaDB for vector storage and retrieval |
| | - Hugging Face API access for Mistral 7B |
| | - Keyring (optional) for secure API key storage |
| | |
| | Configuration: |
| | ------------- |
| | The system needs a Hugging Face API key for generating responses. Set it using: |
| | |
| | keyring.set_password("HF_API_KEY", "rressler", "<your_api_key>") |
| | |
| | Or create an .env file with: |
| | |
| | HF_API_KEY=<your_api_key> |
| | |
| | Note: |
| | ----- |
| | This implementation is designed specifically for academic program queries that |
| | involve distinguishing between required courses and alternatives. It uses |
| | specialized detection for patterns like "STAT-320 or STAT-302" to correctly |
| | inform students about their course options. |
| | """ |
| |
|
| | |
| |
|
| | import os |
| | import sys |
| | import re |
| | from pathlib import Path |
| | import logging |
| | import requests |
| | import json |
| | import math |
| | import warnings |
| | from typing import List, Dict, Tuple, Any, Optional |
| |
|
| | |
| | warnings.filterwarnings("ignore", category=FutureWarning) |
| |
|
| | |
| | from utils.logging_utils import setup_logging |
| | from utils.chroma_utils import get_chroma_manager |
| | from utils.auth_utils import authenticate_huggingface |
| |
|
| | |
| | logger = setup_logging(logger_name="Chatbot", log_filename="chatbot.log") |
| |
|
| | def configure_api_credentials() -> Tuple[Optional[str], str, Optional[Dict[str, str]]]: |
| | """ |
| | Configure Hugging Face API credentials using a unified method. |
| | |
| | Returns: |
| | Tuple: (API key, Model URL, Headers) |
| | """ |
| | try: |
| | hf_api_key, headers = authenticate_huggingface() |
| | |
| | model_url = os.getenv( |
| | "MISTRAL_API_URL", |
| | "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3" |
| | ) |
| | |
| | return hf_api_key, model_url, headers |
| |
|
| | except Exception as e: |
| | logger.warning(f"Authentication failed: {e}") |
| | raise |
| |
|
| | |
| | try: |
| | HF_API_KEY, MISTRAL_API_URL, MISTRAL_HEADERS = configure_api_credentials() |
| | except Exception as e: |
| | logger.error(f"Failed to configure API credentials: {e}") |
| | HF_API_KEY, MISTRAL_API_URL, MISTRAL_HEADERS = None, None, None |
| |
|
| | |
| | global_chroma_manager = get_chroma_manager(model_size="medium") |
| | print(type(global_chroma_manager)) |
| |
|
| | def classify_course_level(course_code): |
| | """ |
| | Classify course level based on course number. |
| | |
| | Args: |
| | course_code (str): The course code (e.g., "MATH-221", "STAT-615") |
| | |
| | Returns: |
| | dict: Dictionary with course_level and level_description |
| | """ |
| | |
| | classification = { |
| | "course_level": "unknown", |
| | "level_description": "Unknown course level" |
| | } |
| | |
| | |
| | try: |
| | |
| | if '-' in course_code: |
| | parts = course_code.split('-') |
| | elif ' ' in course_code: |
| | parts = course_code.split(' ') |
| | elif '.' in course_code: |
| | parts = course_code.split('.') |
| | else: |
| | |
| | import re |
| | match = re.match(r'^([A-Za-z]+)(\d+)$', course_code) |
| | if match: |
| | parts = [match.group(1), match.group(2)] |
| | else: |
| | return classification |
| | |
| | |
| | if len(parts) < 2: |
| | return classification |
| | |
| | |
| | course_num_str = parts[1].strip() |
| | |
| | course_num_str = ''.join(c for c in course_num_str if c.isdigit()) |
| | course_num = int(course_num_str) |
| | |
| | |
| | if course_num <= 499: |
| | classification["course_level"] = "undergraduate" |
| | classification["level_description"] = "Undergraduate course" |
| | elif 500 <= course_num <= 599: |
| | classification["course_level"] = "graduate_open" |
| | classification["level_description"] = "Graduate course open to qualified undergraduate students" |
| | elif 600 <= course_num <= 699: |
| | classification["course_level"] = "graduate_core" |
| | classification["level_description"] = "Core graduate course for the master's degree in the field of study" |
| | elif 700 <= course_num <= 799: |
| | classification["course_level"] = "graduate_advanced" |
| | classification["level_description"] = "Advanced graduate course" |
| | else: |
| | classification["course_level"] = "other" |
| | classification["level_description"] = f"Course number {course_num} outside standard classification" |
| | |
| | except Exception as e: |
| | |
| | pass |
| | |
| | return classification |
| |
|
| | def extract_courses_from_results(results): |
| | """ |
| | Extract course information from the query results with level classification. |
| | |
| | Args: |
| | results (dict): Results from ChromaDB query |
| | |
| | Returns: |
| | list: List of course objects with code, title, credits, type, and level classification |
| | """ |
| | courses = [] |
| | course_codes_seen = set() |
| | |
| | |
| | for i, (doc, metadata) in enumerate(zip(results["documents"][0], results["metadatas"][0])): |
| | |
| | section_type = metadata.get("section_type", "unknown") |
| | |
| | |
| | |
| | course_pattern = r'([A-Z]{2,4}-\d{3})\s+([^(]+)(?:\s*\((\d+(?:\.\d+)?)\))?' |
| | |
| | for line in doc.split('\n'): |
| | matches = re.findall(course_pattern, line) |
| | |
| | for match in matches: |
| | code = match[0].strip() |
| | title = match[1].strip() if len(match) > 1 else "" |
| | credits = match[2] if len(match) > 2 and match[2] else "N/A" |
| | |
| | |
| | if code in course_codes_seen: |
| | continue |
| | |
| | course_codes_seen.add(code) |
| | |
| | |
| | classification = classify_course_level(code) |
| | |
| | courses.append({ |
| | "code": code, |
| | "title": title, |
| | "credits": credits, |
| | "type": section_type, |
| | "course_level": classification["course_level"], |
| | "level_description": classification["level_description"] |
| | }) |
| | |
| | return courses |
| |
|
| | def format_courses_for_display(courses): |
| | """ |
| | Format the courses into a readable string with level information. |
| | |
| | Args: |
| | courses (list): List of course objects |
| | |
| | Returns: |
| | str: Formatted string with course information grouped by type and level |
| | """ |
| | if not courses: |
| | return "No courses found." |
| | |
| | |
| | grouped_courses = { |
| | "required_courses": [], |
| | "elective_courses": [], |
| | "option_group": [], |
| | "small_option_group": [] |
| | } |
| | |
| | for course in courses: |
| | course_type = course["type"] |
| | if course_type in grouped_courses: |
| | grouped_courses[course_type].append(course) |
| | |
| | |
| | output = [] |
| | |
| | |
| | if grouped_courses["required_courses"]: |
| | output.append("**Required Courses:**") |
| | output.append("These courses must be completed by all students in the program:") |
| | |
| | |
| | level_priority = { |
| | "undergraduate": 1, |
| | "graduate_open": 2, |
| | "graduate_core": 3, |
| | "graduate_advanced": 4, |
| | "other": 5, |
| | "unknown": 6 |
| | } |
| | |
| | |
| | sorted_courses = sorted( |
| | grouped_courses["required_courses"], |
| | key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| | ) |
| | |
| | |
| | current_level = None |
| | |
| | for course in sorted_courses: |
| | level = course.get("course_level", "unknown") |
| | level_desc = course.get("level_description", "") |
| | |
| | |
| | if level != current_level: |
| | current_level = level |
| | if level_desc: |
| | output.append(f"\n{level_desc.upper()}:") |
| | |
| | output.append(f"- {course['code']} {course['title']} ({course['credits']})") |
| | |
| | output.append("") |
| | |
| | |
| | if grouped_courses["small_option_group"]: |
| | output.append("**Alternative Course Options:**") |
| | output.append("Students must complete ONE course from each of these groups:") |
| | |
| | |
| | group_id = 1 |
| | |
| | groups = {} |
| | for course in grouped_courses["small_option_group"]: |
| | |
| | group_id = course.get("group_id", group_id) |
| | if group_id not in groups: |
| | groups[group_id] = [] |
| | groups[group_id].append(course) |
| | |
| | |
| | for group_id, course_list in groups.items(): |
| | output.append(f"\nOption Group {group_id}:") |
| | |
| | |
| | level_priority = { |
| | "undergraduate": 1, |
| | "graduate_open": 2, |
| | "graduate_core": 3, |
| | "graduate_advanced": 4, |
| | "other": 5, |
| | "unknown": 6 |
| | } |
| | |
| | sorted_courses = sorted( |
| | course_list, |
| | key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| | ) |
| | |
| | for course in sorted_courses: |
| | level_desc = course.get("level_description", "") |
| | output.append(f"- {course['code']} {course['title']} ({course['credits']}) - {level_desc}") |
| | |
| | output.append("") |
| | |
| | |
| | if grouped_courses["option_group"]: |
| | output.append("**Option Groups:**") |
| | output.append("Students must select courses from the following groups according to program requirements:") |
| | |
| | |
| | level_priority = { |
| | "undergraduate": 1, |
| | "graduate_open": 2, |
| | "graduate_core": 3, |
| | "graduate_advanced": 4, |
| | "other": 5, |
| | "unknown": 6 |
| | } |
| | |
| | sorted_courses = sorted( |
| | grouped_courses["option_group"], |
| | key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| | ) |
| | |
| | |
| | current_level = None |
| | |
| | for course in sorted_courses: |
| | level = course.get("course_level", "unknown") |
| | level_desc = course.get("level_description", "") |
| | |
| | |
| | if level != current_level: |
| | current_level = level |
| | if level_desc: |
| | output.append(f"\n{level_desc.upper()}:") |
| | |
| | output.append(f"- {course['code']} {course['title']} ({course['credits']})") |
| | |
| | output.append("") |
| | |
| | |
| | if grouped_courses["elective_courses"]: |
| | output.append("**Elective Courses:**") |
| | output.append("Students may choose from these optional courses to fulfill elective requirements:") |
| | |
| | |
| | level_priority = { |
| | "undergraduate": 1, |
| | "graduate_open": 2, |
| | "graduate_core": 3, |
| | "graduate_advanced": 4, |
| | "other": 5, |
| | "unknown": 6 |
| | } |
| | |
| | sorted_courses = sorted( |
| | grouped_courses["elective_courses"], |
| | key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| | ) |
| | |
| | |
| | current_level = None |
| | |
| | for course in sorted_courses: |
| | level = course.get("course_level", "unknown") |
| | level_desc = course.get("level_description", "") |
| | |
| | |
| | if level != current_level: |
| | current_level = level |
| | if level_desc: |
| | output.append(f"\n{level_desc.upper()}:") |
| | |
| | output.append(f"- {course['code']} {course['title']} ({course['credits']})") |
| | |
| | return "\n".join(output) |
| |
|
| | def process_program_query(query, program_name=None): |
| | """ |
| | Check if the query is about program requirements or courses and extract program name. |
| | |
| | Args: |
| | query (str): The user's query |
| | program_name (str, optional): Pre-identified program name |
| | |
| | Returns: |
| | dict: Information about the query intent and program |
| | """ |
| | logger.info(f"Processing query in process_program_query start line 446: {repr(query)}") |
| | logger.info(f"[process_program_query] Got query: {repr(query)} | Type: {type(query)} | ID: {id(query)}") |
| | if not isinstance(query, str): |
| | logger.warning(f"Query is not a string! Got {type(query)}: {repr(query)}") |
| | return { |
| | "is_course_query": False, |
| | "course_type": None, |
| | "program_name": program_name, |
| | "query_type": "invalid" |
| | } |
| |
|
| | query_lower = query.lower() |
| | result = { |
| | "is_course_query": False, |
| | "course_type": None, |
| | "program_name": program_name, |
| | "query_type": "general" |
| | } |
| | |
| | |
| | course_query_patterns = [ |
| | |
| | r'what(?:\s+are)?(?:\s+the)?\s+(required|core|elective|optional|must[\s-]complete)\s+courses\s+for\s+(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)?$', |
| | |
| | |
| | r'(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)(?:\s+requirements|(?:\s+)courses)', |
| | |
| | |
| | r'what\s+courses\s+(?:do\s+I|does\s+one|should\s+I)\s+(?:need\s+to|have\s+to|must)\s+(?:take|complete)\s+for\s+(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)?', |
| | |
| | |
| | r'what(?:\s+courses)?\s+(?:do\s+I|does\s+one|should\s+I)\s+(?:have\s+to|need\s+to|must)\s+complete\s+for\s+(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)?' |
| | ] |
| | |
| | |
| | for pattern in course_query_patterns: |
| | match = re.search(pattern, query_lower) |
| | if match: |
| | result["is_course_query"] = True |
| | |
| | |
| | if len(match.groups()) > 1: |
| | course_type = match.group(1) |
| | program_name = match.group(2) |
| | |
| | |
| | if course_type in ['required', 'core', 'must-complete', 'must complete']: |
| | result["course_type"] = 'required_courses' |
| | elif course_type in ['elective', 'optional']: |
| | result["course_type"] = 'elective_courses' |
| | else: |
| | result["course_type"] = 'all' |
| | |
| | result["program_name"] = program_name |
| | result["query_type"] = "course_requirements" |
| | break |
| | elif len(match.groups()) == 1: |
| | |
| | program_name = match.group(1) |
| | result["program_name"] = program_name |
| | result["course_type"] = 'all' |
| | result["query_type"] = "program_requirements" |
| | break |
| | |
| | return result |
| |
|
| | def expand_query_with_academic_terms(query): |
| | """ |
| | Expand the query with alternate academic terminology to improve retrieval. |
| | |
| | This function identifies key terms in the query and adds synonyms/alternate |
| | phrasings that are common in academic contexts, focusing especially on |
| | course requirement terminology. |
| | |
| | Args: |
| | query (str): The original user query |
| | |
| | Returns: |
| | str: Expanded query with alternate terminology |
| | """ |
| | |
| | academic_term_mappings = { |
| | "required": ["must complete", "must take", "mandatory", "core", "required", "requirement", "capstone"], |
| | "elective": ["optional", "elective", "choice", "select from"], |
| | "prerequisite": ["prereq", "prerequisite", "before taking", "prior to"], |
| | "corequisite": ["coreq", "corequisite", "concurrent", "alongside"], |
| | "credit": ["credit hour", "credit", "unit"], |
| | "major": ["major", "program", "degree", "concentration"], |
| | "minor": ["minor", "secondary field"], |
| | "course": ["course", "class", "subject"] |
| | } |
| | |
| | |
| | expanded_terms = [] |
| | logger.info(f"Processing query for mapped terms: {repr(query)}") |
| | query_lower = query.lower() |
| | |
| | for original_term, synonyms in academic_term_mappings.items(): |
| | if original_term in query_lower: |
| | |
| | expanded_terms.extend(synonyms) |
| | |
| | |
| | if expanded_terms: |
| | |
| | |
| | expanded_query = f"{query} {' '.join(expanded_terms)}" |
| | return expanded_query |
| | |
| | |
| | return query |
| |
|
| | def get_program_courses(program_name, course_type='all', n_results=10): |
| | """ |
| | Get specific course information for a program based on course type. |
| | |
| | Args: |
| | program_name (str): Name of the academic program |
| | course_type (str): Type of courses to retrieve ('required_courses', |
| | 'elective_courses', 'option_group', 'small_option_group', or 'all') |
| | n_results (int): Number of results to return |
| | |
| | Returns: |
| | dict: Results containing course information |
| | """ |
| | |
| | chroma_manager = global_chroma_manager |
| |
|
| | |
| | |
| | if course_type == 'all': |
| | where_clause = { |
| | "$or": [ |
| | {"section_type": "required_courses"}, |
| | {"section_type": "elective_courses"}, |
| | {"section_type": "option_group"}, |
| | {"section_type": "small_option_group"} |
| | ] |
| | } |
| | else: |
| | where_clause = {"section_type": course_type} |
| | |
| | |
| | if program_name and program_name.lower() != "any": |
| | |
| | query = f"{course_type} for {program_name} program" |
| | |
| | |
| | where_clause["$and"] = [ |
| | {"type": "program"}, |
| | {"$or": [ |
| | {"program_name": {"$contains": program_name.lower()}}, |
| | {"parent_title": {"$contains": program_name.lower()}} |
| | ]} |
| | ] |
| | else: |
| | query = f"{course_type}" |
| | where_clause["type"] = "program" |
| | |
| | |
| | expanded_query = expand_query_with_academic_terms(query) |
| | |
| | |
| | results = chroma_manager.query( |
| | query_text=expanded_query, |
| | where=where_clause, |
| | n_results=n_results |
| | ) |
| | |
| | return results |
| |
|
| | def get_program_course_information(program_name, course_type='all'): |
| | """ |
| | Get formatted course information for a program. |
| | |
| | Args: |
| | program_name (str): Name of the academic program |
| | course_type (str): Type of courses to retrieve |
| | |
| | Returns: |
| | str: Formatted course information |
| | """ |
| | results = get_program_courses(program_name, course_type, n_results=15) |
| | courses = extract_courses_from_results(results) |
| | return format_courses_for_display(courses) |
| |
|
| | |
| | def extract_validated_program_requirements(soup, program_name, department, url, debug_mode=False): |
| | """ |
| | Extract program requirements with strict validation to avoid mixing electives with requirements. |
| | Carefully differentiates between similarly named programs. |
| | |
| | Args: |
| | soup (BeautifulSoup): Parsed HTML content |
| | program_name (str): Name of the program |
| | department (str): Department name |
| | url (str): URL of the page |
| | debug_mode (bool): Whether to log debug information |
| | |
| | Returns: |
| | dict: Validated program requirements |
| | """ |
| | logger.info(f"Extracting validated requirements for: {program_name}") |
| | |
| | |
| | requirements = { |
| | "program_name": program_name, |
| | "department": department, |
| | "url": url, |
| | "core_requirements": [], |
| | "electives": [], |
| | "capstone": None, |
| | "total_credits": 0 |
| | } |
| | |
| | |
| | |
| | normalized_program = program_name.lower().strip() |
| | |
| | |
| | if normalized_program == "bs data science" or normalized_program == "b.s. data science": |
| | program_type = "BS_DATA_SCIENCE" |
| | elif normalized_program == "bs data sciences" or normalized_program == "b.s. data sciences": |
| | program_type = "BS_DATA_SCIENCES" |
| | elif normalized_program == "ms data science" or normalized_program == "m.s. data science": |
| | program_type = "MS_DATA_SCIENCE" |
| | elif normalized_program == "ms data sciences" or normalized_program == "m.s. data sciences": |
| | program_type = "MS_DATA_SCIENCES" |
| | else: |
| | |
| | program_type = "OTHER" |
| | |
| | requirements["program_type"] = program_type |
| | |
| | if debug_mode: |
| | logger.debug(f"Identified program type: {program_type}") |
| | |
| | |
| | requirement_sections = [] |
| | |
| | |
| | requirement_headers = soup.find_all(['h2', 'h3', 'h4'], string=lambda text: text and any(keyword in text.lower() |
| | for keyword in ['requirement', 'core', 'foundation', 'required', 'curriculum', |
| | 'major', 'course', 'capstone', 'thesis', 'project', 'elective'])) |
| | |
| | for header in requirement_headers: |
| | section_title = header.get_text(strip=True) |
| | section_content = [] |
| | |
| | |
| | current = header.next_sibling |
| | while current and not (hasattr(current, 'name') and current.name in ['h2', 'h3', 'h4']): |
| | if hasattr(current, 'get_text'): |
| | text = current.get_text(strip=True) |
| | if text: |
| | section_content.append(text) |
| | elif isinstance(current, str) and current.strip(): |
| | section_content.append(current.strip()) |
| | |
| | current = current.next_sibling |
| | |
| | if section_content: |
| | section_text = ' '.join(section_content) |
| | |
| | |
| | section_type = "unknown" |
| | |
| | |
| | if any(keyword in section_title.lower() for keyword in ['capstone', 'thesis', 'project', 'senior']): |
| | section_type = "capstone" |
| | requirements["capstone"] = { |
| | "title": section_title, |
| | "content": section_text, |
| | "courses": extract_course_codes(section_text) |
| | } |
| | |
| | |
| | if program_type == "BS_DATA_SCIENCE": |
| | |
| | if "stat-427" in section_text.lower() or "stat 427" in section_text.lower(): |
| | requirements["capstone"]["validated"] = True |
| | requirements["capstone"]["credits"] = 3 |
| | requirements["capstone"]["course_title"] = "Statistical Machine Learning" |
| | else: |
| | requirements["capstone"]["validated"] = False |
| | else: |
| | |
| | requirements["capstone"]["validated"] = True |
| | |
| | |
| | elif any(keyword in section_title.lower() for keyword in ['elective', 'optional', 'choose']): |
| | section_type = "electives" |
| | requirements["electives"].append({ |
| | "title": section_title, |
| | "content": section_text, |
| | "courses": extract_course_codes(section_text) |
| | }) |
| | |
| | |
| | elif any(keyword in section_title.lower() for keyword in ['requirement', 'core', 'required', 'foundation']): |
| | section_type = "core" |
| | requirements["core_requirements"].append({ |
| | "title": section_title, |
| | "content": section_text, |
| | "courses": extract_course_codes(section_text) |
| | }) |
| | |
| | |
| | requirement_sections.append({ |
| | "title": section_title, |
| | "content": section_text, |
| | "type": section_type |
| | }) |
| | |
| | |
| | credit_patterns = [ |
| | r'total\s+of\s+(\d+)\s+credit', |
| | r'(\d+)\s+credits?\s+(?:are|is)\s+required', |
| | r'requires\s+(\d+)\s+credits?', |
| | r'minimum\s+of\s+(\d+)\s+credits?' |
| | ] |
| | |
| | full_text = soup.get_text() |
| | for pattern in credit_patterns: |
| | match = re.search(pattern, full_text, re.IGNORECASE) |
| | if match: |
| | try: |
| | requirements["total_credits"] = int(match.group(1)) |
| | break |
| | except ValueError: |
| | pass |
| | |
| | |
| | if program_type == "BS_DATA_SCIENCE": |
| | |
| | expected_core_courses = [ |
| | "MATH-221", "MATH-222", "MATH-313", "STAT-203", "STAT-302", |
| | "CSC-280", "DATA-320", "STAT-412", "STAT-415" |
| | ] |
| | |
| | |
| | found_courses = [] |
| | for section in requirements["core_requirements"]: |
| | for course in section["courses"]: |
| | course_clean = clean_course_code(course) |
| | if course_clean in expected_core_courses and course_clean not in found_courses: |
| | found_courses.append(course_clean) |
| | |
| | |
| | missing_courses = [c for c in expected_core_courses if c not in found_courses] |
| | requirements["core_coverage"] = len(found_courses) / len(expected_core_courses) |
| | |
| | if debug_mode: |
| | logger.debug(f"Found {len(found_courses)}/{len(expected_core_courses)} expected core courses") |
| | if missing_courses: |
| | logger.debug(f"Missing core courses: {', '.join(missing_courses)}") |
| | |
| | elif program_type == "MS_DATA_SCIENCE": |
| | |
| | |
| | pass |
| | |
| | |
| | logger.info(f"Extracted {len(requirements['core_requirements'])} core requirement sections, {len(requirements['electives'])} elective sections") |
| | |
| | return requirements |
| |
|
| | def extract_course_codes(text): |
| | """Extract course codes from text using regex.""" |
| | |
| | pattern = r'([A-Z]{2,4})[\s\-]?(\d{3,4}[A-Z]?)' |
| | matches = re.findall(pattern, text, re.IGNORECASE) |
| | |
| | |
| | courses = [f"{dept.upper()}-{num}" for dept, num in matches] |
| | return courses |
| |
|
| | def clean_course_code(course_code): |
| | """Standardize course code format to DEPT-NUM.""" |
| | parts = re.match(r'([A-Z]{2,4})[\s\-]?(\d{3,4}[A-Z]?)', course_code, re.IGNORECASE) |
| | if parts: |
| | return f"{parts.group(1).upper()}-{parts.group(2)}" |
| | return course_code |
| |
|
| | |
| | def retrieve_validated_program_requirements(chroma_manager, program_name, debug_mode=False): |
| | """ |
| | Retrieve and validate program requirements from ChromaDB. |
| | |
| | Args: |
| | chroma_manager: ChromaDB manager instance |
| | program_name (str): Name of the program to retrieve |
| | debug_mode (bool): Whether to log debug information |
| | |
| | Returns: |
| | dict: Validated program requirements |
| | """ |
| | |
| | |
| | normalized_program = program_name.lower().strip() |
| | |
| | |
| | if normalized_program == "bs data science" or normalized_program == "b.s. data science": |
| | program_type = "BS_DATA_SCIENCE" |
| | elif normalized_program == "bs data sciences" or normalized_program == "b.s. data sciences": |
| | program_type = "BS_DATA_SCIENCES" |
| | elif normalized_program == "ms data science" or normalized_program == "m.s. data science": |
| | program_type = "MS_DATA_SCIENCE" |
| | elif normalized_program == "ms data sciences" or normalized_program == "m.s. data sciences": |
| | program_type = "MS_DATA_SCIENCES" |
| | else: |
| | |
| | program_type = "OTHER" |
| | |
| | if debug_mode: |
| | logger.debug(f"Retrieving requirements for: {program_name} (Type: {program_type})") |
| | |
| | |
| | query = f"requirements for {program_name}" |
| | |
| | |
| | summary_results = chroma_manager.query( |
| | query_text=query, |
| | n_results=5, |
| | metadata_filter={"program_name": program_name, "type": "program", "section_type": "program_summary"} |
| | ) |
| | |
| | if summary_results and len(summary_results['ids']) > 0: |
| | |
| | if debug_mode: |
| | logger.debug(f"Found program summary for {program_name}") |
| | |
| | |
| | summary_text = summary_results['documents'][0] |
| | |
| | |
| | requirements = { |
| | "program_name": program_name, |
| | "program_type": program_type, |
| | "department": summary_results['metadatas'][0].get('department', 'Unknown Department'), |
| | "core_requirements": [], |
| | "electives": [], |
| | "capstone": None |
| | } |
| | |
| | |
| | if "REQUIRED COURSES" in summary_text: |
| | core_section = summary_text.split("REQUIRED COURSES")[1].split("ELECTIVE COURSES")[0] if "ELECTIVE COURSES" in summary_text else summary_text.split("REQUIRED COURSES")[1] |
| | requirements["core_requirements"] = [{ |
| | "title": "Major Requirements", |
| | "content": core_section, |
| | "courses": extract_course_codes(core_section) |
| | }] |
| | |
| | |
| | if "ELECTIVE COURSES" in summary_text: |
| | elective_section = summary_text.split("ELECTIVE COURSES")[1] |
| | requirements["electives"] = [{ |
| | "title": "Elective Courses", |
| | "content": elective_section, |
| | "courses": extract_course_codes(elective_section) |
| | }] |
| | |
| | return requirements |
| | |
| | |
| | section_results = chroma_manager.query( |
| | query_text=query, |
| | n_results=10, |
| | metadata_filter={"program_name": program_name, "type": "program"} |
| | ) |
| | |
| | if not section_results or len(section_results['ids']) == 0: |
| | logger.warning(f"No results found for {program_name} requirements") |
| | return None |
| | |
| | |
| | requirements = { |
| | "program_name": program_name, |
| | "program_type": program_type, |
| | "department": section_results['metadatas'][0].get('department', 'Unknown Department'), |
| | "core_requirements": [], |
| | "electives": [], |
| | "capstone": None |
| | } |
| | |
| | |
| | for i, doc in enumerate(section_results['documents']): |
| | metadata = section_results['metadatas'][i] |
| | section_type = metadata.get('section_type', 'unknown') |
| | title = metadata.get('title', f"Section {i+1}") |
| | |
| | |
| | if section_type in ['required_courses', 'option_group']: |
| | requirements["core_requirements"].append({ |
| | "title": title, |
| | "content": doc, |
| | "courses": extract_course_codes(doc) |
| | }) |
| | elif section_type == 'elective_courses': |
| | requirements["electives"].append({ |
| | "title": title, |
| | "content": doc, |
| | "courses": extract_course_codes(doc) |
| | }) |
| | elif "capstone" in title.lower() or "senior" in title.lower(): |
| | requirements["capstone"] = { |
| | "title": title, |
| | "content": doc, |
| | "courses": extract_course_codes(doc) |
| | } |
| | |
| | return requirements |
| |
|
| | |
| | def generate_accurate_requirements_response(requirements, program_name): |
| | """ |
| | Generate an accurate response about program requirements. |
| | Enhanced to handle the updated classification where required electives and minors |
| | are properly included in the required_courses category. |
| | |
| | Args: |
| | requirements (dict): Validated program requirements |
| | program_name (str): Name of the program |
| | |
| | Returns: |
| | str: Formatted response with accurate requirements |
| | """ |
| | if not requirements: |
| | return f"I'm sorry, but I couldn't find specific requirements for the {program_name} program. Please check the department website for the most up-to-date information." |
| | |
| | response = [f"# {program_name} Requirements", ""] |
| | |
| | |
| | if requirements.get("department"): |
| | response.append(f"**Department:** {requirements['department']}") |
| | response.append("") |
| | |
| | |
| | if requirements.get("total_credits"): |
| | response.append(f"**Total Credits Required:** {requirements['total_credits']}") |
| | response.append("") |
| | |
| | |
| | if requirements.get("core_requirements"): |
| | response.append("## Core Requirements") |
| | |
| | |
| | displayed_sections = set() |
| | |
| | for section in requirements["core_requirements"]: |
| | |
| | if section['title'] in displayed_sections: |
| | continue |
| | |
| | response.append(f"**{section['title']}**") |
| | displayed_sections.add(section['title']) |
| | |
| | |
| | if section.get("courses"): |
| | for course in section["courses"]: |
| | |
| | |
| | response.append(f"- {course}") |
| | else: |
| | |
| | response.append(section["content"]) |
| | |
| | response.append("") |
| | |
| | |
| | if requirements.get("capstone"): |
| | response.append("## Capstone Experience") |
| | capstone = requirements["capstone"] |
| | response.append(f"**{capstone['title']}**") |
| | |
| | |
| | program_type = requirements.get("program_type", "OTHER") |
| | if program_type == "BS_DATA_SCIENCE" and capstone.get("validated", False): |
| | response.append("**STAT-427: Statistical Machine Learning (3 credits)**") |
| | response.append("This course serves as the capstone experience for the Data Science program.") |
| | elif capstone.get("courses"): |
| | for course in capstone["courses"]: |
| | response.append(f"- {course}") |
| | else: |
| | response.append(capstone["content"]) |
| | |
| | response.append("") |
| | |
| | |
| | |
| | if requirements.get("minor_requirement") and not any( |
| | "minor" in section['title'].lower() for section in requirements.get("core_requirements", []) |
| | ): |
| | response.append("## Minor or Second Major Requirement") |
| | minor = requirements["minor_requirement"] |
| | response.append(f"**{minor['title']}**") |
| | response.append(minor["content"]) |
| | response.append("") |
| | |
| | |
| | |
| | required_electives = [] |
| | elective_titles = set() |
| | |
| | |
| | if requirements.get("core_requirements"): |
| | for section in requirements["core_requirements"]: |
| | if 'elective' in section['title'].lower() and section['title'] not in elective_titles: |
| | required_electives.append(section) |
| | elective_titles.add(section['title']) |
| | |
| | |
| | if requirements.get("electives"): |
| | for section in requirements["electives"]: |
| | if section['title'] not in elective_titles: |
| | required_electives.append(section) |
| | elective_titles.add(section['title']) |
| | |
| | |
| | if required_electives: |
| | response.append("## Elective Requirements") |
| | for section in required_electives: |
| | response.append(f"**{section['title']}**") |
| | |
| | |
| | if section.get("courses"): |
| | for course in section["courses"]: |
| | response.append(f"- {course}") |
| | else: |
| | response.append(section["content"]) |
| | |
| | response.append("") |
| | |
| | |
| | if requirements.get("option_groups"): |
| | response.append("## Option Groups") |
| | for section in requirements["option_groups"]: |
| | response.append(f"**{section['title']}**") |
| | |
| | |
| | if section.get("courses"): |
| | for course in section["courses"]: |
| | response.append(f"- {course}") |
| | else: |
| | response.append(section["content"]) |
| | |
| | response.append("") |
| | |
| | |
| | response.append("*Note: These requirements are subject to change. Please consult with an academic advisor or refer to the official program documentation for the most current information.*") |
| | |
| | return "\n".join(response) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | class AcademicChatbot: |
| | """ |
| | A RAG-based chatbot for answering questions about academic programs and courses |
| | using Mistral 7B model and ChromaDB for retrieval. |
| | """ |
| | |
| | |
| | def __init__(self): |
| | """Initialize the chatbot with ChromaDB and model configuration.""" |
| | |
| | self.chroma_manager = global_chroma_manager |
| | self.collection = self.chroma_manager.get_collection() |
| | |
| | |
| | self.api_url = MISTRAL_API_URL |
| | self.headers = MISTRAL_HEADERS |
| | self.conversation_history = [] |
| | |
| | |
| | if not self.headers: |
| | logger.warning("Mistral API headers not properly configured. Regenerate API credentials.") |
| | raise ValueError("Failed to initialize Mistral API headers. Check API key configuration.") |
| | |
| | def add_message(self, role: str, content: str): |
| | """Add a message to the conversation history.""" |
| | self.conversation_history.append({"role": role, "content": content}) |
| | |
| | def clear_history(self): |
| | """Clear the conversation history.""" |
| | self.conversation_history = [] |
| | |
| | def get_history(self): |
| | """Get the conversation history.""" |
| | return self.conversation_history |
| | |
| | def get_url_from_metadata(self, metadata): |
| | """Extract URL from metadata, checking multiple possible field names.""" |
| | |
| | url_field_names = ['url', 'course_url', 'source_url', 'link', 'href', 'source'] |
| | |
| | for field in url_field_names: |
| | if field in metadata and metadata[field]: |
| | return metadata[field] |
| | |
| | |
| | return '' |
| | |
| | def retrieve_context(self, query: str, n_results: int = 8) -> Tuple[List[str], List[Dict[str, Any]]]: |
| | """ |
| | Retrieve diverse and relevant documents from ChromaDB based on the query. |
| | |
| | Args: |
| | query: The user's question |
| | n_results: Number of documents to retrieve |
| | |
| | Returns: |
| | Tuple containing (contexts, metadata) |
| | """ |
| | logger.info(f"Retrieving context for query: {query}") |
| | |
| | |
| | expanded_query = expand_query_with_academic_terms(query) |
| | logger.info(f"Expanded query: {expanded_query}") |
| | |
| | |
| | retrieve_count = min(n_results * 3, 25) |
| | results = self.chroma_manager.query(expanded_query, n_results=retrieve_count) |
| | |
| | |
| | contexts = [] |
| | metadata_list = [] |
| | |
| | if 'documents' in results and results['documents']: |
| | documents = results['documents'][0] |
| | metadatas = results['metadatas'][0] if 'metadatas' in results and results['metadatas'] else [{}] * len(documents) |
| | |
| | |
| | seen_urls = set() |
| | seen_titles = set() |
| | |
| | |
| | doc_groups = {} |
| | for doc, meta in zip(documents, metadatas): |
| | url = meta.get('url', '') if meta else '' |
| | title = meta.get('title', '') if meta else '' |
| | key = (url, title) |
| | |
| | if key not in doc_groups: |
| | doc_groups[key] = [] |
| | |
| | doc_groups[key].append((doc, meta)) |
| | |
| | |
| | while len(contexts) < n_results and doc_groups: |
| | for key in list(doc_groups.keys()): |
| | if doc_groups[key]: |
| | doc, meta = doc_groups[key].pop(0) |
| | contexts.append(doc) |
| | metadata_list.append(meta) |
| | |
| | if not doc_groups[key]: |
| | del doc_groups[key] |
| | |
| | if len(contexts) >= n_results: |
| | break |
| | |
| | |
| | if len(contexts) < n_results: |
| | i = 0 |
| | while len(contexts) < n_results and i < len(documents): |
| | if documents[i] not in contexts: |
| | contexts.append(documents[i]) |
| | metadata_list.append(metadatas[i]) |
| | i += 1 |
| | |
| | logger.info(f"Retrieved {len(contexts)} context documents") |
| | |
| | return contexts, metadata_list |
| |
|
| | def merge_program_documents(self, docs, metas, max_chars=15000): |
| | """Merge documents by category to create comprehensive context.""" |
| | |
| | categories = { |
| | "comprehensive": {"content": "", "sources": []}, |
| | "core": {"content": "", "sources": []}, |
| | "electives": {"content": "", "sources": []}, |
| | "minor": {"content": "", "sources": []}, |
| | "capstone": {"content": "", "sources": []}, |
| | "ethics": {"content": "", "sources": []}, |
| | "admission": {"content": "", "sources": []}, |
| | "au_core": {"content": "", "sources": []}, |
| | "university_requirements": {"content": "", "sources": []}, |
| | "major_requirements": {"content": "", "sources": []}, |
| | "other": {"content": "", "sources": []} |
| | } |
| | |
| | |
| | for i, (doc, meta) in enumerate(zip(docs, metas)): |
| | title = meta.get("title", "").lower() if meta else "" |
| | |
| | |
| | if "complete" in title and "requirements" in title: |
| | category = "comprehensive" |
| | elif "elective" in title: |
| | category = "electives" |
| | elif "minor" in title or "second major" in title: |
| | category = "minor" |
| | elif "capstone" in title: |
| | category = "capstone" |
| | elif "ethics" in title: |
| | category = "ethics" |
| | elif "admission" in title or "apply" in title: |
| | category = "admission" |
| | elif "au core" in title or "general education" in title: |
| | category = "au_core" |
| | elif "university requirement" in title: |
| | category = "university_requirements" |
| | elif "major requirement" in title: |
| | category = "major_requirements" |
| | elif any(term in title for term in ["statistics", "data science essentials", "intermediate"]): |
| | category = "core" |
| | else: |
| | category = "other" |
| | |
| | |
| | categories[category]["content"] += f"\n\n## {meta.get('title', '')}\n{doc}" |
| | categories[category]["sources"].append(i) |
| | |
| | |
| | output_docs = [] |
| | output_metas = [] |
| | source_indices = set() |
| | |
| | |
| | if categories["comprehensive"]["content"]: |
| | output_docs.append(categories["comprehensive"]["content"]) |
| | output_metas.append({"title": "Complete Program Requirements"}) |
| | source_indices.update(categories["comprehensive"]["sources"]) |
| | |
| | |
| | general_content = "# General Program Requirements\n" |
| | general_sources = [] |
| | |
| | |
| | for cat_name, display_name in [ |
| | ("university_requirements", "University Requirements"), |
| | ("au_core", "AU Core Requirements"), |
| | ("admission", "Admission Requirements") |
| | ]: |
| | if categories[cat_name]["content"]: |
| | general_content += f"\n\n# {display_name}\n{categories[cat_name]['content']}" |
| | general_sources.extend(categories[cat_name]["sources"]) |
| | |
| | |
| | if general_content.strip() != "# General Program Requirements": |
| | output_docs.append(general_content) |
| | output_metas.append({"title": "General Requirements"}) |
| | source_indices.update(general_sources) |
| | |
| | |
| | major_content = "# Major Requirements\n" |
| | |
| | |
| | if categories["major_requirements"]["content"]: |
| | major_content += categories["major_requirements"]["content"] |
| | |
| | |
| | if categories["core"]["content"]: |
| | major_content += "\n\n# Core Course Requirements\n" + categories["core"]["content"] |
| | |
| | |
| | if major_content.strip() != "# Major Requirements": |
| | output_docs.append(major_content) |
| | output_metas.append({"title": "Major Requirements"}) |
| | source_indices.update(categories["major_requirements"]["sources"]) |
| | source_indices.update(categories["core"]["sources"]) |
| | |
| | |
| | additional_content = "# Additional Program Requirements\n" |
| | additional_sources = [] |
| | |
| | |
| | for cat_name, display_name in [ |
| | ("electives", "Elective Requirements"), |
| | ("minor", "Minor or Second Major Requirements"), |
| | ("capstone", "Capstone Requirements"), |
| | ("ethics", "Ethics Requirements") |
| | ]: |
| | if categories[cat_name]["content"]: |
| | additional_content += f"\n\n# {display_name}\n{categories[cat_name]['content']}" |
| | additional_sources.extend(categories[cat_name]["sources"]) |
| | |
| | |
| | if additional_content.strip() != "# Additional Program Requirements": |
| | output_docs.append(additional_content) |
| | output_metas.append({"title": "Additional Requirements"}) |
| | source_indices.update(additional_sources) |
| | |
| | |
| | total_chars = sum(len(doc) for doc in output_docs) |
| | |
| | |
| | if categories["other"]["content"] and total_chars + len(categories["other"]["content"]) <= max_chars: |
| | other_content = "# Other Program Information\n" + categories["other"]["content"] |
| | output_docs.append(other_content) |
| | output_metas.append({"title": "Other Information"}) |
| | source_indices.update(categories["other"]["sources"]) |
| | |
| | |
| | all_sources = [] |
| | for i in range(len(metas)): |
| | all_sources.append(metas[i]) |
| | |
| | logger.info(f"Merged {len(docs)} documents into {len(output_docs)} comprehensive documents (Total chars: {sum(len(d) for d in output_docs)})") |
| | |
| | return output_docs, all_sources |
| |
|
| | |
| | def trim_documents(self, docs, metas, max_chars=12000): |
| | """Trim documents to avoid token overload while ensuring all requirements are included.""" |
| | output_docs, output_metas = [], [] |
| | total_chars = 0 |
| | |
| | |
| | query = getattr(self, "current_query", None) |
| | query_info = process_program_query(query) if isinstance(query, str) else None |
| | if query_info: |
| | logger.info(f"[trim_documents] query_info: {query_info} | program_name: {query_info.get('program_name')}") |
| | program_name = (query_info.get("program_name") or "").lower() if query_info else "" |
| | |
| | |
| | if program_name: |
| | |
| | comprehensive_index = None |
| | for i, meta in enumerate(metas): |
| | title = meta.get("title", "").lower() if meta else "" |
| | if "complete" in title and "requirement" in title and program_name in meta.get("program_name", "").lower(): |
| | comprehensive_index = i |
| | break |
| | |
| | if comprehensive_index is not None and total_chars + len(docs[comprehensive_index]) <= max_chars: |
| | output_docs.append(docs[comprehensive_index]) |
| | output_metas.append(metas[comprehensive_index]) |
| | total_chars += len(docs[comprehensive_index]) |
| | |
| | |
| | for i, meta in enumerate(metas): |
| | |
| | if i == comprehensive_index: |
| | continue |
| | |
| | |
| | is_required = meta.get("section_type", "") == "required_courses" |
| | is_this_program = program_name in meta.get("program_name", "").lower() |
| | |
| | |
| | if is_required and is_this_program and total_chars + len(docs[i]) <= max_chars: |
| | output_docs.append(docs[i]) |
| | output_metas.append(metas[i]) |
| | total_chars += len(docs[i]) |
| | |
| | |
| | has_minor = any("minor" in meta.get("title", "").lower() or "second major" in meta.get("title", "").lower() |
| | for meta in output_metas) |
| | |
| | if not has_minor: |
| | for i, meta in enumerate(metas): |
| | if "minor" in meta.get("title", "").lower() or "second major" in meta.get("title", "").lower(): |
| | if total_chars + len(docs[i]) <= max_chars: |
| | output_docs.append(docs[i]) |
| | output_metas.append(metas[i]) |
| | total_chars += len(docs[i]) |
| | break |
| | |
| | |
| | has_capstone = any("capstone" in meta.get("title", "").lower() for meta in output_metas) |
| | |
| | if not has_capstone: |
| | for i, meta in enumerate(metas): |
| | if "capstone" in meta.get("title", "").lower(): |
| | if total_chars + len(docs[i]) <= max_chars: |
| | output_docs.append(docs[i]) |
| | output_metas.append(metas[i]) |
| | total_chars += len(docs[i]) |
| | break |
| | |
| | |
| | has_electives = any("elective" in meta.get("title", "").lower() for meta in output_metas) |
| | |
| | if not has_electives: |
| | for i, meta in enumerate(metas): |
| | if "elective" in meta.get("title", "").lower(): |
| | if total_chars + len(docs[i]) <= max_chars: |
| | output_docs.append(docs[i]) |
| | output_metas.append(metas[i]) |
| | total_chars += len(docs[i]) |
| | break |
| | |
| | |
| | |
| | if not output_docs: |
| | for doc, meta in zip(docs, metas): |
| | |
| | if len(output_docs) == 0 or total_chars + len(doc) <= max_chars: |
| | output_docs.append(doc) |
| | output_metas.append(meta) |
| | total_chars += len(doc) |
| | else: |
| | break |
| | |
| | logger.info(f"Trimmed documents from {len(docs)} to {len(output_docs)} (Total chars: {total_chars})") |
| | return output_docs, output_metas |
| |
|
| | def generate_response(self, query: str, contexts: List[str], |
| | metadata: List[Dict[str, Any]], temperature: float = 0.7) -> str: |
| | """ |
| | Generate a response using Mistral 7B with retrieved contexts. |
| | |
| | Args: |
| | query: The user's question |
| | contexts: Retrieved document contents |
| | metadata: Metadata for the retrieved documents |
| | temperature: Controls randomness in generation |
| | |
| | Returns: |
| | Generated response |
| | """ |
| | logger.info(f"Generating response for query: {query}") |
| |
|
| | |
| | self.current_query = query |
| | if not isinstance(query, str) or not query.strip(): |
| | logger.warning("Query is missing or not a string.") |
| | return "No query provided." |
| | |
| | |
| | query_info = process_program_query(query) |
| | |
| | if query_info["is_course_query"] and query_info["program_name"]: |
| | logger.info(f"Detected course query for program: {query_info['program_name']}, type: {query_info['course_type']}") |
| | |
| | |
| | try: |
| | |
| | requirements = retrieve_validated_program_requirements( |
| | self.chroma_manager, |
| | query_info["program_name"], |
| | debug_mode=False |
| | ) |
| | |
| | |
| | if requirements: |
| | logger.info(f"Using validated requirements for {query_info['program_name']}") |
| | response = generate_accurate_requirements_response( |
| | requirements, |
| | query_info["program_name"] |
| | ) |
| | |
| | |
| | sources = [] |
| | for i, meta in enumerate(metadata): |
| | if meta: |
| | title = meta.get("title", "") |
| | url = self.get_url_from_metadata(meta) |
| | |
| | if url: |
| | if title: |
| | citation = f"[{i+1}] {title} - {url}" |
| | else: |
| | citation = f"[{i+1}] Program information - {url}" |
| | else: |
| | if title: |
| | citation = f"[{i+1}] {title}" |
| | else: |
| | citation = f"[{i+1}] Program information" |
| | |
| | sources.append(citation) |
| | |
| | if sources: |
| | |
| | used_source_indexes = set() |
| | for i in range(len(sources)): |
| | |
| | if f"[{i+1}]" in response: |
| | used_source_indexes.add(i) |
| | |
| | |
| | if used_source_indexes: |
| | response += "\n\nSources Referenced in Response:" |
| | for i in sorted(used_source_indexes): |
| | response += f"\n{sources[i]}" |
| | |
| | |
| | response += "\n\nAll Retrieved Sources:" |
| | for source in sources: |
| | response += f"\n{source}" |
| | |
| | return response |
| | |
| | except Exception as e: |
| | logger.error(f"Error using validated requirements approach: {str(e)}") |
| | |
| | |
| | |
| | try: |
| | program_courses = get_program_course_information( |
| | query_info["program_name"], |
| | query_info["course_type"] |
| | ) |
| | |
| | |
| | if program_courses and "No courses found" not in program_courses: |
| | program_name = query_info["program_name"].title() |
| | |
| | |
| | response = f"Here's information about the {program_name} program courses:\n\n{program_courses}" |
| | |
| | |
| | sources = [] |
| | for i, meta in enumerate(metadata): |
| | if meta: |
| | title = meta.get("title", "") |
| | url = self.get_url_from_metadata(meta) |
| | |
| | if url: |
| | if title: |
| | citation = f"[{i+1}] {title} - {url}" |
| | else: |
| | citation = f"[{i+1}] Program information - {url}" |
| | else: |
| | if title: |
| | citation = f"[{i+1}] {title}" |
| | else: |
| | citation = f"[{i+1}] Program information" |
| | |
| | sources.append(citation) |
| | |
| | if sources: |
| | |
| | used_source_indexes = set() |
| | for i in range(len(sources)): |
| | |
| | if f"[{i+1}]" in response: |
| | used_source_indexes.add(i) |
| | |
| | |
| | if used_source_indexes: |
| | response += "\n\nSources Referenced in Response:" |
| | for i in sorted(used_source_indexes): |
| | response += f"\n{sources[i]}" |
| | |
| | |
| | response += "\n\nAll Retrieved Sources:" |
| | for source in sources: |
| | response += f"\n{source}" |
| | |
| | return response |
| | except Exception as e: |
| | logger.error(f"Error handling specialized course query: {str(e)}") |
| | |
| | |
| | |
| | |
| | if query_info["is_course_query"] and query_info["program_name"]: |
| | contexts, metadata = self.merge_program_documents(contexts, metadata, max_chars=12000) |
| | else: |
| | |
| | contexts, metadata = self.trim_documents(contexts, metadata, max_chars=10000) |
| | |
| | |
| | enhanced_contexts = [] |
| | for i, (doc, meta) in enumerate(zip(contexts, metadata)): |
| | source_type = meta.get("type", "document") |
| | title = meta.get("title", "") |
| | url = self.get_url_from_metadata(meta) |
| | |
| | |
| | doc_preview = doc[:1500] + ("..." if len(doc) > 1500 else "") |
| | |
| | |
| | doc_header = f"Document {i+1} ({source_type.capitalize()}" |
| | if title: |
| | doc_header += f": {title}" |
| | if url: |
| | doc_header += f" - {url}" |
| | doc_header += "):" |
| | |
| | enhanced_contexts.append(f"{doc_header}\n{doc_preview}") |
| | |
| | |
| | history_text = "" |
| | if self.conversation_history: |
| | recent_history = self.conversation_history[-3:] |
| | if recent_history: |
| | history_text = "### Recent Conversation:\n" |
| | for msg in recent_history: |
| | role = "User" if msg["role"] == "user" else "Assistant" |
| | history_text += f"{role}: {msg['content']}\n\n" |
| | |
| | |
| | context_text = "\n\n".join(enhanced_contexts) |
| | prompt = f"""You are an AI assistant answering questions about American University's academic programs and courses. |
| | Use the following documents as your primary source of information. |
| | |
| | Important rules: |
| | - If the answer is not explicitly stated, you may reason from the information provided, but explain your reasoning. |
| | - Courses marked as "must be completed", "prerequisites", or "required" are mandatory. |
| | - When you see "one of the following" or "either X or Y", students must choose exactly one course from the options. |
| | - When you see "option group", students must select some number of courses from that group. |
| | - Courses listed as electives form a group from which a certain number must be completed, but not every course. |
| | - Always mention the source document when including specific information. |
| | - If you don't know or the information is not in the documents, be honest about it. |
| | - For Data Science programs, STAT-427 (Statistical Machine Learning) is the 3-credit capstone course. |
| | - Undergraduate courses have numbers 499 and below, graduate courses open to qualified undergraduates have numbers 500-599, |
| | core graduate courses have numbers 600-699, and advanced graduate courses have numbers 700-799. |
| | |
| | {history_text if history_text else ""} |
| | |
| | ### Context: |
| | {context_text} |
| | |
| | ### Question: |
| | {query} |
| | |
| | """ |
| | |
| | |
| | logger.info(f"Processing query in process_program_query instructions: {repr(query)}") |
| | if isinstance(query, str) and ("course requirement" in query.lower() or "program requirement" in query.lower()): |
| | prompt += """ |
| | |
| | IMPORTANT: Your response should include ALL required components for this degree program. |
| | Ensure you cover all sections mentioned in the documents, including: |
| | - All core course requirements with their credit hours |
| | - Any elective requirements with credit hours |
| | - Any minor or second major requirements |
| | - Any capstone or project requirements |
| | |
| | Present requirements in a clear, organized format that makes the degree structure easy to understand. |
| | DO NOT OMIT any requirements or sections mentioned in the documents. |
| | """ |
| |
|
| | prompt += "\n\n### Answer:" |
| | logger.info(f"Processed query in instructions: {repr(query)}") |
| | |
| | payload = { |
| | "inputs": prompt, |
| | "parameters": { |
| | "max_new_tokens": 4000, |
| | "temperature": temperature, |
| | "top_p": 0.85, |
| | "do_sample": True |
| | } |
| | } |
| | |
| | try: |
| | response = requests.post(self.api_url, headers=self.headers, json=payload) |
| | |
| | if response.status_code == 200: |
| | |
| | generated_text = response.json()[0]["generated_text"] |
| | answer = generated_text.split("### Answer:")[-1].strip() |
| | |
| | |
| | |
| | if "\n\nSources:" in answer: |
| | answer = answer.split("\n\nSources:")[0].strip() |
| | |
| | |
| | sources = [] |
| | for i, meta in enumerate(metadata): |
| | if meta: |
| | source_type = meta.get("type", "document") |
| | title = meta.get("title", "") |
| | url = self.get_url_from_metadata(meta) |
| | |
| | |
| | if url: |
| | if title: |
| | citation = f"[{i+1}] {title} - {url}" |
| | else: |
| | citation = f"[{i+1}] {source_type.capitalize()} - {url}" |
| | else: |
| | if title: |
| | citation = f"[{i+1}] {title}" |
| | else: |
| | citation = f"[{i+1}] {source_type.capitalize()}" |
| |
|
| | sources.append(citation) |
| | |
| | |
| | if sources: |
| | |
| | used_source_indexes = set() |
| | for i in range(len(sources)): |
| | |
| | if f"[{i+1}]" in answer: |
| | used_source_indexes.add(i) |
| | |
| | |
| | if used_source_indexes: |
| | answer += "\n\nSources Referenced in Response:" |
| | for i in sorted(used_source_indexes): |
| | answer += f"\n{sources[i]}" |
| | |
| | |
| | answer += "\n\nAll Retrieved Sources:" |
| | for source in sources: |
| | answer += f"\n{source}" |
| | |
| | return answer |
| | else: |
| | error_msg = f"Error: {response.status_code}, {response.text}" |
| | logger.error(error_msg) |
| | return error_msg |
| | |
| | except Exception as e: |
| | error_msg = f"Exception during response generation: {str(e)}" |
| | logger.error(error_msg) |
| | return error_msg |
| | |
| | def add_document(self, text: str, metadata: Dict[str, Any], doc_id: Optional[str] = None) -> str: |
| | """Add a document to the ChromaDB collection.""" |
| | return self.chroma_manager.add_document(text, metadata, doc_id) |
| | |
| | def get_collection_info(self) -> Dict[str, Any]: |
| | """Get information about the ChromaDB collection.""" |
| | return self.collection.get() |
| | |
| | def ask(self, query: str, n_results: int = 8, temperature: float = 0.7) -> Dict[str, Any]: |
| | """ |
| | Process a query and return a response with relevant context. |
| | |
| | Args: |
| | query: The user's question |
| | n_results: Number of documents to retrieve |
| | temperature: Controls randomness in generation |
| | |
| | Returns: |
| | Dictionary with response and context information |
| | """ |
| | |
| | self.add_message("user", query) |
| | |
| | |
| | contexts, metadata = self.retrieve_context(query, n_results) |
| | |
| | |
| | if not contexts: |
| | response = "I couldn't find any relevant information to answer your question. Could you please rephrase or ask about a different topic related to American University's programs or courses?" |
| | else: |
| | |
| | response = self.generate_response(query, contexts, metadata, temperature) |
| | |
| | |
| | if len(response) > 15000: |
| | response = response[:14800] + "...\n\n[Response truncated due to length. Please ask for specific details if needed.]" |
| | |
| | |
| | self.add_message("assistant", response) |
| | |
| | |
| | return { |
| | "response": response, |
| | "contexts": contexts, |
| | "metadata": metadata, |
| | "history": self.conversation_history |
| | } |
| |
|
| | |
| | def ask_question(query: str, n_results: int = 8, temperature: float = 0.7) -> Dict[str, Any]: |
| | """Ask a question to the chatbot.""" |
| | return chatbot.ask(query, n_results, temperature) |
| |
|
| | |
| | chatbot = AcademicChatbot() |
| |
|
| | |
| | def ask_question(query: str, n_results: int = 10, temperature: float = 0.7) -> Dict[str, Any]: |
| | """Ask a question to the chatbot.""" |
| | return chatbot.ask(query, n_results, temperature) |
| |
|
| | |
| | def clear_conversation(): |
| | """Clear the conversation history.""" |
| | chatbot.clear_history() |
| |
|
| | |
| | def add_document(text: str, metadata: Dict[str, Any], doc_id: Optional[str] = None) -> str: |
| | """Add a document to the collection.""" |
| | return chatbot.add_document(text, metadata, doc_id) |
| |
|
| | |
| | def split_long_response(response: str, max_chunk_size: int = 3500) -> List[str]: |
| | """ |
| | Split a long response into manageable chunks while preserving whole sentences. |
| | |
| | Args: |
| | response (str): The full response text |
| | max_chunk_size (int): Maximum size of each chunk in characters |
| | |
| | Returns: |
| | List[str]: List of response chunks |
| | """ |
| | |
| | if len(response) <= max_chunk_size: |
| | return [response] |
| | |
| | |
| | def split_sentences(text): |
| | |
| | import re |
| | return re.split(r'(?<=[.!?])\s+', text) |
| | |
| | chunks = [] |
| | current_chunk = [] |
| | current_chunk_length = 0 |
| | |
| | sentences = split_sentences(response) |
| | |
| | for sentence in sentences: |
| | |
| | if current_chunk_length + len(sentence) > max_chunk_size: |
| | |
| | chunks.append(' '.join(current_chunk)) |
| | current_chunk = [] |
| | current_chunk_length = 0 |
| | |
| | |
| | current_chunk.append(sentence) |
| | current_chunk_length += len(sentence) + 1 |
| | |
| | |
| | if current_chunk: |
| | chunks.append(' '.join(current_chunk)) |
| | |
| | |
| | for i in range(len(chunks)): |
| | if i < len(chunks) - 1: |
| | chunks[i] += f"\n\n(Continued in next message - Part {i+1}/{len(chunks)})" |
| | else: |
| | chunks[i] += f"\n\n(End of response - Part {i+1}/{len(chunks)})" |
| | |
| | return chunks |
| |
|
| | def generate_response_with_mistral(prompt, temperature): |
| | """ |
| | Generate response using Mistral 7B via Hugging Face API. |
| | |
| | Args: |
| | prompt: Fully formatted prompt for the model |
| | temperature: Sampling temperature for response generation |
| | |
| | Returns: |
| | Generated response as a string |
| | """ |
| | if not HF_API_KEY: |
| | raise ValueError("Hugging Face API key not found. Please configure credentials.") |
| | |
| | try: |
| | |
| | client = InferenceClient( |
| | "mistralai/Mistral-7B-Instruct-v0.3", |
| | token=HF_API_KEY |
| | ) |
| | |
| | |
| | response = client.text_generation( |
| | prompt, |
| | max_new_tokens=4096, |
| | temperature=temperature, |
| | stop_sequences=["\n\nUser:"], |
| | ) |
| | |
| | return response.strip() |
| | |
| | except Exception as e: |
| | error_msg = f"Error generating response with Mistral: {e}" |
| | logger.error(error_msg) |
| | return error_msg |
| | |
| | |
| | try: |
| | response = client.text_generation( |
| | prompt, |
| | max_new_tokens=4096, |
| | temperature=temperature, |
| | stop_sequences=["\n\nUser:"], |
| | ) |
| | |
| | return response.strip() |
| | |
| | except Exception as e: |
| | logging.error(f"Error generating response with Mistral: {str(e)}") |
| | return f"I apologize, but I encountered an error generating a response: {str(e)}" |
| |
|
| | def clear_conversation(): |
| | """ |
| | Clear the conversation history. |
| | Implement this based on your specific conversation tracking mechanism. |
| | """ |
| | |
| | |
| | pass |
| |
|
| | |
| | def get_full_response_chunks(result): |
| | """ |
| | Retrieve all chunks of a potentially long response. |
| | |
| | Args: |
| | result (Dict): Result from ask_question |
| | |
| | Returns: |
| | List[str]: All response chunks |
| | """ |
| | return result.get('full_response_chunks', [result.get('response', '')]) |
| |
|
| | def initialize_chatbot(): |
| | """ |
| | Initialize the chatbot with a welcome message and system setup. |
| | |
| | Returns: |
| | Dict[str, str]: Initial chatbot response |
| | """ |
| | welcome_message = """Welcome to the American University Academic Advisor Chatbot! |
| | |
| | I'm here to help you with information about: |
| | - Academic programs |
| | - Course details |
| | - Program requirements |
| | - Academic policies |
| | |
| | What would you like to know about American University's academic offerings? |
| | |
| | Some example questions you can ask: |
| | - Tell me about the Data Science program |
| | - What are the requirements for a Data Science major? |
| | - What courses are required for a Statistics minor? |
| | - Can you help me understand the AU Core curriculum? |
| | |
| | Feel free to ask, and I'll do my best to provide comprehensive and helpful information!""" |
| |
|
| | return { |
| | "response": welcome_message, |
| | "sources": "AU Academic Advisor Chatbot - Initial Welcome Message" |
| | } |
| |
|
| | def get_chatbot_info(): |
| | """ |
| | Provide information about the chatbot's capabilities and sources. |
| | |
| | Returns: |
| | Dict[str, str]: Chatbot information |
| | """ |
| | info_message = """π€ AU Academic Advisor Chatbot Information |
| | |
| | Data Sources: |
| | - American University's official website |
| | - Course catalog |
| | - Program description pages |
| | - Academic department information |
| | |
| | Technologies Used: |
| | - Retrieval-Augmented Generation (RAG) |
| | - Mistral 7B Language Model |
| | - ChromaDB Vector Database |
| | - Sentence Transformers for Embedding |
| | |
| | Capabilities: |
| | - Retrieve detailed information about academic programs |
| | - Explain course requirements |
| | - Provide insights into academic policies |
| | - Offer guidance on course selection |
| | |
| | Limitations: |
| | - Information is based on available web sources |
| | - Might not reflect the most recent updates |
| | - Recommended to verify critical information with official AU sources |
| | |
| | Developed as a student research project to assist with academic advising. |
| | """ |
| |
|
| | return { |
| | "response": info_message, |
| | "sources": "AU Academic Advisor Chatbot - System Information" |
| | } |
| |
|
| | def interactive_chat(): |
| | """ |
| | Run an interactive chat session in the command line. |
| | Updated to handle multi-part responses. |
| | """ |
| | print("π€ AU Academic Advisor Chatbot - Interactive Mode") |
| | print("Type 'quit', 'exit', or 'q' to end the conversation.") |
| | print("Type 'info' to get information about the chatbot.\n") |
| |
|
| | |
| | init_response = initialize_chatbot() |
| | print("π€ ", init_response["response"]) |
| | print("\n--- How can I help you today? ---\n") |
| |
|
| | while True: |
| | try: |
| | |
| | user_query = input("You: ").strip() |
| |
|
| | |
| | if user_query.lower() in ['quit', 'exit', 'q']: |
| | print("\nπ€ Thank you for using the AU Academic Advisor Chatbot. Goodbye!") |
| | break |
| | |
| | |
| | if user_query.lower() == 'info': |
| | info_response = get_chatbot_info() |
| | print("π€ ", info_response["response"]) |
| | continue |
| |
|
| | |
| | if user_query: |
| | print("\nπ€ Thinking...\n") |
| | response = ask_question(user_query) |
| | |
| | |
| | if "full_response" in response: |
| | print("π€ ", response["full_response"]) |
| | else: |
| | print("π€ ", response["response"]) |
| | |
| | |
| | if "metadata" in response and response["metadata"]: |
| | print("\n--- Sources ---") |
| | for i, meta in enumerate(response["metadata"]): |
| | source = meta.get('url', 'Unknown Source') |
| | title = meta.get('title', 'Untitled') |
| | print(f"{i+1}. {title} - {source}") |
| | |
| | print("\n") |
| |
|
| | except KeyboardInterrupt: |
| | print("\n\nπ€ Chat interrupted. Type 'quit' to exit.") |
| | except Exception as e: |
| | print(f"\nπ€ An error occurred: {e}") |
| |
|
| | |
| | if __name__ == "__main__": |
| | try: |
| | interactive_chat() |
| | except Exception as e: |
| | print(f"An unexpected error occurred: {e}") |
| | import traceback |
| | traceback.print_exc() |
| |
|
| | |
| | __all__ = [ |
| | 'ask_question', |
| | 'initialize_chatbot', |
| | 'get_chatbot_info', |
| | 'clear_conversation', |
| | 'split_long_response', |
| | 'interactive_chat' |
| | ] |
| |
|