| |
| """ |
| American University Academic Advisor Chatbot |
| =========================================== |
| |
| A RAG-based chatbot system that answers questions about American University academic programs, |
| leveraging ChromaDB for vector retrieval and Mistral 7B for response generation. |
| |
| Features: |
| --------- |
| - Course requirement pattern recognition: Distinguishes between required courses, alternative |
| options ("take either X or Y"), option groups, and true electives |
| - Academic terminology matching: Connects student questions using "required" to program |
| descriptions using "must complete" |
| - Specialized formatting for course requirements: Organizes courses by type with clear labels |
| - Response generation using Mistral 7B: Creates natural language responses with source citations |
| - Conversation history tracking: Maintains context across multiple questions |
| |
| Usage: |
| ------ |
| 1. Command line: |
| python chatbot.py |
| |
| 2. Import in another script: |
| from chatbot import ask_question |
| result = ask_question("What are the required courses for the Data Science program?") |
| print(result["response"]) |
| |
| 3. Clear conversation history: |
| from chatbot import clear_conversation |
| clear_conversation() |
| |
| Requirements: |
| ------------ |
| - Python 3.8+ |
| - ChromaDB for vector storage and retrieval |
| - Hugging Face API access for Mistral 7B |
| - Keyring (optional) for secure API key storage |
| |
| Configuration: |
| ------------- |
| The system needs a Hugging Face API key for generating responses. Set it using: |
| |
| keyring.set_password("HF_API_KEY", "rressler", "<your_api_key>") |
| |
| Or create an .env file with: |
| |
| HF_API_KEY=<your_api_key> |
| |
| Note: |
| ----- |
| This implementation is designed specifically for academic program queries that |
| involve distinguishing between required courses and alternatives. It uses |
| specialized detection for patterns like "STAT-320 or STAT-302" to correctly |
| inform students about their course options. |
| """ |
|
|
| |
|
|
| import os |
| import sys |
| import re |
| from pathlib import Path |
| import logging |
| import requests |
| import json |
| import math |
| import warnings |
| from typing import List, Dict, Tuple, Any, Optional |
|
|
| |
| warnings.filterwarnings("ignore", category=FutureWarning) |
|
|
| |
| from utils.logging_utils import setup_logging |
| from utils.chroma_utils import get_chroma_manager |
| from utils.auth_utils import authenticate_huggingface |
|
|
| |
| logger = setup_logging(logger_name="Chatbot", log_filename="chatbot.log") |
|
|
| def configure_api_credentials() -> Tuple[Optional[str], str, Optional[Dict[str, str]]]: |
| """ |
| Configure Hugging Face API credentials using a unified method. |
| |
| Returns: |
| Tuple: (API key, Model URL, Headers) |
| """ |
| try: |
| hf_api_key, headers = authenticate_huggingface() |
| |
| model_url = os.getenv( |
| "MISTRAL_API_URL", |
| "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3" |
| ) |
| |
| return hf_api_key, model_url, headers |
|
|
| except Exception as e: |
| logger.warning(f"Authentication failed: {e}") |
| raise |
|
|
| |
| try: |
| HF_API_KEY, MISTRAL_API_URL, MISTRAL_HEADERS = configure_api_credentials() |
| except Exception as e: |
| logger.error(f"Failed to configure API credentials: {e}") |
| HF_API_KEY, MISTRAL_API_URL, MISTRAL_HEADERS = None, None, None |
|
|
| |
| global_chroma_manager = get_chroma_manager(model_size="medium") |
| print(type(global_chroma_manager)) |
|
|
| def classify_course_level(course_code): |
| """ |
| Classify course level based on course number. |
| |
| Args: |
| course_code (str): The course code (e.g., "MATH-221", "STAT-615") |
| |
| Returns: |
| dict: Dictionary with course_level and level_description |
| """ |
| |
| classification = { |
| "course_level": "unknown", |
| "level_description": "Unknown course level" |
| } |
| |
| |
| try: |
| |
| if '-' in course_code: |
| parts = course_code.split('-') |
| elif ' ' in course_code: |
| parts = course_code.split(' ') |
| elif '.' in course_code: |
| parts = course_code.split('.') |
| else: |
| |
| import re |
| match = re.match(r'^([A-Za-z]+)(\d+)$', course_code) |
| if match: |
| parts = [match.group(1), match.group(2)] |
| else: |
| return classification |
| |
| |
| if len(parts) < 2: |
| return classification |
| |
| |
| course_num_str = parts[1].strip() |
| |
| course_num_str = ''.join(c for c in course_num_str if c.isdigit()) |
| course_num = int(course_num_str) |
| |
| |
| if course_num <= 499: |
| classification["course_level"] = "undergraduate" |
| classification["level_description"] = "Undergraduate course" |
| elif 500 <= course_num <= 599: |
| classification["course_level"] = "graduate_open" |
| classification["level_description"] = "Graduate course open to qualified undergraduate students" |
| elif 600 <= course_num <= 699: |
| classification["course_level"] = "graduate_core" |
| classification["level_description"] = "Core graduate course for the master's degree in the field of study" |
| elif 700 <= course_num <= 799: |
| classification["course_level"] = "graduate_advanced" |
| classification["level_description"] = "Advanced graduate course" |
| else: |
| classification["course_level"] = "other" |
| classification["level_description"] = f"Course number {course_num} outside standard classification" |
| |
| except Exception as e: |
| |
| pass |
| |
| return classification |
|
|
| def extract_courses_from_results(results): |
| """ |
| Extract course information from the query results with level classification. |
| |
| Args: |
| results (dict): Results from ChromaDB query |
| |
| Returns: |
| list: List of course objects with code, title, credits, type, and level classification |
| """ |
| courses = [] |
| course_codes_seen = set() |
| |
| |
| for i, (doc, metadata) in enumerate(zip(results["documents"][0], results["metadatas"][0])): |
| |
| section_type = metadata.get("section_type", "unknown") |
| |
| |
| |
| course_pattern = r'([A-Z]{2,4}-\d{3})\s+([^(]+)(?:\s*\((\d+(?:\.\d+)?)\))?' |
| |
| for line in doc.split('\n'): |
| matches = re.findall(course_pattern, line) |
| |
| for match in matches: |
| code = match[0].strip() |
| title = match[1].strip() if len(match) > 1 else "" |
| credits = match[2] if len(match) > 2 and match[2] else "N/A" |
| |
| |
| if code in course_codes_seen: |
| continue |
| |
| course_codes_seen.add(code) |
| |
| |
| classification = classify_course_level(code) |
| |
| courses.append({ |
| "code": code, |
| "title": title, |
| "credits": credits, |
| "type": section_type, |
| "course_level": classification["course_level"], |
| "level_description": classification["level_description"] |
| }) |
| |
| return courses |
|
|
| def format_courses_for_display(courses): |
| """ |
| Format the courses into a readable string with level information. |
| |
| Args: |
| courses (list): List of course objects |
| |
| Returns: |
| str: Formatted string with course information grouped by type and level |
| """ |
| if not courses: |
| return "No courses found." |
| |
| |
| grouped_courses = { |
| "required_courses": [], |
| "elective_courses": [], |
| "option_group": [], |
| "small_option_group": [] |
| } |
| |
| for course in courses: |
| course_type = course["type"] |
| if course_type in grouped_courses: |
| grouped_courses[course_type].append(course) |
| |
| |
| output = [] |
| |
| |
| if grouped_courses["required_courses"]: |
| output.append("**Required Courses:**") |
| output.append("These courses must be completed by all students in the program:") |
| |
| |
| level_priority = { |
| "undergraduate": 1, |
| "graduate_open": 2, |
| "graduate_core": 3, |
| "graduate_advanced": 4, |
| "other": 5, |
| "unknown": 6 |
| } |
| |
| |
| sorted_courses = sorted( |
| grouped_courses["required_courses"], |
| key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| ) |
| |
| |
| current_level = None |
| |
| for course in sorted_courses: |
| level = course.get("course_level", "unknown") |
| level_desc = course.get("level_description", "") |
| |
| |
| if level != current_level: |
| current_level = level |
| if level_desc: |
| output.append(f"\n{level_desc.upper()}:") |
| |
| output.append(f"- {course['code']} {course['title']} ({course['credits']})") |
| |
| output.append("") |
| |
| |
| if grouped_courses["small_option_group"]: |
| output.append("**Alternative Course Options:**") |
| output.append("Students must complete ONE course from each of these groups:") |
| |
| |
| group_id = 1 |
| |
| groups = {} |
| for course in grouped_courses["small_option_group"]: |
| |
| group_id = course.get("group_id", group_id) |
| if group_id not in groups: |
| groups[group_id] = [] |
| groups[group_id].append(course) |
| |
| |
| for group_id, course_list in groups.items(): |
| output.append(f"\nOption Group {group_id}:") |
| |
| |
| level_priority = { |
| "undergraduate": 1, |
| "graduate_open": 2, |
| "graduate_core": 3, |
| "graduate_advanced": 4, |
| "other": 5, |
| "unknown": 6 |
| } |
| |
| sorted_courses = sorted( |
| course_list, |
| key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| ) |
| |
| for course in sorted_courses: |
| level_desc = course.get("level_description", "") |
| output.append(f"- {course['code']} {course['title']} ({course['credits']}) - {level_desc}") |
| |
| output.append("") |
| |
| |
| if grouped_courses["option_group"]: |
| output.append("**Option Groups:**") |
| output.append("Students must select courses from the following groups according to program requirements:") |
| |
| |
| level_priority = { |
| "undergraduate": 1, |
| "graduate_open": 2, |
| "graduate_core": 3, |
| "graduate_advanced": 4, |
| "other": 5, |
| "unknown": 6 |
| } |
| |
| sorted_courses = sorted( |
| grouped_courses["option_group"], |
| key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| ) |
| |
| |
| current_level = None |
| |
| for course in sorted_courses: |
| level = course.get("course_level", "unknown") |
| level_desc = course.get("level_description", "") |
| |
| |
| if level != current_level: |
| current_level = level |
| if level_desc: |
| output.append(f"\n{level_desc.upper()}:") |
| |
| output.append(f"- {course['code']} {course['title']} ({course['credits']})") |
| |
| output.append("") |
| |
| |
| if grouped_courses["elective_courses"]: |
| output.append("**Elective Courses:**") |
| output.append("Students may choose from these optional courses to fulfill elective requirements:") |
| |
| |
| level_priority = { |
| "undergraduate": 1, |
| "graduate_open": 2, |
| "graduate_core": 3, |
| "graduate_advanced": 4, |
| "other": 5, |
| "unknown": 6 |
| } |
| |
| sorted_courses = sorted( |
| grouped_courses["elective_courses"], |
| key=lambda x: level_priority.get(x.get("course_level", "unknown"), 999) |
| ) |
| |
| |
| current_level = None |
| |
| for course in sorted_courses: |
| level = course.get("course_level", "unknown") |
| level_desc = course.get("level_description", "") |
| |
| |
| if level != current_level: |
| current_level = level |
| if level_desc: |
| output.append(f"\n{level_desc.upper()}:") |
| |
| output.append(f"- {course['code']} {course['title']} ({course['credits']})") |
| |
| return "\n".join(output) |
|
|
| def process_program_query(query, program_name=None): |
| """ |
| Check if the query is about program requirements or courses and extract program name. |
| |
| Args: |
| query (str): The user's query |
| program_name (str, optional): Pre-identified program name |
| |
| Returns: |
| dict: Information about the query intent and program |
| """ |
| logger.info(f"Processing query in process_program_query start line 446: {repr(query)}") |
| logger.info(f"[process_program_query] Got query: {repr(query)} | Type: {type(query)} | ID: {id(query)}") |
| if not isinstance(query, str): |
| logger.warning(f"Query is not a string! Got {type(query)}: {repr(query)}") |
| return { |
| "is_course_query": False, |
| "course_type": None, |
| "program_name": program_name, |
| "query_type": "invalid" |
| } |
|
|
| query_lower = query.lower() |
| result = { |
| "is_course_query": False, |
| "course_type": None, |
| "program_name": program_name, |
| "query_type": "general" |
| } |
| |
| |
| course_query_patterns = [ |
| |
| r'what(?:\s+are)?(?:\s+the)?\s+(required|core|elective|optional|must[\s-]complete)\s+courses\s+for\s+(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)?$', |
| |
| |
| r'(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)(?:\s+requirements|(?:\s+)courses)', |
| |
| |
| r'what\s+courses\s+(?:do\s+I|does\s+one|should\s+I)\s+(?:need\s+to|have\s+to|must)\s+(?:take|complete)\s+for\s+(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)?', |
| |
| |
| r'what(?:\s+courses)?\s+(?:do\s+I|does\s+one|should\s+I)\s+(?:have\s+to|need\s+to|must)\s+complete\s+for\s+(?:the\s+)?(.+?)(?:\s+program|\s+major|\s+degree|\s+minor)?' |
| ] |
| |
| |
| for pattern in course_query_patterns: |
| match = re.search(pattern, query_lower) |
| if match: |
| result["is_course_query"] = True |
| |
| |
| if len(match.groups()) > 1: |
| course_type = match.group(1) |
| program_name = match.group(2) |
| |
| |
| if course_type in ['required', 'core', 'must-complete', 'must complete']: |
| result["course_type"] = 'required_courses' |
| elif course_type in ['elective', 'optional']: |
| result["course_type"] = 'elective_courses' |
| else: |
| result["course_type"] = 'all' |
| |
| result["program_name"] = program_name |
| result["query_type"] = "course_requirements" |
| break |
| elif len(match.groups()) == 1: |
| |
| program_name = match.group(1) |
| result["program_name"] = program_name |
| result["course_type"] = 'all' |
| result["query_type"] = "program_requirements" |
| break |
| |
| return result |
|
|
| def expand_query_with_academic_terms(query): |
| """ |
| Expand the query with alternate academic terminology to improve retrieval. |
| |
| This function identifies key terms in the query and adds synonyms/alternate |
| phrasings that are common in academic contexts, focusing especially on |
| course requirement terminology. |
| |
| Args: |
| query (str): The original user query |
| |
| Returns: |
| str: Expanded query with alternate terminology |
| """ |
| |
| academic_term_mappings = { |
| "required": ["must complete", "must take", "mandatory", "core", "required", "requirement", "capstone"], |
| "elective": ["optional", "elective", "choice", "select from"], |
| "prerequisite": ["prereq", "prerequisite", "before taking", "prior to"], |
| "corequisite": ["coreq", "corequisite", "concurrent", "alongside"], |
| "credit": ["credit hour", "credit", "unit"], |
| "major": ["major", "program", "degree", "concentration"], |
| "minor": ["minor", "secondary field"], |
| "course": ["course", "class", "subject"] |
| } |
| |
| |
| expanded_terms = [] |
| logger.info(f"Processing query for mapped terms: {repr(query)}") |
| query_lower = query.lower() |
| |
| for original_term, synonyms in academic_term_mappings.items(): |
| if original_term in query_lower: |
| |
| expanded_terms.extend(synonyms) |
| |
| |
| if expanded_terms: |
| |
| |
| expanded_query = f"{query} {' '.join(expanded_terms)}" |
| return expanded_query |
| |
| |
| return query |
|
|
| def get_program_courses(program_name, course_type='all', n_results=10): |
| """ |
| Get specific course information for a program based on course type. |
| |
| Args: |
| program_name (str): Name of the academic program |
| course_type (str): Type of courses to retrieve ('required_courses', |
| 'elective_courses', 'option_group', 'small_option_group', or 'all') |
| n_results (int): Number of results to return |
| |
| Returns: |
| dict: Results containing course information |
| """ |
| |
| chroma_manager = global_chroma_manager |
|
|
| |
| |
| if course_type == 'all': |
| where_clause = { |
| "$or": [ |
| {"section_type": "required_courses"}, |
| {"section_type": "elective_courses"}, |
| {"section_type": "option_group"}, |
| {"section_type": "small_option_group"} |
| ] |
| } |
| else: |
| where_clause = {"section_type": course_type} |
| |
| |
| if program_name and program_name.lower() != "any": |
| |
| query = f"{course_type} for {program_name} program" |
| |
| |
| where_clause["$and"] = [ |
| {"type": "program"}, |
| {"$or": [ |
| {"program_name": {"$contains": program_name.lower()}}, |
| {"parent_title": {"$contains": program_name.lower()}} |
| ]} |
| ] |
| else: |
| query = f"{course_type}" |
| where_clause["type"] = "program" |
| |
| |
| expanded_query = expand_query_with_academic_terms(query) |
| |
| |
| results = chroma_manager.query( |
| query_text=expanded_query, |
| where=where_clause, |
| n_results=n_results |
| ) |
| |
| return results |
|
|
| def get_program_course_information(program_name, course_type='all'): |
| """ |
| Get formatted course information for a program. |
| |
| Args: |
| program_name (str): Name of the academic program |
| course_type (str): Type of courses to retrieve |
| |
| Returns: |
| str: Formatted course information |
| """ |
| results = get_program_courses(program_name, course_type, n_results=15) |
| courses = extract_courses_from_results(results) |
| return format_courses_for_display(courses) |
|
|
| |
| def extract_validated_program_requirements(soup, program_name, department, url, debug_mode=False): |
| """ |
| Extract program requirements with strict validation to avoid mixing electives with requirements. |
| Carefully differentiates between similarly named programs. |
| |
| Args: |
| soup (BeautifulSoup): Parsed HTML content |
| program_name (str): Name of the program |
| department (str): Department name |
| url (str): URL of the page |
| debug_mode (bool): Whether to log debug information |
| |
| Returns: |
| dict: Validated program requirements |
| """ |
| logger.info(f"Extracting validated requirements for: {program_name}") |
| |
| |
| requirements = { |
| "program_name": program_name, |
| "department": department, |
| "url": url, |
| "core_requirements": [], |
| "electives": [], |
| "capstone": None, |
| "total_credits": 0 |
| } |
| |
| |
| |
| normalized_program = program_name.lower().strip() |
| |
| |
| if normalized_program == "bs data science" or normalized_program == "b.s. data science": |
| program_type = "BS_DATA_SCIENCE" |
| elif normalized_program == "bs data sciences" or normalized_program == "b.s. data sciences": |
| program_type = "BS_DATA_SCIENCES" |
| elif normalized_program == "ms data science" or normalized_program == "m.s. data science": |
| program_type = "MS_DATA_SCIENCE" |
| elif normalized_program == "ms data sciences" or normalized_program == "m.s. data sciences": |
| program_type = "MS_DATA_SCIENCES" |
| else: |
| |
| program_type = "OTHER" |
| |
| requirements["program_type"] = program_type |
| |
| if debug_mode: |
| logger.debug(f"Identified program type: {program_type}") |
| |
| |
| requirement_sections = [] |
| |
| |
| requirement_headers = soup.find_all(['h2', 'h3', 'h4'], string=lambda text: text and any(keyword in text.lower() |
| for keyword in ['requirement', 'core', 'foundation', 'required', 'curriculum', |
| 'major', 'course', 'capstone', 'thesis', 'project', 'elective'])) |
| |
| for header in requirement_headers: |
| section_title = header.get_text(strip=True) |
| section_content = [] |
| |
| |
| current = header.next_sibling |
| while current and not (hasattr(current, 'name') and current.name in ['h2', 'h3', 'h4']): |
| if hasattr(current, 'get_text'): |
| text = current.get_text(strip=True) |
| if text: |
| section_content.append(text) |
| elif isinstance(current, str) and current.strip(): |
| section_content.append(current.strip()) |
| |
| current = current.next_sibling |
| |
| if section_content: |
| section_text = ' '.join(section_content) |
| |
| |
| section_type = "unknown" |
| |
| |
| if any(keyword in section_title.lower() for keyword in ['capstone', 'thesis', 'project', 'senior']): |
| section_type = "capstone" |
| requirements["capstone"] = { |
| "title": section_title, |
| "content": section_text, |
| "courses": extract_course_codes(section_text) |
| } |
| |
| |
| if program_type == "BS_DATA_SCIENCE": |
| |
| if "stat-427" in section_text.lower() or "stat 427" in section_text.lower(): |
| requirements["capstone"]["validated"] = True |
| requirements["capstone"]["credits"] = 3 |
| requirements["capstone"]["course_title"] = "Statistical Machine Learning" |
| else: |
| requirements["capstone"]["validated"] = False |
| else: |
| |
| requirements["capstone"]["validated"] = True |
| |
| |
| elif any(keyword in section_title.lower() for keyword in ['elective', 'optional', 'choose']): |
| section_type = "electives" |
| requirements["electives"].append({ |
| "title": section_title, |
| "content": section_text, |
| "courses": extract_course_codes(section_text) |
| }) |
| |
| |
| elif any(keyword in section_title.lower() for keyword in ['requirement', 'core', 'required', 'foundation']): |
| section_type = "core" |
| requirements["core_requirements"].append({ |
| "title": section_title, |
| "content": section_text, |
| "courses": extract_course_codes(section_text) |
| }) |
| |
| |
| requirement_sections.append({ |
| "title": section_title, |
| "content": section_text, |
| "type": section_type |
| }) |
| |
| |
| credit_patterns = [ |
| r'total\s+of\s+(\d+)\s+credit', |
| r'(\d+)\s+credits?\s+(?:are|is)\s+required', |
| r'requires\s+(\d+)\s+credits?', |
| r'minimum\s+of\s+(\d+)\s+credits?' |
| ] |
| |
| full_text = soup.get_text() |
| for pattern in credit_patterns: |
| match = re.search(pattern, full_text, re.IGNORECASE) |
| if match: |
| try: |
| requirements["total_credits"] = int(match.group(1)) |
| break |
| except ValueError: |
| pass |
| |
| |
| if program_type == "BS_DATA_SCIENCE": |
| |
| expected_core_courses = [ |
| "MATH-221", "MATH-222", "MATH-313", "STAT-203", "STAT-302", |
| "CSC-280", "DATA-320", "STAT-412", "STAT-415" |
| ] |
| |
| |
| found_courses = [] |
| for section in requirements["core_requirements"]: |
| for course in section["courses"]: |
| course_clean = clean_course_code(course) |
| if course_clean in expected_core_courses and course_clean not in found_courses: |
| found_courses.append(course_clean) |
| |
| |
| missing_courses = [c for c in expected_core_courses if c not in found_courses] |
| requirements["core_coverage"] = len(found_courses) / len(expected_core_courses) |
| |
| if debug_mode: |
| logger.debug(f"Found {len(found_courses)}/{len(expected_core_courses)} expected core courses") |
| if missing_courses: |
| logger.debug(f"Missing core courses: {', '.join(missing_courses)}") |
| |
| elif program_type == "MS_DATA_SCIENCE": |
| |
| |
| pass |
| |
| |
| logger.info(f"Extracted {len(requirements['core_requirements'])} core requirement sections, {len(requirements['electives'])} elective sections") |
| |
| return requirements |
|
|
| def extract_course_codes(text): |
| """Extract course codes from text using regex.""" |
| |
| pattern = r'([A-Z]{2,4})[\s\-]?(\d{3,4}[A-Z]?)' |
| matches = re.findall(pattern, text, re.IGNORECASE) |
| |
| |
| courses = [f"{dept.upper()}-{num}" for dept, num in matches] |
| return courses |
|
|
| def clean_course_code(course_code): |
| """Standardize course code format to DEPT-NUM.""" |
| parts = re.match(r'([A-Z]{2,4})[\s\-]?(\d{3,4}[A-Z]?)', course_code, re.IGNORECASE) |
| if parts: |
| return f"{parts.group(1).upper()}-{parts.group(2)}" |
| return course_code |
|
|
| |
| def retrieve_validated_program_requirements(chroma_manager, program_name, debug_mode=False): |
| """ |
| Retrieve and validate program requirements from ChromaDB. |
| |
| Args: |
| chroma_manager: ChromaDB manager instance |
| program_name (str): Name of the program to retrieve |
| debug_mode (bool): Whether to log debug information |
| |
| Returns: |
| dict: Validated program requirements |
| """ |
| |
| |
| normalized_program = program_name.lower().strip() |
| |
| |
| if normalized_program == "bs data science" or normalized_program == "b.s. data science": |
| program_type = "BS_DATA_SCIENCE" |
| elif normalized_program == "bs data sciences" or normalized_program == "b.s. data sciences": |
| program_type = "BS_DATA_SCIENCES" |
| elif normalized_program == "ms data science" or normalized_program == "m.s. data science": |
| program_type = "MS_DATA_SCIENCE" |
| elif normalized_program == "ms data sciences" or normalized_program == "m.s. data sciences": |
| program_type = "MS_DATA_SCIENCES" |
| else: |
| |
| program_type = "OTHER" |
| |
| if debug_mode: |
| logger.debug(f"Retrieving requirements for: {program_name} (Type: {program_type})") |
| |
| |
| query = f"requirements for {program_name}" |
| |
| |
| summary_results = chroma_manager.query( |
| query_text=query, |
| n_results=5, |
| metadata_filter={"program_name": program_name, "type": "program", "section_type": "program_summary"} |
| ) |
| |
| if summary_results and len(summary_results['ids']) > 0: |
| |
| if debug_mode: |
| logger.debug(f"Found program summary for {program_name}") |
| |
| |
| summary_text = summary_results['documents'][0] |
| |
| |
| requirements = { |
| "program_name": program_name, |
| "program_type": program_type, |
| "department": summary_results['metadatas'][0].get('department', 'Unknown Department'), |
| "core_requirements": [], |
| "electives": [], |
| "capstone": None |
| } |
| |
| |
| if "REQUIRED COURSES" in summary_text: |
| core_section = summary_text.split("REQUIRED COURSES")[1].split("ELECTIVE COURSES")[0] if "ELECTIVE COURSES" in summary_text else summary_text.split("REQUIRED COURSES")[1] |
| requirements["core_requirements"] = [{ |
| "title": "Major Requirements", |
| "content": core_section, |
| "courses": extract_course_codes(core_section) |
| }] |
| |
| |
| if "ELECTIVE COURSES" in summary_text: |
| elective_section = summary_text.split("ELECTIVE COURSES")[1] |
| requirements["electives"] = [{ |
| "title": "Elective Courses", |
| "content": elective_section, |
| "courses": extract_course_codes(elective_section) |
| }] |
| |
| return requirements |
| |
| |
| section_results = chroma_manager.query( |
| query_text=query, |
| n_results=10, |
| metadata_filter={"program_name": program_name, "type": "program"} |
| ) |
| |
| if not section_results or len(section_results['ids']) == 0: |
| logger.warning(f"No results found for {program_name} requirements") |
| return None |
| |
| |
| requirements = { |
| "program_name": program_name, |
| "program_type": program_type, |
| "department": section_results['metadatas'][0].get('department', 'Unknown Department'), |
| "core_requirements": [], |
| "electives": [], |
| "capstone": None |
| } |
| |
| |
| for i, doc in enumerate(section_results['documents']): |
| metadata = section_results['metadatas'][i] |
| section_type = metadata.get('section_type', 'unknown') |
| title = metadata.get('title', f"Section {i+1}") |
| |
| |
| if section_type in ['required_courses', 'option_group']: |
| requirements["core_requirements"].append({ |
| "title": title, |
| "content": doc, |
| "courses": extract_course_codes(doc) |
| }) |
| elif section_type == 'elective_courses': |
| requirements["electives"].append({ |
| "title": title, |
| "content": doc, |
| "courses": extract_course_codes(doc) |
| }) |
| elif "capstone" in title.lower() or "senior" in title.lower(): |
| requirements["capstone"] = { |
| "title": title, |
| "content": doc, |
| "courses": extract_course_codes(doc) |
| } |
| |
| return requirements |
|
|
| |
| def generate_accurate_requirements_response(requirements, program_name): |
| """ |
| Generate an accurate response about program requirements. |
| Enhanced to handle the updated classification where required electives and minors |
| are properly included in the required_courses category. |
| |
| Args: |
| requirements (dict): Validated program requirements |
| program_name (str): Name of the program |
| |
| Returns: |
| str: Formatted response with accurate requirements |
| """ |
| if not requirements: |
| return f"I'm sorry, but I couldn't find specific requirements for the {program_name} program. Please check the department website for the most up-to-date information." |
| |
| response = [f"# {program_name} Requirements", ""] |
| |
| |
| if requirements.get("department"): |
| response.append(f"**Department:** {requirements['department']}") |
| response.append("") |
| |
| |
| if requirements.get("total_credits"): |
| response.append(f"**Total Credits Required:** {requirements['total_credits']}") |
| response.append("") |
| |
| |
| if requirements.get("core_requirements"): |
| response.append("## Core Requirements") |
| |
| |
| displayed_sections = set() |
| |
| for section in requirements["core_requirements"]: |
| |
| if section['title'] in displayed_sections: |
| continue |
| |
| response.append(f"**{section['title']}**") |
| displayed_sections.add(section['title']) |
| |
| |
| if section.get("courses"): |
| for course in section["courses"]: |
| |
| |
| response.append(f"- {course}") |
| else: |
| |
| response.append(section["content"]) |
| |
| response.append("") |
| |
| |
| if requirements.get("capstone"): |
| response.append("## Capstone Experience") |
| capstone = requirements["capstone"] |
| response.append(f"**{capstone['title']}**") |
| |
| |
| program_type = requirements.get("program_type", "OTHER") |
| if program_type == "BS_DATA_SCIENCE" and capstone.get("validated", False): |
| response.append("**STAT-427: Statistical Machine Learning (3 credits)**") |
| response.append("This course serves as the capstone experience for the Data Science program.") |
| elif capstone.get("courses"): |
| for course in capstone["courses"]: |
| response.append(f"- {course}") |
| else: |
| response.append(capstone["content"]) |
| |
| response.append("") |
| |
| |
| |
| if requirements.get("minor_requirement") and not any( |
| "minor" in section['title'].lower() for section in requirements.get("core_requirements", []) |
| ): |
| response.append("## Minor or Second Major Requirement") |
| minor = requirements["minor_requirement"] |
| response.append(f"**{minor['title']}**") |
| response.append(minor["content"]) |
| response.append("") |
| |
| |
| |
| required_electives = [] |
| elective_titles = set() |
| |
| |
| if requirements.get("core_requirements"): |
| for section in requirements["core_requirements"]: |
| if 'elective' in section['title'].lower() and section['title'] not in elective_titles: |
| required_electives.append(section) |
| elective_titles.add(section['title']) |
| |
| |
| if requirements.get("electives"): |
| for section in requirements["electives"]: |
| if section['title'] not in elective_titles: |
| required_electives.append(section) |
| elective_titles.add(section['title']) |
| |
| |
| if required_electives: |
| response.append("## Elective Requirements") |
| for section in required_electives: |
| response.append(f"**{section['title']}**") |
| |
| |
| if section.get("courses"): |
| for course in section["courses"]: |
| response.append(f"- {course}") |
| else: |
| response.append(section["content"]) |
| |
| response.append("") |
| |
| |
| if requirements.get("option_groups"): |
| response.append("## Option Groups") |
| for section in requirements["option_groups"]: |
| response.append(f"**{section['title']}**") |
| |
| |
| if section.get("courses"): |
| for course in section["courses"]: |
| response.append(f"- {course}") |
| else: |
| response.append(section["content"]) |
| |
| response.append("") |
| |
| |
| response.append("*Note: These requirements are subject to change. Please consult with an academic advisor or refer to the official program documentation for the most current information.*") |
| |
| return "\n".join(response) |
|
|
| |
| |
| |
| |
|
|
| class AcademicChatbot: |
| """ |
| A RAG-based chatbot for answering questions about academic programs and courses |
| using Mistral 7B model and ChromaDB for retrieval. |
| """ |
| |
| |
| def __init__(self): |
| """Initialize the chatbot with ChromaDB and model configuration.""" |
| |
| self.chroma_manager = global_chroma_manager |
| self.collection = self.chroma_manager.get_collection() |
| |
| |
| self.api_url = MISTRAL_API_URL |
| self.headers = MISTRAL_HEADERS |
| self.conversation_history = [] |
| |
| |
| if not self.headers: |
| logger.warning("Mistral API headers not properly configured. Regenerate API credentials.") |
| raise ValueError("Failed to initialize Mistral API headers. Check API key configuration.") |
| |
| def add_message(self, role: str, content: str): |
| """Add a message to the conversation history.""" |
| self.conversation_history.append({"role": role, "content": content}) |
| |
| def clear_history(self): |
| """Clear the conversation history.""" |
| self.conversation_history = [] |
| |
| def get_history(self): |
| """Get the conversation history.""" |
| return self.conversation_history |
| |
| def get_url_from_metadata(self, metadata): |
| """Extract URL from metadata, checking multiple possible field names.""" |
| |
| url_field_names = ['url', 'course_url', 'source_url', 'link', 'href', 'source'] |
| |
| for field in url_field_names: |
| if field in metadata and metadata[field]: |
| return metadata[field] |
| |
| |
| return '' |
| |
| def retrieve_context(self, query: str, n_results: int = 8) -> Tuple[List[str], List[Dict[str, Any]]]: |
| """ |
| Retrieve diverse and relevant documents from ChromaDB based on the query. |
| |
| Args: |
| query: The user's question |
| n_results: Number of documents to retrieve |
| |
| Returns: |
| Tuple containing (contexts, metadata) |
| """ |
| logger.info(f"Retrieving context for query: {query}") |
| |
| |
| expanded_query = expand_query_with_academic_terms(query) |
| logger.info(f"Expanded query: {expanded_query}") |
| |
| |
| retrieve_count = min(n_results * 3, 25) |
| results = self.chroma_manager.query(expanded_query, n_results=retrieve_count) |
| |
| |
| contexts = [] |
| metadata_list = [] |
| |
| if 'documents' in results and results['documents']: |
| documents = results['documents'][0] |
| metadatas = results['metadatas'][0] if 'metadatas' in results and results['metadatas'] else [{}] * len(documents) |
| |
| |
| seen_urls = set() |
| seen_titles = set() |
| |
| |
| doc_groups = {} |
| for doc, meta in zip(documents, metadatas): |
| url = meta.get('url', '') if meta else '' |
| title = meta.get('title', '') if meta else '' |
| key = (url, title) |
| |
| if key not in doc_groups: |
| doc_groups[key] = [] |
| |
| doc_groups[key].append((doc, meta)) |
| |
| |
| while len(contexts) < n_results and doc_groups: |
| for key in list(doc_groups.keys()): |
| if doc_groups[key]: |
| doc, meta = doc_groups[key].pop(0) |
| contexts.append(doc) |
| metadata_list.append(meta) |
| |
| if not doc_groups[key]: |
| del doc_groups[key] |
| |
| if len(contexts) >= n_results: |
| break |
| |
| |
| if len(contexts) < n_results: |
| i = 0 |
| while len(contexts) < n_results and i < len(documents): |
| if documents[i] not in contexts: |
| contexts.append(documents[i]) |
| metadata_list.append(metadatas[i]) |
| i += 1 |
| |
| logger.info(f"Retrieved {len(contexts)} context documents") |
| |
| return contexts, metadata_list |
|
|
| def merge_program_documents(self, docs, metas, max_chars=15000): |
| """Merge documents by category to create comprehensive context.""" |
| |
| categories = { |
| "comprehensive": {"content": "", "sources": []}, |
| "core": {"content": "", "sources": []}, |
| "electives": {"content": "", "sources": []}, |
| "minor": {"content": "", "sources": []}, |
| "capstone": {"content": "", "sources": []}, |
| "ethics": {"content": "", "sources": []}, |
| "admission": {"content": "", "sources": []}, |
| "au_core": {"content": "", "sources": []}, |
| "university_requirements": {"content": "", "sources": []}, |
| "major_requirements": {"content": "", "sources": []}, |
| "other": {"content": "", "sources": []} |
| } |
| |
| |
| for i, (doc, meta) in enumerate(zip(docs, metas)): |
| title = meta.get("title", "").lower() if meta else "" |
| |
| |
| if "complete" in title and "requirements" in title: |
| category = "comprehensive" |
| elif "elective" in title: |
| category = "electives" |
| elif "minor" in title or "second major" in title: |
| category = "minor" |
| elif "capstone" in title: |
| category = "capstone" |
| elif "ethics" in title: |
| category = "ethics" |
| elif "admission" in title or "apply" in title: |
| category = "admission" |
| elif "au core" in title or "general education" in title: |
| category = "au_core" |
| elif "university requirement" in title: |
| category = "university_requirements" |
| elif "major requirement" in title: |
| category = "major_requirements" |
| elif any(term in title for term in ["statistics", "data science essentials", "intermediate"]): |
| category = "core" |
| else: |
| category = "other" |
| |
| |
| categories[category]["content"] += f"\n\n## {meta.get('title', '')}\n{doc}" |
| categories[category]["sources"].append(i) |
| |
| |
| output_docs = [] |
| output_metas = [] |
| source_indices = set() |
| |
| |
| if categories["comprehensive"]["content"]: |
| output_docs.append(categories["comprehensive"]["content"]) |
| output_metas.append({"title": "Complete Program Requirements"}) |
| source_indices.update(categories["comprehensive"]["sources"]) |
| |
| |
| general_content = "# General Program Requirements\n" |
| general_sources = [] |
| |
| |
| for cat_name, display_name in [ |
| ("university_requirements", "University Requirements"), |
| ("au_core", "AU Core Requirements"), |
| ("admission", "Admission Requirements") |
| ]: |
| if categories[cat_name]["content"]: |
| general_content += f"\n\n# {display_name}\n{categories[cat_name]['content']}" |
| general_sources.extend(categories[cat_name]["sources"]) |
| |
| |
| if general_content.strip() != "# General Program Requirements": |
| output_docs.append(general_content) |
| output_metas.append({"title": "General Requirements"}) |
| source_indices.update(general_sources) |
| |
| |
| major_content = "# Major Requirements\n" |
| |
| |
| if categories["major_requirements"]["content"]: |
| major_content += categories["major_requirements"]["content"] |
| |
| |
| if categories["core"]["content"]: |
| major_content += "\n\n# Core Course Requirements\n" + categories["core"]["content"] |
| |
| |
| if major_content.strip() != "# Major Requirements": |
| output_docs.append(major_content) |
| output_metas.append({"title": "Major Requirements"}) |
| source_indices.update(categories["major_requirements"]["sources"]) |
| source_indices.update(categories["core"]["sources"]) |
| |
| |
| additional_content = "# Additional Program Requirements\n" |
| additional_sources = [] |
| |
| |
| for cat_name, display_name in [ |
| ("electives", "Elective Requirements"), |
| ("minor", "Minor or Second Major Requirements"), |
| ("capstone", "Capstone Requirements"), |
| ("ethics", "Ethics Requirements") |
| ]: |
| if categories[cat_name]["content"]: |
| additional_content += f"\n\n# {display_name}\n{categories[cat_name]['content']}" |
| additional_sources.extend(categories[cat_name]["sources"]) |
| |
| |
| if additional_content.strip() != "# Additional Program Requirements": |
| output_docs.append(additional_content) |
| output_metas.append({"title": "Additional Requirements"}) |
| source_indices.update(additional_sources) |
| |
| |
| total_chars = sum(len(doc) for doc in output_docs) |
| |
| |
| if categories["other"]["content"] and total_chars + len(categories["other"]["content"]) <= max_chars: |
| other_content = "# Other Program Information\n" + categories["other"]["content"] |
| output_docs.append(other_content) |
| output_metas.append({"title": "Other Information"}) |
| source_indices.update(categories["other"]["sources"]) |
| |
| |
| all_sources = [] |
| for i in range(len(metas)): |
| all_sources.append(metas[i]) |
| |
| logger.info(f"Merged {len(docs)} documents into {len(output_docs)} comprehensive documents (Total chars: {sum(len(d) for d in output_docs)})") |
| |
| return output_docs, all_sources |
|
|
| |
| def trim_documents(self, docs, metas, max_chars=12000): |
| """Trim documents to avoid token overload while ensuring all requirements are included.""" |
| output_docs, output_metas = [], [] |
| total_chars = 0 |
| |
| |
| query = getattr(self, "current_query", None) |
| query_info = process_program_query(query) if isinstance(query, str) else None |
| if query_info: |
| logger.info(f"[trim_documents] query_info: {query_info} | program_name: {query_info.get('program_name')}") |
| program_name = (query_info.get("program_name") or "").lower() if query_info else "" |
| |
| |
| if program_name: |
| |
| comprehensive_index = None |
| for i, meta in enumerate(metas): |
| title = meta.get("title", "").lower() if meta else "" |
| if "complete" in title and "requirement" in title and program_name in meta.get("program_name", "").lower(): |
| comprehensive_index = i |
| break |
| |
| if comprehensive_index is not None and total_chars + len(docs[comprehensive_index]) <= max_chars: |
| output_docs.append(docs[comprehensive_index]) |
| output_metas.append(metas[comprehensive_index]) |
| total_chars += len(docs[comprehensive_index]) |
| |
| |
| for i, meta in enumerate(metas): |
| |
| if i == comprehensive_index: |
| continue |
| |
| |
| is_required = meta.get("section_type", "") == "required_courses" |
| is_this_program = program_name in meta.get("program_name", "").lower() |
| |
| |
| if is_required and is_this_program and total_chars + len(docs[i]) <= max_chars: |
| output_docs.append(docs[i]) |
| output_metas.append(metas[i]) |
| total_chars += len(docs[i]) |
| |
| |
| has_minor = any("minor" in meta.get("title", "").lower() or "second major" in meta.get("title", "").lower() |
| for meta in output_metas) |
| |
| if not has_minor: |
| for i, meta in enumerate(metas): |
| if "minor" in meta.get("title", "").lower() or "second major" in meta.get("title", "").lower(): |
| if total_chars + len(docs[i]) <= max_chars: |
| output_docs.append(docs[i]) |
| output_metas.append(metas[i]) |
| total_chars += len(docs[i]) |
| break |
| |
| |
| has_capstone = any("capstone" in meta.get("title", "").lower() for meta in output_metas) |
| |
| if not has_capstone: |
| for i, meta in enumerate(metas): |
| if "capstone" in meta.get("title", "").lower(): |
| if total_chars + len(docs[i]) <= max_chars: |
| output_docs.append(docs[i]) |
| output_metas.append(metas[i]) |
| total_chars += len(docs[i]) |
| break |
| |
| |
| has_electives = any("elective" in meta.get("title", "").lower() for meta in output_metas) |
| |
| if not has_electives: |
| for i, meta in enumerate(metas): |
| if "elective" in meta.get("title", "").lower(): |
| if total_chars + len(docs[i]) <= max_chars: |
| output_docs.append(docs[i]) |
| output_metas.append(metas[i]) |
| total_chars += len(docs[i]) |
| break |
| |
| |
| |
| if not output_docs: |
| for doc, meta in zip(docs, metas): |
| |
| if len(output_docs) == 0 or total_chars + len(doc) <= max_chars: |
| output_docs.append(doc) |
| output_metas.append(meta) |
| total_chars += len(doc) |
| else: |
| break |
| |
| logger.info(f"Trimmed documents from {len(docs)} to {len(output_docs)} (Total chars: {total_chars})") |
| return output_docs, output_metas |
|
|
| def generate_response(self, query: str, contexts: List[str], |
| metadata: List[Dict[str, Any]], temperature: float = 0.7) -> str: |
| """ |
| Generate a response using Mistral 7B with retrieved contexts. |
| |
| Args: |
| query: The user's question |
| contexts: Retrieved document contents |
| metadata: Metadata for the retrieved documents |
| temperature: Controls randomness in generation |
| |
| Returns: |
| Generated response |
| """ |
| logger.info(f"Generating response for query: {query}") |
|
|
| |
| self.current_query = query |
| if not isinstance(query, str) or not query.strip(): |
| logger.warning("Query is missing or not a string.") |
| return "No query provided." |
| |
| |
| query_info = process_program_query(query) |
| |
| if query_info["is_course_query"] and query_info["program_name"]: |
| logger.info(f"Detected course query for program: {query_info['program_name']}, type: {query_info['course_type']}") |
| |
| |
| try: |
| |
| requirements = retrieve_validated_program_requirements( |
| self.chroma_manager, |
| query_info["program_name"], |
| debug_mode=False |
| ) |
| |
| |
| if requirements: |
| logger.info(f"Using validated requirements for {query_info['program_name']}") |
| response = generate_accurate_requirements_response( |
| requirements, |
| query_info["program_name"] |
| ) |
| |
| |
| sources = [] |
| for i, meta in enumerate(metadata): |
| if meta: |
| title = meta.get("title", "") |
| url = self.get_url_from_metadata(meta) |
| |
| if url: |
| if title: |
| citation = f"[{i+1}] {title} - {url}" |
| else: |
| citation = f"[{i+1}] Program information - {url}" |
| else: |
| if title: |
| citation = f"[{i+1}] {title}" |
| else: |
| citation = f"[{i+1}] Program information" |
| |
| sources.append(citation) |
| |
| if sources: |
| |
| used_source_indexes = set() |
| for i in range(len(sources)): |
| |
| if f"[{i+1}]" in response: |
| used_source_indexes.add(i) |
| |
| |
| if used_source_indexes: |
| response += "\n\nSources Referenced in Response:" |
| for i in sorted(used_source_indexes): |
| response += f"\n{sources[i]}" |
| |
| |
| response += "\n\nAll Retrieved Sources:" |
| for source in sources: |
| response += f"\n{source}" |
| |
| return response |
| |
| except Exception as e: |
| logger.error(f"Error using validated requirements approach: {str(e)}") |
| |
| |
| |
| try: |
| program_courses = get_program_course_information( |
| query_info["program_name"], |
| query_info["course_type"] |
| ) |
| |
| |
| if program_courses and "No courses found" not in program_courses: |
| program_name = query_info["program_name"].title() |
| |
| |
| response = f"Here's information about the {program_name} program courses:\n\n{program_courses}" |
| |
| |
| sources = [] |
| for i, meta in enumerate(metadata): |
| if meta: |
| title = meta.get("title", "") |
| url = self.get_url_from_metadata(meta) |
| |
| if url: |
| if title: |
| citation = f"[{i+1}] {title} - {url}" |
| else: |
| citation = f"[{i+1}] Program information - {url}" |
| else: |
| if title: |
| citation = f"[{i+1}] {title}" |
| else: |
| citation = f"[{i+1}] Program information" |
| |
| sources.append(citation) |
| |
| if sources: |
| |
| used_source_indexes = set() |
| for i in range(len(sources)): |
| |
| if f"[{i+1}]" in response: |
| used_source_indexes.add(i) |
| |
| |
| if used_source_indexes: |
| response += "\n\nSources Referenced in Response:" |
| for i in sorted(used_source_indexes): |
| response += f"\n{sources[i]}" |
| |
| |
| response += "\n\nAll Retrieved Sources:" |
| for source in sources: |
| response += f"\n{source}" |
| |
| return response |
| except Exception as e: |
| logger.error(f"Error handling specialized course query: {str(e)}") |
| |
| |
| |
| |
| if query_info["is_course_query"] and query_info["program_name"]: |
| contexts, metadata = self.merge_program_documents(contexts, metadata, max_chars=12000) |
| else: |
| |
| contexts, metadata = self.trim_documents(contexts, metadata, max_chars=10000) |
| |
| |
| enhanced_contexts = [] |
| for i, (doc, meta) in enumerate(zip(contexts, metadata)): |
| source_type = meta.get("type", "document") |
| title = meta.get("title", "") |
| url = self.get_url_from_metadata(meta) |
| |
| |
| doc_preview = doc[:1500] + ("..." if len(doc) > 1500 else "") |
| |
| |
| doc_header = f"Document {i+1} ({source_type.capitalize()}" |
| if title: |
| doc_header += f": {title}" |
| if url: |
| doc_header += f" - {url}" |
| doc_header += "):" |
| |
| enhanced_contexts.append(f"{doc_header}\n{doc_preview}") |
| |
| |
| history_text = "" |
| if self.conversation_history: |
| recent_history = self.conversation_history[-3:] |
| if recent_history: |
| history_text = "### Recent Conversation:\n" |
| for msg in recent_history: |
| role = "User" if msg["role"] == "user" else "Assistant" |
| history_text += f"{role}: {msg['content']}\n\n" |
| |
| |
| context_text = "\n\n".join(enhanced_contexts) |
| prompt = f"""You are an AI assistant answering questions about American University's academic programs and courses. |
| Use the following documents as your primary source of information. |
| |
| Important rules: |
| - If the answer is not explicitly stated, you may reason from the information provided, but explain your reasoning. |
| - Courses marked as "must be completed", "prerequisites", or "required" are mandatory. |
| - When you see "one of the following" or "either X or Y", students must choose exactly one course from the options. |
| - When you see "option group", students must select some number of courses from that group. |
| - Courses listed as electives form a group from which a certain number must be completed, but not every course. |
| - Always mention the source document when including specific information. |
| - If you don't know or the information is not in the documents, be honest about it. |
| - For Data Science programs, STAT-427 (Statistical Machine Learning) is the 3-credit capstone course. |
| - Undergraduate courses have numbers 499 and below, graduate courses open to qualified undergraduates have numbers 500-599, |
| core graduate courses have numbers 600-699, and advanced graduate courses have numbers 700-799. |
| |
| {history_text if history_text else ""} |
| |
| ### Context: |
| {context_text} |
| |
| ### Question: |
| {query} |
| |
| """ |
| |
| |
| logger.info(f"Processing query in process_program_query instructions: {repr(query)}") |
| if isinstance(query, str) and ("course requirement" in query.lower() or "program requirement" in query.lower()): |
| prompt += """ |
| |
| IMPORTANT: Your response should include ALL required components for this degree program. |
| Ensure you cover all sections mentioned in the documents, including: |
| - All core course requirements with their credit hours |
| - Any elective requirements with credit hours |
| - Any minor or second major requirements |
| - Any capstone or project requirements |
| |
| Present requirements in a clear, organized format that makes the degree structure easy to understand. |
| DO NOT OMIT any requirements or sections mentioned in the documents. |
| """ |
|
|
| prompt += "\n\n### Answer:" |
| logger.info(f"Processed query in instructions: {repr(query)}") |
| |
| payload = { |
| "inputs": prompt, |
| "parameters": { |
| "max_new_tokens": 4000, |
| "temperature": temperature, |
| "top_p": 0.85, |
| "do_sample": True |
| } |
| } |
| |
| try: |
| response = requests.post(self.api_url, headers=self.headers, json=payload) |
| |
| if response.status_code == 200: |
| |
| generated_text = response.json()[0]["generated_text"] |
| answer = generated_text.split("### Answer:")[-1].strip() |
| |
| |
| |
| if "\n\nSources:" in answer: |
| answer = answer.split("\n\nSources:")[0].strip() |
| |
| |
| sources = [] |
| for i, meta in enumerate(metadata): |
| if meta: |
| source_type = meta.get("type", "document") |
| title = meta.get("title", "") |
| url = self.get_url_from_metadata(meta) |
| |
| |
| if url: |
| if title: |
| citation = f"[{i+1}] {title} - {url}" |
| else: |
| citation = f"[{i+1}] {source_type.capitalize()} - {url}" |
| else: |
| if title: |
| citation = f"[{i+1}] {title}" |
| else: |
| citation = f"[{i+1}] {source_type.capitalize()}" |
|
|
| sources.append(citation) |
| |
| |
| if sources: |
| |
| used_source_indexes = set() |
| for i in range(len(sources)): |
| |
| if f"[{i+1}]" in answer: |
| used_source_indexes.add(i) |
| |
| |
| if used_source_indexes: |
| answer += "\n\nSources Referenced in Response:" |
| for i in sorted(used_source_indexes): |
| answer += f"\n{sources[i]}" |
| |
| |
| answer += "\n\nAll Retrieved Sources:" |
| for source in sources: |
| answer += f"\n{source}" |
| |
| return answer |
| else: |
| error_msg = f"Error: {response.status_code}, {response.text}" |
| logger.error(error_msg) |
| return error_msg |
| |
| except Exception as e: |
| error_msg = f"Exception during response generation: {str(e)}" |
| logger.error(error_msg) |
| return error_msg |
| |
| def add_document(self, text: str, metadata: Dict[str, Any], doc_id: Optional[str] = None) -> str: |
| """Add a document to the ChromaDB collection.""" |
| return self.chroma_manager.add_document(text, metadata, doc_id) |
| |
| def get_collection_info(self) -> Dict[str, Any]: |
| """Get information about the ChromaDB collection.""" |
| return self.collection.get() |
| |
| def ask(self, query: str, n_results: int = 8, temperature: float = 0.7) -> Dict[str, Any]: |
| """ |
| Process a query and return a response with relevant context. |
| |
| Args: |
| query: The user's question |
| n_results: Number of documents to retrieve |
| temperature: Controls randomness in generation |
| |
| Returns: |
| Dictionary with response and context information |
| """ |
| |
| self.add_message("user", query) |
| |
| |
| contexts, metadata = self.retrieve_context(query, n_results) |
| |
| |
| if not contexts: |
| response = "I couldn't find any relevant information to answer your question. Could you please rephrase or ask about a different topic related to American University's programs or courses?" |
| else: |
| |
| response = self.generate_response(query, contexts, metadata, temperature) |
| |
| |
| if len(response) > 15000: |
| response = response[:14800] + "...\n\n[Response truncated due to length. Please ask for specific details if needed.]" |
| |
| |
| self.add_message("assistant", response) |
| |
| |
| return { |
| "response": response, |
| "contexts": contexts, |
| "metadata": metadata, |
| "history": self.conversation_history |
| } |
|
|
| |
| def ask_question(query: str, n_results: int = 8, temperature: float = 0.7) -> Dict[str, Any]: |
| """Ask a question to the chatbot.""" |
| return chatbot.ask(query, n_results, temperature) |
|
|
| |
| chatbot = AcademicChatbot() |
|
|
| |
| def ask_question(query: str, n_results: int = 10, temperature: float = 0.7) -> Dict[str, Any]: |
| """Ask a question to the chatbot.""" |
| return chatbot.ask(query, n_results, temperature) |
|
|
| |
| def clear_conversation(): |
| """Clear the conversation history.""" |
| chatbot.clear_history() |
|
|
| |
| def add_document(text: str, metadata: Dict[str, Any], doc_id: Optional[str] = None) -> str: |
| """Add a document to the collection.""" |
| return chatbot.add_document(text, metadata, doc_id) |
|
|
| |
| def split_long_response(response: str, max_chunk_size: int = 3500) -> List[str]: |
| """ |
| Split a long response into manageable chunks while preserving whole sentences. |
| |
| Args: |
| response (str): The full response text |
| max_chunk_size (int): Maximum size of each chunk in characters |
| |
| Returns: |
| List[str]: List of response chunks |
| """ |
| |
| if len(response) <= max_chunk_size: |
| return [response] |
| |
| |
| def split_sentences(text): |
| |
| import re |
| return re.split(r'(?<=[.!?])\s+', text) |
| |
| chunks = [] |
| current_chunk = [] |
| current_chunk_length = 0 |
| |
| sentences = split_sentences(response) |
| |
| for sentence in sentences: |
| |
| if current_chunk_length + len(sentence) > max_chunk_size: |
| |
| chunks.append(' '.join(current_chunk)) |
| current_chunk = [] |
| current_chunk_length = 0 |
| |
| |
| current_chunk.append(sentence) |
| current_chunk_length += len(sentence) + 1 |
| |
| |
| if current_chunk: |
| chunks.append(' '.join(current_chunk)) |
| |
| |
| for i in range(len(chunks)): |
| if i < len(chunks) - 1: |
| chunks[i] += f"\n\n(Continued in next message - Part {i+1}/{len(chunks)})" |
| else: |
| chunks[i] += f"\n\n(End of response - Part {i+1}/{len(chunks)})" |
| |
| return chunks |
|
|
| def generate_response_with_mistral(prompt, temperature): |
| """ |
| Generate response using Mistral 7B via Hugging Face API. |
| |
| Args: |
| prompt: Fully formatted prompt for the model |
| temperature: Sampling temperature for response generation |
| |
| Returns: |
| Generated response as a string |
| """ |
| if not HF_API_KEY: |
| raise ValueError("Hugging Face API key not found. Please configure credentials.") |
| |
| try: |
| |
| client = InferenceClient( |
| "mistralai/Mistral-7B-Instruct-v0.3", |
| token=HF_API_KEY |
| ) |
| |
| |
| response = client.text_generation( |
| prompt, |
| max_new_tokens=4096, |
| temperature=temperature, |
| stop_sequences=["\n\nUser:"], |
| ) |
| |
| return response.strip() |
| |
| except Exception as e: |
| error_msg = f"Error generating response with Mistral: {e}" |
| logger.error(error_msg) |
| return error_msg |
| |
| |
| try: |
| response = client.text_generation( |
| prompt, |
| max_new_tokens=4096, |
| temperature=temperature, |
| stop_sequences=["\n\nUser:"], |
| ) |
| |
| return response.strip() |
| |
| except Exception as e: |
| logging.error(f"Error generating response with Mistral: {str(e)}") |
| return f"I apologize, but I encountered an error generating a response: {str(e)}" |
|
|
| def clear_conversation(): |
| """ |
| Clear the conversation history. |
| Implement this based on your specific conversation tracking mechanism. |
| """ |
| |
| |
| pass |
|
|
| |
| def get_full_response_chunks(result): |
| """ |
| Retrieve all chunks of a potentially long response. |
| |
| Args: |
| result (Dict): Result from ask_question |
| |
| Returns: |
| List[str]: All response chunks |
| """ |
| return result.get('full_response_chunks', [result.get('response', '')]) |
|
|
| def initialize_chatbot(): |
| """ |
| Initialize the chatbot with a welcome message and system setup. |
| |
| Returns: |
| Dict[str, str]: Initial chatbot response |
| """ |
| welcome_message = """Welcome to the American University Academic Advisor Chatbot! |
| |
| I'm here to help you with information about: |
| - Academic programs |
| - Course details |
| - Program requirements |
| - Academic policies |
| |
| What would you like to know about American University's academic offerings? |
| |
| Some example questions you can ask: |
| - Tell me about the Data Science program |
| - What are the requirements for a Data Science major? |
| - What courses are required for a Statistics minor? |
| - Can you help me understand the AU Core curriculum? |
| |
| Feel free to ask, and I'll do my best to provide comprehensive and helpful information!""" |
|
|
| return { |
| "response": welcome_message, |
| "sources": "AU Academic Advisor Chatbot - Initial Welcome Message" |
| } |
|
|
| def get_chatbot_info(): |
| """ |
| Provide information about the chatbot's capabilities and sources. |
| |
| Returns: |
| Dict[str, str]: Chatbot information |
| """ |
| info_message = """π€ AU Academic Advisor Chatbot Information |
| |
| Data Sources: |
| - American University's official website |
| - Course catalog |
| - Program description pages |
| - Academic department information |
| |
| Technologies Used: |
| - Retrieval-Augmented Generation (RAG) |
| - Mistral 7B Language Model |
| - ChromaDB Vector Database |
| - Sentence Transformers for Embedding |
| |
| Capabilities: |
| - Retrieve detailed information about academic programs |
| - Explain course requirements |
| - Provide insights into academic policies |
| - Offer guidance on course selection |
| |
| Limitations: |
| - Information is based on available web sources |
| - Might not reflect the most recent updates |
| - Recommended to verify critical information with official AU sources |
| |
| Developed as a student research project to assist with academic advising. |
| """ |
|
|
| return { |
| "response": info_message, |
| "sources": "AU Academic Advisor Chatbot - System Information" |
| } |
|
|
| def interactive_chat(): |
| """ |
| Run an interactive chat session in the command line. |
| Updated to handle multi-part responses. |
| """ |
| print("π€ AU Academic Advisor Chatbot - Interactive Mode") |
| print("Type 'quit', 'exit', or 'q' to end the conversation.") |
| print("Type 'info' to get information about the chatbot.\n") |
|
|
| |
| init_response = initialize_chatbot() |
| print("π€ ", init_response["response"]) |
| print("\n--- How can I help you today? ---\n") |
|
|
| while True: |
| try: |
| |
| user_query = input("You: ").strip() |
|
|
| |
| if user_query.lower() in ['quit', 'exit', 'q']: |
| print("\nπ€ Thank you for using the AU Academic Advisor Chatbot. Goodbye!") |
| break |
| |
| |
| if user_query.lower() == 'info': |
| info_response = get_chatbot_info() |
| print("π€ ", info_response["response"]) |
| continue |
|
|
| |
| if user_query: |
| print("\nπ€ Thinking...\n") |
| response = ask_question(user_query) |
| |
| |
| if "full_response" in response: |
| print("π€ ", response["full_response"]) |
| else: |
| print("π€ ", response["response"]) |
| |
| |
| if "metadata" in response and response["metadata"]: |
| print("\n--- Sources ---") |
| for i, meta in enumerate(response["metadata"]): |
| source = meta.get('url', 'Unknown Source') |
| title = meta.get('title', 'Untitled') |
| print(f"{i+1}. {title} - {source}") |
| |
| print("\n") |
|
|
| except KeyboardInterrupt: |
| print("\n\nπ€ Chat interrupted. Type 'quit' to exit.") |
| except Exception as e: |
| print(f"\nπ€ An error occurred: {e}") |
|
|
| |
| if __name__ == "__main__": |
| try: |
| interactive_chat() |
| except Exception as e: |
| print(f"An unexpected error occurred: {e}") |
| import traceback |
| traceback.print_exc() |
|
|
| |
| __all__ = [ |
| 'ask_question', |
| 'initialize_chatbot', |
| 'get_chatbot_info', |
| 'clear_conversation', |
| 'split_long_response', |
| 'interactive_chat' |
| ] |
|
|