# api/course_loader.py import os from typing import Dict, List from api.rag_engine import build_rag_chunks_from_file SUPPORTED_EXTS = {".pdf", ".docx", ".pptx", ".md", ".txt"} def _pick_existing_dir(base: str, candidates: List[str]) -> str: for name in candidates: p = os.path.join(base, name) if os.path.isdir(p): return p return "" def load_course_chunks(course_dir: str, course_id: str) -> List[Dict]: """ Load course materials into RAG chunks. We keep it simple: - resources/ -> doc_type="Resources" - syllabus/ -> doc_type="Syllabus" - textbooks/ -> doc_type="Textbook" """ all_chunks: List[Dict] = [] # allow both lower/upper folder names (so your current Textbooks still works) resources_dir = _pick_existing_dir(course_dir, ["resources", "Resources"]) syllabus_dir = _pick_existing_dir(course_dir, ["syllabus", "Syllabus"]) textbooks_dir = _pick_existing_dir(course_dir, ["textbooks", "Textbooks"]) def walk_and_add(folder: str, doc_type: str): if not folder: return for root, _, files in os.walk(folder): for fn in files: ext = os.path.splitext(fn)[1].lower() if ext not in SUPPORTED_EXTS: continue path = os.path.join(root, fn) try: chunks = build_rag_chunks_from_file(path, doc_type) or [] for c in chunks: c["course_id"] = course_id all_chunks.extend(chunks) print(f"[course_loader] OK {course_id} {doc_type} {os.path.relpath(path, course_dir)} chunks={len(chunks)}") except Exception as e: print(f"[course_loader] FAIL {course_id} {doc_type} {os.path.relpath(path, course_dir)} err={repr(e)}") walk_and_add(resources_dir, "Resources") walk_and_add(syllabus_dir, "Syllabus") walk_and_add(textbooks_dir, "Textbook") return all_chunks def load_all_courses(courses_root_dir: str) -> Dict[str, List[Dict]]: """ courses_root_dir = .../data/courses Each subdir is a course_id (e.g. course_ist345) """ out: Dict[str, List[Dict]] = {} if not os.path.isdir(courses_root_dir): return out for course_id in sorted(os.listdir(courses_root_dir)): course_dir = os.path.join(courses_root_dir, course_id) if not os.path.isdir(course_dir): continue chunks = load_course_chunks(course_dir, course_id) out[course_id] = chunks print(f"[course_loader] {course_id}: {len(chunks)} chunks loaded") return out