Spaces:
Sleeping
Sleeping
| # api/course_loader.py | |
| import os | |
| from typing import Dict, List | |
| from api.rag_engine import build_rag_chunks_from_file | |
| SUPPORTED_EXTS = {".pdf", ".docx", ".pptx", ".md", ".txt"} | |
| def _pick_existing_dir(base: str, candidates: List[str]) -> str: | |
| for name in candidates: | |
| p = os.path.join(base, name) | |
| if os.path.isdir(p): | |
| return p | |
| return "" | |
| def load_course_chunks(course_dir: str, course_id: str) -> List[Dict]: | |
| """ | |
| Load course materials into RAG chunks. | |
| We keep it simple: | |
| - resources/ -> doc_type="Resources" | |
| - syllabus/ -> doc_type="Syllabus" | |
| - textbooks/ -> doc_type="Textbook" | |
| """ | |
| all_chunks: List[Dict] = [] | |
| # allow both lower/upper folder names (so your current Textbooks still works) | |
| resources_dir = _pick_existing_dir(course_dir, ["resources", "Resources"]) | |
| syllabus_dir = _pick_existing_dir(course_dir, ["syllabus", "Syllabus"]) | |
| textbooks_dir = _pick_existing_dir(course_dir, ["textbooks", "Textbooks"]) | |
| def walk_and_add(folder: str, doc_type: str): | |
| if not folder: | |
| return | |
| for root, _, files in os.walk(folder): | |
| for fn in files: | |
| ext = os.path.splitext(fn)[1].lower() | |
| if ext not in SUPPORTED_EXTS: | |
| continue | |
| path = os.path.join(root, fn) | |
| try: | |
| chunks = build_rag_chunks_from_file(path, doc_type) or [] | |
| for c in chunks: | |
| c["course_id"] = course_id | |
| all_chunks.extend(chunks) | |
| print(f"[course_loader] OK {course_id} {doc_type} {os.path.relpath(path, course_dir)} chunks={len(chunks)}") | |
| except Exception as e: | |
| print(f"[course_loader] FAIL {course_id} {doc_type} {os.path.relpath(path, course_dir)} err={repr(e)}") | |
| walk_and_add(resources_dir, "Resources") | |
| walk_and_add(syllabus_dir, "Syllabus") | |
| walk_and_add(textbooks_dir, "Textbook") | |
| return all_chunks | |
| def load_all_courses(courses_root_dir: str) -> Dict[str, List[Dict]]: | |
| """ | |
| courses_root_dir = .../data/courses | |
| Each subdir is a course_id (e.g. course_ist345) | |
| """ | |
| out: Dict[str, List[Dict]] = {} | |
| if not os.path.isdir(courses_root_dir): | |
| return out | |
| for course_id in sorted(os.listdir(courses_root_dir)): | |
| course_dir = os.path.join(courses_root_dir, course_id) | |
| if not os.path.isdir(course_dir): | |
| continue | |
| chunks = load_course_chunks(course_dir, course_id) | |
| out[course_id] = chunks | |
| print(f"[course_loader] {course_id}: {len(chunks)} chunks loaded") | |
| return out | |