SarahXia0405 commited on
Commit
cfaf6e4
·
verified ·
1 Parent(s): 1f374e6

Create course_loader.py

Browse files
Files changed (1) hide show
  1. api/course_loader.py +69 -0
api/course_loader.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # api/course_loader.py
2
+ import os
3
+ from typing import Dict, List
4
+ from api.rag_engine import build_rag_chunks_from_file
5
+
6
+ SUPPORTED_EXTS = {".pdf", ".docx", ".pptx", ".md", ".txt"}
7
+
8
+ def _pick_existing_dir(base: str, candidates: List[str]) -> str:
9
+ for name in candidates:
10
+ p = os.path.join(base, name)
11
+ if os.path.isdir(p):
12
+ return p
13
+ return ""
14
+
15
+ def load_course_chunks(course_dir: str, course_id: str) -> List[Dict]:
16
+ """
17
+ Load course materials into RAG chunks.
18
+ We keep it simple:
19
+ - resources/ -> doc_type="Resources"
20
+ - syllabus/ -> doc_type="Syllabus"
21
+ - textbooks/ -> doc_type="Textbook"
22
+ """
23
+ all_chunks: List[Dict] = []
24
+
25
+ # allow both lower/upper folder names (so your current Textbooks still works)
26
+ resources_dir = _pick_existing_dir(course_dir, ["resources", "Resources"])
27
+ syllabus_dir = _pick_existing_dir(course_dir, ["syllabus", "Syllabus"])
28
+ textbooks_dir = _pick_existing_dir(course_dir, ["textbooks", "Textbooks"])
29
+
30
+ def walk_and_add(folder: str, doc_type: str):
31
+ if not folder:
32
+ return
33
+ for root, _, files in os.walk(folder):
34
+ for fn in files:
35
+ ext = os.path.splitext(fn)[1].lower()
36
+ if ext not in SUPPORTED_EXTS:
37
+ continue
38
+ path = os.path.join(root, fn)
39
+ chunks = build_rag_chunks_from_file(path, doc_type) or []
40
+ for c in chunks:
41
+ # attach course_id + keep relative path as source_file if you want later
42
+ c["course_id"] = course_id
43
+ all_chunks.extend(chunks)
44
+
45
+ walk_and_add(resources_dir, "Resources")
46
+ walk_and_add(syllabus_dir, "Syllabus")
47
+ walk_and_add(textbooks_dir, "Textbook")
48
+
49
+ return all_chunks
50
+
51
+
52
+ def load_all_courses(courses_root_dir: str) -> Dict[str, List[Dict]]:
53
+ """
54
+ courses_root_dir = .../data/courses
55
+ Each subdir is a course_id (e.g. course_ist345)
56
+ """
57
+ out: Dict[str, List[Dict]] = {}
58
+ if not os.path.isdir(courses_root_dir):
59
+ return out
60
+
61
+ for course_id in sorted(os.listdir(courses_root_dir)):
62
+ course_dir = os.path.join(courses_root_dir, course_id)
63
+ if not os.path.isdir(course_dir):
64
+ continue
65
+ chunks = load_course_chunks(course_dir, course_id)
66
+ out[course_id] = chunks
67
+ print(f"[course_loader] {course_id}: {len(chunks)} chunks loaded")
68
+
69
+ return out