SarahXia0405 commited on
Commit
6e941de
·
verified ·
1 Parent(s): 682585f

Update api/rag_engine.py

Browse files
Files changed (1) hide show
  1. api/rag_engine.py +35 -23
api/rag_engine.py CHANGED
@@ -8,7 +8,9 @@ Chunk format (MVP):
8
  {
9
  "text": str,
10
  "source_file": str,
11
- "section": str
 
 
12
  }
13
  """
14
 
@@ -20,14 +22,11 @@ from pypdf import PdfReader
20
  from docx import Document
21
  from pptx import Presentation
22
 
23
- # IMPORTANT: now under api/
24
- from api.syllabus_utils import parse_pptx_slides # optional reuse
25
- from api.config import DEFAULT_COURSE_TOPICS
26
 
27
 
28
- # ----------------------------
29
- # Helpers
30
- # ----------------------------
31
  def _clean_text(s: str) -> str:
32
  s = (s or "").replace("\r", "\n")
33
  s = re.sub(r"\n{3,}", "\n\n", s)
@@ -36,9 +35,9 @@ def _clean_text(s: str) -> str:
36
 
37
  def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
38
  """
39
- Simple deterministic chunker:
40
  - split by blank lines
41
- - then pack into <= max_chars
42
  """
43
  text = _clean_text(text)
44
  if not text:
@@ -69,14 +68,18 @@ def _file_label(path: str) -> str:
69
  return os.path.basename(path) if path else "uploaded_file"
70
 
71
 
 
 
 
 
 
 
 
 
72
  # ----------------------------
73
  # Parsers
74
  # ----------------------------
75
  def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
76
- """
77
- Returns list of (section_label, text)
78
- section_label uses page numbers.
79
- """
80
  reader = PdfReader(path)
81
  out: List[Tuple[str, str]] = []
82
  for i, page in enumerate(reader.pages):
@@ -125,7 +128,6 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
125
  ext = os.path.splitext(path)[1].lower()
126
  source_file = _file_label(path)
127
 
128
- # Parse into (section, text blocks)
129
  sections: List[Tuple[str, str]] = []
130
  try:
131
  if ext == ".pdf":
@@ -138,7 +140,6 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
138
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
139
  sections = [("text", _clean_text(f.read()))]
140
  else:
141
- # Unsupported file type: return empty (safe)
142
  print(f"[rag_engine] unsupported file type: {ext}")
143
  return []
144
  except Exception as e:
@@ -147,14 +148,16 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
147
 
148
  chunks: List[Dict] = []
149
  for section, text in sections:
150
- # Split section text into smaller chunks
151
  for j, piece in enumerate(_split_into_chunks(text), start=1):
 
 
152
  chunks.append(
153
  {
154
  "text": piece,
155
  "source_file": source_file,
156
  "section": f"{section}#{j}",
157
  "doc_type": doc_type,
 
158
  }
159
  )
160
 
@@ -162,33 +165,42 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
162
 
163
 
164
  def retrieve_relevant_chunks(
165
- query: str, chunks: List[Dict], k: int = 4, max_context_chars: int = 2800
 
 
 
166
  ) -> Tuple[str, List[Dict]]:
167
  """
168
- Deterministic lightweight retrieval (no embeddings):
169
- - score by token overlap (very fast)
170
  - return top-k chunks concatenated as context
171
  """
172
  query = _clean_text(query)
173
  if not query or not chunks:
174
  return "", []
175
 
176
- q_tokens = set(re.findall(r"[a-zA-Z0-9]+", query.lower()))
177
  if not q_tokens:
178
  return "", []
179
 
180
  scored: List[Tuple[int, Dict]] = []
181
  for c in chunks:
182
- text = (c.get("text") or "")
183
- t_tokens = set(re.findall(r"[a-zA-Z0-9]+", text.lower()))
 
 
 
 
184
  score = len(q_tokens.intersection(t_tokens))
185
  if score > 0:
186
  scored.append((score, c))
187
 
 
 
 
188
  scored.sort(key=lambda x: x[0], reverse=True)
189
  top = [c for _, c in scored[:k]]
190
 
191
- # Build context text
192
  buf_parts: List[str] = []
193
  used: List[Dict] = []
194
  total = 0
 
8
  {
9
  "text": str,
10
  "source_file": str,
11
+ "section": str,
12
+ "doc_type": str,
13
+ "_tokens": frozenset[str] # ✅ precomputed for fast retrieval (in-memory)
14
  }
15
  """
16
 
 
22
  from docx import Document
23
  from pptx import Presentation
24
 
25
+ # precompiled regex for speed
26
+ _WORD_RE = re.compile(r"[a-zA-Z0-9]+")
27
+ _WS_RE = re.compile(r"\s+")
28
 
29
 
 
 
 
30
  def _clean_text(s: str) -> str:
31
  s = (s or "").replace("\r", "\n")
32
  s = re.sub(r"\n{3,}", "\n\n", s)
 
35
 
36
  def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
37
  """
38
+ Deterministic chunker:
39
  - split by blank lines
40
+ - pack into <= max_chars
41
  """
42
  text = _clean_text(text)
43
  if not text:
 
68
  return os.path.basename(path) if path else "uploaded_file"
69
 
70
 
71
+ def _tokenize(s: str) -> frozenset:
72
+ # normalize whitespace first to reduce regex work slightly
73
+ s = _WS_RE.sub(" ", (s or "").lower()).strip()
74
+ if not s:
75
+ return frozenset()
76
+ return frozenset(_WORD_RE.findall(s))
77
+
78
+
79
  # ----------------------------
80
  # Parsers
81
  # ----------------------------
82
  def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
 
 
 
 
83
  reader = PdfReader(path)
84
  out: List[Tuple[str, str]] = []
85
  for i, page in enumerate(reader.pages):
 
128
  ext = os.path.splitext(path)[1].lower()
129
  source_file = _file_label(path)
130
 
 
131
  sections: List[Tuple[str, str]] = []
132
  try:
133
  if ext == ".pdf":
 
140
  with open(path, "r", encoding="utf-8", errors="ignore") as f:
141
  sections = [("text", _clean_text(f.read()))]
142
  else:
 
143
  print(f"[rag_engine] unsupported file type: {ext}")
144
  return []
145
  except Exception as e:
 
148
 
149
  chunks: List[Dict] = []
150
  for section, text in sections:
 
151
  for j, piece in enumerate(_split_into_chunks(text), start=1):
152
+ # ✅ precompute tokens once
153
+ toks = _tokenize(piece)
154
  chunks.append(
155
  {
156
  "text": piece,
157
  "source_file": source_file,
158
  "section": f"{section}#{j}",
159
  "doc_type": doc_type,
160
+ "_tokens": toks,
161
  }
162
  )
163
 
 
165
 
166
 
167
  def retrieve_relevant_chunks(
168
+ query: str,
169
+ chunks: List[Dict],
170
+ k: int = 3, # ✅ smaller default = faster + less prompt
171
+ max_context_chars: int = 2200, # ✅ smaller default = faster
172
  ) -> Tuple[str, List[Dict]]:
173
  """
174
+ Fast deterministic retrieval:
175
+ - score by token overlap using precomputed chunk tokens
176
  - return top-k chunks concatenated as context
177
  """
178
  query = _clean_text(query)
179
  if not query or not chunks:
180
  return "", []
181
 
182
+ q_tokens = _tokenize(query)
183
  if not q_tokens:
184
  return "", []
185
 
186
  scored: List[Tuple[int, Dict]] = []
187
  for c in chunks:
188
+ t_tokens = c.get("_tokens")
189
+ if not t_tokens:
190
+ # fallback if older chunks exist without tokens
191
+ t_tokens = _tokenize(c.get("text") or "")
192
+ c["_tokens"] = t_tokens
193
+
194
  score = len(q_tokens.intersection(t_tokens))
195
  if score > 0:
196
  scored.append((score, c))
197
 
198
+ if not scored:
199
+ return "", []
200
+
201
  scored.sort(key=lambda x: x[0], reverse=True)
202
  top = [c for _, c in scored[:k]]
203
 
 
204
  buf_parts: List[str] = []
205
  used: List[Dict] = []
206
  total = 0