quantumbit Copilot commited on
Commit
25f29e5
·
1 Parent(s): 812b65b

rag value updates

Browse files

Co-authored-by: Copilot <copilot@github.com>

Files changed (4) hide show
  1. .env.example +3 -0
  2. .gitignore +2 -1
  3. app/config.py +3 -0
  4. app/main.py +14 -3
.env.example CHANGED
@@ -11,3 +11,6 @@ RAW_TEXT_DIR=data/raw_text
11
  PDF_TIMEOUT_SEC=60
12
  PDF_MAX_RETRIES=3
13
  PDF_RETRY_BACKOFF_SEC=1.5
 
 
 
 
11
  PDF_TIMEOUT_SEC=60
12
  PDF_MAX_RETRIES=3
13
  PDF_RETRY_BACKOFF_SEC=1.5
14
+ RAG_CHUNK_SIZE=1400
15
+ RAG_CHUNK_OVERLAP=250
16
+ RAG_SYLLABUS_TOP_K=8
.gitignore CHANGED
@@ -45,4 +45,5 @@ build/
45
  prompt.md
46
  stud_info.md
47
  test_db_to_api.py
48
- test.ipynb
 
 
45
  prompt.md
46
  stud_info.md
47
  test_db_to_api.py
48
+ test.ipynb
49
+ ingest_syllabus_to_neon.py
app/config.py CHANGED
@@ -32,6 +32,9 @@ class Settings:
32
  pdf_timeout_sec: int = int(os.getenv("PDF_TIMEOUT_SEC", "60"))
33
  pdf_max_retries: int = int(os.getenv("PDF_MAX_RETRIES", "3"))
34
  pdf_retry_backoff_sec: float = float(os.getenv("PDF_RETRY_BACKOFF_SEC", "1.5"))
 
 
 
35
 
36
 
37
  settings = Settings()
 
32
  pdf_timeout_sec: int = int(os.getenv("PDF_TIMEOUT_SEC", "60"))
33
  pdf_max_retries: int = int(os.getenv("PDF_MAX_RETRIES", "3"))
34
  pdf_retry_backoff_sec: float = float(os.getenv("PDF_RETRY_BACKOFF_SEC", "1.5"))
35
+ rag_chunk_size: int = int(os.getenv("RAG_CHUNK_SIZE", "1400"))
36
+ rag_chunk_overlap: int = int(os.getenv("RAG_CHUNK_OVERLAP", "250"))
37
+ rag_syllabus_top_k: int = int(os.getenv("RAG_SYLLABUS_TOP_K", "8"))
38
 
39
 
40
  settings = Settings()
app/main.py CHANGED
@@ -84,7 +84,11 @@ def process_syllabus(courses: List[CourseInput]) -> SyllabusProcessResponse:
84
  if not syllabus_text:
85
  raise RuntimeError("No text extracted from PDF.")
86
 
87
- chunks = chunk_text(syllabus_text)
 
 
 
 
88
  if not chunks:
89
  raise RuntimeError("Unable to create text chunks from syllabus content.")
90
 
@@ -175,8 +179,12 @@ Rules:
175
  neon_connect_timeout_sec=settings.neon_connect_timeout_sec,
176
  )
177
  query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
178
- hits = vector_store.search(req.semester, query_embedding, top_k=5)
179
- hits = hits[:5]
 
 
 
 
180
  chunks_passed = len(hits)
181
  syllabus_context = "\n\n---\n\n".join(
182
  [f"[{h.get('course_code', '')}] {h.get('chunk', '')}" for h in hits]
@@ -229,6 +237,9 @@ Rules:
229
  f"Language: {req.lang_code}\n"
230
  f"Student endpoint intent param: {intent if intent in {'attendance', 'result'} else 'not_called'}\n"
231
  f"Student endpoint semester param: {req.semester if intent in {'attendance', 'result'} else 'not_called'}\n"
 
 
 
232
  f"Syllabus chunks passed: {chunks_passed}\n"
233
  "LLM Called: yes\n"
234
  "\n--- Prompt Passed To LLM ---\n"
 
84
  if not syllabus_text:
85
  raise RuntimeError("No text extracted from PDF.")
86
 
87
+ chunks = chunk_text(
88
+ syllabus_text,
89
+ chunk_size=settings.rag_chunk_size,
90
+ overlap=settings.rag_chunk_overlap,
91
+ )
92
  if not chunks:
93
  raise RuntimeError("Unable to create text chunks from syllabus content.")
94
 
 
179
  neon_connect_timeout_sec=settings.neon_connect_timeout_sec,
180
  )
181
  query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
182
+ hits = vector_store.search(
183
+ req.semester,
184
+ query_embedding,
185
+ top_k=settings.rag_syllabus_top_k,
186
+ )
187
+ hits = hits[: settings.rag_syllabus_top_k]
188
  chunks_passed = len(hits)
189
  syllabus_context = "\n\n---\n\n".join(
190
  [f"[{h.get('course_code', '')}] {h.get('chunk', '')}" for h in hits]
 
237
  f"Language: {req.lang_code}\n"
238
  f"Student endpoint intent param: {intent if intent in {'attendance', 'result'} else 'not_called'}\n"
239
  f"Student endpoint semester param: {req.semester if intent in {'attendance', 'result'} else 'not_called'}\n"
240
+ f"Syllabus top_k configured: {settings.rag_syllabus_top_k}\n"
241
+ f"RAG chunk size configured: {settings.rag_chunk_size}\n"
242
+ f"RAG chunk overlap configured: {settings.rag_chunk_overlap}\n"
243
  f"Syllabus chunks passed: {chunks_passed}\n"
244
  "LLM Called: yes\n"
245
  "\n--- Prompt Passed To LLM ---\n"