Commit ·
25f29e5
1
Parent(s): 812b65b
rag value updates
Browse filesCo-authored-by: Copilot <copilot@github.com>
- .env.example +3 -0
- .gitignore +2 -1
- app/config.py +3 -0
- app/main.py +14 -3
.env.example
CHANGED
|
@@ -11,3 +11,6 @@ RAW_TEXT_DIR=data/raw_text
|
|
| 11 |
PDF_TIMEOUT_SEC=60
|
| 12 |
PDF_MAX_RETRIES=3
|
| 13 |
PDF_RETRY_BACKOFF_SEC=1.5
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
PDF_TIMEOUT_SEC=60
|
| 12 |
PDF_MAX_RETRIES=3
|
| 13 |
PDF_RETRY_BACKOFF_SEC=1.5
|
| 14 |
+
RAG_CHUNK_SIZE=1400
|
| 15 |
+
RAG_CHUNK_OVERLAP=250
|
| 16 |
+
RAG_SYLLABUS_TOP_K=8
|
.gitignore
CHANGED
|
@@ -45,4 +45,5 @@ build/
|
|
| 45 |
prompt.md
|
| 46 |
stud_info.md
|
| 47 |
test_db_to_api.py
|
| 48 |
-
test.ipynb
|
|
|
|
|
|
| 45 |
prompt.md
|
| 46 |
stud_info.md
|
| 47 |
test_db_to_api.py
|
| 48 |
+
test.ipynb
|
| 49 |
+
ingest_syllabus_to_neon.py
|
app/config.py
CHANGED
|
@@ -32,6 +32,9 @@ class Settings:
|
|
| 32 |
pdf_timeout_sec: int = int(os.getenv("PDF_TIMEOUT_SEC", "60"))
|
| 33 |
pdf_max_retries: int = int(os.getenv("PDF_MAX_RETRIES", "3"))
|
| 34 |
pdf_retry_backoff_sec: float = float(os.getenv("PDF_RETRY_BACKOFF_SEC", "1.5"))
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
settings = Settings()
|
|
|
|
| 32 |
pdf_timeout_sec: int = int(os.getenv("PDF_TIMEOUT_SEC", "60"))
|
| 33 |
pdf_max_retries: int = int(os.getenv("PDF_MAX_RETRIES", "3"))
|
| 34 |
pdf_retry_backoff_sec: float = float(os.getenv("PDF_RETRY_BACKOFF_SEC", "1.5"))
|
| 35 |
+
rag_chunk_size: int = int(os.getenv("RAG_CHUNK_SIZE", "1400"))
|
| 36 |
+
rag_chunk_overlap: int = int(os.getenv("RAG_CHUNK_OVERLAP", "250"))
|
| 37 |
+
rag_syllabus_top_k: int = int(os.getenv("RAG_SYLLABUS_TOP_K", "8"))
|
| 38 |
|
| 39 |
|
| 40 |
settings = Settings()
|
app/main.py
CHANGED
|
@@ -84,7 +84,11 @@ def process_syllabus(courses: List[CourseInput]) -> SyllabusProcessResponse:
|
|
| 84 |
if not syllabus_text:
|
| 85 |
raise RuntimeError("No text extracted from PDF.")
|
| 86 |
|
| 87 |
-
chunks = chunk_text(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
if not chunks:
|
| 89 |
raise RuntimeError("Unable to create text chunks from syllabus content.")
|
| 90 |
|
|
@@ -175,8 +179,12 @@ Rules:
|
|
| 175 |
neon_connect_timeout_sec=settings.neon_connect_timeout_sec,
|
| 176 |
)
|
| 177 |
query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
|
| 178 |
-
hits = vector_store.search(
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
chunks_passed = len(hits)
|
| 181 |
syllabus_context = "\n\n---\n\n".join(
|
| 182 |
[f"[{h.get('course_code', '')}] {h.get('chunk', '')}" for h in hits]
|
|
@@ -229,6 +237,9 @@ Rules:
|
|
| 229 |
f"Language: {req.lang_code}\n"
|
| 230 |
f"Student endpoint intent param: {intent if intent in {'attendance', 'result'} else 'not_called'}\n"
|
| 231 |
f"Student endpoint semester param: {req.semester if intent in {'attendance', 'result'} else 'not_called'}\n"
|
|
|
|
|
|
|
|
|
|
| 232 |
f"Syllabus chunks passed: {chunks_passed}\n"
|
| 233 |
"LLM Called: yes\n"
|
| 234 |
"\n--- Prompt Passed To LLM ---\n"
|
|
|
|
| 84 |
if not syllabus_text:
|
| 85 |
raise RuntimeError("No text extracted from PDF.")
|
| 86 |
|
| 87 |
+
chunks = chunk_text(
|
| 88 |
+
syllabus_text,
|
| 89 |
+
chunk_size=settings.rag_chunk_size,
|
| 90 |
+
overlap=settings.rag_chunk_overlap,
|
| 91 |
+
)
|
| 92 |
if not chunks:
|
| 93 |
raise RuntimeError("Unable to create text chunks from syllabus content.")
|
| 94 |
|
|
|
|
| 179 |
neon_connect_timeout_sec=settings.neon_connect_timeout_sec,
|
| 180 |
)
|
| 181 |
query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
|
| 182 |
+
hits = vector_store.search(
|
| 183 |
+
req.semester,
|
| 184 |
+
query_embedding,
|
| 185 |
+
top_k=settings.rag_syllabus_top_k,
|
| 186 |
+
)
|
| 187 |
+
hits = hits[: settings.rag_syllabus_top_k]
|
| 188 |
chunks_passed = len(hits)
|
| 189 |
syllabus_context = "\n\n---\n\n".join(
|
| 190 |
[f"[{h.get('course_code', '')}] {h.get('chunk', '')}" for h in hits]
|
|
|
|
| 237 |
f"Language: {req.lang_code}\n"
|
| 238 |
f"Student endpoint intent param: {intent if intent in {'attendance', 'result'} else 'not_called'}\n"
|
| 239 |
f"Student endpoint semester param: {req.semester if intent in {'attendance', 'result'} else 'not_called'}\n"
|
| 240 |
+
f"Syllabus top_k configured: {settings.rag_syllabus_top_k}\n"
|
| 241 |
+
f"RAG chunk size configured: {settings.rag_chunk_size}\n"
|
| 242 |
+
f"RAG chunk overlap configured: {settings.rag_chunk_overlap}\n"
|
| 243 |
f"Syllabus chunks passed: {chunks_passed}\n"
|
| 244 |
"LLM Called: yes\n"
|
| 245 |
"\n--- Prompt Passed To LLM ---\n"
|