Commit ·
d0220ae
1
Parent(s): 1a1dc00
preprocessing endpoint fixed and chat endpoint has been updated to specific results-sem wise
Browse files- .gitignore +3 -1
- app/main.py +115 -50
- app/models.py +1 -3
- app/services/gemini_service.py +17 -3
- app/services/intent_service.py +91 -0
- app/services/pdf_service.py +52 -1
- app/services/student_service.py +30 -2
.gitignore
CHANGED
|
@@ -33,6 +33,7 @@ data/vector_index/
|
|
| 33 |
*.faiss
|
| 34 |
*.meta.json
|
| 35 |
*.log
|
|
|
|
| 36 |
|
| 37 |
# Hugging Face and model cache
|
| 38 |
.cache/
|
|
@@ -43,4 +44,5 @@ dist/
|
|
| 43 |
build/
|
| 44 |
prompt.md
|
| 45 |
stud_info.md
|
| 46 |
-
test_db_to_api.py
|
|
|
|
|
|
| 33 |
*.faiss
|
| 34 |
*.meta.json
|
| 35 |
*.log
|
| 36 |
+
context.txt
|
| 37 |
|
| 38 |
# Hugging Face and model cache
|
| 39 |
.cache/
|
|
|
|
| 44 |
build/
|
| 45 |
prompt.md
|
| 46 |
stud_info.md
|
| 47 |
+
test_db_to_api.py
|
| 48 |
+
test.ipynb
|
app/main.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
from fastapi import FastAPI, HTTPException
|
|
@@ -13,13 +14,27 @@ from app.models import (
|
|
| 13 |
SyllabusProcessResponse,
|
| 14 |
)
|
| 15 |
from app.services.gemini_service import GeminiService
|
|
|
|
| 16 |
from app.services.pdf_service import chunk_text, fetch_pdf_text
|
| 17 |
-
from app.services.rag_service import build_student_documents, mmr_select
|
| 18 |
from app.services.student_service import fetch_student_info
|
| 19 |
from app.vector_store import LocalVectorStore
|
| 20 |
|
| 21 |
|
| 22 |
app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
@app.get("/health")
|
|
@@ -103,64 +118,114 @@ def chat(req: ChatRequest) -> ChatResponse:
|
|
| 103 |
except ValueError as exc:
|
| 104 |
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
)
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
try:
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
doc["embedding"] = gemini.embed_text(
|
| 126 |
-
doc["chunk"],
|
| 127 |
-
task_type="retrieval_document",
|
| 128 |
-
)
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
query_embedding=query_embedding,
|
| 133 |
-
candidates=combined_candidates,
|
| 134 |
-
top_k=8,
|
| 135 |
-
lambda_param=0.7,
|
| 136 |
-
)
|
| 137 |
-
except Exception as exc:
|
| 138 |
-
raise HTTPException(status_code=500, detail=f"RAG retrieval failed: {exc}") from exc
|
| 139 |
-
|
| 140 |
-
rag_chunks = [f"[{h.get('source', 'unknown')}] {h['chunk']}" for h in hits]
|
| 141 |
-
retrieved_course_codes = sorted(
|
| 142 |
-
list(
|
| 143 |
-
{
|
| 144 |
-
h.get("course_code", "")
|
| 145 |
-
for h in hits
|
| 146 |
-
if str(h.get("course_code", "")).strip()
|
| 147 |
-
}
|
| 148 |
-
)
|
| 149 |
-
)
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
except Exception as exc:
|
| 160 |
raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
)
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
from typing import List
|
| 4 |
|
| 5 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 14 |
SyllabusProcessResponse,
|
| 15 |
)
|
| 16 |
from app.services.gemini_service import GeminiService
|
| 17 |
+
from app.services.intent_service import classify_intent
|
| 18 |
from app.services.pdf_service import chunk_text, fetch_pdf_text
|
|
|
|
| 19 |
from app.services.student_service import fetch_student_info
|
| 20 |
from app.vector_store import LocalVectorStore
|
| 21 |
|
| 22 |
|
| 23 |
app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
|
| 24 |
+
_CONTEXT_LOG_PATH = Path(__file__).resolve().parents[1] / "context.txt"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _write_context_log(content: str) -> None:
|
| 28 |
+
_CONTEXT_LOG_PATH.write_text(content, encoding="utf-8")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@app.on_event("startup")
|
| 32 |
+
def warmup_embedding_model() -> None:
|
| 33 |
+
try:
|
| 34 |
+
GeminiService.preload_embedding_model(settings.embedding_model_name)
|
| 35 |
+
except Exception as exc:
|
| 36 |
+
# Startup should continue even if warmup fails.
|
| 37 |
+
print(f"Embedding warmup skipped due to error: {exc}")
|
| 38 |
|
| 39 |
|
| 40 |
@app.get("/health")
|
|
|
|
| 118 |
except ValueError as exc:
|
| 119 |
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 120 |
|
| 121 |
+
history_text = "\n".join(
|
| 122 |
+
[f"{msg.role}: {msg.content}" for msg in req.history]
|
| 123 |
+
)
|
| 124 |
+
intent, in_scope = classify_intent(req.query)
|
| 125 |
|
| 126 |
+
if not in_scope:
|
| 127 |
+
reply = (
|
| 128 |
+
"I can help only with education-related queries such as syllabus, attendance, "
|
| 129 |
+
"results, study planning, and course guidance."
|
| 130 |
)
|
| 131 |
+
_write_context_log(
|
| 132 |
+
"Intent: out_of_scope\n"
|
| 133 |
+
f"Query: {req.query}\n"
|
| 134 |
+
"LLM Called: no\n"
|
| 135 |
+
f"Response: {reply}\n"
|
| 136 |
+
)
|
| 137 |
+
return ChatResponse(reply_markdown=reply)
|
| 138 |
+
|
| 139 |
+
prompt = ""
|
| 140 |
+
chunks_passed = 0
|
| 141 |
|
| 142 |
try:
|
| 143 |
+
if intent in {"attendance", "result"}:
|
| 144 |
+
student_info = fetch_student_info(
|
| 145 |
+
settings.student_performance_url_template,
|
| 146 |
+
req.student_id,
|
| 147 |
+
semester=req.semester,
|
| 148 |
+
intent=intent,
|
| 149 |
+
)
|
| 150 |
+
prompt = f"""
|
| 151 |
+
You are a college assistant. Respond in language code: {req.lang_code}.
|
| 152 |
+
Return markdown only.
|
| 153 |
+
|
| 154 |
+
Intent: {intent}
|
| 155 |
+
User query: {req.query}
|
| 156 |
+
|
| 157 |
+
Recent chat history:
|
| 158 |
+
{history_text}
|
| 159 |
+
|
| 160 |
+
Student performance context (authoritative):
|
| 161 |
+
{student_info}
|
| 162 |
+
|
| 163 |
+
Rules:
|
| 164 |
+
- Answer only from the provided student performance context.
|
| 165 |
+
- If asked for something unavailable in the context, clearly say it is unavailable.
|
| 166 |
+
- Be concise and practical.
|
| 167 |
+
"""
|
| 168 |
+
elif intent == "syllabus":
|
| 169 |
+
vector_store = LocalVectorStore(settings.vector_data_dir)
|
| 170 |
+
query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
|
| 171 |
+
hits = vector_store.search(req.semester, query_embedding, top_k=5)
|
| 172 |
+
hits = hits[:5]
|
| 173 |
+
chunks_passed = len(hits)
|
| 174 |
+
syllabus_context = "\n\n---\n\n".join(
|
| 175 |
+
[f"[{h.get('course_code', '')}] {h.get('chunk', '')}" for h in hits]
|
| 176 |
+
)
|
| 177 |
|
| 178 |
+
prompt = f"""
|
| 179 |
+
You are a college assistant. Respond in language code: {req.lang_code}.
|
| 180 |
+
Return markdown only.
|
| 181 |
|
| 182 |
+
Intent: syllabus
|
| 183 |
+
User query: {req.query}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
+
Recent chat history:
|
| 186 |
+
{history_text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
Syllabus context (authoritative):
|
| 189 |
+
{syllabus_context}
|
| 190 |
+
|
| 191 |
+
Rules:
|
| 192 |
+
- Answer only from the provided syllabus context.
|
| 193 |
+
- For unit/module queries, list units clearly with headings/bullets.
|
| 194 |
+
- If exact detail is unavailable, state what is missing.
|
| 195 |
+
"""
|
| 196 |
+
else:
|
| 197 |
+
prompt = f"""
|
| 198 |
+
You are a helpful college assistant. Respond in language code: {req.lang_code}.
|
| 199 |
+
Return markdown only.
|
| 200 |
+
|
| 201 |
+
Intent: other (education-related)
|
| 202 |
+
User query: {req.query}
|
| 203 |
+
|
| 204 |
+
Recent chat history:
|
| 205 |
+
{history_text}
|
| 206 |
+
|
| 207 |
+
Rules:
|
| 208 |
+
- Keep the response casual, helpful, and education-focused.
|
| 209 |
+
- Do not answer non-education requests.
|
| 210 |
+
- If needed, ask a brief clarifying question.
|
| 211 |
+
"""
|
| 212 |
+
|
| 213 |
+
reply = gemini.generate_markdown(prompt)
|
| 214 |
except Exception as exc:
|
| 215 |
raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
|
| 216 |
|
| 217 |
+
_write_context_log(
|
| 218 |
+
f"Intent: {intent}\n"
|
| 219 |
+
f"Query: {req.query}\n"
|
| 220 |
+
f"Student ID: {req.student_id}\n"
|
| 221 |
+
f"Semester: {req.semester}\n"
|
| 222 |
+
f"Language: {req.lang_code}\n"
|
| 223 |
+
f"Student endpoint intent param: {intent if intent in {'attendance', 'result'} else 'not_called'}\n"
|
| 224 |
+
f"Student endpoint semester param: {req.semester if intent in {'attendance', 'result'} else 'not_called'}\n"
|
| 225 |
+
f"Syllabus chunks passed: {chunks_passed}\n"
|
| 226 |
+
"LLM Called: yes\n"
|
| 227 |
+
"\n--- Prompt Passed To LLM ---\n"
|
| 228 |
+
f"{prompt}\n"
|
| 229 |
)
|
| 230 |
+
|
| 231 |
+
return ChatResponse(reply_markdown=reply)
|
app/models.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import Dict, List, Literal
|
| 2 |
from pydantic import BaseModel, Field, HttpUrl
|
| 3 |
|
| 4 |
|
|
@@ -46,5 +46,3 @@ class ChatRequest(BaseModel):
|
|
| 46 |
|
| 47 |
class ChatResponse(BaseModel):
|
| 48 |
reply_markdown: str
|
| 49 |
-
retrieved_course_codes: List[str]
|
| 50 |
-
student_info: Optional[dict] = None
|
|
|
|
| 1 |
+
from typing import Dict, List, Literal
|
| 2 |
from pydantic import BaseModel, Field, HttpUrl
|
| 3 |
|
| 4 |
|
|
|
|
| 46 |
|
| 47 |
class ChatResponse(BaseModel):
|
| 48 |
reply_markdown: str
|
|
|
|
|
|
app/services/gemini_service.py
CHANGED
|
@@ -45,6 +45,9 @@ Syllabus context:
|
|
| 45 |
raw = self._model.generate_content(prompt).text
|
| 46 |
return self._safe_parse_summary_json(raw)
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
def chat_with_context(
|
| 49 |
self,
|
| 50 |
query: str,
|
|
@@ -65,15 +68,20 @@ Respond in language code: {lang_code}
|
|
| 65 |
Supported codes: en, hn, mr, kn.
|
| 66 |
Return the final answer in markdown.
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
Student data (attendance, result etc.):
|
| 69 |
{json.dumps(student_info, ensure_ascii=False)}
|
| 70 |
|
| 71 |
Recent chat history:
|
| 72 |
{history_text}
|
| 73 |
|
| 74 |
-
Relevant syllabus context:
|
| 75 |
-
{syllabus_context}
|
| 76 |
-
|
| 77 |
User query:
|
| 78 |
{query}
|
| 79 |
|
|
@@ -107,3 +115,9 @@ Answer guidelines:
|
|
| 107 |
embedding_model_name
|
| 108 |
)
|
| 109 |
return cls._embedding_model_cache[embedding_model_name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
raw = self._model.generate_content(prompt).text
|
| 46 |
return self._safe_parse_summary_json(raw)
|
| 47 |
|
| 48 |
+
def generate_markdown(self, prompt: str) -> str:
|
| 49 |
+
return self._model.generate_content(prompt).text
|
| 50 |
+
|
| 51 |
def chat_with_context(
|
| 52 |
self,
|
| 53 |
query: str,
|
|
|
|
| 68 |
Supported codes: en, hn, mr, kn.
|
| 69 |
Return the final answer in markdown.
|
| 70 |
|
| 71 |
+
Grounding rules:
|
| 72 |
+
- Prioritize facts from "Relevant syllabus context" for syllabus/unit/module questions.
|
| 73 |
+
- If user asks for units/modules/topics of a course and context includes them, list them clearly.
|
| 74 |
+
- Do not say data is missing unless the relevant syllabus context truly does not contain it.
|
| 75 |
+
|
| 76 |
+
Relevant syllabus context:
|
| 77 |
+
{syllabus_context}
|
| 78 |
+
|
| 79 |
Student data (attendance, result etc.):
|
| 80 |
{json.dumps(student_info, ensure_ascii=False)}
|
| 81 |
|
| 82 |
Recent chat history:
|
| 83 |
{history_text}
|
| 84 |
|
|
|
|
|
|
|
|
|
|
| 85 |
User query:
|
| 86 |
{query}
|
| 87 |
|
|
|
|
| 115 |
embedding_model_name
|
| 116 |
)
|
| 117 |
return cls._embedding_model_cache[embedding_model_name]
|
| 118 |
+
|
| 119 |
+
@classmethod
|
| 120 |
+
def preload_embedding_model(cls, embedding_model_name: str) -> None:
|
| 121 |
+
model = cls._get_embedding_model(embedding_model_name)
|
| 122 |
+
# Warm up once so the first real query does not pay model initialization cost.
|
| 123 |
+
model.encode("embedding warmup", normalize_embeddings=False)
|
app/services/intent_service.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import Literal, Tuple
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
Intent = Literal["attendance", "result", "syllabus", "other"]
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def classify_intent(query: str) -> Tuple[Intent, bool]:
|
| 9 |
+
q = query.strip().lower()
|
| 10 |
+
|
| 11 |
+
if not q:
|
| 12 |
+
return "other", False
|
| 13 |
+
|
| 14 |
+
greeting_words = {
|
| 15 |
+
"hi",
|
| 16 |
+
"hello",
|
| 17 |
+
"hey",
|
| 18 |
+
"good morning",
|
| 19 |
+
"good afternoon",
|
| 20 |
+
"good evening",
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
attendance_keywords = {
|
| 24 |
+
"attendance",
|
| 25 |
+
"attend",
|
| 26 |
+
"absent",
|
| 27 |
+
"present",
|
| 28 |
+
"classes",
|
| 29 |
+
"attendance percentage",
|
| 30 |
+
"overall attendance",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
result_keywords = {
|
| 34 |
+
"result",
|
| 35 |
+
"results",
|
| 36 |
+
"marks",
|
| 37 |
+
"score",
|
| 38 |
+
"sgpa",
|
| 39 |
+
"cgpa",
|
| 40 |
+
"gpa",
|
| 41 |
+
"grade",
|
| 42 |
+
"ia",
|
| 43 |
+
"exam",
|
| 44 |
+
"performance",
|
| 45 |
+
"passed",
|
| 46 |
+
"fail",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
syllabus_keywords = {
|
| 50 |
+
"syllabus",
|
| 51 |
+
"unit",
|
| 52 |
+
"units",
|
| 53 |
+
"module",
|
| 54 |
+
"modules",
|
| 55 |
+
"topic",
|
| 56 |
+
"topics",
|
| 57 |
+
"course content",
|
| 58 |
+
"chapters",
|
| 59 |
+
"what is covered",
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
education_keywords = (
|
| 63 |
+
attendance_keywords
|
| 64 |
+
| result_keywords
|
| 65 |
+
| syllabus_keywords
|
| 66 |
+
| {
|
| 67 |
+
"semester",
|
| 68 |
+
"course",
|
| 69 |
+
"subject",
|
| 70 |
+
"study",
|
| 71 |
+
"assignment",
|
| 72 |
+
"project",
|
| 73 |
+
"college",
|
| 74 |
+
"class",
|
| 75 |
+
"exam prep",
|
| 76 |
+
}
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
has_course_code = bool(re.search(r"\b\d{2}[a-z]{2,}[a-z0-9]*\d+[a-z]?\b", q, flags=re.I))
|
| 80 |
+
|
| 81 |
+
is_greeting = any(word in q for word in greeting_words)
|
| 82 |
+
is_education = has_course_code or any(k in q for k in education_keywords) or is_greeting
|
| 83 |
+
|
| 84 |
+
if any(k in q for k in attendance_keywords):
|
| 85 |
+
return "attendance", is_education
|
| 86 |
+
if any(k in q for k in result_keywords):
|
| 87 |
+
return "result", is_education
|
| 88 |
+
if any(k in q for k in syllabus_keywords):
|
| 89 |
+
return "syllabus", is_education
|
| 90 |
+
|
| 91 |
+
return "other", is_education
|
app/services/pdf_service.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
| 1 |
import io
|
| 2 |
import time
|
|
|
|
| 3 |
|
| 4 |
import requests
|
|
|
|
| 5 |
from pypdf import PdfReader
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def fetch_pdf_text(
|
|
@@ -12,10 +15,38 @@ def fetch_pdf_text(
|
|
| 12 |
backoff_sec: float = 1.5,
|
| 13 |
) -> str:
|
| 14 |
last_exc: Exception | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
for attempt in range(max_retries):
|
| 17 |
try:
|
| 18 |
-
response =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
response.raise_for_status()
|
| 20 |
pdf_stream = io.BytesIO(response.content)
|
| 21 |
reader = PdfReader(pdf_stream)
|
|
@@ -29,6 +60,26 @@ def fetch_pdf_text(
|
|
| 29 |
return "\n\n".join(extracted).strip()
|
| 30 |
except Exception as exc:
|
| 31 |
last_exc = exc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
if attempt < max_retries - 1:
|
| 33 |
sleep_sec = backoff_sec * (2 ** attempt)
|
| 34 |
time.sleep(sleep_sec)
|
|
|
|
| 1 |
import io
|
| 2 |
import time
|
| 3 |
+
import urllib.request
|
| 4 |
|
| 5 |
import requests
|
| 6 |
+
from requests.adapters import HTTPAdapter
|
| 7 |
from pypdf import PdfReader
|
| 8 |
+
from urllib3.util import Retry
|
| 9 |
|
| 10 |
|
| 11 |
def fetch_pdf_text(
|
|
|
|
| 15 |
backoff_sec: float = 1.5,
|
| 16 |
) -> str:
|
| 17 |
last_exc: Exception | None = None
|
| 18 |
+
connect_timeout = min(max(int(timeout / 3), 10), 30)
|
| 19 |
+
read_timeout = max(timeout, 60)
|
| 20 |
+
|
| 21 |
+
session = requests.Session()
|
| 22 |
+
retry_cfg = Retry(
|
| 23 |
+
total=max_retries,
|
| 24 |
+
connect=max_retries,
|
| 25 |
+
read=max_retries,
|
| 26 |
+
backoff_factor=backoff_sec,
|
| 27 |
+
status_forcelist=(429, 500, 502, 503, 504),
|
| 28 |
+
allowed_methods=frozenset(["GET", "HEAD"]),
|
| 29 |
+
)
|
| 30 |
+
adapter = HTTPAdapter(max_retries=retry_cfg)
|
| 31 |
+
session.mount("http://", adapter)
|
| 32 |
+
session.mount("https://", adapter)
|
| 33 |
+
|
| 34 |
+
headers = {
|
| 35 |
+
"User-Agent": (
|
| 36 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 37 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 38 |
+
"Chrome/124.0.0.0 Safari/537.36"
|
| 39 |
+
),
|
| 40 |
+
"Accept": "application/pdf,*/*;q=0.8",
|
| 41 |
+
}
|
| 42 |
|
| 43 |
for attempt in range(max_retries):
|
| 44 |
try:
|
| 45 |
+
response = session.get(
|
| 46 |
+
pdf_url,
|
| 47 |
+
headers=headers,
|
| 48 |
+
timeout=(connect_timeout, read_timeout),
|
| 49 |
+
)
|
| 50 |
response.raise_for_status()
|
| 51 |
pdf_stream = io.BytesIO(response.content)
|
| 52 |
reader = PdfReader(pdf_stream)
|
|
|
|
| 60 |
return "\n\n".join(extracted).strip()
|
| 61 |
except Exception as exc:
|
| 62 |
last_exc = exc
|
| 63 |
+
|
| 64 |
+
# Fallback path: some hosts behave better with urllib defaults.
|
| 65 |
+
try:
|
| 66 |
+
req = urllib.request.Request(
|
| 67 |
+
pdf_url,
|
| 68 |
+
headers={"User-Agent": headers["User-Agent"]},
|
| 69 |
+
)
|
| 70 |
+
with urllib.request.urlopen(req, timeout=read_timeout) as resp:
|
| 71 |
+
content = resp.read()
|
| 72 |
+
pdf_stream = io.BytesIO(content)
|
| 73 |
+
reader = PdfReader(pdf_stream)
|
| 74 |
+
extracted = []
|
| 75 |
+
for page in reader.pages:
|
| 76 |
+
text = page.extract_text() or ""
|
| 77 |
+
if text.strip():
|
| 78 |
+
extracted.append(text)
|
| 79 |
+
return "\n\n".join(extracted).strip()
|
| 80 |
+
except Exception as fallback_exc:
|
| 81 |
+
last_exc = fallback_exc
|
| 82 |
+
|
| 83 |
if attempt < max_retries - 1:
|
| 84 |
sleep_sec = backoff_sec * (2 ** attempt)
|
| 85 |
time.sleep(sleep_sec)
|
app/services/student_service.py
CHANGED
|
@@ -4,6 +4,8 @@ import requests
|
|
| 4 |
def fetch_student_info(
|
| 5 |
student_performance_url_template: str,
|
| 6 |
student_id: int,
|
|
|
|
|
|
|
| 7 |
timeout: int = 20,
|
| 8 |
) -> dict:
|
| 9 |
if "{student_id}" not in student_performance_url_template:
|
|
@@ -12,11 +14,37 @@ def fetch_student_info(
|
|
| 12 |
)
|
| 13 |
|
| 14 |
student_url = student_performance_url_template.format(student_id=student_id)
|
| 15 |
-
get_resp = requests.get(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
if not get_resp.ok:
|
| 17 |
raise RuntimeError(
|
| 18 |
f"Failed to fetch student info from {student_url}; "
|
| 19 |
f"status {get_resp.status_code}."
|
| 20 |
)
|
| 21 |
|
| 22 |
-
return get_resp.json()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
def fetch_student_info(
|
| 5 |
student_performance_url_template: str,
|
| 6 |
student_id: int,
|
| 7 |
+
semester: int,
|
| 8 |
+
intent: str,
|
| 9 |
timeout: int = 20,
|
| 10 |
) -> dict:
|
| 11 |
if "{student_id}" not in student_performance_url_template:
|
|
|
|
| 14 |
)
|
| 15 |
|
| 16 |
student_url = student_performance_url_template.format(student_id=student_id)
|
| 17 |
+
get_resp = requests.get(
|
| 18 |
+
student_url,
|
| 19 |
+
params={"semester": semester, "intent": intent},
|
| 20 |
+
timeout=timeout,
|
| 21 |
+
)
|
| 22 |
if not get_resp.ok:
|
| 23 |
raise RuntimeError(
|
| 24 |
f"Failed to fetch student info from {student_url}; "
|
| 25 |
f"status {get_resp.status_code}."
|
| 26 |
)
|
| 27 |
|
| 28 |
+
return _filter_student_info_by_intent(get_resp.json(), semester=semester, intent=intent)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _filter_student_info_by_intent(payload: dict, semester: int, intent: str) -> dict:
|
| 32 |
+
# Fallback local filtering in case backend ignores intent/semester params.
|
| 33 |
+
data = dict(payload)
|
| 34 |
+
|
| 35 |
+
if intent == "attendance":
|
| 36 |
+
data.pop("results", None)
|
| 37 |
+
return data
|
| 38 |
+
|
| 39 |
+
if intent == "result":
|
| 40 |
+
data.pop("attendance", None)
|
| 41 |
+
results = data.get("results")
|
| 42 |
+
if isinstance(results, dict):
|
| 43 |
+
semesters = results.get("semesters")
|
| 44 |
+
if isinstance(semesters, list):
|
| 45 |
+
results["semesters"] = [
|
| 46 |
+
s for s in semesters if int(s.get("semester", -1)) == int(semester)
|
| 47 |
+
]
|
| 48 |
+
return data
|
| 49 |
+
|
| 50 |
+
return data
|