quantumbit Copilot commited on
Commit
d0220ae
·
1 Parent(s): 1a1dc00

preprocessing endpoint fixed and chat endpoint has been updated to specific results-sem wise

Browse files
.gitignore CHANGED
@@ -33,6 +33,7 @@ data/vector_index/
33
  *.faiss
34
  *.meta.json
35
  *.log
 
36
 
37
  # Hugging Face and model cache
38
  .cache/
@@ -43,4 +44,5 @@ dist/
43
  build/
44
  prompt.md
45
  stud_info.md
46
- test_db_to_api.py
 
 
33
  *.faiss
34
  *.meta.json
35
  *.log
36
+ context.txt
37
 
38
  # Hugging Face and model cache
39
  .cache/
 
44
  build/
45
  prompt.md
46
  stud_info.md
47
+ test_db_to_api.py
48
+ test.ipynb
app/main.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from typing import List
3
 
4
  from fastapi import FastAPI, HTTPException
@@ -13,13 +14,27 @@ from app.models import (
13
  SyllabusProcessResponse,
14
  )
15
  from app.services.gemini_service import GeminiService
 
16
  from app.services.pdf_service import chunk_text, fetch_pdf_text
17
- from app.services.rag_service import build_student_documents, mmr_select
18
  from app.services.student_service import fetch_student_info
19
  from app.vector_store import LocalVectorStore
20
 
21
 
22
  app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  @app.get("/health")
@@ -103,64 +118,114 @@ def chat(req: ChatRequest) -> ChatResponse:
103
  except ValueError as exc:
104
  raise HTTPException(status_code=500, detail=str(exc)) from exc
105
 
106
- vector_store = LocalVectorStore(settings.vector_data_dir)
 
 
 
107
 
108
- try:
109
- student_info = fetch_student_info(
110
- settings.student_performance_url_template,
111
- req.student_id,
112
  )
113
- except Exception as exc:
114
- raise HTTPException(status_code=502, detail=f"Student info fetch failed: {exc}") from exc
 
 
 
 
 
 
 
 
115
 
116
  try:
117
- query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- syllabus_hits = vector_store.search(req.semester, query_embedding, top_k=20)
120
- for hit in syllabus_hits:
121
- hit["source"] = "syllabus"
122
 
123
- student_docs = build_student_documents(student_info)
124
- for doc in student_docs:
125
- doc["embedding"] = gemini.embed_text(
126
- doc["chunk"],
127
- task_type="retrieval_document",
128
- )
129
 
130
- combined_candidates = syllabus_hits + student_docs
131
- hits = mmr_select(
132
- query_embedding=query_embedding,
133
- candidates=combined_candidates,
134
- top_k=8,
135
- lambda_param=0.7,
136
- )
137
- except Exception as exc:
138
- raise HTTPException(status_code=500, detail=f"RAG retrieval failed: {exc}") from exc
139
-
140
- rag_chunks = [f"[{h.get('source', 'unknown')}] {h['chunk']}" for h in hits]
141
- retrieved_course_codes = sorted(
142
- list(
143
- {
144
- h.get("course_code", "")
145
- for h in hits
146
- if str(h.get("course_code", "")).strip()
147
- }
148
- )
149
- )
150
 
151
- try:
152
- reply = gemini.chat_with_context(
153
- query=req.query,
154
- lang_code=req.lang_code,
155
- history=[msg.model_dump() for msg in req.history],
156
- student_info=student_info,
157
- rag_chunks=rag_chunks,
158
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  except Exception as exc:
160
  raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
161
 
162
- return ChatResponse(
163
- reply_markdown=reply,
164
- retrieved_course_codes=retrieved_course_codes,
165
- student_info=student_info,
 
 
 
 
 
 
 
 
166
  )
 
 
 
1
  import os
2
+ from pathlib import Path
3
  from typing import List
4
 
5
  from fastapi import FastAPI, HTTPException
 
14
  SyllabusProcessResponse,
15
  )
16
  from app.services.gemini_service import GeminiService
17
+ from app.services.intent_service import classify_intent
18
  from app.services.pdf_service import chunk_text, fetch_pdf_text
 
19
  from app.services.student_service import fetch_student_info
20
  from app.vector_store import LocalVectorStore
21
 
22
 
23
  app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
24
+ _CONTEXT_LOG_PATH = Path(__file__).resolve().parents[1] / "context.txt"
25
+
26
+
27
+ def _write_context_log(content: str) -> None:
28
+ _CONTEXT_LOG_PATH.write_text(content, encoding="utf-8")
29
+
30
+
31
+ @app.on_event("startup")
32
+ def warmup_embedding_model() -> None:
33
+ try:
34
+ GeminiService.preload_embedding_model(settings.embedding_model_name)
35
+ except Exception as exc:
36
+ # Startup should continue even if warmup fails.
37
+ print(f"Embedding warmup skipped due to error: {exc}")
38
 
39
 
40
  @app.get("/health")
 
118
  except ValueError as exc:
119
  raise HTTPException(status_code=500, detail=str(exc)) from exc
120
 
121
+ history_text = "\n".join(
122
+ [f"{msg.role}: {msg.content}" for msg in req.history]
123
+ )
124
+ intent, in_scope = classify_intent(req.query)
125
 
126
+ if not in_scope:
127
+ reply = (
128
+ "I can help only with education-related queries such as syllabus, attendance, "
129
+ "results, study planning, and course guidance."
130
  )
131
+ _write_context_log(
132
+ "Intent: out_of_scope\n"
133
+ f"Query: {req.query}\n"
134
+ "LLM Called: no\n"
135
+ f"Response: {reply}\n"
136
+ )
137
+ return ChatResponse(reply_markdown=reply)
138
+
139
+ prompt = ""
140
+ chunks_passed = 0
141
 
142
  try:
143
+ if intent in {"attendance", "result"}:
144
+ student_info = fetch_student_info(
145
+ settings.student_performance_url_template,
146
+ req.student_id,
147
+ semester=req.semester,
148
+ intent=intent,
149
+ )
150
+ prompt = f"""
151
+ You are a college assistant. Respond in language code: {req.lang_code}.
152
+ Return markdown only.
153
+
154
+ Intent: {intent}
155
+ User query: {req.query}
156
+
157
+ Recent chat history:
158
+ {history_text}
159
+
160
+ Student performance context (authoritative):
161
+ {student_info}
162
+
163
+ Rules:
164
+ - Answer only from the provided student performance context.
165
+ - If asked for something unavailable in the context, clearly say it is unavailable.
166
+ - Be concise and practical.
167
+ """
168
+ elif intent == "syllabus":
169
+ vector_store = LocalVectorStore(settings.vector_data_dir)
170
+ query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
171
+ hits = vector_store.search(req.semester, query_embedding, top_k=5)
172
+ hits = hits[:5]
173
+ chunks_passed = len(hits)
174
+ syllabus_context = "\n\n---\n\n".join(
175
+ [f"[{h.get('course_code', '')}] {h.get('chunk', '')}" for h in hits]
176
+ )
177
 
178
+ prompt = f"""
179
+ You are a college assistant. Respond in language code: {req.lang_code}.
180
+ Return markdown only.
181
 
182
+ Intent: syllabus
183
+ User query: {req.query}
 
 
 
 
184
 
185
+ Recent chat history:
186
+ {history_text}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
+ Syllabus context (authoritative):
189
+ {syllabus_context}
190
+
191
+ Rules:
192
+ - Answer only from the provided syllabus context.
193
+ - For unit/module queries, list units clearly with headings/bullets.
194
+ - If exact detail is unavailable, state what is missing.
195
+ """
196
+ else:
197
+ prompt = f"""
198
+ You are a helpful college assistant. Respond in language code: {req.lang_code}.
199
+ Return markdown only.
200
+
201
+ Intent: other (education-related)
202
+ User query: {req.query}
203
+
204
+ Recent chat history:
205
+ {history_text}
206
+
207
+ Rules:
208
+ - Keep the response casual, helpful, and education-focused.
209
+ - Do not answer non-education requests.
210
+ - If needed, ask a brief clarifying question.
211
+ """
212
+
213
+ reply = gemini.generate_markdown(prompt)
214
  except Exception as exc:
215
  raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
216
 
217
+ _write_context_log(
218
+ f"Intent: {intent}\n"
219
+ f"Query: {req.query}\n"
220
+ f"Student ID: {req.student_id}\n"
221
+ f"Semester: {req.semester}\n"
222
+ f"Language: {req.lang_code}\n"
223
+ f"Student endpoint intent param: {intent if intent in {'attendance', 'result'} else 'not_called'}\n"
224
+ f"Student endpoint semester param: {req.semester if intent in {'attendance', 'result'} else 'not_called'}\n"
225
+ f"Syllabus chunks passed: {chunks_passed}\n"
226
+ "LLM Called: yes\n"
227
+ "\n--- Prompt Passed To LLM ---\n"
228
+ f"{prompt}\n"
229
  )
230
+
231
+ return ChatResponse(reply_markdown=reply)
app/models.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Literal, Optional
2
  from pydantic import BaseModel, Field, HttpUrl
3
 
4
 
@@ -46,5 +46,3 @@ class ChatRequest(BaseModel):
46
 
47
  class ChatResponse(BaseModel):
48
  reply_markdown: str
49
- retrieved_course_codes: List[str]
50
- student_info: Optional[dict] = None
 
1
+ from typing import Dict, List, Literal
2
  from pydantic import BaseModel, Field, HttpUrl
3
 
4
 
 
46
 
47
  class ChatResponse(BaseModel):
48
  reply_markdown: str
 
 
app/services/gemini_service.py CHANGED
@@ -45,6 +45,9 @@ Syllabus context:
45
  raw = self._model.generate_content(prompt).text
46
  return self._safe_parse_summary_json(raw)
47
 
 
 
 
48
  def chat_with_context(
49
  self,
50
  query: str,
@@ -65,15 +68,20 @@ Respond in language code: {lang_code}
65
  Supported codes: en, hn, mr, kn.
66
  Return the final answer in markdown.
67
 
 
 
 
 
 
 
 
 
68
  Student data (attendance, result etc.):
69
  {json.dumps(student_info, ensure_ascii=False)}
70
 
71
  Recent chat history:
72
  {history_text}
73
 
74
- Relevant syllabus context:
75
- {syllabus_context}
76
-
77
  User query:
78
  {query}
79
 
@@ -107,3 +115,9 @@ Answer guidelines:
107
  embedding_model_name
108
  )
109
  return cls._embedding_model_cache[embedding_model_name]
 
 
 
 
 
 
 
45
  raw = self._model.generate_content(prompt).text
46
  return self._safe_parse_summary_json(raw)
47
 
48
+ def generate_markdown(self, prompt: str) -> str:
49
+ return self._model.generate_content(prompt).text
50
+
51
  def chat_with_context(
52
  self,
53
  query: str,
 
68
  Supported codes: en, hn, mr, kn.
69
  Return the final answer in markdown.
70
 
71
+ Grounding rules:
72
+ - Prioritize facts from "Relevant syllabus context" for syllabus/unit/module questions.
73
+ - If user asks for units/modules/topics of a course and context includes them, list them clearly.
74
+ - Do not say data is missing unless the relevant syllabus context truly does not contain it.
75
+
76
+ Relevant syllabus context:
77
+ {syllabus_context}
78
+
79
  Student data (attendance, result etc.):
80
  {json.dumps(student_info, ensure_ascii=False)}
81
 
82
  Recent chat history:
83
  {history_text}
84
 
 
 
 
85
  User query:
86
  {query}
87
 
 
115
  embedding_model_name
116
  )
117
  return cls._embedding_model_cache[embedding_model_name]
118
+
119
+ @classmethod
120
+ def preload_embedding_model(cls, embedding_model_name: str) -> None:
121
+ model = cls._get_embedding_model(embedding_model_name)
122
+ # Warm up once so the first real query does not pay model initialization cost.
123
+ model.encode("embedding warmup", normalize_embeddings=False)
app/services/intent_service.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Literal, Tuple
3
+
4
+
5
+ Intent = Literal["attendance", "result", "syllabus", "other"]
6
+
7
+
8
+ def classify_intent(query: str) -> Tuple[Intent, bool]:
9
+ q = query.strip().lower()
10
+
11
+ if not q:
12
+ return "other", False
13
+
14
+ greeting_words = {
15
+ "hi",
16
+ "hello",
17
+ "hey",
18
+ "good morning",
19
+ "good afternoon",
20
+ "good evening",
21
+ }
22
+
23
+ attendance_keywords = {
24
+ "attendance",
25
+ "attend",
26
+ "absent",
27
+ "present",
28
+ "classes",
29
+ "attendance percentage",
30
+ "overall attendance",
31
+ }
32
+
33
+ result_keywords = {
34
+ "result",
35
+ "results",
36
+ "marks",
37
+ "score",
38
+ "sgpa",
39
+ "cgpa",
40
+ "gpa",
41
+ "grade",
42
+ "ia",
43
+ "exam",
44
+ "performance",
45
+ "passed",
46
+ "fail",
47
+ }
48
+
49
+ syllabus_keywords = {
50
+ "syllabus",
51
+ "unit",
52
+ "units",
53
+ "module",
54
+ "modules",
55
+ "topic",
56
+ "topics",
57
+ "course content",
58
+ "chapters",
59
+ "what is covered",
60
+ }
61
+
62
+ education_keywords = (
63
+ attendance_keywords
64
+ | result_keywords
65
+ | syllabus_keywords
66
+ | {
67
+ "semester",
68
+ "course",
69
+ "subject",
70
+ "study",
71
+ "assignment",
72
+ "project",
73
+ "college",
74
+ "class",
75
+ "exam prep",
76
+ }
77
+ )
78
+
79
+ has_course_code = bool(re.search(r"\b\d{2}[a-z]{2,}[a-z0-9]*\d+[a-z]?\b", q, flags=re.I))
80
+
81
+ is_greeting = any(word in q for word in greeting_words)
82
+ is_education = has_course_code or any(k in q for k in education_keywords) or is_greeting
83
+
84
+ if any(k in q for k in attendance_keywords):
85
+ return "attendance", is_education
86
+ if any(k in q for k in result_keywords):
87
+ return "result", is_education
88
+ if any(k in q for k in syllabus_keywords):
89
+ return "syllabus", is_education
90
+
91
+ return "other", is_education
app/services/pdf_service.py CHANGED
@@ -1,8 +1,11 @@
1
  import io
2
  import time
 
3
 
4
  import requests
 
5
  from pypdf import PdfReader
 
6
 
7
 
8
  def fetch_pdf_text(
@@ -12,10 +15,38 @@ def fetch_pdf_text(
12
  backoff_sec: float = 1.5,
13
  ) -> str:
14
  last_exc: Exception | None = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  for attempt in range(max_retries):
17
  try:
18
- response = requests.get(pdf_url, timeout=timeout)
 
 
 
 
19
  response.raise_for_status()
20
  pdf_stream = io.BytesIO(response.content)
21
  reader = PdfReader(pdf_stream)
@@ -29,6 +60,26 @@ def fetch_pdf_text(
29
  return "\n\n".join(extracted).strip()
30
  except Exception as exc:
31
  last_exc = exc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  if attempt < max_retries - 1:
33
  sleep_sec = backoff_sec * (2 ** attempt)
34
  time.sleep(sleep_sec)
 
1
  import io
2
  import time
3
+ import urllib.request
4
 
5
  import requests
6
+ from requests.adapters import HTTPAdapter
7
  from pypdf import PdfReader
8
+ from urllib3.util import Retry
9
 
10
 
11
  def fetch_pdf_text(
 
15
  backoff_sec: float = 1.5,
16
  ) -> str:
17
  last_exc: Exception | None = None
18
+ connect_timeout = min(max(int(timeout / 3), 10), 30)
19
+ read_timeout = max(timeout, 60)
20
+
21
+ session = requests.Session()
22
+ retry_cfg = Retry(
23
+ total=max_retries,
24
+ connect=max_retries,
25
+ read=max_retries,
26
+ backoff_factor=backoff_sec,
27
+ status_forcelist=(429, 500, 502, 503, 504),
28
+ allowed_methods=frozenset(["GET", "HEAD"]),
29
+ )
30
+ adapter = HTTPAdapter(max_retries=retry_cfg)
31
+ session.mount("http://", adapter)
32
+ session.mount("https://", adapter)
33
+
34
+ headers = {
35
+ "User-Agent": (
36
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
37
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
38
+ "Chrome/124.0.0.0 Safari/537.36"
39
+ ),
40
+ "Accept": "application/pdf,*/*;q=0.8",
41
+ }
42
 
43
  for attempt in range(max_retries):
44
  try:
45
+ response = session.get(
46
+ pdf_url,
47
+ headers=headers,
48
+ timeout=(connect_timeout, read_timeout),
49
+ )
50
  response.raise_for_status()
51
  pdf_stream = io.BytesIO(response.content)
52
  reader = PdfReader(pdf_stream)
 
60
  return "\n\n".join(extracted).strip()
61
  except Exception as exc:
62
  last_exc = exc
63
+
64
+ # Fallback path: some hosts behave better with urllib defaults.
65
+ try:
66
+ req = urllib.request.Request(
67
+ pdf_url,
68
+ headers={"User-Agent": headers["User-Agent"]},
69
+ )
70
+ with urllib.request.urlopen(req, timeout=read_timeout) as resp:
71
+ content = resp.read()
72
+ pdf_stream = io.BytesIO(content)
73
+ reader = PdfReader(pdf_stream)
74
+ extracted = []
75
+ for page in reader.pages:
76
+ text = page.extract_text() or ""
77
+ if text.strip():
78
+ extracted.append(text)
79
+ return "\n\n".join(extracted).strip()
80
+ except Exception as fallback_exc:
81
+ last_exc = fallback_exc
82
+
83
  if attempt < max_retries - 1:
84
  sleep_sec = backoff_sec * (2 ** attempt)
85
  time.sleep(sleep_sec)
app/services/student_service.py CHANGED
@@ -4,6 +4,8 @@ import requests
4
  def fetch_student_info(
5
  student_performance_url_template: str,
6
  student_id: int,
 
 
7
  timeout: int = 20,
8
  ) -> dict:
9
  if "{student_id}" not in student_performance_url_template:
@@ -12,11 +14,37 @@ def fetch_student_info(
12
  )
13
 
14
  student_url = student_performance_url_template.format(student_id=student_id)
15
- get_resp = requests.get(student_url, timeout=timeout)
 
 
 
 
16
  if not get_resp.ok:
17
  raise RuntimeError(
18
  f"Failed to fetch student info from {student_url}; "
19
  f"status {get_resp.status_code}."
20
  )
21
 
22
- return get_resp.json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def fetch_student_info(
5
  student_performance_url_template: str,
6
  student_id: int,
7
+ semester: int,
8
+ intent: str,
9
  timeout: int = 20,
10
  ) -> dict:
11
  if "{student_id}" not in student_performance_url_template:
 
14
  )
15
 
16
  student_url = student_performance_url_template.format(student_id=student_id)
17
+ get_resp = requests.get(
18
+ student_url,
19
+ params={"semester": semester, "intent": intent},
20
+ timeout=timeout,
21
+ )
22
  if not get_resp.ok:
23
  raise RuntimeError(
24
  f"Failed to fetch student info from {student_url}; "
25
  f"status {get_resp.status_code}."
26
  )
27
 
28
+ return _filter_student_info_by_intent(get_resp.json(), semester=semester, intent=intent)
29
+
30
+
31
+ def _filter_student_info_by_intent(payload: dict, semester: int, intent: str) -> dict:
32
+ # Fallback local filtering in case backend ignores intent/semester params.
33
+ data = dict(payload)
34
+
35
+ if intent == "attendance":
36
+ data.pop("results", None)
37
+ return data
38
+
39
+ if intent == "result":
40
+ data.pop("attendance", None)
41
+ results = data.get("results")
42
+ if isinstance(results, dict):
43
+ semesters = results.get("semesters")
44
+ if isinstance(semesters, list):
45
+ results["semesters"] = [
46
+ s for s in semesters if int(s.get("semester", -1)) == int(semester)
47
+ ]
48
+ return data
49
+
50
+ return data