Spaces:

quantumbit
/

chatbot-gitconnect

Paused

App Files Files Community

quantumbit Copilot commited on 22 days ago

Commit

d0220ae

1 Parent(s): 1a1dc00

preprocessing endpoint fixed and chat endpoint has been updated to specific results-sem wise

Browse files

Files changed (7) hide show

.gitignore +3 -1
app/main.py +115 -50
app/models.py +1 -3
app/services/gemini_service.py +17 -3
app/services/intent_service.py +91 -0
app/services/pdf_service.py +52 -1
app/services/student_service.py +30 -2

.gitignore CHANGED Viewed

@@ -33,6 +33,7 @@ data/vector_index/
 *.faiss
 *.meta.json
 *.log
 # Hugging Face and model cache
 .cache/
@@ -43,4 +44,5 @@ dist/
 build/
 prompt.md
 stud_info.md
-test_db_to_api.py

 *.faiss
 *.meta.json
 *.log
+context.txt
 # Hugging Face and model cache
 .cache/
 build/
 prompt.md
 stud_info.md
+test_db_to_api.py
+test.ipynb

app/main.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from typing import List
 from fastapi import FastAPI, HTTPException
@@ -13,13 +14,27 @@ from app.models import (
     SyllabusProcessResponse,
 )
 from app.services.gemini_service import GeminiService
 from app.services.pdf_service import chunk_text, fetch_pdf_text
-from app.services.rag_service import build_student_documents, mmr_select
 from app.services.student_service import fetch_student_info
 from app.vector_store import LocalVectorStore
 app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
 @app.get("/health")
@@ -103,64 +118,114 @@ def chat(req: ChatRequest) -> ChatResponse:
     except ValueError as exc:
         raise HTTPException(status_code=500, detail=str(exc)) from exc
-    vector_store = LocalVectorStore(settings.vector_data_dir)
-    try:
-        student_info = fetch_student_info(
-            settings.student_performance_url_template,
-            req.student_id,
         )
-    except Exception as exc:
-        raise HTTPException(status_code=502, detail=f"Student info fetch failed: {exc}") from exc
     try:
-        query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
-        syllabus_hits = vector_store.search(req.semester, query_embedding, top_k=20)
-        for hit in syllabus_hits:
-            hit["source"] = "syllabus"
-        student_docs = build_student_documents(student_info)
-        for doc in student_docs:
-            doc["embedding"] = gemini.embed_text(
-                doc["chunk"],
-                task_type="retrieval_document",
-            )
-        combined_candidates = syllabus_hits + student_docs
-        hits = mmr_select(
-            query_embedding=query_embedding,
-            candidates=combined_candidates,
-            top_k=8,
-            lambda_param=0.7,
-        )
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"RAG retrieval failed: {exc}") from exc
-    rag_chunks = [f"[{h.get('source', 'unknown')}] {h['chunk']}" for h in hits]
-    retrieved_course_codes = sorted(
-        list(
-            {
-                h.get("course_code", "")
-                for h in hits
-                if str(h.get("course_code", "")).strip()
-            }
-        )
-    )
-    try:
-        reply = gemini.chat_with_context(
-            query=req.query,
-            lang_code=req.lang_code,
-            history=[msg.model_dump() for msg in req.history],
-            student_info=student_info,
-            rag_chunks=rag_chunks,
-        )
     except Exception as exc:
         raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
-    return ChatResponse(
-        reply_markdown=reply,
-        retrieved_course_codes=retrieved_course_codes,
-        student_info=student_info,
     )

 import os
+from pathlib import Path
 from typing import List
 from fastapi import FastAPI, HTTPException
     SyllabusProcessResponse,
 )
 from app.services.gemini_service import GeminiService
+from app.services.intent_service import classify_intent
 from app.services.pdf_service import chunk_text, fetch_pdf_text
 from app.services.student_service import fetch_student_info
 from app.vector_store import LocalVectorStore
 app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
+_CONTEXT_LOG_PATH = Path(__file__).resolve().parents[1] / "context.txt"
+def _write_context_log(content: str) -> None:
+    _CONTEXT_LOG_PATH.write_text(content, encoding="utf-8")
+@app.on_event("startup")
+def warmup_embedding_model() -> None:
+    try:
+        GeminiService.preload_embedding_model(settings.embedding_model_name)
+    except Exception as exc:
+        # Startup should continue even if warmup fails.
+        print(f"Embedding warmup skipped due to error: {exc}")
 @app.get("/health")
     except ValueError as exc:
         raise HTTPException(status_code=500, detail=str(exc)) from exc
+    history_text = "\n".join(
+        [f"{msg.role}: {msg.content}" for msg in req.history]
+    )
+    intent, in_scope = classify_intent(req.query)
+    if not in_scope:
+        reply = (
+            "I can help only with education-related queries such as syllabus, attendance, "
+            "results, study planning, and course guidance."
         )
+        _write_context_log(
+            "Intent: out_of_scope\n"
+            f"Query: {req.query}\n"
+            "LLM Called: no\n"
+            f"Response: {reply}\n"
+        )
+        return ChatResponse(reply_markdown=reply)
+    prompt = ""
+    chunks_passed = 0
     try:
+        if intent in {"attendance", "result"}:
+            student_info = fetch_student_info(
+                settings.student_performance_url_template,
+                req.student_id,
+                semester=req.semester,
+                intent=intent,
+            )
+            prompt = f"""
+You are a college assistant. Respond in language code: {req.lang_code}.
+Return markdown only.
+Intent: {intent}
+User query: {req.query}
+Recent chat history:
+{history_text}
+Student performance context (authoritative):
+{student_info}
+Rules:
+- Answer only from the provided student performance context.
+- If asked for something unavailable in the context, clearly say it is unavailable.
+- Be concise and practical.
+"""
+        elif intent == "syllabus":
+            vector_store = LocalVectorStore(settings.vector_data_dir)
+            query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
+            hits = vector_store.search(req.semester, query_embedding, top_k=5)
+            hits = hits[:5]
+            chunks_passed = len(hits)
+            syllabus_context = "\n\n---\n\n".join(
+                [f"[{h.get('course_code', '')}] {h.get('chunk', '')}" for h in hits]
+            )
+            prompt = f"""
+You are a college assistant. Respond in language code: {req.lang_code}.
+Return markdown only.
+Intent: syllabus
+User query: {req.query}
+Recent chat history:
+{history_text}
+Syllabus context (authoritative):
+{syllabus_context}
+Rules:
+- Answer only from the provided syllabus context.
+- For unit/module queries, list units clearly with headings/bullets.
+- If exact detail is unavailable, state what is missing.
+"""
+        else:
+            prompt = f"""
+You are a helpful college assistant. Respond in language code: {req.lang_code}.
+Return markdown only.
+Intent: other (education-related)
+User query: {req.query}
+Recent chat history:
+{history_text}
+Rules:
+- Keep the response casual, helpful, and education-focused.
+- Do not answer non-education requests.
+- If needed, ask a brief clarifying question.
+"""
+        reply = gemini.generate_markdown(prompt)
     except Exception as exc:
         raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
+    _write_context_log(
+        f"Intent: {intent}\n"
+        f"Query: {req.query}\n"
+        f"Student ID: {req.student_id}\n"
+        f"Semester: {req.semester}\n"
+        f"Language: {req.lang_code}\n"
+        f"Student endpoint intent param: {intent if intent in {'attendance', 'result'} else 'not_called'}\n"
+        f"Student endpoint semester param: {req.semester if intent in {'attendance', 'result'} else 'not_called'}\n"
+        f"Syllabus chunks passed: {chunks_passed}\n"
+        "LLM Called: yes\n"
+        "\n--- Prompt Passed To LLM ---\n"
+        f"{prompt}\n"
     )
+    return ChatResponse(reply_markdown=reply)

app/models.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, Literal, Optional
 from pydantic import BaseModel, Field, HttpUrl
@@ -46,5 +46,3 @@ class ChatRequest(BaseModel):
 class ChatResponse(BaseModel):
     reply_markdown: str
-    retrieved_course_codes: List[str]
-    student_info: Optional[dict] = None

+from typing import Dict, List, Literal
 from pydantic import BaseModel, Field, HttpUrl
 class ChatResponse(BaseModel):
     reply_markdown: str

app/services/gemini_service.py CHANGED Viewed

@@ -45,6 +45,9 @@ Syllabus context:
         raw = self._model.generate_content(prompt).text
         return self._safe_parse_summary_json(raw)
     def chat_with_context(
         self,
         query: str,
@@ -65,15 +68,20 @@ Respond in language code: {lang_code}
 Supported codes: en, hn, mr, kn.
 Return the final answer in markdown.
 Student data (attendance, result etc.):
 {json.dumps(student_info, ensure_ascii=False)}
 Recent chat history:
 {history_text}
-Relevant syllabus context:
-{syllabus_context}
 User query:
 {query}
@@ -107,3 +115,9 @@ Answer guidelines:
                 embedding_model_name
             )
         return cls._embedding_model_cache[embedding_model_name]

         raw = self._model.generate_content(prompt).text
         return self._safe_parse_summary_json(raw)
+    def generate_markdown(self, prompt: str) -> str:
+        return self._model.generate_content(prompt).text
     def chat_with_context(
         self,
         query: str,
 Supported codes: en, hn, mr, kn.
 Return the final answer in markdown.
+Grounding rules:
+- Prioritize facts from "Relevant syllabus context" for syllabus/unit/module questions.
+- If user asks for units/modules/topics of a course and context includes them, list them clearly.
+- Do not say data is missing unless the relevant syllabus context truly does not contain it.
+Relevant syllabus context:
+{syllabus_context}
 Student data (attendance, result etc.):
 {json.dumps(student_info, ensure_ascii=False)}
 Recent chat history:
 {history_text}
 User query:
 {query}
                 embedding_model_name
             )
         return cls._embedding_model_cache[embedding_model_name]
+    @classmethod
+    def preload_embedding_model(cls, embedding_model_name: str) -> None:
+        model = cls._get_embedding_model(embedding_model_name)
+        # Warm up once so the first real query does not pay model initialization cost.
+        model.encode("embedding warmup", normalize_embeddings=False)

app/services/intent_service.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import re
+from typing import Literal, Tuple
+Intent = Literal["attendance", "result", "syllabus", "other"]
+def classify_intent(query: str) -> Tuple[Intent, bool]:
+    q = query.strip().lower()
+    if not q:
+        return "other", False
+    greeting_words = {
+        "hi",
+        "hello",
+        "hey",
+        "good morning",
+        "good afternoon",
+        "good evening",
+    }
+    attendance_keywords = {
+        "attendance",
+        "attend",
+        "absent",
+        "present",
+        "classes",
+        "attendance percentage",
+        "overall attendance",
+    }
+    result_keywords = {
+        "result",
+        "results",
+        "marks",
+        "score",
+        "sgpa",
+        "cgpa",
+        "gpa",
+        "grade",
+        "ia",
+        "exam",
+        "performance",
+        "passed",
+        "fail",
+    }
+    syllabus_keywords = {
+        "syllabus",
+        "unit",
+        "units",
+        "module",
+        "modules",
+        "topic",
+        "topics",
+        "course content",
+        "chapters",
+        "what is covered",
+    }
+    education_keywords = (
+        attendance_keywords
+        | result_keywords
+        | syllabus_keywords
+        | {
+            "semester",
+            "course",
+            "subject",
+            "study",
+            "assignment",
+            "project",
+            "college",
+            "class",
+            "exam prep",
+        }
+    )
+    has_course_code = bool(re.search(r"\b\d{2}[a-z]{2,}[a-z0-9]*\d+[a-z]?\b", q, flags=re.I))
+    is_greeting = any(word in q for word in greeting_words)
+    is_education = has_course_code or any(k in q for k in education_keywords) or is_greeting
+    if any(k in q for k in attendance_keywords):
+        return "attendance", is_education
+    if any(k in q for k in result_keywords):
+        return "result", is_education
+    if any(k in q for k in syllabus_keywords):
+        return "syllabus", is_education
+    return "other", is_education

app/services/pdf_service.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import io
 import time
 import requests
 from pypdf import PdfReader
 def fetch_pdf_text(
@@ -12,10 +15,38 @@ def fetch_pdf_text(
     backoff_sec: float = 1.5,
 ) -> str:
     last_exc: Exception | None = None
     for attempt in range(max_retries):
         try:
-            response = requests.get(pdf_url, timeout=timeout)
             response.raise_for_status()
             pdf_stream = io.BytesIO(response.content)
             reader = PdfReader(pdf_stream)
@@ -29,6 +60,26 @@ def fetch_pdf_text(
             return "\n\n".join(extracted).strip()
         except Exception as exc:
             last_exc = exc
             if attempt < max_retries - 1:
                 sleep_sec = backoff_sec * (2 ** attempt)
                 time.sleep(sleep_sec)

 import io
 import time
+import urllib.request
 import requests
+from requests.adapters import HTTPAdapter
 from pypdf import PdfReader
+from urllib3.util import Retry
 def fetch_pdf_text(
     backoff_sec: float = 1.5,
 ) -> str:
     last_exc: Exception | None = None
+    connect_timeout = min(max(int(timeout / 3), 10), 30)
+    read_timeout = max(timeout, 60)
+    session = requests.Session()
+    retry_cfg = Retry(
+        total=max_retries,
+        connect=max_retries,
+        read=max_retries,
+        backoff_factor=backoff_sec,
+        status_forcelist=(429, 500, 502, 503, 504),
+        allowed_methods=frozenset(["GET", "HEAD"]),
+    )
+    adapter = HTTPAdapter(max_retries=retry_cfg)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/124.0.0.0 Safari/537.36"
+        ),
+        "Accept": "application/pdf,*/*;q=0.8",
+    }
     for attempt in range(max_retries):
         try:
+            response = session.get(
+                pdf_url,
+                headers=headers,
+                timeout=(connect_timeout, read_timeout),
+            )
             response.raise_for_status()
             pdf_stream = io.BytesIO(response.content)
             reader = PdfReader(pdf_stream)
             return "\n\n".join(extracted).strip()
         except Exception as exc:
             last_exc = exc
+            # Fallback path: some hosts behave better with urllib defaults.
+            try:
+                req = urllib.request.Request(
+                    pdf_url,
+                    headers={"User-Agent": headers["User-Agent"]},
+                )
+                with urllib.request.urlopen(req, timeout=read_timeout) as resp:
+                    content = resp.read()
+                pdf_stream = io.BytesIO(content)
+                reader = PdfReader(pdf_stream)
+                extracted = []
+                for page in reader.pages:
+                    text = page.extract_text() or ""
+                    if text.strip():
+                        extracted.append(text)
+                return "\n\n".join(extracted).strip()
+            except Exception as fallback_exc:
+                last_exc = fallback_exc
             if attempt < max_retries - 1:
                 sleep_sec = backoff_sec * (2 ** attempt)
                 time.sleep(sleep_sec)

app/services/student_service.py CHANGED Viewed

@@ -4,6 +4,8 @@ import requests
 def fetch_student_info(
     student_performance_url_template: str,
     student_id: int,
     timeout: int = 20,
 ) -> dict:
     if "{student_id}" not in student_performance_url_template:
@@ -12,11 +14,37 @@ def fetch_student_info(
         )
     student_url = student_performance_url_template.format(student_id=student_id)
-    get_resp = requests.get(student_url, timeout=timeout)
     if not get_resp.ok:
         raise RuntimeError(
             f"Failed to fetch student info from {student_url}; "
             f"status {get_resp.status_code}."
         )
-    return get_resp.json()

 def fetch_student_info(
     student_performance_url_template: str,
     student_id: int,
+    semester: int,
+    intent: str,
     timeout: int = 20,
 ) -> dict:
     if "{student_id}" not in student_performance_url_template:
         )
     student_url = student_performance_url_template.format(student_id=student_id)
+    get_resp = requests.get(
+        student_url,
+        params={"semester": semester, "intent": intent},
+        timeout=timeout,
+    )
     if not get_resp.ok:
         raise RuntimeError(
             f"Failed to fetch student info from {student_url}; "
             f"status {get_resp.status_code}."
         )
+    return _filter_student_info_by_intent(get_resp.json(), semester=semester, intent=intent)
+def _filter_student_info_by_intent(payload: dict, semester: int, intent: str) -> dict:
+    # Fallback local filtering in case backend ignores intent/semester params.
+    data = dict(payload)
+    if intent == "attendance":
+        data.pop("results", None)
+        return data
+    if intent == "result":
+        data.pop("attendance", None)
+        results = data.get("results")
+        if isinstance(results, dict):
+            semesters = results.get("semesters")
+            if isinstance(semesters, list):
+                results["semesters"] = [
+                    s for s in semesters if int(s.get("semester", -1)) == int(semester)
+                ]
+        return data
+    return data