quantumbit Copilot commited on
Commit
fdb66ba
·
0 Parent(s):

initial commit

Browse files

Co-authored-by: Copilot <copilot@github.com>

.dockerignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.pyo
4
+ *.pyd
5
+ *.so
6
+ *.egg-info/
7
+ .pytest_cache/
8
+ .mypy_cache/
9
+ .ruff_cache/
10
+
11
+ .venv/
12
+ venv/
13
+ env/
14
+
15
+ .env
16
+ app/.env
17
+ .vscode/
18
+ .git/
19
+ .gitignore
20
+
21
+ data/
22
+ *.log
.env.example ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ GEMINI_API_KEY=your_gemini_api_key
2
+ GEMINI_MODEL=gemini-2.5-flash
3
+ EMBEDDING_MODEL_NAME=sentence-transformers/all-MiniLM-L6-v2
4
+ STUDENT_PERFORMANCE_URL_TEMPLATE=https://git-connect-backend-v2.vercel.app/api/student/{student_id}/performance
5
+ VECTOR_DATA_DIR=data/vector_index
6
+ RAW_TEXT_DIR=data/raw_text
7
+ PDF_TIMEOUT_SEC=60
8
+ PDF_MAX_RETRIES=3
9
+ PDF_RETRY_BACKOFF_SEC=1.5
.github/workflows/deploy-hf-space.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy To Hugging Face Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ deploy:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - name: Checkout
15
+ uses: actions/checkout@v4
16
+ with:
17
+ fetch-depth: 0
18
+
19
+ - name: Validate required secrets
20
+ shell: bash
21
+ run: |
22
+ test -n "${{ secrets.HF_TOKEN }}" || (echo "HF_TOKEN is missing" && exit 1)
23
+ test -n "${{ secrets.HF_SPACE_REPO_ID }}" || (echo "HF_SPACE_REPO_ID is missing" && exit 1)
24
+
25
+ - name: Configure git user
26
+ run: |
27
+ git config --global user.name "github-actions[bot]"
28
+ git config --global user.email "github-actions[bot]@users.noreply.github.com"
29
+
30
+ - name: Push repository to Hugging Face Space
31
+ env:
32
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
33
+ HF_SPACE_REPO_ID: ${{ secrets.HF_SPACE_REPO_ID }}
34
+ shell: bash
35
+ run: |
36
+ HF_URL="https://oauth2:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE_REPO_ID}"
37
+ git remote remove hf 2>/dev/null || true
38
+ git remote add hf "${HF_URL}"
39
+ git push hf HEAD:main --force
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ *.so
7
+ *.egg-info/
8
+ .pytest_cache/
9
+ .mypy_cache/
10
+ .ruff_cache/
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ env/
16
+
17
+ # Environment files and secrets
18
+ .env
19
+ app/.env
20
+ *.env.local
21
+
22
+ # IDE/editor
23
+ .vscode/
24
+ .idea/
25
+
26
+ # OS files
27
+ .DS_Store
28
+ Thumbs.db
29
+
30
+ # Runtime and local data
31
+ data/raw_text/
32
+ data/vector_index/
33
+ *.faiss
34
+ *.meta.json
35
+ *.log
36
+
37
+ # Hugging Face and model cache
38
+ .cache/
39
+ .huggingface/
40
+
41
+ # Build artifacts
42
+ dist/
43
+ build/
44
+ prompt.md
45
+ stud_info.md
46
+ test_db_to_api.py
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1
6
+
7
+ WORKDIR /app
8
+
9
+ COPY requirements.txt /app/requirements.txt
10
+ RUN pip install --upgrade pip && pip install -r /app/requirements.txt
11
+
12
+ COPY app /app/app
13
+ COPY .env.example /app/.env.example
14
+ COPY README.md /app/README.md
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GitConnect FastAPI Service
2
+
3
+ FastAPI backend with two endpoints:
4
+ - Syllabus processing from PDF URLs with FAISS vector indexing and multilingual course summaries.
5
+ - Chatbot endpoint using student performance data + semester-scoped syllabus RAG with MMR.
6
+
7
+ Embedding setup:
8
+ - Uses local Hugging Face sentence embeddings (default: `sentence-transformers/all-MiniLM-L6-v2`).
9
+ - Uses FAISS (`IndexFlatIP` with L2-normalized vectors) for similarity search.
10
+ - Gemini is used for summarization and chatbot generation.
11
+
12
+ ## Setup
13
+
14
+ 1. Create a virtual environment and activate it.
15
+ 2. Install dependencies:
16
+
17
+ ```bash
18
+ pip install -r requirements.txt
19
+ ```
20
+
21
+ 3. Create `.env` from `.env.example` and fill `GEMINI_API_KEY`.
22
+ 4. Run server:
23
+
24
+ ```bash
25
+ uvicorn app.main:app --reload
26
+ ```
27
+
28
+ ## Endpoints
29
+
30
+ - `GET /health`
31
+ - `POST /api/syllabus/process`
32
+ - `POST /api/chat`
33
+
34
+ ## Deploy to Hugging Face Spaces
35
+
36
+ This repo is configured for Docker Spaces deployment.
37
+
38
+ Files used for deployment:
39
+ - `Dockerfile`
40
+ - `requirements.txt`
41
+ - `app/`
42
+ - `.github/workflows/deploy-hf-space.yml`
43
+
44
+ Files intentionally not pushed from local machine:
45
+ - `.env` and `app/.env`
46
+ - `data/` generated files
47
+ - local caches and `__pycache__/`
48
+
49
+ GitHub Action deployment:
50
+ - Trigger: push to `main` or manual run
51
+ - Workflow file: `.github/workflows/deploy-hf-space.yml`
52
+
53
+ Required GitHub repository secrets:
54
+ - `HF_TOKEN`: Hugging Face write token
55
+ - `HF_SPACE_REPO_ID`: in format `username/space-name`
56
+
57
+ After setting secrets, pushing to `main` will sync this repo to your Hugging Face Space `main` branch.
58
+
59
+ Student performance is fetched from:
60
+ - `STUDENT_PERFORMANCE_URL_TEMPLATE` (default)
61
+ - `https://git-connect-backend-v2.vercel.app/api/student/{student_id}/performance`
62
+
63
+ ## Sample syllabus request
64
+
65
+ ```json
66
+ [
67
+ {
68
+ "course_code": "22CS501",
69
+ "name": "Database Management Systems",
70
+ "course_type": "theory",
71
+ "syllabus_url": "https://example.com/dbms.pdf",
72
+ "semester": 5
73
+ }
74
+ ]
75
+ ```
76
+
77
+ ## Sample chat request
78
+
79
+ ```json
80
+ {
81
+ "query": "How can I improve attendance this semester?",
82
+ "history": [
83
+ {"role": "user", "content": "Hi"},
84
+ {"role": "assistant", "content": "Hello"}
85
+ ],
86
+ "student_id": "STU123",
87
+ "lang_code": "en",
88
+ "semester": 5
89
+ }
90
+ ```
app/__init__.py ADDED
File without changes
app/config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ # Load both potential env file locations.
7
+ _APP_DIR = Path(__file__).resolve().parent
8
+ _ROOT_ENV = _APP_DIR.parent / ".env"
9
+ _APP_ENV = _APP_DIR / ".env"
10
+
11
+ load_dotenv(dotenv_path=_ROOT_ENV, override=False)
12
+ load_dotenv(dotenv_path=_APP_ENV, override=True)
13
+
14
+
15
+ class Settings:
16
+ gemini_api_key: str = os.getenv("GEMINI_API_KEY", "")
17
+ embedding_model_name: str = os.getenv(
18
+ "EMBEDDING_MODEL_NAME",
19
+ "sentence-transformers/all-MiniLM-L6-v2",
20
+ )
21
+ student_performance_url_template: str = os.getenv(
22
+ "STUDENT_PERFORMANCE_URL_TEMPLATE",
23
+ "https://git-connect-backend-v2.vercel.app/api/student/{student_id}/performance",
24
+ )
25
+ vector_data_dir: str = os.getenv("VECTOR_DATA_DIR", "data/vector_index")
26
+ raw_text_dir: str = os.getenv("RAW_TEXT_DIR", "data/raw_text")
27
+ gemini_model: str = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
28
+ pdf_timeout_sec: int = int(os.getenv("PDF_TIMEOUT_SEC", "60"))
29
+ pdf_max_retries: int = int(os.getenv("PDF_MAX_RETRIES", "3"))
30
+ pdf_retry_backoff_sec: float = float(os.getenv("PDF_RETRY_BACKOFF_SEC", "1.5"))
31
+
32
+
33
+ settings = Settings()
app/main.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+
4
+ from fastapi import FastAPI, HTTPException
5
+
6
+ from app.config import settings
7
+ from app.models import (
8
+ ChatRequest,
9
+ ChatResponse,
10
+ CourseInput,
11
+ CourseProcessError,
12
+ CourseSummary,
13
+ SyllabusProcessResponse,
14
+ )
15
+ from app.services.gemini_service import GeminiService
16
+ from app.services.pdf_service import chunk_text, fetch_pdf_text
17
+ from app.services.rag_service import build_student_documents, mmr_select
18
+ from app.services.student_service import fetch_student_info
19
+ from app.vector_store import LocalVectorStore
20
+
21
+
22
+ app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
23
+
24
+
25
+ @app.get("/health")
26
+ def health() -> dict:
27
+ return {"status": "ok"}
28
+
29
+
30
+ @app.post("/api/syllabus/process", response_model=SyllabusProcessResponse)
31
+ def process_syllabus(courses: List[CourseInput]) -> SyllabusProcessResponse:
32
+ if not courses:
33
+ return SyllabusProcessResponse(
34
+ results=[],
35
+ failed=[],
36
+ total_received=0,
37
+ total_processed=0,
38
+ total_failed=0,
39
+ )
40
+
41
+ try:
42
+ gemini = GeminiService(
43
+ settings.gemini_api_key,
44
+ settings.gemini_model,
45
+ settings.embedding_model_name,
46
+ )
47
+ except ValueError as exc:
48
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
49
+
50
+ vector_store = LocalVectorStore(settings.vector_data_dir)
51
+ os.makedirs(settings.raw_text_dir, exist_ok=True)
52
+
53
+ results: List[CourseSummary] = []
54
+ failed: List[CourseProcessError] = []
55
+
56
+ for course in courses:
57
+ try:
58
+ syllabus_text = fetch_pdf_text(
59
+ str(course.syllabus_url),
60
+ timeout=settings.pdf_timeout_sec,
61
+ max_retries=settings.pdf_max_retries,
62
+ backoff_sec=settings.pdf_retry_backoff_sec,
63
+ )
64
+ if not syllabus_text:
65
+ raise RuntimeError("No text extracted from PDF.")
66
+
67
+ raw_path = os.path.join(settings.raw_text_dir, f"{course.course_code}.txt")
68
+ with open(raw_path, "w", encoding="utf-8") as f:
69
+ f.write(syllabus_text)
70
+
71
+ chunks = chunk_text(syllabus_text)
72
+ if not chunks:
73
+ raise RuntimeError("Unable to create text chunks from syllabus content.")
74
+
75
+ embeddings = [
76
+ gemini.embed_text(chunk, task_type="retrieval_document")
77
+ for chunk in chunks
78
+ ]
79
+ vector_store.upsert_documents(course.semester, course.course_code, chunks, embeddings)
80
+
81
+ ai_summary = gemini.summarize_multilingual(course.name, syllabus_text)
82
+ results.append(CourseSummary(course_code=course.course_code, ai_summary=ai_summary))
83
+ except Exception as exc:
84
+ failed.append(CourseProcessError(course_code=course.course_code, error=str(exc)))
85
+
86
+ return SyllabusProcessResponse(
87
+ results=results,
88
+ failed=failed,
89
+ total_received=len(courses),
90
+ total_processed=len(results),
91
+ total_failed=len(failed),
92
+ )
93
+
94
+
95
+ @app.post("/api/chat", response_model=ChatResponse)
96
+ def chat(req: ChatRequest) -> ChatResponse:
97
+ try:
98
+ gemini = GeminiService(
99
+ settings.gemini_api_key,
100
+ settings.gemini_model,
101
+ settings.embedding_model_name,
102
+ )
103
+ except ValueError as exc:
104
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
105
+
106
+ vector_store = LocalVectorStore(settings.vector_data_dir)
107
+
108
+ try:
109
+ student_info = fetch_student_info(
110
+ settings.student_performance_url_template,
111
+ req.student_id,
112
+ )
113
+ except Exception as exc:
114
+ raise HTTPException(status_code=502, detail=f"Student info fetch failed: {exc}") from exc
115
+
116
+ try:
117
+ query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
118
+
119
+ syllabus_hits = vector_store.search(req.semester, query_embedding, top_k=20)
120
+ for hit in syllabus_hits:
121
+ hit["source"] = "syllabus"
122
+
123
+ student_docs = build_student_documents(student_info)
124
+ for doc in student_docs:
125
+ doc["embedding"] = gemini.embed_text(
126
+ doc["chunk"],
127
+ task_type="retrieval_document",
128
+ )
129
+
130
+ combined_candidates = syllabus_hits + student_docs
131
+ hits = mmr_select(
132
+ query_embedding=query_embedding,
133
+ candidates=combined_candidates,
134
+ top_k=8,
135
+ lambda_param=0.7,
136
+ )
137
+ except Exception as exc:
138
+ raise HTTPException(status_code=500, detail=f"RAG retrieval failed: {exc}") from exc
139
+
140
+ rag_chunks = [f"[{h.get('source', 'unknown')}] {h['chunk']}" for h in hits]
141
+ retrieved_course_codes = sorted(
142
+ list(
143
+ {
144
+ h.get("course_code", "")
145
+ for h in hits
146
+ if str(h.get("course_code", "")).strip()
147
+ }
148
+ )
149
+ )
150
+
151
+ try:
152
+ reply = gemini.chat_with_context(
153
+ query=req.query,
154
+ lang_code=req.lang_code,
155
+ history=[msg.model_dump() for msg in req.history],
156
+ student_info=student_info,
157
+ rag_chunks=rag_chunks,
158
+ )
159
+ except Exception as exc:
160
+ raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
161
+
162
+ return ChatResponse(
163
+ reply_markdown=reply,
164
+ retrieved_course_codes=retrieved_course_codes,
165
+ student_info=student_info,
166
+ )
app/models.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Literal, Optional
2
+ from pydantic import BaseModel, Field, HttpUrl
3
+
4
+
5
+ LangCode = Literal["en", "hn", "mr", "kn"]
6
+
7
+
8
+ class CourseInput(BaseModel):
9
+ course_code: str = Field(..., min_length=1)
10
+ name: str = Field(..., min_length=1)
11
+ course_type: str = Field(..., min_length=1)
12
+ syllabus_url: HttpUrl
13
+ semester: int = Field(..., ge=1, le=12)
14
+
15
+
16
+ class CourseSummary(BaseModel):
17
+ course_code: str
18
+ ai_summary: Dict[LangCode, str]
19
+
20
+
21
+ class CourseProcessError(BaseModel):
22
+ course_code: str
23
+ error: str
24
+
25
+
26
+ class SyllabusProcessResponse(BaseModel):
27
+ results: List[CourseSummary]
28
+ failed: List[CourseProcessError] = Field(default_factory=list)
29
+ total_received: int
30
+ total_processed: int
31
+ total_failed: int
32
+
33
+
34
+ class ChatMessage(BaseModel):
35
+ role: Literal["user", "assistant", "system"]
36
+ content: str
37
+
38
+
39
+ class ChatRequest(BaseModel):
40
+ query: str
41
+ history: List[ChatMessage] = Field(default_factory=list, max_length=5)
42
+ student_id: int = Field(..., ge=1)
43
+ lang_code: LangCode
44
+ semester: int = Field(..., ge=1, le=12)
45
+
46
+
47
+ class ChatResponse(BaseModel):
48
+ reply_markdown: str
49
+ retrieved_course_codes: List[str]
50
+ student_info: Optional[dict] = None
app/services/__init__.py ADDED
File without changes
app/services/gemini_service.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict, List
3
+
4
+ import google.generativeai as genai
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+
8
+ class GeminiService:
9
+ _embedding_model_cache: dict[str, SentenceTransformer] = {}
10
+
11
+ def __init__(self, api_key: str, model_name: str, embedding_model_name: str) -> None:
12
+ if not api_key:
13
+ raise ValueError("GEMINI_API_KEY is not set.")
14
+ genai.configure(api_key=api_key)
15
+ self._model = genai.GenerativeModel(model_name)
16
+ self._embedding_model = self._get_embedding_model(embedding_model_name)
17
+
18
+ def embed_text(self, text: str, task_type: str) -> List[float]:
19
+ # Small local HF embeddings for RAG; task_type kept for API compatibility.
20
+ _ = task_type
21
+ emb = self._embedding_model.encode(text, normalize_embeddings=False)
22
+ return emb.tolist()
23
+
24
+ def summarize_multilingual(self, course_name: str, syllabus_text: str) -> Dict[str, str]:
25
+ prompt = f"""
26
+ You are an academic assistant.
27
+ Summarize the course syllabus content for course: {course_name}.
28
+
29
+ Return STRICT JSON with this exact schema and keys only:
30
+ {{
31
+ "en": "English summary",
32
+ "mr": "Marathi summary",
33
+ "kn": "Kannada summary",
34
+ "hn": "Hindi summary"
35
+ }}
36
+
37
+ Rules:
38
+ - Keep each summary clear for students and parents.
39
+ - 80-140 words per language.
40
+ - Use only the syllabus context below.
41
+
42
+ Syllabus context:
43
+ {syllabus_text[:12000]}
44
+ """
45
+ raw = self._model.generate_content(prompt).text
46
+ return self._safe_parse_summary_json(raw)
47
+
48
+ def chat_with_context(
49
+ self,
50
+ query: str,
51
+ lang_code: str,
52
+ history: List[dict],
53
+ student_info: dict,
54
+ rag_chunks: List[str],
55
+ ) -> str:
56
+ history_text = "\n".join(
57
+ [f"{msg.get('role', 'user')}: {msg.get('content', '')}" for msg in history]
58
+ )
59
+ syllabus_context = "\n\n---\n\n".join(rag_chunks[:8])
60
+
61
+ prompt = f"""
62
+ You are a helpful college assistant chatbot for students and parents.
63
+
64
+ Respond in language code: {lang_code}
65
+ Supported codes: en, hn, mr, kn.
66
+ Return the final answer in markdown.
67
+
68
+ Student data (attendance, result etc.):
69
+ {json.dumps(student_info, ensure_ascii=False)}
70
+
71
+ Recent chat history:
72
+ {history_text}
73
+
74
+ Relevant syllabus context:
75
+ {syllabus_context}
76
+
77
+ User query:
78
+ {query}
79
+
80
+ Answer guidelines:
81
+ - Be accurate and grounded in provided info.
82
+ - If data is missing, state what is missing.
83
+ - Keep response practical and concise.
84
+ - Use markdown with bullets or short headings when useful.
85
+ """
86
+ return self._model.generate_content(prompt).text
87
+
88
+ def _safe_parse_summary_json(self, raw: str) -> Dict[str, str]:
89
+ text = raw.strip()
90
+ if text.startswith("```"):
91
+ text = text.strip("`")
92
+ if text.startswith("json"):
93
+ text = text[4:].strip()
94
+
95
+ parsed = json.loads(text)
96
+ return {
97
+ "en": str(parsed.get("en", "")),
98
+ "mr": str(parsed.get("mr", "")),
99
+ "kn": str(parsed.get("kn", "")),
100
+ "hn": str(parsed.get("hn", "")),
101
+ }
102
+
103
+ @classmethod
104
+ def _get_embedding_model(cls, embedding_model_name: str) -> SentenceTransformer:
105
+ if embedding_model_name not in cls._embedding_model_cache:
106
+ cls._embedding_model_cache[embedding_model_name] = SentenceTransformer(
107
+ embedding_model_name
108
+ )
109
+ return cls._embedding_model_cache[embedding_model_name]
app/services/pdf_service.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import time
3
+
4
+ import requests
5
+ from pypdf import PdfReader
6
+
7
+
8
+ def fetch_pdf_text(
9
+ pdf_url: str,
10
+ timeout: int = 60,
11
+ max_retries: int = 3,
12
+ backoff_sec: float = 1.5,
13
+ ) -> str:
14
+ last_exc: Exception | None = None
15
+
16
+ for attempt in range(max_retries):
17
+ try:
18
+ response = requests.get(pdf_url, timeout=timeout)
19
+ response.raise_for_status()
20
+ pdf_stream = io.BytesIO(response.content)
21
+ reader = PdfReader(pdf_stream)
22
+ extracted = []
23
+
24
+ for page in reader.pages:
25
+ text = page.extract_text() or ""
26
+ if text.strip():
27
+ extracted.append(text)
28
+
29
+ return "\n\n".join(extracted).strip()
30
+ except Exception as exc:
31
+ last_exc = exc
32
+ if attempt < max_retries - 1:
33
+ sleep_sec = backoff_sec * (2 ** attempt)
34
+ time.sleep(sleep_sec)
35
+
36
+ raise RuntimeError(f"Failed to fetch PDF after {max_retries} attempts: {last_exc}")
37
+
38
+
39
+ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 150) -> list[str]:
40
+ if not text.strip():
41
+ return []
42
+
43
+ clean_text = " ".join(text.split())
44
+ chunks = []
45
+ start = 0
46
+ step = max(chunk_size - overlap, 1)
47
+
48
+ while start < len(clean_text):
49
+ end = min(start + chunk_size, len(clean_text))
50
+ chunks.append(clean_text[start:end])
51
+ if end >= len(clean_text):
52
+ break
53
+ start += step
54
+
55
+ return chunks
app/services/rag_service.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import numpy as np
4
+
5
+
6
+ def build_student_documents(student_info: Dict) -> List[Dict]:
7
+ docs: List[Dict] = []
8
+
9
+ attendance = student_info.get("attendance", {})
10
+ results = student_info.get("results", {})
11
+
12
+ overall_attendance = attendance.get("overall_pct")
13
+ overall_status = attendance.get("overall_status")
14
+ current_cgpa = results.get("current_cgpa")
15
+
16
+ docs.append(
17
+ {
18
+ "source": "student_profile",
19
+ "course_code": "",
20
+ "chunk": (
21
+ f"Student profile overview: current CGPA is {current_cgpa}. "
22
+ f"Overall attendance is {overall_attendance}% with status {overall_status}."
23
+ ),
24
+ }
25
+ )
26
+
27
+ for subject in attendance.get("subjects", []):
28
+ docs.append(
29
+ {
30
+ "source": "attendance",
31
+ "course_code": str(subject.get("course_code", "")),
32
+ "chunk": (
33
+ f"Attendance for {subject.get('subject', '')} ({subject.get('course_code', '')}): "
34
+ f"{subject.get('attended_classes', 0)}/{subject.get('total_classes', 0)} classes, "
35
+ f"{subject.get('attendance_pct', 0)}%, status {subject.get('status', 'unknown')}."
36
+ ),
37
+ }
38
+ )
39
+
40
+ for sem in results.get("semesters", []):
41
+ sem_no = sem.get("semester")
42
+ docs.append(
43
+ {
44
+ "source": "results_semester",
45
+ "course_code": "",
46
+ "chunk": (
47
+ f"Semester {sem_no} performance summary: SGPA {sem.get('sgpa', 'NA')}, "
48
+ f"CGPA {sem.get('cgpa', 'NA')}."
49
+ ),
50
+ }
51
+ )
52
+
53
+ for subject in sem.get("subjects", []):
54
+ docs.append(
55
+ {
56
+ "source": "results_subject",
57
+ "course_code": str(subject.get("course_code", "")),
58
+ "chunk": (
59
+ f"Result for {subject.get('subject', '')} ({subject.get('course_code', '')}) "
60
+ f"in semester {sem_no}: total score {subject.get('total', 'NA')}"
61
+ ),
62
+ }
63
+ )
64
+
65
+ return docs
66
+
67
+
68
+ def mmr_select(
69
+ query_embedding: List[float],
70
+ candidates: List[Dict],
71
+ top_k: int = 8,
72
+ lambda_param: float = 0.7,
73
+ ) -> List[Dict]:
74
+ if not candidates:
75
+ return []
76
+
77
+ vectors = []
78
+ valid_candidates = []
79
+
80
+ for c in candidates:
81
+ emb = c.get("embedding")
82
+ if not emb:
83
+ continue
84
+ vec = np.array(emb, dtype=np.float32)
85
+ if np.linalg.norm(vec) == 0:
86
+ continue
87
+ vectors.append(vec)
88
+ valid_candidates.append(c)
89
+
90
+ if not valid_candidates:
91
+ return []
92
+
93
+ q = np.array(query_embedding, dtype=np.float32)
94
+ q_norm = np.linalg.norm(q)
95
+ if q_norm == 0:
96
+ return valid_candidates[:top_k]
97
+
98
+ vectors_np = np.stack(vectors)
99
+ vec_norms = np.linalg.norm(vectors_np, axis=1)
100
+ query_sims = (vectors_np @ q) / (vec_norms * q_norm)
101
+
102
+ selected_idx: List[int] = []
103
+ candidate_idx = list(range(len(valid_candidates)))
104
+
105
+ first_idx = int(np.argmax(query_sims))
106
+ selected_idx.append(first_idx)
107
+ candidate_idx.remove(first_idx)
108
+
109
+ while candidate_idx and len(selected_idx) < top_k:
110
+ best_idx = None
111
+ best_score = -1e9
112
+
113
+ for idx in candidate_idx:
114
+ relevance = float(query_sims[idx])
115
+ diversity = max(
116
+ float(np.dot(vectors_np[idx], vectors_np[s]) / (vec_norms[idx] * vec_norms[s]))
117
+ for s in selected_idx
118
+ )
119
+ mmr_score = lambda_param * relevance - (1.0 - lambda_param) * diversity
120
+ if mmr_score > best_score:
121
+ best_score = mmr_score
122
+ best_idx = idx
123
+
124
+ if best_idx is None:
125
+ break
126
+
127
+ selected_idx.append(best_idx)
128
+ candidate_idx.remove(best_idx)
129
+
130
+ return [valid_candidates[i] for i in selected_idx]
app/services/student_service.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+
4
+ def fetch_student_info(
5
+ student_performance_url_template: str,
6
+ student_id: int,
7
+ timeout: int = 20,
8
+ ) -> dict:
9
+ if "{student_id}" not in student_performance_url_template:
10
+ raise ValueError(
11
+ "STUDENT_PERFORMANCE_URL_TEMPLATE must include '{student_id}'."
12
+ )
13
+
14
+ student_url = student_performance_url_template.format(student_id=student_id)
15
+ get_resp = requests.get(student_url, timeout=timeout)
16
+ if not get_resp.ok:
17
+ raise RuntimeError(
18
+ f"Failed to fetch student info from {student_url}; "
19
+ f"status {get_resp.status_code}."
20
+ )
21
+
22
+ return get_resp.json()
app/vector_store.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import List
4
+
5
+ import faiss
6
+ import numpy as np
7
+
8
+
9
+ class LocalVectorStore:
10
+ def __init__(self, base_dir: str) -> None:
11
+ self.base_dir = base_dir
12
+ os.makedirs(self.base_dir, exist_ok=True)
13
+
14
+ def upsert_documents(
15
+ self,
16
+ semester: int,
17
+ course_code: str,
18
+ chunks: List[str],
19
+ embeddings: List[List[float]],
20
+ ) -> None:
21
+ meta_path = self._semester_meta_path(semester)
22
+ records = self._load(meta_path)
23
+
24
+ records = [r for r in records if r.get("course_code") != course_code]
25
+ for chunk, vector in zip(chunks, embeddings):
26
+ records.append(
27
+ {
28
+ "course_code": course_code,
29
+ "chunk": chunk,
30
+ "embedding": vector,
31
+ }
32
+ )
33
+
34
+ self._save(meta_path, records)
35
+ self._rebuild_faiss_index(semester, records)
36
+
37
+ def search(self, semester: int, query_embedding: List[float], top_k: int = 6) -> List[dict]:
38
+ meta_path = self._semester_meta_path(semester)
39
+ index_path = self._semester_index_path(semester)
40
+ records = self._load(meta_path)
41
+ if not records:
42
+ return []
43
+
44
+ if not os.path.exists(index_path):
45
+ self._rebuild_faiss_index(semester, records)
46
+
47
+ if not os.path.exists(index_path):
48
+ return []
49
+
50
+ index = faiss.read_index(index_path)
51
+
52
+ q = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
53
+ faiss.normalize_L2(q)
54
+
55
+ k = min(top_k, len(records))
56
+ _, indices = index.search(q, k)
57
+
58
+ hits = []
59
+ for idx in indices[0].tolist():
60
+ if idx == -1:
61
+ continue
62
+ if 0 <= idx < len(records):
63
+ record = records[idx]
64
+ hits.append(
65
+ {
66
+ "course_code": record.get("course_code", ""),
67
+ "chunk": record.get("chunk", ""),
68
+ }
69
+ )
70
+
71
+ return hits
72
+
73
+ def _semester_meta_path(self, semester: int) -> str:
74
+ return os.path.join(self.base_dir, f"semester_{semester}.meta.json")
75
+
76
+ def _semester_index_path(self, semester: int) -> str:
77
+ return os.path.join(self.base_dir, f"semester_{semester}.faiss")
78
+
79
+ def _rebuild_faiss_index(self, semester: int, records: List[dict]) -> None:
80
+ index_path = self._semester_index_path(semester)
81
+
82
+ if not records:
83
+ if os.path.exists(index_path):
84
+ os.remove(index_path)
85
+ return
86
+
87
+ vectors = np.array([r["embedding"] for r in records], dtype=np.float32)
88
+ if vectors.ndim != 2 or vectors.shape[0] == 0:
89
+ return
90
+
91
+ faiss.normalize_L2(vectors)
92
+ dim = vectors.shape[1]
93
+ index = faiss.IndexFlatIP(dim)
94
+ index.add(vectors)
95
+ faiss.write_index(index, index_path)
96
+
97
+ @staticmethod
98
+ def _load(path: str) -> List[dict]:
99
+ if not os.path.exists(path):
100
+ return []
101
+ with open(path, "r", encoding="utf-8") as f:
102
+ return json.load(f)
103
+
104
+ @staticmethod
105
+ def _save(path: str, data: List[dict]) -> None:
106
+ with open(path, "w", encoding="utf-8") as f:
107
+ json.dump(data, f, ensure_ascii=False)
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.115.0
2
+ uvicorn[standard]>=0.30.0
3
+ requests>=2.32.0
4
+ pydantic>=2.8.0
5
+ python-dotenv>=1.0.1
6
+ pypdf>=5.0.0
7
+ numpy>=2.0.0
8
+ google-generativeai>=0.7.2
9
+ psycopg[binary]>=3.2.0
10
+ sentence-transformers>=3.0.1
11
+ faiss-cpu>=1.8.0.post1