Commit ·
fdb66ba
0
Parent(s):
initial commit
Browse filesCo-authored-by: Copilot <copilot@github.com>
- .dockerignore +22 -0
- .env.example +9 -0
- .github/workflows/deploy-hf-space.yml +39 -0
- .gitignore +46 -0
- Dockerfile +18 -0
- README.md +90 -0
- app/__init__.py +0 -0
- app/config.py +33 -0
- app/main.py +166 -0
- app/models.py +50 -0
- app/services/__init__.py +0 -0
- app/services/gemini_service.py +109 -0
- app/services/pdf_service.py +55 -0
- app/services/rag_service.py +130 -0
- app/services/student_service.py +22 -0
- app/vector_store.py +107 -0
- requirements.txt +11 -0
.dockerignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
*.so
|
| 6 |
+
*.egg-info/
|
| 7 |
+
.pytest_cache/
|
| 8 |
+
.mypy_cache/
|
| 9 |
+
.ruff_cache/
|
| 10 |
+
|
| 11 |
+
.venv/
|
| 12 |
+
venv/
|
| 13 |
+
env/
|
| 14 |
+
|
| 15 |
+
.env
|
| 16 |
+
app/.env
|
| 17 |
+
.vscode/
|
| 18 |
+
.git/
|
| 19 |
+
.gitignore
|
| 20 |
+
|
| 21 |
+
data/
|
| 22 |
+
*.log
|
.env.example
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GEMINI_API_KEY=your_gemini_api_key
|
| 2 |
+
GEMINI_MODEL=gemini-2.5-flash
|
| 3 |
+
EMBEDDING_MODEL_NAME=sentence-transformers/all-MiniLM-L6-v2
|
| 4 |
+
STUDENT_PERFORMANCE_URL_TEMPLATE=https://git-connect-backend-v2.vercel.app/api/student/{student_id}/performance
|
| 5 |
+
VECTOR_DATA_DIR=data/vector_index
|
| 6 |
+
RAW_TEXT_DIR=data/raw_text
|
| 7 |
+
PDF_TIMEOUT_SEC=60
|
| 8 |
+
PDF_MAX_RETRIES=3
|
| 9 |
+
PDF_RETRY_BACKOFF_SEC=1.5
|
.github/workflows/deploy-hf-space.yml
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Deploy To Hugging Face Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
deploy:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
|
| 13 |
+
steps:
|
| 14 |
+
- name: Checkout
|
| 15 |
+
uses: actions/checkout@v4
|
| 16 |
+
with:
|
| 17 |
+
fetch-depth: 0
|
| 18 |
+
|
| 19 |
+
- name: Validate required secrets
|
| 20 |
+
shell: bash
|
| 21 |
+
run: |
|
| 22 |
+
test -n "${{ secrets.HF_TOKEN }}" || (echo "HF_TOKEN is missing" && exit 1)
|
| 23 |
+
test -n "${{ secrets.HF_SPACE_REPO_ID }}" || (echo "HF_SPACE_REPO_ID is missing" && exit 1)
|
| 24 |
+
|
| 25 |
+
- name: Configure git user
|
| 26 |
+
run: |
|
| 27 |
+
git config --global user.name "github-actions[bot]"
|
| 28 |
+
git config --global user.email "github-actions[bot]@users.noreply.github.com"
|
| 29 |
+
|
| 30 |
+
- name: Push repository to Hugging Face Space
|
| 31 |
+
env:
|
| 32 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 33 |
+
HF_SPACE_REPO_ID: ${{ secrets.HF_SPACE_REPO_ID }}
|
| 34 |
+
shell: bash
|
| 35 |
+
run: |
|
| 36 |
+
HF_URL="https://oauth2:${HF_TOKEN}@huggingface.co/spaces/${HF_SPACE_REPO_ID}"
|
| 37 |
+
git remote remove hf 2>/dev/null || true
|
| 38 |
+
git remote add hf "${HF_URL}"
|
| 39 |
+
git push hf HEAD:main --force
|
.gitignore
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
*.so
|
| 7 |
+
*.egg-info/
|
| 8 |
+
.pytest_cache/
|
| 9 |
+
.mypy_cache/
|
| 10 |
+
.ruff_cache/
|
| 11 |
+
|
| 12 |
+
# Virtual environments
|
| 13 |
+
.venv/
|
| 14 |
+
venv/
|
| 15 |
+
env/
|
| 16 |
+
|
| 17 |
+
# Environment files and secrets
|
| 18 |
+
.env
|
| 19 |
+
app/.env
|
| 20 |
+
*.env.local
|
| 21 |
+
|
| 22 |
+
# IDE/editor
|
| 23 |
+
.vscode/
|
| 24 |
+
.idea/
|
| 25 |
+
|
| 26 |
+
# OS files
|
| 27 |
+
.DS_Store
|
| 28 |
+
Thumbs.db
|
| 29 |
+
|
| 30 |
+
# Runtime and local data
|
| 31 |
+
data/raw_text/
|
| 32 |
+
data/vector_index/
|
| 33 |
+
*.faiss
|
| 34 |
+
*.meta.json
|
| 35 |
+
*.log
|
| 36 |
+
|
| 37 |
+
# Hugging Face and model cache
|
| 38 |
+
.cache/
|
| 39 |
+
.huggingface/
|
| 40 |
+
|
| 41 |
+
# Build artifacts
|
| 42 |
+
dist/
|
| 43 |
+
build/
|
| 44 |
+
prompt.md
|
| 45 |
+
stud_info.md
|
| 46 |
+
test_db_to_api.py
|
Dockerfile
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt /app/requirements.txt
|
| 10 |
+
RUN pip install --upgrade pip && pip install -r /app/requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY app /app/app
|
| 13 |
+
COPY .env.example /app/.env.example
|
| 14 |
+
COPY README.md /app/README.md
|
| 15 |
+
|
| 16 |
+
EXPOSE 7860
|
| 17 |
+
|
| 18 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GitConnect FastAPI Service
|
| 2 |
+
|
| 3 |
+
FastAPI backend with two endpoints:
|
| 4 |
+
- Syllabus processing from PDF URLs with FAISS vector indexing and multilingual course summaries.
|
| 5 |
+
- Chatbot endpoint using student performance data + semester-scoped syllabus RAG with MMR.
|
| 6 |
+
|
| 7 |
+
Embedding setup:
|
| 8 |
+
- Uses local Hugging Face sentence embeddings (default: `sentence-transformers/all-MiniLM-L6-v2`).
|
| 9 |
+
- Uses FAISS (`IndexFlatIP` with L2-normalized vectors) for similarity search.
|
| 10 |
+
- Gemini is used for summarization and chatbot generation.
|
| 11 |
+
|
| 12 |
+
## Setup
|
| 13 |
+
|
| 14 |
+
1. Create a virtual environment and activate it.
|
| 15 |
+
2. Install dependencies:
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
pip install -r requirements.txt
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
3. Create `.env` from `.env.example` and fill `GEMINI_API_KEY`.
|
| 22 |
+
4. Run server:
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
uvicorn app.main:app --reload
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Endpoints
|
| 29 |
+
|
| 30 |
+
- `GET /health`
|
| 31 |
+
- `POST /api/syllabus/process`
|
| 32 |
+
- `POST /api/chat`
|
| 33 |
+
|
| 34 |
+
## Deploy to Hugging Face Spaces
|
| 35 |
+
|
| 36 |
+
This repo is configured for Docker Spaces deployment.
|
| 37 |
+
|
| 38 |
+
Files used for deployment:
|
| 39 |
+
- `Dockerfile`
|
| 40 |
+
- `requirements.txt`
|
| 41 |
+
- `app/`
|
| 42 |
+
- `.github/workflows/deploy-hf-space.yml`
|
| 43 |
+
|
| 44 |
+
Files intentionally not pushed from local machine:
|
| 45 |
+
- `.env` and `app/.env`
|
| 46 |
+
- `data/` generated files
|
| 47 |
+
- local caches and `__pycache__/`
|
| 48 |
+
|
| 49 |
+
GitHub Action deployment:
|
| 50 |
+
- Trigger: push to `main` or manual run
|
| 51 |
+
- Workflow file: `.github/workflows/deploy-hf-space.yml`
|
| 52 |
+
|
| 53 |
+
Required GitHub repository secrets:
|
| 54 |
+
- `HF_TOKEN`: Hugging Face write token
|
| 55 |
+
- `HF_SPACE_REPO_ID`: in format `username/space-name`
|
| 56 |
+
|
| 57 |
+
After setting secrets, pushing to `main` will sync this repo to your Hugging Face Space `main` branch.
|
| 58 |
+
|
| 59 |
+
Student performance is fetched from:
|
| 60 |
+
- `STUDENT_PERFORMANCE_URL_TEMPLATE` (default)
|
| 61 |
+
- `https://git-connect-backend-v2.vercel.app/api/student/{student_id}/performance`
|
| 62 |
+
|
| 63 |
+
## Sample syllabus request
|
| 64 |
+
|
| 65 |
+
```json
|
| 66 |
+
[
|
| 67 |
+
{
|
| 68 |
+
"course_code": "22CS501",
|
| 69 |
+
"name": "Database Management Systems",
|
| 70 |
+
"course_type": "theory",
|
| 71 |
+
"syllabus_url": "https://example.com/dbms.pdf",
|
| 72 |
+
"semester": 5
|
| 73 |
+
}
|
| 74 |
+
]
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Sample chat request
|
| 78 |
+
|
| 79 |
+
```json
|
| 80 |
+
{
|
| 81 |
+
"query": "How can I improve attendance this semester?",
|
| 82 |
+
"history": [
|
| 83 |
+
{"role": "user", "content": "Hi"},
|
| 84 |
+
{"role": "assistant", "content": "Hello"}
|
| 85 |
+
],
|
| 86 |
+
"student_id": "STU123",
|
| 87 |
+
"lang_code": "en",
|
| 88 |
+
"semester": 5
|
| 89 |
+
}
|
| 90 |
+
```
|
app/__init__.py
ADDED
|
File without changes
|
app/config.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Load both potential env file locations.
|
| 7 |
+
_APP_DIR = Path(__file__).resolve().parent
|
| 8 |
+
_ROOT_ENV = _APP_DIR.parent / ".env"
|
| 9 |
+
_APP_ENV = _APP_DIR / ".env"
|
| 10 |
+
|
| 11 |
+
load_dotenv(dotenv_path=_ROOT_ENV, override=False)
|
| 12 |
+
load_dotenv(dotenv_path=_APP_ENV, override=True)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Settings:
|
| 16 |
+
gemini_api_key: str = os.getenv("GEMINI_API_KEY", "")
|
| 17 |
+
embedding_model_name: str = os.getenv(
|
| 18 |
+
"EMBEDDING_MODEL_NAME",
|
| 19 |
+
"sentence-transformers/all-MiniLM-L6-v2",
|
| 20 |
+
)
|
| 21 |
+
student_performance_url_template: str = os.getenv(
|
| 22 |
+
"STUDENT_PERFORMANCE_URL_TEMPLATE",
|
| 23 |
+
"https://git-connect-backend-v2.vercel.app/api/student/{student_id}/performance",
|
| 24 |
+
)
|
| 25 |
+
vector_data_dir: str = os.getenv("VECTOR_DATA_DIR", "data/vector_index")
|
| 26 |
+
raw_text_dir: str = os.getenv("RAW_TEXT_DIR", "data/raw_text")
|
| 27 |
+
gemini_model: str = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
|
| 28 |
+
pdf_timeout_sec: int = int(os.getenv("PDF_TIMEOUT_SEC", "60"))
|
| 29 |
+
pdf_max_retries: int = int(os.getenv("PDF_MAX_RETRIES", "3"))
|
| 30 |
+
pdf_retry_backoff_sec: float = float(os.getenv("PDF_RETRY_BACKOFF_SEC", "1.5"))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
settings = Settings()
|
app/main.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List
|
| 3 |
+
|
| 4 |
+
from fastapi import FastAPI, HTTPException
|
| 5 |
+
|
| 6 |
+
from app.config import settings
|
| 7 |
+
from app.models import (
|
| 8 |
+
ChatRequest,
|
| 9 |
+
ChatResponse,
|
| 10 |
+
CourseInput,
|
| 11 |
+
CourseProcessError,
|
| 12 |
+
CourseSummary,
|
| 13 |
+
SyllabusProcessResponse,
|
| 14 |
+
)
|
| 15 |
+
from app.services.gemini_service import GeminiService
|
| 16 |
+
from app.services.pdf_service import chunk_text, fetch_pdf_text
|
| 17 |
+
from app.services.rag_service import build_student_documents, mmr_select
|
| 18 |
+
from app.services.student_service import fetch_student_info
|
| 19 |
+
from app.vector_store import LocalVectorStore
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
app = FastAPI(title="GitConnect Chatbot Service", version="0.1.0")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@app.get("/health")
|
| 26 |
+
def health() -> dict:
|
| 27 |
+
return {"status": "ok"}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@app.post("/api/syllabus/process", response_model=SyllabusProcessResponse)
|
| 31 |
+
def process_syllabus(courses: List[CourseInput]) -> SyllabusProcessResponse:
|
| 32 |
+
if not courses:
|
| 33 |
+
return SyllabusProcessResponse(
|
| 34 |
+
results=[],
|
| 35 |
+
failed=[],
|
| 36 |
+
total_received=0,
|
| 37 |
+
total_processed=0,
|
| 38 |
+
total_failed=0,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
gemini = GeminiService(
|
| 43 |
+
settings.gemini_api_key,
|
| 44 |
+
settings.gemini_model,
|
| 45 |
+
settings.embedding_model_name,
|
| 46 |
+
)
|
| 47 |
+
except ValueError as exc:
|
| 48 |
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 49 |
+
|
| 50 |
+
vector_store = LocalVectorStore(settings.vector_data_dir)
|
| 51 |
+
os.makedirs(settings.raw_text_dir, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
results: List[CourseSummary] = []
|
| 54 |
+
failed: List[CourseProcessError] = []
|
| 55 |
+
|
| 56 |
+
for course in courses:
|
| 57 |
+
try:
|
| 58 |
+
syllabus_text = fetch_pdf_text(
|
| 59 |
+
str(course.syllabus_url),
|
| 60 |
+
timeout=settings.pdf_timeout_sec,
|
| 61 |
+
max_retries=settings.pdf_max_retries,
|
| 62 |
+
backoff_sec=settings.pdf_retry_backoff_sec,
|
| 63 |
+
)
|
| 64 |
+
if not syllabus_text:
|
| 65 |
+
raise RuntimeError("No text extracted from PDF.")
|
| 66 |
+
|
| 67 |
+
raw_path = os.path.join(settings.raw_text_dir, f"{course.course_code}.txt")
|
| 68 |
+
with open(raw_path, "w", encoding="utf-8") as f:
|
| 69 |
+
f.write(syllabus_text)
|
| 70 |
+
|
| 71 |
+
chunks = chunk_text(syllabus_text)
|
| 72 |
+
if not chunks:
|
| 73 |
+
raise RuntimeError("Unable to create text chunks from syllabus content.")
|
| 74 |
+
|
| 75 |
+
embeddings = [
|
| 76 |
+
gemini.embed_text(chunk, task_type="retrieval_document")
|
| 77 |
+
for chunk in chunks
|
| 78 |
+
]
|
| 79 |
+
vector_store.upsert_documents(course.semester, course.course_code, chunks, embeddings)
|
| 80 |
+
|
| 81 |
+
ai_summary = gemini.summarize_multilingual(course.name, syllabus_text)
|
| 82 |
+
results.append(CourseSummary(course_code=course.course_code, ai_summary=ai_summary))
|
| 83 |
+
except Exception as exc:
|
| 84 |
+
failed.append(CourseProcessError(course_code=course.course_code, error=str(exc)))
|
| 85 |
+
|
| 86 |
+
return SyllabusProcessResponse(
|
| 87 |
+
results=results,
|
| 88 |
+
failed=failed,
|
| 89 |
+
total_received=len(courses),
|
| 90 |
+
total_processed=len(results),
|
| 91 |
+
total_failed=len(failed),
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
@app.post("/api/chat", response_model=ChatResponse)
|
| 96 |
+
def chat(req: ChatRequest) -> ChatResponse:
|
| 97 |
+
try:
|
| 98 |
+
gemini = GeminiService(
|
| 99 |
+
settings.gemini_api_key,
|
| 100 |
+
settings.gemini_model,
|
| 101 |
+
settings.embedding_model_name,
|
| 102 |
+
)
|
| 103 |
+
except ValueError as exc:
|
| 104 |
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 105 |
+
|
| 106 |
+
vector_store = LocalVectorStore(settings.vector_data_dir)
|
| 107 |
+
|
| 108 |
+
try:
|
| 109 |
+
student_info = fetch_student_info(
|
| 110 |
+
settings.student_performance_url_template,
|
| 111 |
+
req.student_id,
|
| 112 |
+
)
|
| 113 |
+
except Exception as exc:
|
| 114 |
+
raise HTTPException(status_code=502, detail=f"Student info fetch failed: {exc}") from exc
|
| 115 |
+
|
| 116 |
+
try:
|
| 117 |
+
query_embedding = gemini.embed_text(req.query, task_type="retrieval_query")
|
| 118 |
+
|
| 119 |
+
syllabus_hits = vector_store.search(req.semester, query_embedding, top_k=20)
|
| 120 |
+
for hit in syllabus_hits:
|
| 121 |
+
hit["source"] = "syllabus"
|
| 122 |
+
|
| 123 |
+
student_docs = build_student_documents(student_info)
|
| 124 |
+
for doc in student_docs:
|
| 125 |
+
doc["embedding"] = gemini.embed_text(
|
| 126 |
+
doc["chunk"],
|
| 127 |
+
task_type="retrieval_document",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
combined_candidates = syllabus_hits + student_docs
|
| 131 |
+
hits = mmr_select(
|
| 132 |
+
query_embedding=query_embedding,
|
| 133 |
+
candidates=combined_candidates,
|
| 134 |
+
top_k=8,
|
| 135 |
+
lambda_param=0.7,
|
| 136 |
+
)
|
| 137 |
+
except Exception as exc:
|
| 138 |
+
raise HTTPException(status_code=500, detail=f"RAG retrieval failed: {exc}") from exc
|
| 139 |
+
|
| 140 |
+
rag_chunks = [f"[{h.get('source', 'unknown')}] {h['chunk']}" for h in hits]
|
| 141 |
+
retrieved_course_codes = sorted(
|
| 142 |
+
list(
|
| 143 |
+
{
|
| 144 |
+
h.get("course_code", "")
|
| 145 |
+
for h in hits
|
| 146 |
+
if str(h.get("course_code", "")).strip()
|
| 147 |
+
}
|
| 148 |
+
)
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
reply = gemini.chat_with_context(
|
| 153 |
+
query=req.query,
|
| 154 |
+
lang_code=req.lang_code,
|
| 155 |
+
history=[msg.model_dump() for msg in req.history],
|
| 156 |
+
student_info=student_info,
|
| 157 |
+
rag_chunks=rag_chunks,
|
| 158 |
+
)
|
| 159 |
+
except Exception as exc:
|
| 160 |
+
raise HTTPException(status_code=500, detail=f"LLM response failed: {exc}") from exc
|
| 161 |
+
|
| 162 |
+
return ChatResponse(
|
| 163 |
+
reply_markdown=reply,
|
| 164 |
+
retrieved_course_codes=retrieved_course_codes,
|
| 165 |
+
student_info=student_info,
|
| 166 |
+
)
|
app/models.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Literal, Optional
|
| 2 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
LangCode = Literal["en", "hn", "mr", "kn"]
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class CourseInput(BaseModel):
|
| 9 |
+
course_code: str = Field(..., min_length=1)
|
| 10 |
+
name: str = Field(..., min_length=1)
|
| 11 |
+
course_type: str = Field(..., min_length=1)
|
| 12 |
+
syllabus_url: HttpUrl
|
| 13 |
+
semester: int = Field(..., ge=1, le=12)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class CourseSummary(BaseModel):
|
| 17 |
+
course_code: str
|
| 18 |
+
ai_summary: Dict[LangCode, str]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class CourseProcessError(BaseModel):
|
| 22 |
+
course_code: str
|
| 23 |
+
error: str
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class SyllabusProcessResponse(BaseModel):
|
| 27 |
+
results: List[CourseSummary]
|
| 28 |
+
failed: List[CourseProcessError] = Field(default_factory=list)
|
| 29 |
+
total_received: int
|
| 30 |
+
total_processed: int
|
| 31 |
+
total_failed: int
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class ChatMessage(BaseModel):
|
| 35 |
+
role: Literal["user", "assistant", "system"]
|
| 36 |
+
content: str
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class ChatRequest(BaseModel):
|
| 40 |
+
query: str
|
| 41 |
+
history: List[ChatMessage] = Field(default_factory=list, max_length=5)
|
| 42 |
+
student_id: int = Field(..., ge=1)
|
| 43 |
+
lang_code: LangCode
|
| 44 |
+
semester: int = Field(..., ge=1, le=12)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class ChatResponse(BaseModel):
|
| 48 |
+
reply_markdown: str
|
| 49 |
+
retrieved_course_codes: List[str]
|
| 50 |
+
student_info: Optional[dict] = None
|
app/services/__init__.py
ADDED
|
File without changes
|
app/services/gemini_service.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
import google.generativeai as genai
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class GeminiService:
|
| 9 |
+
_embedding_model_cache: dict[str, SentenceTransformer] = {}
|
| 10 |
+
|
| 11 |
+
def __init__(self, api_key: str, model_name: str, embedding_model_name: str) -> None:
|
| 12 |
+
if not api_key:
|
| 13 |
+
raise ValueError("GEMINI_API_KEY is not set.")
|
| 14 |
+
genai.configure(api_key=api_key)
|
| 15 |
+
self._model = genai.GenerativeModel(model_name)
|
| 16 |
+
self._embedding_model = self._get_embedding_model(embedding_model_name)
|
| 17 |
+
|
| 18 |
+
def embed_text(self, text: str, task_type: str) -> List[float]:
|
| 19 |
+
# Small local HF embeddings for RAG; task_type kept for API compatibility.
|
| 20 |
+
_ = task_type
|
| 21 |
+
emb = self._embedding_model.encode(text, normalize_embeddings=False)
|
| 22 |
+
return emb.tolist()
|
| 23 |
+
|
| 24 |
+
def summarize_multilingual(self, course_name: str, syllabus_text: str) -> Dict[str, str]:
|
| 25 |
+
prompt = f"""
|
| 26 |
+
You are an academic assistant.
|
| 27 |
+
Summarize the course syllabus content for course: {course_name}.
|
| 28 |
+
|
| 29 |
+
Return STRICT JSON with this exact schema and keys only:
|
| 30 |
+
{{
|
| 31 |
+
"en": "English summary",
|
| 32 |
+
"mr": "Marathi summary",
|
| 33 |
+
"kn": "Kannada summary",
|
| 34 |
+
"hn": "Hindi summary"
|
| 35 |
+
}}
|
| 36 |
+
|
| 37 |
+
Rules:
|
| 38 |
+
- Keep each summary clear for students and parents.
|
| 39 |
+
- 80-140 words per language.
|
| 40 |
+
- Use only the syllabus context below.
|
| 41 |
+
|
| 42 |
+
Syllabus context:
|
| 43 |
+
{syllabus_text[:12000]}
|
| 44 |
+
"""
|
| 45 |
+
raw = self._model.generate_content(prompt).text
|
| 46 |
+
return self._safe_parse_summary_json(raw)
|
| 47 |
+
|
| 48 |
+
def chat_with_context(
|
| 49 |
+
self,
|
| 50 |
+
query: str,
|
| 51 |
+
lang_code: str,
|
| 52 |
+
history: List[dict],
|
| 53 |
+
student_info: dict,
|
| 54 |
+
rag_chunks: List[str],
|
| 55 |
+
) -> str:
|
| 56 |
+
history_text = "\n".join(
|
| 57 |
+
[f"{msg.get('role', 'user')}: {msg.get('content', '')}" for msg in history]
|
| 58 |
+
)
|
| 59 |
+
syllabus_context = "\n\n---\n\n".join(rag_chunks[:8])
|
| 60 |
+
|
| 61 |
+
prompt = f"""
|
| 62 |
+
You are a helpful college assistant chatbot for students and parents.
|
| 63 |
+
|
| 64 |
+
Respond in language code: {lang_code}
|
| 65 |
+
Supported codes: en, hn, mr, kn.
|
| 66 |
+
Return the final answer in markdown.
|
| 67 |
+
|
| 68 |
+
Student data (attendance, result etc.):
|
| 69 |
+
{json.dumps(student_info, ensure_ascii=False)}
|
| 70 |
+
|
| 71 |
+
Recent chat history:
|
| 72 |
+
{history_text}
|
| 73 |
+
|
| 74 |
+
Relevant syllabus context:
|
| 75 |
+
{syllabus_context}
|
| 76 |
+
|
| 77 |
+
User query:
|
| 78 |
+
{query}
|
| 79 |
+
|
| 80 |
+
Answer guidelines:
|
| 81 |
+
- Be accurate and grounded in provided info.
|
| 82 |
+
- If data is missing, state what is missing.
|
| 83 |
+
- Keep response practical and concise.
|
| 84 |
+
- Use markdown with bullets or short headings when useful.
|
| 85 |
+
"""
|
| 86 |
+
return self._model.generate_content(prompt).text
|
| 87 |
+
|
| 88 |
+
def _safe_parse_summary_json(self, raw: str) -> Dict[str, str]:
|
| 89 |
+
text = raw.strip()
|
| 90 |
+
if text.startswith("```"):
|
| 91 |
+
text = text.strip("`")
|
| 92 |
+
if text.startswith("json"):
|
| 93 |
+
text = text[4:].strip()
|
| 94 |
+
|
| 95 |
+
parsed = json.loads(text)
|
| 96 |
+
return {
|
| 97 |
+
"en": str(parsed.get("en", "")),
|
| 98 |
+
"mr": str(parsed.get("mr", "")),
|
| 99 |
+
"kn": str(parsed.get("kn", "")),
|
| 100 |
+
"hn": str(parsed.get("hn", "")),
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
@classmethod
|
| 104 |
+
def _get_embedding_model(cls, embedding_model_name: str) -> SentenceTransformer:
|
| 105 |
+
if embedding_model_name not in cls._embedding_model_cache:
|
| 106 |
+
cls._embedding_model_cache[embedding_model_name] = SentenceTransformer(
|
| 107 |
+
embedding_model_name
|
| 108 |
+
)
|
| 109 |
+
return cls._embedding_model_cache[embedding_model_name]
|
app/services/pdf_service.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
from pypdf import PdfReader
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def fetch_pdf_text(
|
| 9 |
+
pdf_url: str,
|
| 10 |
+
timeout: int = 60,
|
| 11 |
+
max_retries: int = 3,
|
| 12 |
+
backoff_sec: float = 1.5,
|
| 13 |
+
) -> str:
|
| 14 |
+
last_exc: Exception | None = None
|
| 15 |
+
|
| 16 |
+
for attempt in range(max_retries):
|
| 17 |
+
try:
|
| 18 |
+
response = requests.get(pdf_url, timeout=timeout)
|
| 19 |
+
response.raise_for_status()
|
| 20 |
+
pdf_stream = io.BytesIO(response.content)
|
| 21 |
+
reader = PdfReader(pdf_stream)
|
| 22 |
+
extracted = []
|
| 23 |
+
|
| 24 |
+
for page in reader.pages:
|
| 25 |
+
text = page.extract_text() or ""
|
| 26 |
+
if text.strip():
|
| 27 |
+
extracted.append(text)
|
| 28 |
+
|
| 29 |
+
return "\n\n".join(extracted).strip()
|
| 30 |
+
except Exception as exc:
|
| 31 |
+
last_exc = exc
|
| 32 |
+
if attempt < max_retries - 1:
|
| 33 |
+
sleep_sec = backoff_sec * (2 ** attempt)
|
| 34 |
+
time.sleep(sleep_sec)
|
| 35 |
+
|
| 36 |
+
raise RuntimeError(f"Failed to fetch PDF after {max_retries} attempts: {last_exc}")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 150) -> list[str]:
|
| 40 |
+
if not text.strip():
|
| 41 |
+
return []
|
| 42 |
+
|
| 43 |
+
clean_text = " ".join(text.split())
|
| 44 |
+
chunks = []
|
| 45 |
+
start = 0
|
| 46 |
+
step = max(chunk_size - overlap, 1)
|
| 47 |
+
|
| 48 |
+
while start < len(clean_text):
|
| 49 |
+
end = min(start + chunk_size, len(clean_text))
|
| 50 |
+
chunks.append(clean_text[start:end])
|
| 51 |
+
if end >= len(clean_text):
|
| 52 |
+
break
|
| 53 |
+
start += step
|
| 54 |
+
|
| 55 |
+
return chunks
|
app/services/rag_service.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def build_student_documents(student_info: Dict) -> List[Dict]:
|
| 7 |
+
docs: List[Dict] = []
|
| 8 |
+
|
| 9 |
+
attendance = student_info.get("attendance", {})
|
| 10 |
+
results = student_info.get("results", {})
|
| 11 |
+
|
| 12 |
+
overall_attendance = attendance.get("overall_pct")
|
| 13 |
+
overall_status = attendance.get("overall_status")
|
| 14 |
+
current_cgpa = results.get("current_cgpa")
|
| 15 |
+
|
| 16 |
+
docs.append(
|
| 17 |
+
{
|
| 18 |
+
"source": "student_profile",
|
| 19 |
+
"course_code": "",
|
| 20 |
+
"chunk": (
|
| 21 |
+
f"Student profile overview: current CGPA is {current_cgpa}. "
|
| 22 |
+
f"Overall attendance is {overall_attendance}% with status {overall_status}."
|
| 23 |
+
),
|
| 24 |
+
}
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
for subject in attendance.get("subjects", []):
|
| 28 |
+
docs.append(
|
| 29 |
+
{
|
| 30 |
+
"source": "attendance",
|
| 31 |
+
"course_code": str(subject.get("course_code", "")),
|
| 32 |
+
"chunk": (
|
| 33 |
+
f"Attendance for {subject.get('subject', '')} ({subject.get('course_code', '')}): "
|
| 34 |
+
f"{subject.get('attended_classes', 0)}/{subject.get('total_classes', 0)} classes, "
|
| 35 |
+
f"{subject.get('attendance_pct', 0)}%, status {subject.get('status', 'unknown')}."
|
| 36 |
+
),
|
| 37 |
+
}
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
for sem in results.get("semesters", []):
|
| 41 |
+
sem_no = sem.get("semester")
|
| 42 |
+
docs.append(
|
| 43 |
+
{
|
| 44 |
+
"source": "results_semester",
|
| 45 |
+
"course_code": "",
|
| 46 |
+
"chunk": (
|
| 47 |
+
f"Semester {sem_no} performance summary: SGPA {sem.get('sgpa', 'NA')}, "
|
| 48 |
+
f"CGPA {sem.get('cgpa', 'NA')}."
|
| 49 |
+
),
|
| 50 |
+
}
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
for subject in sem.get("subjects", []):
|
| 54 |
+
docs.append(
|
| 55 |
+
{
|
| 56 |
+
"source": "results_subject",
|
| 57 |
+
"course_code": str(subject.get("course_code", "")),
|
| 58 |
+
"chunk": (
|
| 59 |
+
f"Result for {subject.get('subject', '')} ({subject.get('course_code', '')}) "
|
| 60 |
+
f"in semester {sem_no}: total score {subject.get('total', 'NA')}"
|
| 61 |
+
),
|
| 62 |
+
}
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
return docs
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def mmr_select(
|
| 69 |
+
query_embedding: List[float],
|
| 70 |
+
candidates: List[Dict],
|
| 71 |
+
top_k: int = 8,
|
| 72 |
+
lambda_param: float = 0.7,
|
| 73 |
+
) -> List[Dict]:
|
| 74 |
+
if not candidates:
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
vectors = []
|
| 78 |
+
valid_candidates = []
|
| 79 |
+
|
| 80 |
+
for c in candidates:
|
| 81 |
+
emb = c.get("embedding")
|
| 82 |
+
if not emb:
|
| 83 |
+
continue
|
| 84 |
+
vec = np.array(emb, dtype=np.float32)
|
| 85 |
+
if np.linalg.norm(vec) == 0:
|
| 86 |
+
continue
|
| 87 |
+
vectors.append(vec)
|
| 88 |
+
valid_candidates.append(c)
|
| 89 |
+
|
| 90 |
+
if not valid_candidates:
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
q = np.array(query_embedding, dtype=np.float32)
|
| 94 |
+
q_norm = np.linalg.norm(q)
|
| 95 |
+
if q_norm == 0:
|
| 96 |
+
return valid_candidates[:top_k]
|
| 97 |
+
|
| 98 |
+
vectors_np = np.stack(vectors)
|
| 99 |
+
vec_norms = np.linalg.norm(vectors_np, axis=1)
|
| 100 |
+
query_sims = (vectors_np @ q) / (vec_norms * q_norm)
|
| 101 |
+
|
| 102 |
+
selected_idx: List[int] = []
|
| 103 |
+
candidate_idx = list(range(len(valid_candidates)))
|
| 104 |
+
|
| 105 |
+
first_idx = int(np.argmax(query_sims))
|
| 106 |
+
selected_idx.append(first_idx)
|
| 107 |
+
candidate_idx.remove(first_idx)
|
| 108 |
+
|
| 109 |
+
while candidate_idx and len(selected_idx) < top_k:
|
| 110 |
+
best_idx = None
|
| 111 |
+
best_score = -1e9
|
| 112 |
+
|
| 113 |
+
for idx in candidate_idx:
|
| 114 |
+
relevance = float(query_sims[idx])
|
| 115 |
+
diversity = max(
|
| 116 |
+
float(np.dot(vectors_np[idx], vectors_np[s]) / (vec_norms[idx] * vec_norms[s]))
|
| 117 |
+
for s in selected_idx
|
| 118 |
+
)
|
| 119 |
+
mmr_score = lambda_param * relevance - (1.0 - lambda_param) * diversity
|
| 120 |
+
if mmr_score > best_score:
|
| 121 |
+
best_score = mmr_score
|
| 122 |
+
best_idx = idx
|
| 123 |
+
|
| 124 |
+
if best_idx is None:
|
| 125 |
+
break
|
| 126 |
+
|
| 127 |
+
selected_idx.append(best_idx)
|
| 128 |
+
candidate_idx.remove(best_idx)
|
| 129 |
+
|
| 130 |
+
return [valid_candidates[i] for i in selected_idx]
|
app/services/student_service.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def fetch_student_info(
|
| 5 |
+
student_performance_url_template: str,
|
| 6 |
+
student_id: int,
|
| 7 |
+
timeout: int = 20,
|
| 8 |
+
) -> dict:
|
| 9 |
+
if "{student_id}" not in student_performance_url_template:
|
| 10 |
+
raise ValueError(
|
| 11 |
+
"STUDENT_PERFORMANCE_URL_TEMPLATE must include '{student_id}'."
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
student_url = student_performance_url_template.format(student_id=student_id)
|
| 15 |
+
get_resp = requests.get(student_url, timeout=timeout)
|
| 16 |
+
if not get_resp.ok:
|
| 17 |
+
raise RuntimeError(
|
| 18 |
+
f"Failed to fetch student info from {student_url}; "
|
| 19 |
+
f"status {get_resp.status_code}."
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
return get_resp.json()
|
app/vector_store.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
import faiss
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class LocalVectorStore:
|
| 10 |
+
def __init__(self, base_dir: str) -> None:
|
| 11 |
+
self.base_dir = base_dir
|
| 12 |
+
os.makedirs(self.base_dir, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
def upsert_documents(
|
| 15 |
+
self,
|
| 16 |
+
semester: int,
|
| 17 |
+
course_code: str,
|
| 18 |
+
chunks: List[str],
|
| 19 |
+
embeddings: List[List[float]],
|
| 20 |
+
) -> None:
|
| 21 |
+
meta_path = self._semester_meta_path(semester)
|
| 22 |
+
records = self._load(meta_path)
|
| 23 |
+
|
| 24 |
+
records = [r for r in records if r.get("course_code") != course_code]
|
| 25 |
+
for chunk, vector in zip(chunks, embeddings):
|
| 26 |
+
records.append(
|
| 27 |
+
{
|
| 28 |
+
"course_code": course_code,
|
| 29 |
+
"chunk": chunk,
|
| 30 |
+
"embedding": vector,
|
| 31 |
+
}
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
self._save(meta_path, records)
|
| 35 |
+
self._rebuild_faiss_index(semester, records)
|
| 36 |
+
|
| 37 |
+
def search(self, semester: int, query_embedding: List[float], top_k: int = 6) -> List[dict]:
|
| 38 |
+
meta_path = self._semester_meta_path(semester)
|
| 39 |
+
index_path = self._semester_index_path(semester)
|
| 40 |
+
records = self._load(meta_path)
|
| 41 |
+
if not records:
|
| 42 |
+
return []
|
| 43 |
+
|
| 44 |
+
if not os.path.exists(index_path):
|
| 45 |
+
self._rebuild_faiss_index(semester, records)
|
| 46 |
+
|
| 47 |
+
if not os.path.exists(index_path):
|
| 48 |
+
return []
|
| 49 |
+
|
| 50 |
+
index = faiss.read_index(index_path)
|
| 51 |
+
|
| 52 |
+
q = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
|
| 53 |
+
faiss.normalize_L2(q)
|
| 54 |
+
|
| 55 |
+
k = min(top_k, len(records))
|
| 56 |
+
_, indices = index.search(q, k)
|
| 57 |
+
|
| 58 |
+
hits = []
|
| 59 |
+
for idx in indices[0].tolist():
|
| 60 |
+
if idx == -1:
|
| 61 |
+
continue
|
| 62 |
+
if 0 <= idx < len(records):
|
| 63 |
+
record = records[idx]
|
| 64 |
+
hits.append(
|
| 65 |
+
{
|
| 66 |
+
"course_code": record.get("course_code", ""),
|
| 67 |
+
"chunk": record.get("chunk", ""),
|
| 68 |
+
}
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
return hits
|
| 72 |
+
|
| 73 |
+
def _semester_meta_path(self, semester: int) -> str:
|
| 74 |
+
return os.path.join(self.base_dir, f"semester_{semester}.meta.json")
|
| 75 |
+
|
| 76 |
+
def _semester_index_path(self, semester: int) -> str:
|
| 77 |
+
return os.path.join(self.base_dir, f"semester_{semester}.faiss")
|
| 78 |
+
|
| 79 |
+
def _rebuild_faiss_index(self, semester: int, records: List[dict]) -> None:
|
| 80 |
+
index_path = self._semester_index_path(semester)
|
| 81 |
+
|
| 82 |
+
if not records:
|
| 83 |
+
if os.path.exists(index_path):
|
| 84 |
+
os.remove(index_path)
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
vectors = np.array([r["embedding"] for r in records], dtype=np.float32)
|
| 88 |
+
if vectors.ndim != 2 or vectors.shape[0] == 0:
|
| 89 |
+
return
|
| 90 |
+
|
| 91 |
+
faiss.normalize_L2(vectors)
|
| 92 |
+
dim = vectors.shape[1]
|
| 93 |
+
index = faiss.IndexFlatIP(dim)
|
| 94 |
+
index.add(vectors)
|
| 95 |
+
faiss.write_index(index, index_path)
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def _load(path: str) -> List[dict]:
|
| 99 |
+
if not os.path.exists(path):
|
| 100 |
+
return []
|
| 101 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 102 |
+
return json.load(f)
|
| 103 |
+
|
| 104 |
+
@staticmethod
|
| 105 |
+
def _save(path: str, data: List[dict]) -> None:
|
| 106 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 107 |
+
json.dump(data, f, ensure_ascii=False)
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.115.0
|
| 2 |
+
uvicorn[standard]>=0.30.0
|
| 3 |
+
requests>=2.32.0
|
| 4 |
+
pydantic>=2.8.0
|
| 5 |
+
python-dotenv>=1.0.1
|
| 6 |
+
pypdf>=5.0.0
|
| 7 |
+
numpy>=2.0.0
|
| 8 |
+
google-generativeai>=0.7.2
|
| 9 |
+
psycopg[binary]>=3.2.0
|
| 10 |
+
sentence-transformers>=3.0.1
|
| 11 |
+
faiss-cpu>=1.8.0.post1
|