chatbot-gitconnect / app /services /rag_service.py
quantumbit's picture
initial commit
fdb66ba
from typing import Dict, List
import numpy as np
def build_student_documents(student_info: Dict) -> List[Dict]:
docs: List[Dict] = []
attendance = student_info.get("attendance", {})
results = student_info.get("results", {})
overall_attendance = attendance.get("overall_pct")
overall_status = attendance.get("overall_status")
current_cgpa = results.get("current_cgpa")
docs.append(
{
"source": "student_profile",
"course_code": "",
"chunk": (
f"Student profile overview: current CGPA is {current_cgpa}. "
f"Overall attendance is {overall_attendance}% with status {overall_status}."
),
}
)
for subject in attendance.get("subjects", []):
docs.append(
{
"source": "attendance",
"course_code": str(subject.get("course_code", "")),
"chunk": (
f"Attendance for {subject.get('subject', '')} ({subject.get('course_code', '')}): "
f"{subject.get('attended_classes', 0)}/{subject.get('total_classes', 0)} classes, "
f"{subject.get('attendance_pct', 0)}%, status {subject.get('status', 'unknown')}."
),
}
)
for sem in results.get("semesters", []):
sem_no = sem.get("semester")
docs.append(
{
"source": "results_semester",
"course_code": "",
"chunk": (
f"Semester {sem_no} performance summary: SGPA {sem.get('sgpa', 'NA')}, "
f"CGPA {sem.get('cgpa', 'NA')}."
),
}
)
for subject in sem.get("subjects", []):
docs.append(
{
"source": "results_subject",
"course_code": str(subject.get("course_code", "")),
"chunk": (
f"Result for {subject.get('subject', '')} ({subject.get('course_code', '')}) "
f"in semester {sem_no}: total score {subject.get('total', 'NA')}"
),
}
)
return docs
def mmr_select(
query_embedding: List[float],
candidates: List[Dict],
top_k: int = 8,
lambda_param: float = 0.7,
) -> List[Dict]:
if not candidates:
return []
vectors = []
valid_candidates = []
for c in candidates:
emb = c.get("embedding")
if not emb:
continue
vec = np.array(emb, dtype=np.float32)
if np.linalg.norm(vec) == 0:
continue
vectors.append(vec)
valid_candidates.append(c)
if not valid_candidates:
return []
q = np.array(query_embedding, dtype=np.float32)
q_norm = np.linalg.norm(q)
if q_norm == 0:
return valid_candidates[:top_k]
vectors_np = np.stack(vectors)
vec_norms = np.linalg.norm(vectors_np, axis=1)
query_sims = (vectors_np @ q) / (vec_norms * q_norm)
selected_idx: List[int] = []
candidate_idx = list(range(len(valid_candidates)))
first_idx = int(np.argmax(query_sims))
selected_idx.append(first_idx)
candidate_idx.remove(first_idx)
while candidate_idx and len(selected_idx) < top_k:
best_idx = None
best_score = -1e9
for idx in candidate_idx:
relevance = float(query_sims[idx])
diversity = max(
float(np.dot(vectors_np[idx], vectors_np[s]) / (vec_norms[idx] * vec_norms[s]))
for s in selected_idx
)
mmr_score = lambda_param * relevance - (1.0 - lambda_param) * diversity
if mmr_score > best_score:
best_score = mmr_score
best_idx = idx
if best_idx is None:
break
selected_idx.append(best_idx)
candidate_idx.remove(best_idx)
return [valid_candidates[i] for i in selected_idx]