preprocessing endpoint fixed and chat endpoint has been updated to specific results-sem wise
d0220ae | import json | |
| from typing import Dict, List | |
| import google.generativeai as genai | |
| from sentence_transformers import SentenceTransformer | |
| class GeminiService: | |
| _embedding_model_cache: dict[str, SentenceTransformer] = {} | |
| def __init__(self, api_key: str, model_name: str, embedding_model_name: str) -> None: | |
| if not api_key: | |
| raise ValueError("GEMINI_API_KEY is not set.") | |
| genai.configure(api_key=api_key) | |
| self._model = genai.GenerativeModel(model_name) | |
| self._embedding_model = self._get_embedding_model(embedding_model_name) | |
| def embed_text(self, text: str, task_type: str) -> List[float]: | |
| # Small local HF embeddings for RAG; task_type kept for API compatibility. | |
| _ = task_type | |
| emb = self._embedding_model.encode(text, normalize_embeddings=False) | |
| return emb.tolist() | |
| def summarize_multilingual(self, course_name: str, syllabus_text: str) -> Dict[str, str]: | |
| prompt = f""" | |
| You are an academic assistant. | |
| Summarize the course syllabus content for course: {course_name}. | |
| Return STRICT JSON with this exact schema and keys only: | |
| {{ | |
| "en": "English summary", | |
| "mr": "Marathi summary", | |
| "kn": "Kannada summary", | |
| "hn": "Hindi summary" | |
| }} | |
| Rules: | |
| - Keep each summary clear for students and parents. | |
| - 80-140 words per language. | |
| - Use only the syllabus context below. | |
| Syllabus context: | |
| {syllabus_text[:12000]} | |
| """ | |
| raw = self._model.generate_content(prompt).text | |
| return self._safe_parse_summary_json(raw) | |
| def generate_markdown(self, prompt: str) -> str: | |
| return self._model.generate_content(prompt).text | |
| def chat_with_context( | |
| self, | |
| query: str, | |
| lang_code: str, | |
| history: List[dict], | |
| student_info: dict, | |
| rag_chunks: List[str], | |
| ) -> str: | |
| history_text = "\n".join( | |
| [f"{msg.get('role', 'user')}: {msg.get('content', '')}" for msg in history] | |
| ) | |
| syllabus_context = "\n\n---\n\n".join(rag_chunks[:8]) | |
| prompt = f""" | |
| You are a helpful college assistant chatbot for students and parents. | |
| Respond in language code: {lang_code} | |
| Supported codes: en, hn, mr, kn. | |
| Return the final answer in markdown. | |
| Grounding rules: | |
| - Prioritize facts from "Relevant syllabus context" for syllabus/unit/module questions. | |
| - If user asks for units/modules/topics of a course and context includes them, list them clearly. | |
| - Do not say data is missing unless the relevant syllabus context truly does not contain it. | |
| Relevant syllabus context: | |
| {syllabus_context} | |
| Student data (attendance, result etc.): | |
| {json.dumps(student_info, ensure_ascii=False)} | |
| Recent chat history: | |
| {history_text} | |
| User query: | |
| {query} | |
| Answer guidelines: | |
| - Be accurate and grounded in provided info. | |
| - If data is missing, state what is missing. | |
| - Keep response practical and concise. | |
| - Use markdown with bullets or short headings when useful. | |
| """ | |
| return self._model.generate_content(prompt).text | |
| def _safe_parse_summary_json(self, raw: str) -> Dict[str, str]: | |
| text = raw.strip() | |
| if text.startswith("```"): | |
| text = text.strip("`") | |
| if text.startswith("json"): | |
| text = text[4:].strip() | |
| parsed = json.loads(text) | |
| return { | |
| "en": str(parsed.get("en", "")), | |
| "mr": str(parsed.get("mr", "")), | |
| "kn": str(parsed.get("kn", "")), | |
| "hn": str(parsed.get("hn", "")), | |
| } | |
| def _get_embedding_model(cls, embedding_model_name: str) -> SentenceTransformer: | |
| if embedding_model_name not in cls._embedding_model_cache: | |
| cls._embedding_model_cache[embedding_model_name] = SentenceTransformer( | |
| embedding_model_name | |
| ) | |
| return cls._embedding_model_cache[embedding_model_name] | |
| def preload_embedding_model(cls, embedding_model_name: str) -> None: | |
| model = cls._get_embedding_model(embedding_model_name) | |
| # Warm up once so the first real query does not pay model initialization cost. | |
| model.encode("embedding warmup", normalize_embeddings=False) | |