Spaces:

quantumbit
/

chatbot-gitconnect

Paused

App Files Files Community

chatbot-gitconnect / app /services /gemini_service.py

quantumbit

preprocessing endpoint fixed and chat endpoint has been updated to specific results-sem wise

d0220ae 21 days ago

raw

history blame contribute delete

4.13 kB

	import json
	from typing import Dict, List

	import google.generativeai as genai
	from sentence_transformers import SentenceTransformer


	class GeminiService:
	_embedding_model_cache: dict[str, SentenceTransformer] = {}

	def __init__(self, api_key: str, model_name: str, embedding_model_name: str) -> None:
	if not api_key:
	raise ValueError("GEMINI_API_KEY is not set.")
	genai.configure(api_key=api_key)
	self._model = genai.GenerativeModel(model_name)
	self._embedding_model = self._get_embedding_model(embedding_model_name)

	def embed_text(self, text: str, task_type: str) -> List[float]:
	# Small local HF embeddings for RAG; task_type kept for API compatibility.
	_ = task_type
	emb = self._embedding_model.encode(text, normalize_embeddings=False)
	return emb.tolist()

	def summarize_multilingual(self, course_name: str, syllabus_text: str) -> Dict[str, str]:
	prompt = f"""
	You are an academic assistant.
	Summarize the course syllabus content for course: {course_name}.

	Return STRICT JSON with this exact schema and keys only:
	{{
	"en": "English summary",
	"mr": "Marathi summary",
	"kn": "Kannada summary",
	"hn": "Hindi summary"
	}}

	Rules:
	- Keep each summary clear for students and parents.
	- 80-140 words per language.
	- Use only the syllabus context below.

	Syllabus context:
	{syllabus_text[:12000]}
	"""
	raw = self._model.generate_content(prompt).text
	return self._safe_parse_summary_json(raw)

	def generate_markdown(self, prompt: str) -> str:
	return self._model.generate_content(prompt).text

	def chat_with_context(
	self,
	query: str,
	lang_code: str,
	history: List[dict],
	student_info: dict,
	rag_chunks: List[str],
	) -> str:
	history_text = "\n".join(
	[f"{msg.get('role', 'user')}: {msg.get('content', '')}" for msg in history]
	)
	syllabus_context = "\n\n---\n\n".join(rag_chunks[:8])

	prompt = f"""
	You are a helpful college assistant chatbot for students and parents.

	Respond in language code: {lang_code}
	Supported codes: en, hn, mr, kn.
	Return the final answer in markdown.

	Grounding rules:
	- Prioritize facts from "Relevant syllabus context" for syllabus/unit/module questions.
	- If user asks for units/modules/topics of a course and context includes them, list them clearly.
	- Do not say data is missing unless the relevant syllabus context truly does not contain it.

	Relevant syllabus context:
	{syllabus_context}

	Student data (attendance, result etc.):
	{json.dumps(student_info, ensure_ascii=False)}

	Recent chat history:
	{history_text}

	User query:
	{query}

	Answer guidelines:
	- Be accurate and grounded in provided info.
	- If data is missing, state what is missing.
	- Keep response practical and concise.
	- Use markdown with bullets or short headings when useful.
	"""
	return self._model.generate_content(prompt).text

	def _safe_parse_summary_json(self, raw: str) -> Dict[str, str]:
	text = raw.strip()
	if text.startswith("```"):
	text = text.strip("`")
	if text.startswith("json"):
	text = text[4:].strip()

	parsed = json.loads(text)
	return {
	"en": str(parsed.get("en", "")),
	"mr": str(parsed.get("mr", "")),
	"kn": str(parsed.get("kn", "")),
	"hn": str(parsed.get("hn", "")),
	}

	@classmethod
	def _get_embedding_model(cls, embedding_model_name: str) -> SentenceTransformer:
	if embedding_model_name not in cls._embedding_model_cache:
	cls._embedding_model_cache[embedding_model_name] = SentenceTransformer(
	embedding_model_name
	)
	return cls._embedding_model_cache[embedding_model_name]

	@classmethod
	def preload_embedding_model(cls, embedding_model_name: str) -> None:
	model = cls._get_embedding_model(embedding_model_name)
	# Warm up once so the first real query does not pay model initialization cost.
	model.encode("embedding warmup", normalize_embeddings=False)