VEXA_ChatBot_V2 / sahayak_utils.py
Chaitanya895's picture
Update sahayak_utils.py
6c0aee9 verified
import os
os.environ["TRANSFORMERS_CACHE"] = "/data/models"
os.environ["HF_HOME"] = "/data/models"
os.environ["HF_HUB_CACHE"] = "/data/models"
import logging
import nltk
import numpy as np
import faiss
import re
import json
import time
import torch
from functools import lru_cache
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AutoModel, pipeline
import difflib
import tempfile
import io
try:
from gtts import gTTS
_gtts_available = True
except Exception:
_gtts_available = False
import hashlib
# Try whisper first (optional, may not be installed), else fallback to SpeechRecognition
try:
import whisper
_whisper_available = True
except Exception:
_whisper_available = False
try:
import speech_recognition as sr
_sr_available = True
except Exception:
_sr_available = False
try:
from pydub import AudioSegment
_pydub_available = True
except Exception:
_pydub_available = False
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s'
)
logger = logging.getLogger(__name__)
# Debug: Log environment variables
logger.info(f"TRANSFORMERS_CACHE: {os.environ.get('TRANSFORMERS_CACHE')}")
logger.info(f"HF_HOME: {os.environ.get('HF_HOME')}")
logger.info(f"HF_HUB_CACHE: {os.environ.get('HF_HUB_CACHE')}")
# Import libraries with error handling
try:
import fitz # PyMuPDF
PDF_BACKEND = "pymupdf"
except ImportError:
logger.warning("PyMuPDF (fitz) not installed. Trying PyPDF2 as fallback.")
try:
import PyPDF2
PDF_BACKEND = "pypdf2"
except ImportError:
logger.error("No PDF processing library available.")
PDF_BACKEND = "none"
# Translation setup with graceful fallback
try:
from deep_translator import GoogleTranslator
from deep_translator.exceptions import LanguageNotSupportedException
class DeepTranslatorWrapper:
"""Wrapper to provide consistent API like googletrans"""
def detect(self, text):
"""Detect language using simple heuristics"""
class LangResult:
def __init__(self, lang):
self.lang = lang
# Simple language detection based on character ranges
text_sample = text[:100]
# Hindi (Devanagari script)
if any('\u0900' <= c <= '\u097F' for c in text_sample):
return LangResult("hi")
# Bengali
elif any('\u0980' <= c <= '\u09FF' for c in text_sample):
return LangResult("bn")
# Tamil
elif any('\u0B80' <= c <= '\u0BFF' for c in text_sample):
return LangResult("ta")
# Telugu
elif any('\u0C00' <= c <= '\u0C7F' for c in text_sample):
return LangResult("te")
# Gujarati
elif any('\u0A80' <= c <= '\u0AFF' for c in text_sample):
return LangResult("gu")
# Marathi (also Devanagari, but different patterns)
elif any('\u0900' <= c <= '\u097F' for c in text_sample):
return LangResult("mr")
# Chinese
elif any('\u4E00' <= c <= '\u9FFF' for c in text_sample):
return LangResult("zh")
# Arabic
elif any('\u0600' <= c <= '\u06FF' for c in text_sample):
return LangResult("ar")
# Spanish/French/German (accented Latin)
elif any(c in 'áéíóúñüÁÉÍÓÚÑÜ' for c in text_sample):
return LangResult("es")
else:
return LangResult("en")
def translate(self, text, src=None, dest=None):
"""Translate text using deep-translator"""
class TranslationResult:
def __init__(self, translated_text):
self.text = translated_text
try:
if src == dest or dest is None:
return TranslationResult(text)
# Map language codes
src_lang = src if src and src != 'auto' else 'auto'
dest_lang = dest if dest else 'en'
translated = GoogleTranslator(source=src_lang, target=dest_lang).translate(text)
return TranslationResult(translated if translated else text)
except Exception as e:
logger.warning(f"Translation failed: {e}")
return TranslationResult(text)
translator = DeepTranslatorWrapper()
logger.info("deep-translator initialized successfully")
except Exception as e:
logger.warning(f"deep-translator not available ({e}); using simple no-op translator")
class SimpleTranslator:
def detect(self, text):
class LangResult:
def __init__(self):
self.lang = "en"
return LangResult()
def translate(self, text, src=None, dest=None):
class TranslationResult:
def __init__(self, text):
self.text = text
return TranslationResult(text)
translator = SimpleTranslator()
# Ensure NLTK punkt is downloaded to a writable path
nltk.data.path.append('/tmp/nltk_data')
try:
nltk.download('punkt', download_dir='/tmp/nltk_data', quiet=True)
except Exception as e:
logger.warning(f"Failed to download NLTK punkt: {e}. Text chunking may be affected.")
# Models configuration
MODEL_DIR = os.environ.get("MODEL_DIR", "/data/models")
EMBEDDING_MODEL_NAME = os.environ.get("EMBEDDING_MODEL", "sentence-transformers/all-mpnet-base-v2")
os.makedirs(MODEL_DIR, exist_ok=True)
# Setup transformers-based embedding model
logger.info(f"Loading embedding model from: {EMBEDDING_MODEL_NAME}")
try:
tokenizer = AutoTokenizer.from_pretrained(
EMBEDDING_MODEL_NAME,
cache_dir=MODEL_DIR,
local_files_only=os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
)
model = AutoModel.from_pretrained(
EMBEDDING_MODEL_NAME,
cache_dir=MODEL_DIR,
local_files_only=os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
)
except Exception as e:
logger.error(f"Failed to load embedding model: {e}")
raise
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def transformers_encode(texts, batch_size=8):
if isinstance(texts, str):
texts = [texts]
if not isinstance(texts, (list, tuple)) or not texts:
logger.error(f"Invalid input to transformers_encode: {texts}")
return np.random.randn(1, 768)
if not all(isinstance(t, str) for t in texts):
logger.error(f"Non-string elements in texts: {texts}")
texts = [str(t) for t in texts]
try:
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
with torch.no_grad():
model_output = model(**encoded_input)
embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
return embeddings.numpy()
except Exception as e:
logger.error(f"Error in transformers_encode: {str(e)}")
return np.random.randn(len(texts), 768)
# Model caching configuration
CACHE_FILE = os.path.join(MODEL_DIR, "model_cache_status.json")
def check_model_cache_status():
"""Check if models are already cached"""
if not os.path.exists(MODEL_DIR):
os.makedirs(MODEL_DIR, exist_ok=True)
return False
if os.path.exists(CACHE_FILE):
try:
with open(CACHE_FILE, 'r') as f:
cache_data = json.load(f)
if cache_data.get('initialized', False):
logger.info("Using cached models")
return True
except Exception as e:
logger.warning(f"Error reading cache file: {e}")
return False
def mark_models_as_cached():
"""Mark models as successfully cached"""
try:
with open(CACHE_FILE, 'w') as f:
json.dump({'initialized': True, 'timestamp': time.time()}, f)
logger.info("Models marked as cached")
except Exception as e:
logger.warning(f"Error writing cache file: {e}")
def initialize_models():
"""Initialize and save models properly with caching"""
if check_model_cache_status():
logger.info("Models already cached, skipping initialization")
return
logger.info("Preloading QA model...")
get_qa_model()
mark_models_as_cached()
@lru_cache(maxsize=1)
def get_embedder():
"""Get the embedding model with caching"""
embedding_model_dir = os.path.join(MODEL_DIR, "embedding_model")
try:
class Embedder:
def encode(self, texts, batch_size=8):
return transformers_encode(texts, batch_size)
if os.path.exists(embedding_model_dir):
logger.info(f"Loading embedding model from: {embedding_model_dir}")
return Embedder()
logger.info(f"Using transformers-based embedding model: {EMBEDDING_MODEL_NAME}")
return Embedder()
except Exception as e:
logger.error(f"Error loading embedding model: {str(e)}. Using random embeddings as fallback.")
class SimpleEmbedder:
def encode(self, texts, batch_size=8):
if isinstance(texts, str):
return np.random.randn(768)
return np.random.randn(len(texts), 768)
return SimpleEmbedder()
@lru_cache(maxsize=1)
def get_qa_model():
"""Get the QA model with caching"""
try:
model_dir_contents = os.listdir(MODEL_DIR)
logger.info(f"Model directory contents: {model_dir_contents}")
except Exception as e:
logger.error(f"Failed to list model directory: {e}")
try:
logger.info("Loading QA model: distilbert-base-uncased-distilled-squad")
qa_model = pipeline(
"question-answering",
model="distilbert-base-uncased-distilled-squad",
tokenizer="distilbert-base-uncased-distilled-squad",
local_files_only=True,
cache_dir=MODEL_DIR
)
logger.info("Successfully loaded distilbert-base-uncased-distilled-squad")
return qa_model
except Exception as e:
logger.warning(f"Failed to load distilbert-base-uncased-distilled-squad: {e}. Falling back to roberta-base-squad2.")
try:
logger.info("Loading fallback QA model: deepset/roberta-base-squad2")
qa_model = pipeline(
"question-answering",
model="deepset/roberta-base-squad2",
tokenizer="deepset/roberta-base-squad2",
local_files_only=True,
cache_dir=MODEL_DIR
)
logger.info("Successfully loaded deepset/roberta-base-squad2")
return qa_model
except Exception as e:
logger.error(f"Error loading fallback QA model: {e}")
def simple_qa(question, context):
return {
"answer": "I'm sorry, the QA model couldn't be loaded. Please try again later.",
"score": 0.0
}
return simple_qa
def load_pdf_text(pdf_path):
"""Load text from PDF with better error handling and multiple backends"""
if not os.path.exists(pdf_path):
logger.error(f"PDF file not found: {pdf_path}")
return "PDF file not found. Please check the file path."
cache_path = os.path.join(MODEL_DIR, f"{os.path.basename(pdf_path)}.cache.json")
if os.path.exists(cache_path):
try:
logger.info(f"Loading PDF content from cache: {cache_path}")
with open(cache_path, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
return cache_data.get("text", "")
except Exception as e:
logger.warning(f"Error loading cache: {str(e)}")
try:
if PDF_BACKEND == "pymupdf":
logger.info(f"Loading PDF with PyMuPDF: {pdf_path}")
doc = fitz.open(pdf_path)
raw_text = "\n".join(page.get_text() for page in doc)
doc.close()
elif PDF_BACKEND == "pypdf2":
logger.info(f"Loading PDF with PyPDF2: {pdf_path}")
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
raw_text = "\n".join(page.extract_text() for page in reader.pages)
else:
logger.error("No PDF backend available")
return "No PDF processing library is installed."
clean_text = " ".join(raw_text.split())
if not clean_text:
logger.warning(f"Extracted empty text from PDF: {pdf_path}")
return "No readable text found in the PDF."
try:
with open(cache_path, 'w', encoding='utf-8') as f:
json.dump({"text": clean_text, "timestamp": time.time()}, f)
except Exception as e:
logger.warning(f"Could not write PDF cache: {str(e)}")
return clean_text
except Exception as e:
logger.error(f"Error loading PDF: {str(e)}")
return f"Error loading PDF: {str(e)}"
def load_passages_from_path(path_setting):
"""Load and chunk PDF content from all available sources.
Sources checked (in order):
1. Root PDF: Sahayak_Organisation_Expanded_Info.pdf
2. Directory: data/pdfs (all PDFs inside)
3. Custom path from path_setting
"""
all_passages = []
sources_loaded = []
# Always try to load root PDF first
root_pdf = "Sahayak_Organisation_Expanded_Info.pdf"
if os.path.isfile(root_pdf):
text = load_pdf_text(root_pdf)
chunks = split_into_chunks(text)
all_passages.extend(chunks)
sources_loaded.append(f"{root_pdf} ({len(chunks)} passages)")
logger.info(f"Loaded {len(chunks)} passages from root PDF: {root_pdf}")
# Load from data/pdfs directory
pdf_dir = "data/pdfs"
if os.path.isdir(pdf_dir):
pdf_files = [
os.path.join(pdf_dir, f) for f in sorted(os.listdir(pdf_dir))
if f.lower().endswith(".pdf")
]
for pdf_file in pdf_files:
# Skip if same as root PDF
if os.path.basename(pdf_file) == root_pdf:
continue
text = load_pdf_text(pdf_file)
chunks = split_into_chunks(text)
all_passages.extend(chunks)
sources_loaded.append(f"{os.path.basename(pdf_file)} ({len(chunks)} passages)")
logger.info(f"Loaded PDFs from directory: {pdf_dir}")
# Also check custom path if different from defaults
target = path_setting or ""
if target and target not in [root_pdf, pdf_dir]:
if os.path.isdir(target):
pdf_files = [
os.path.join(target, f) for f in sorted(os.listdir(target))
if f.lower().endswith(".pdf")
]
for pdf_file in pdf_files:
text = load_pdf_text(pdf_file)
chunks = split_into_chunks(text)
all_passages.extend(chunks)
sources_loaded.append(f"{os.path.basename(pdf_file)} ({len(chunks)} passages)")
elif os.path.isfile(target):
text = load_pdf_text(target)
chunks = split_into_chunks(text)
all_passages.extend(chunks)
sources_loaded.append(f"{target} ({len(chunks)} passages)")
if all_passages:
logger.info(f"Total knowledge base: {len(all_passages)} passages from {len(sources_loaded)} sources")
logger.info(f"Sources: {sources_loaded}")
return all_passages
logger.error("No PDF sources found. Using fallback.")
return ["Sahayak is a non-profit organization dedicated to providing support and community development."]
def split_into_chunks(text, max_length=200, min_length=50):
"""Split text into chunks based on thematic sections and sentence boundaries"""
if not isinstance(text, str):
logger.error(f"Invalid input to split_into_chunks: {text}")
return ["Invalid input"]
try:
sections = re.split(r'(?=\b[A-Z][a-zA-Z\s]+:)', text)
chunks = []
for section in sections:
section = section.strip()
if not section:
continue
paragraphs = section.split('\n\n')
for paragraph in paragraphs:
paragraph = paragraph.strip()
if not paragraph:
continue
if len(paragraph) <= max_length and len(paragraph) >= min_length:
chunks.append(paragraph)
elif len(paragraph) < min_length:
continue
else:
sentences = nltk.sent_tokenize(paragraph)
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_length:
current_chunk += " " + sentence
else:
if len(current_chunk) >= min_length:
chunks.append(current_chunk.strip())
current_chunk = sentence
if current_chunk and len(current_chunk) >= min_length:
chunks.append(current_chunk.strip())
if not chunks:
logger.warning("No chunks created from text")
chunks = ["No content available"]
return chunks
except Exception as e:
logger.error(f"Error splitting text into chunks: {str(e)}")
return ["Error processing text content"]
def build_faiss_index(passages):
"""Build FAISS index with error handling"""
try:
logger.info(f"Building FAISS index with passages: {passages[:2]}... (total: {len(passages)})")
embedder = get_embedder()
embeddings = embedder.encode(passages)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))
return index, embeddings
except Exception as e:
logger.error(f"Error building FAISS index: {str(e)}")
dimension = 768
dummy_embeddings = np.random.randn(len(passages), dimension).astype('float32')
dummy_index = faiss.IndexFlatL2(dimension)
dummy_index.add(dummy_embeddings)
return dummy_index, dummy_embeddings
def retrieve_relevant_passages(query, passages, vector_index, embeddings, top_k=10):
"""Retrieve the most relevant passages using pure semantic similarity.
Uses cosine similarity between query and passage embeddings for true semantic understanding,
not just keyword matching.
"""
try:
embedder = get_embedder()
query_vector = embedder.encode([query])[0].reshape(1, -1).astype('float32')
# Get more candidates than needed, then re-rank
num_candidates = min(top_k * 3, len(passages))
D, I = vector_index.search(query_vector, num_candidates)
# Re-rank based on semantic similarity score
results = []
for idx, dist in zip(I[0], D[0]):
if idx < len(passages):
# Convert L2 distance to similarity score (closer = higher score)
similarity_score = 1.0 / (1.0 + dist)
results.append((idx, similarity_score, passages[idx]))
# Sort by similarity score descending
results.sort(key=lambda x: x[1], reverse=True)
# Return top_k most relevant passages
return [passage for _, _, passage in results[:top_k]]
except Exception as e:
logger.error(f"Error retrieving passages: {str(e)}")
import random
return random.sample(passages, min(3, len(passages)))
def speech_to_text_file(file_path):
"""Transcribe an audio file to text.
Tries whisper if available, otherwise falls back to SpeechRecognition + Google Web Speech.
Returns transcribed text (empty string on failure).
"""
try:
if _whisper_available:
model = whisper.load_model("small")
result = model.transcribe(file_path)
return result.get("text", "").strip()
if _sr_available:
r = sr.Recognizer()
# If pydub available, convert to WAV for reliability
wav_path = file_path
if _pydub_available:
try:
audio = AudioSegment.from_file(file_path)
wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
audio.export(wav_tmp.name, format="wav")
wav_path = wav_tmp.name
except Exception:
wav_path = file_path
with sr.AudioFile(wav_path) as source:
audio_data = r.record(source)
try:
# Let Google's recognizer auto-detect language where possible
text = r.recognize_google(audio_data)
return text.strip()
except Exception as e:
logger.warning(f"SpeechRecognition transcription failed: {e}")
return ""
except Exception as e:
logger.error(f"speech_to_text_file error: {e}")
return ""
def text_to_speech_file(text, lang='en'):
"""Generate an MP3 file for the given text in the specified language using gTTS.
Returns path to saved file or None on failure.
"""
try:
if not text:
return None
if not _gtts_available:
logger.error("gTTS not available; cannot synthesize speech")
return None
# Ensure tts directory exists inside MODEL_DIR for stable storage
tts_dir = os.path.join(MODEL_DIR, "tts")
os.makedirs(tts_dir, exist_ok=True)
# Create a deterministic filename based on text+lang to reuse identical requests
digest = hashlib.sha256((text + '||' + lang).encode('utf-8')).hexdigest()[:24]
filename = f"tts-{digest}.mp3"
file_path = os.path.join(tts_dir, filename)
if os.path.exists(file_path):
return filename
tts = gTTS(text=text, lang=lang)
tts.save(file_path)
return filename
except Exception as e:
logger.error(f"text_to_speech_file error: {e}")
return None
def detect_and_translate(query):
"""Detect language and translate with error handling"""
try:
lang = translator.detect(query).lang
if lang != "en":
translated_query = translator.translate(query, src=lang, dest="en").text
else:
translated_query = query
return translated_query, lang
except Exception as e:
logger.warning(f"Translation error: {str(e)}. Using original query.")
return query, "en"
def translate_text(text, target_lang):
"""Translate text back to the target language with fallback"""
if not text or not target_lang or target_lang == "en":
return text
try:
return translator.translate(text, src="en", dest=target_lang).text
except Exception as e:
logger.warning(f"Answer translation failed: {e}. Returning English text.")
return text
def moderate_query(query):
"""Lightweight safety filter to avoid harmful or off-policy content."""
lowered = query.lower()
blocked_topics = [
"violence", "weapon", "harm myself", "self-harm", "suicide", "attack",
"explosive", "bomb", "terror", "hate", "racist", "sex", "explicit",
"drugs", "narcotic", "illegal"
]
if any(term in lowered for term in blocked_topics):
return {
"status": "blocked",
"message": ("I'm here to provide supportive, lawful information. "
"I can't help with that topic. If you need wellbeing support, consider contacting a local helpline.")
}
return {"status": "ok"}
def is_in_scope(query):
"""Keep answers aligned to community service, government schemes, and Sahayak context.
Expanded to cover:
- Sahayak organization topics
- Government welfare schemes and benefits
- Social services and community development
- Education and health programs
- Disaster relief and emergency support
- Legal rights and entitlements for citizens
"""
scope_keywords = [
# Sahayak specific
"sahayak", "vexa", "ngo", "non profit", "nonprofit",
# Community and social
"community", "benefit", "volunteer", "social", "help", "support", "welfare",
"charity", "donation", "outreach", "initiative", "campaign",
# Education
"education", "school", "scholarship", "student", "learning", "training",
"skill", "literacy", "academic", "mentorship",
# Health
"health", "medical", "hospital", "treatment", "medicine", "healthcare",
"disability", "disabled", "blind", "specially abled",
# Government schemes
"scheme", "yojana", "government", "subsidy", "pension", "ration",
"aadhar", "aadhaar", "pan", "voter", "certificate", "document",
# Services
"service", "programme", "program", "project", "event", "activity",
# Locations
"belgaum", "bangalore", "karnataka", "india",
# Disaster and emergency
"disaster", "relief", "emergency", "flood", "earthquake",
# Rights and legal
"rights", "entitlement", "eligibility", "apply", "registration",
# General queries that should be answered
"who", "what", "when", "where", "how", "why", "tell", "explain",
"founder", "team", "member", "leader", "president", "secretary",
"mission", "vision", "goal", "objective", "aim", "purpose"
]
lowered = query.lower()
# More permissive: if query is short or contains any scope keyword, allow it
if len(query.split()) <= 5:
return True # Short queries are likely on-topic
return any(k in lowered for k in scope_keywords)
def scope_redirect_message(lang):
msg = (
"I focus on Sahayak and community-benefit topics. "
"Please ask about our programmes, volunteering, events, or social impact."
)
return translate_text(msg, lang)
# =============================================================================
# SEMANTIC UNDERSTANDING ENGINE - Expert-Level Context Analysis
# =============================================================================
# Intent templates for semantic matching (these will be encoded for similarity)
INTENT_TEMPLATES = {
"about_organization": [
"What is Sahayak?",
"Tell me about Sahayak organization",
"Describe Sahayak NGO",
"What does Sahayak do?",
"Explain Sahayak organization"
],
"founder_leadership": [
"Who founded Sahayak?",
"Who is the founder of Sahayak?",
"Who started Sahayak?",
"Tell me about the founder",
"Who is Verril Vaz?"
],
"president_leadership": [
"Who is the president?",
"Who leads Sahayak?",
"Who is the current president?",
"Tell me about the leadership team"
],
"team_members": [
"Who are the team members?",
"How many members does Sahayak have?",
"Tell me about the team",
"Who works at Sahayak?"
],
"mission_vision": [
"What is the mission of Sahayak?",
"What is the vision?",
"What are the goals?",
"What are the objectives?",
"What does Sahayak aim to achieve?",
"What is the purpose of Sahayak?"
],
"initiatives_programs": [
"What programs does Sahayak run?",
"What are Sahayak's initiatives?",
"Tell me about the activities",
"What projects has Sahayak done?",
"What kind of work does Sahayak do?"
],
"recent_events": [
"What are the recent events?",
"What activities have been conducted recently?",
"What has Sahayak done recently?",
"Tell me about recent visits"
],
"location_branches": [
"Where is Sahayak located?",
"How many branches does Sahayak have?",
"Where does Sahayak operate?",
"What cities does Sahayak work in?"
],
"join_volunteer": [
"How can I join Sahayak?",
"How to volunteer?",
"How to become a member?",
"How can I help?",
"How to get involved?"
],
"donate_support": [
"How to donate?",
"How can I support Sahayak?",
"How to contribute?",
"Where can I donate?"
],
"contact_info": [
"How to contact Sahayak?",
"What is the phone number?",
"What is the email?",
"How to reach Sahayak?"
],
"founding_date": [
"When was Sahayak founded?",
"When did Sahayak start?",
"What is the founding date?",
"How old is Sahayak?"
],
"greeting": [
"Hello",
"Hi",
"Hey",
"Good morning",
"Namaste"
],
"thanks": [
"Thank you",
"Thanks",
"Thanks a lot",
"Thank you so much"
],
"affirmative": [
"Yes",
"Sure",
"Ok",
"Okay",
"Tell me more",
"Continue"
]
}
# Cache for intent embeddings
_intent_embeddings_cache = {}
def get_intent_embeddings():
"""Get cached intent embeddings or compute them."""
global _intent_embeddings_cache
if _intent_embeddings_cache:
return _intent_embeddings_cache
try:
embedder = get_embedder()
for intent, templates in INTENT_TEMPLATES.items():
embeddings = embedder.encode(templates)
# Store mean embedding for each intent
_intent_embeddings_cache[intent] = {
"templates": templates,
"embeddings": embeddings,
"centroid": np.mean(embeddings, axis=0)
}
logger.info(f"Intent embeddings computed for {len(_intent_embeddings_cache)} intents")
except Exception as e:
logger.error(f"Error computing intent embeddings: {e}")
return _intent_embeddings_cache
def semantic_intent_classification(query):
"""
Classify user intent using semantic similarity with embeddings.
Returns the best matching intent and confidence score.
"""
try:
embedder = get_embedder()
query_embedding = embedder.encode([query])[0]
intent_embeddings = get_intent_embeddings()
if not intent_embeddings:
return "general", 0.0
best_intent = "general"
best_score = 0.0
best_template = ""
for intent, data in intent_embeddings.items():
# Compare with centroid (mean of all templates)
centroid = data["centroid"]
centroid_similarity = np.dot(query_embedding, centroid) / (
np.linalg.norm(query_embedding) * np.linalg.norm(centroid)
)
# Also check best individual template match
for i, template_emb in enumerate(data["embeddings"]):
template_similarity = np.dot(query_embedding, template_emb) / (
np.linalg.norm(query_embedding) * np.linalg.norm(template_emb)
)
if template_similarity > best_score:
best_score = template_similarity
best_intent = intent
best_template = data["templates"][i]
logger.info(f"Semantic intent: {best_intent} (score: {best_score:.3f}, matched: '{best_template}')")
return best_intent, best_score
except Exception as e:
logger.error(f"Intent classification error: {e}")
return "general", 0.0
def semantic_passage_ranking(query, passages, top_k=5):
"""
Rank passages by semantic similarity to the query.
Uses cosine similarity between query and passage embeddings.
"""
try:
embedder = get_embedder()
query_embedding = embedder.encode([query])[0]
passage_embeddings = embedder.encode(passages)
# Calculate cosine similarity
similarities = []
for i, passage_emb in enumerate(passage_embeddings):
similarity = np.dot(query_embedding, passage_emb) / (
np.linalg.norm(query_embedding) * np.linalg.norm(passage_emb)
)
similarities.append((i, similarity, passages[i]))
# Sort by similarity descending
similarities.sort(key=lambda x: x[1], reverse=True)
return [(passage, score) for _, score, passage in similarities[:top_k]]
except Exception as e:
logger.error(f"Passage ranking error: {e}")
return [(p, 0.5) for p in passages[:top_k]]
def analyze_query_complexity(query):
"""
Analyze query complexity to determine response strategy.
"""
words = query.split()
complexity = {
"word_count": len(words),
"is_question": any(query.strip().endswith(c) for c in ['?', '?']),
"has_multiple_parts": any(c in query for c in [',', 'and', 'or', 'also']),
"is_comparison": any(w in query.lower() for w in ['compare', 'difference', 'between', 'vs', 'versus']),
"is_list_request": any(w in query.lower() for w in ['list', 'all', 'every', 'each', 'various']),
"is_explanation": any(w in query.lower() for w in ['why', 'how', 'explain', 'describe', 'elaborate']),
"is_specific": any(w in query.lower() for w in ['specific', 'exactly', 'particular', 'precise'])
}
# Calculate complexity score
score = 0
if complexity["word_count"] > 10:
score += 1
if complexity["has_multiple_parts"]:
score += 1
if complexity["is_comparison"]:
score += 2
if complexity["is_list_request"]:
score += 1
if complexity["is_explanation"]:
score += 1
complexity["score"] = score
complexity["level"] = "simple" if score < 2 else ("moderate" if score < 4 else "complex")
return complexity
# =============================================================================
# EXPERT REASONING ENGINE - Chain of Thought for Better Responses
# =============================================================================
def extract_key_entities(text):
"""Extract key entities like names, dates, places, and numbers from text."""
entities = {
"dates": re.findall(r'\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', text, re.IGNORECASE),
"years": re.findall(r'\b(?:19|20)\d{2}\b', text),
"numbers": re.findall(r'\b\d+(?:\.\d+)?\s*(?:members|people|students|children|rupees|rs|lakh|crore|percent|%)\b', text, re.IGNORECASE),
"places": re.findall(r'\b(?:Belgaum|Bangalore|Karnataka|India|Delhi|Mumbai)\b', text, re.IGNORECASE),
"organizations": re.findall(r'\b(?:Sahayak|Sparsh Foundation|Maheshwari School|Anand Yatri|Balika Adarsh Vidyalaya)\b', text, re.IGNORECASE)
}
return entities
def calculate_passage_relevance(passage, query):
"""Calculate a relevance score for a passage based on query overlap."""
query_words = set(query.lower().split())
passage_words = set(passage.lower().split())
overlap = query_words.intersection(passage_words)
if not query_words:
return 0.0
return len(overlap) / len(query_words)
def expert_reasoning_chain(query, passages, qa_result):
"""
Expert system reasoning chain that:
1. Uses semantic intent classification (not keyword matching)
2. Evaluates evidence from multiple passages with semantic similarity
3. Synthesizes a well-reasoned response
4. Provides confidence assessment based on multiple factors
"""
reasoning_steps = []
# Step 1: Semantic Intent Analysis (using embeddings)
intent, intent_confidence = semantic_intent_classification(query)
# Map semantic intents to response intents
intent_mapping = {
"about_organization": "definitional",
"founder_leadership": "person",
"president_leadership": "person",
"team_members": "person",
"mission_vision": "definitional",
"initiatives_programs": "information",
"recent_events": "temporal",
"location_branches": "location",
"join_volunteer": "procedural",
"donate_support": "procedural",
"contact_info": "information",
"founding_date": "temporal",
"greeting": "greeting",
"thanks": "greeting",
"affirmative": "confirmation",
"general": "information"
}
response_intent = intent_mapping.get(intent, "information")
reasoning_steps.append(f"Semantic Intent: {intent} -> {response_intent} (confidence: {intent_confidence:.2f})")
# Step 2: Semantic Evidence Gathering
evidence_pieces = []
try:
# Use semantic ranking for evidence
ranked = semantic_passage_ranking(query, passages[:10], top_k=5)
for passage, score in ranked:
if score > 0.25: # Semantic similarity threshold
entities = extract_key_entities(passage)
evidence_pieces.append({
"passage": passage[:300],
"semantic_score": score,
"entities": entities
})
except Exception as e:
# Fallback to simple relevance
for i, passage in enumerate(passages[:5]):
relevance = calculate_passage_relevance(passage, query)
if relevance > 0.1:
evidence_pieces.append({
"passage": passage[:300],
"semantic_score": relevance,
"entities": extract_key_entities(passage)
})
reasoning_steps.append(f"Semantic evidence pieces: {len(evidence_pieces)}")
# Step 3: Multi-factor Confidence Assessment
base_confidence = qa_result.get("score", 0.0)
intent_boost = intent_confidence * 0.2 # Intent confidence contributes
evidence_boost = min(len(evidence_pieces) * 0.1, 0.3)
# Calculate final confidence
final_confidence = min(base_confidence + intent_boost + evidence_boost, 1.0)
confidence_level = "low"
if final_confidence > 0.7:
confidence_level = "high"
elif final_confidence > 0.4:
confidence_level = "medium"
reasoning_steps.append(f"Confidence: {confidence_level} ({final_confidence:.2f})")
return {
"intent": intent,
"evidence": evidence_pieces,
"confidence": final_confidence,
"confidence_level": confidence_level,
"reasoning_steps": reasoning_steps
}
def clean_answer_fragment(answer, context_passages):
"""
Clean up fragmented QA model answers to make them complete sentences.
The QA model sometimes returns partial phrases - this fixes that.
"""
if not answer:
return None
answer = answer.strip()
# Check if answer starts with lowercase or incomplete phrase
incomplete_starts = ["the ", "a ", "an ", "is ", "are ", "was ", "were ", "has ", "have ",
"it ", "its ", "their ", "this ", "that ", "these ", "those ",
"and ", "or ", "but ", "with ", "for ", "to ", "in ", "on ", "at "]
# If answer starts with lowercase or is a fragment, try to find it in context
if answer and (answer[0].islower() or any(answer.lower().startswith(s) for s in incomplete_starts)):
for passage in context_passages[:3]:
if answer in passage:
# Find the sentence containing the answer
sentences = re.split(r'(?<=[.!?])\s+', passage)
for sent in sentences:
if answer in sent:
return sent.strip()
# If answer is very short and looks like a fragment
if len(answer.split()) < 4 and not answer.endswith(('.', '!', '?')):
for passage in context_passages[:3]:
if answer.lower() in passage.lower():
sentences = re.split(r'(?<=[.!?])\s+', passage)
for sent in sentences:
if answer.lower() in sent.lower():
return sent.strip()
return answer
def synthesize_answer_from_passages(query, passages, qa_answer, intent):
"""
Synthesize a comprehensive, well-structured answer from passages.
This creates complete, coherent responses instead of fragments.
"""
query_lower = query.lower()
# Extract key information based on intent
if intent == "person":
# Look for names, roles, titles
person_info = []
for passage in passages[:5]:
if any(term in passage.lower() for term in ["founder", "president", "leader", "team", "member", "secretary"]):
sentences = re.split(r'(?<=[.!?])\s+', passage)
for sent in sentences:
if any(term in sent.lower() for term in ["founder", "president", "verril", "vaz", "leader", "team"]):
person_info.append(sent.strip())
if person_info:
return " ".join(person_info[:3])
elif intent == "definitional":
# Look for definitions, descriptions
definitions = []
for passage in passages[:5]:
if "sahayak" in passage.lower():
sentences = re.split(r'(?<=[.!?])\s+', passage)
for sent in sentences:
if "sahayak" in sent.lower() and len(sent) > 30:
definitions.append(sent.strip())
if definitions:
return " ".join(definitions[:3])
elif intent == "location":
# Look for place-related information
location_info = []
for passage in passages[:5]:
if any(place in passage.lower() for place in ["belgaum", "bangalore", "karnataka", "india", "location", "branch"]):
sentences = re.split(r'(?<=[.!?])\s+', passage)
for sent in sentences:
if any(place in sent.lower() for place in ["belgaum", "bangalore", "operates", "location", "branch"]):
location_info.append(sent.strip())
if location_info:
return " ".join(location_info[:3])
elif intent == "procedural":
# Look for how-to, process information
process_info = []
for passage in passages[:5]:
if any(term in passage.lower() for term in ["how", "step", "process", "register", "join", "volunteer", "donate", "contact"]):
sentences = re.split(r'(?<=[.!?])\s+', passage)
for sent in sentences:
if len(sent) > 20:
process_info.append(sent.strip())
if process_info:
return " ".join(process_info[:4])
# Default: return cleaned QA answer or first relevant passage
cleaned = clean_answer_fragment(qa_answer, passages)
if cleaned and len(cleaned) > 20:
return cleaned
# Fallback to first passage sentences
if passages:
sentences = re.split(r'(?<=[.!?])\s+', passages[0])
return " ".join(sentences[:3])
return qa_answer
def build_expert_response(qa_answer, reasoning, query, passages):
"""
Build a comprehensive, well-formatted expert response with:
- Clear, complete answer (not fragments)
- Organized sections with proper headers
- Supporting details
- Helpful follow-up suggestions
"""
intent = reasoning["intent"]
confidence = reasoning["confidence_level"]
evidence = reasoning["evidence"]
# Synthesize a proper answer from fragments
synthesized_answer = synthesize_answer_from_passages(query, passages, qa_answer, intent)
if synthesized_answer:
synthesized_answer = clean_answer_fragment(synthesized_answer, passages) or synthesized_answer
response_parts = []
# Section 1: Main Answer with clear header based on intent
intent_headers = {
"person": "👤 **About the Person/Team**",
"temporal": "📅 **Timeline & Dates**",
"location": "📍 **Location Information**",
"quantitative": "📊 **Facts & Figures**",
"procedural": "📋 **How It Works**",
"explanatory": "💡 **Explanation**",
"definitional": "📖 **Overview**",
"information": "ℹ️ **Information**"
}
header = intent_headers.get(intent, intent_headers["information"])
response_parts.append(f"{header}\n\n")
# Add the main answer, always as a complete paragraph
if synthesized_answer and len(synthesized_answer) > 10:
answer_text = synthesized_answer.strip()
if answer_text and answer_text[0].islower():
answer_text = answer_text[0].upper() + answer_text[1:]
if not answer_text.endswith(('.', '!', '?')):
answer_text += "."
response_parts.append(f"{answer_text}\n\n")
else:
response_parts.append(f"{qa_answer}\n\n")
# Section 2: Key Details (bulleted, max 5)
if evidence and len(evidence) > 0:
response_parts.append("**Key Details:**\n")
added_details = set()
for ev in evidence:
passage = ev["passage"]
sentences = re.split(r'(?<=[.!?])\s+', passage)
for sent in sentences:
sent = sent.strip()
if len(sent) > 30 and sent not in added_details:
if sent[0].islower():
sent = sent[0].upper() + sent[1:]
if not sent.endswith(('.', '!', '?')):
sent += "."
response_parts.append(f"- {sent}\n")
added_details.add(sent)
if len(added_details) >= 5:
break
if len(added_details) >= 5:
break
response_parts.append("\n")
# Section 3: Contact/Action info if relevant
if intent == "procedural" or any(word in query.lower() for word in ["join", "contact", "call", "volunteer", "donate"]):
for passage in passages[:5]:
if "phone" in passage.lower() or "contact" in passage.lower() or "website" in passage.lower():
phone_match = re.search(r'\+?\d{2,3}[-\s]?\d{3}[-\s]?\d{3}[-\s]?\d{4}', passage)
website_match = re.search(r'https?://[\w\./]+', passage)
if phone_match or website_match:
response_parts.append("**📞 Contact Information:**\n")
if phone_match:
response_parts.append(f"- Phone: {phone_match.group()}\n")
if website_match:
response_parts.append(f"- Website: {website_match.group()}\n")
response_parts.append("\n")
break
# Section 4: Helpful follow-up (context-aware, more GPT-like)
follow_ups = {
"person": "\n💬 *Would you like to know about specific team members, their roles, or how to contact them?*",
"temporal": "\n💬 *Would you like to know about upcoming events, past milestones, or Sahayak's history?*",
"location": "\n💬 *Would you like directions, branch details, or information about activities at a specific location?*",
"quantitative": "\n💬 *Would you like more details about Sahayak's impact, achievements, or statistics?*",
"procedural": "\n💬 *Need help with the next steps? I can provide more detailed guidance or walk you through the process.*",
"explanatory": "\n💬 *Would you like me to explain any specific aspect in more detail, or clarify something further?*",
"definitional": "\n💬 *Would you like to learn about specific programs, events, or initiatives?*",
"information": "\n💬 *What else would you like to know about Sahayak Organization, its team, or its work?*"
}
response_parts.append(follow_ups.get(intent, follow_ups["information"]))
# Section 5: Horizontal rule and disclaimer for professionalism
response_parts.append("\n---\n")
response_parts.append("_This response is for community awareness only and not legal, medical, or financial advice. Program details may change; please verify with official Sahayak contacts or local authorities._")
return "".join(response_parts)
def append_context_to_answer(answer, query, passages):
"""Enhanced context integration using expert reasoning chain."""
if not passages:
return answer
# Use expert reasoning for better responses
qa_result = {"answer": answer, "score": 0.5} # Default score for context
reasoning = expert_reasoning_chain(query, passages, qa_result)
logger.info(f"Expert reasoning: {reasoning['reasoning_steps']}")
return build_expert_response(answer, reasoning, query, passages)
# =============================================================================
# MAIN ANSWER FUNCTION
# =============================================================================
def answer_question(query, passages, vector_index, embeddings):
"""Answer a question with expert-level reasoning, retrieval, and detailed responses"""
try:
meta_responses = {
"hi": "Hello! I'm Vexa, your assistant for Sahayak Organization.\n\n"
"Sahayak is a non-profit dedicated to making a positive impact in the community by supporting underprivileged groups.\n"
"I can help you learn more about our mission, goals, leadership, and activities.\n"
"What would you like to know about Sahayak Organization today?",
"hello": "Hi there! I'm Vexa, here to assist you with information about Sahayak Organization.\n\n"
"We are a non-profit focused on addressing social issues and empowering communities through education and support.\n"
"I can provide details about our vision, projects, or team members.\n"
"How can I help you today?",
"hey": "Hey! I'm Vexa, your guide to Sahayak Organization.\n\n"
"Sahayak works to support those in need, focusing on education, inclusion, and community development.\n"
"I'm here to answer any questions you have about our work, leadership, or goals.\n"
"What would you like to explore?",
"explain me about sahayak": "Sahayak is a non-profit organization committed to providing support and assistance to those in need.\n\n"
"Our primary focus is on addressing various social issues, such as education, health, and community development.\n"
"We aim to make a lasting positive impact by empowering underprivileged communities through dedicated programs.\n"
"Additionally, Sahayak collaborates with local leaders and volunteers to ensure our initiatives are effective.\n"
"Would you like to know more about our specific projects or leadership team?",
"tell me about sahayak": "Sahayak Organization is a non-profit dedicated to helping those in need across various communities.\n\n"
"We focus on tackling social issues like lack of access to education, healthcare challenges, and economic disparities.\n"
"Our mission is to empower underprivileged groups by providing resources, support, and opportunities for growth.\n"
"Sahayak operates in multiple locations, including Belgaum and Bangalore, to maximize our impact.\n"
"If you'd like, I can share more about our goals or the team behind our efforts!",
"brief me about sahayak": "Sahayak is a non-profit organization devoted to supporting underprivileged communities.\n\n"
"Our work centers on addressing key social issues, such as education, health, and social inclusion.\n"
"We strive to create a positive impact by offering resources and programs that empower individuals and families.\n"
"With operations in places like Belgaum and Bangalore, we aim to reach as many people as possible.\n"
"Let me know if you'd like to dive deeper into our mission or activities!",
"goals of sahayak": "The goals of Sahayak Organization are centered on creating a better future for underprivileged communities.\n\n"
"We aim to provide support by addressing key challenges like access to education and healthcare.\n"
"Another goal is to raise awareness about social issues that affect marginalized groups.\n"
"Additionally, we promote initiatives that improve health and education outcomes for children and families.\n"
"Would you like to learn more about how we achieve these goals through our programs?",
"vision": "Sahayak Organisation envisions a society where education is accessible to every child, regardless of their socio-economic background.\n\n"
"We believe that education is a fundamental right that can transform lives and uplift entire communities.\n"
"Our vision drives us to work tirelessly to remove barriers and create opportunities for learning.\n"
"By doing so, we hope to build a more equitable and inclusive society for future generations.\n"
"Would you like to know more about our specific initiatives in education?",
"mission of sahayak": "Sahayak's mission is to empower underprivileged communities through education, inclusion, and dedicated care.\n\n"
"We strive to provide resources and support to those who need it most, ensuring they have access to opportunities.\n"
"Our focus is on creating sustainable change by addressing systemic issues like poverty and lack of education.\n"
"Through our programs, we aim to foster a sense of community and hope for a better future.\n"
"Let me know if you'd like more details about our mission-driven projects!",
"vision of sahayak": "Sahayak Organisation envisions a society where education is accessible to every child, regardless of their socio-economic background.\n\n"
"We believe that education is a fundamental right that can transform lives and uplift entire communities.\n"
"Our vision drives us to work tirelessly to remove barriers and create opportunities for learning.\n"
"By doing so, we hope to build a more equitable and inclusive society for future generations.\n"
"Would you like to know more about our specific initiatives in education?",
"who created you": "I was created by B Chaitanya Reddy, a dedicated developer passionate about using technology for social good.\n\n"
"Chaitanya built me to assist users in learning more about Sahayak Organization and its impactful work.\n"
"My purpose is to provide accurate and detailed information to help you understand Sahayak's mission and activities.\n"
"Thanks to Chaitanya's efforts, I can answer your questions and guide you through Sahayak's initiatives.\n"
"What else would you like to know about me or Sahayak?",
"who developed you": "I was developed by B Chaitanya Reddy, a skilled developer who wanted to support Sahayak Organization's mission.\n\n"
"Chaitanya designed me to be a helpful tool for anyone seeking information about Sahayak's work and goals.\n"
"My role is to provide detailed answers and insights into the organization's efforts to make a difference.\n"
"I'm here to assist you with any questions you have about Sahayak or its initiatives.\n"
"What would you like to explore next?",
"who is the founder of sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
"Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
"Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
"founder of sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
"Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
"Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
"who founded sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
"Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
"Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
"who is verril vaz": "**👤 About Verril Vaz**\n\n"
"Verril Vaz is the **Founder and President** of Sahayak Organisation.\n\n"
"**Key Facts:**\n"
"• Visionary leader who believes in grassroots change\n"
"• Established Sahayak on May 21, 2024\n"
"• Leads a passionate team of 45 young members\n"
"• Focuses on education and community empowerment\n\n"
"**His Vision:**\n"
"Under his leadership, Sahayak has grown to operate in Belgaum and Bangalore, helping underprivileged communities through education and social support.\n\n"
"Would you like to know more about the leadership team or Sahayak's activities?",
"verril vaz": "**👤 About Verril Vaz**\n\n"
"Verril Vaz is the **Founder and President** of Sahayak Organisation.\n\n"
"**Key Facts:**\n"
"• Visionary leader who believes in grassroots change\n"
"• Established Sahayak on May 21, 2024\n"
"• Leads a passionate team of 45 young members\n\n"
"Under his leadership, Sahayak operates in Belgaum and Bangalore, helping communities through education and support.\n\n"
"Would you like to know more about the leadership team?",
"how does sahayak organization works": "Sahayak Organisation operates as a non-profit dedicated to uplifting underprivileged communities through education, social support, and community development.\nIt functions by Implementing Educational Initiatives:\nSahayak focuses heavily on education, providing academic mentorship and resources to ensure access to learning opportunities, particularly for children from marginalized backgrounds.\nProviding Social and Emotional Support: The organization acts as a 'helper' (as its name suggests in Hindi), offering emotional and social assistance to those in need, fostering inclusion and empowerment.\nOperating in Key Locations: Sahayak runs its programs primarily in Belgaum and Bangalore, collaborating with local special schools and healthcare providers to maximize impact.\nLeveraging a Youthful Team: Powered by a team of 45 core members, Sahayak is led by young, passionate leaders like founder Verril Vaz.\nThis team brings empathy, innovation, and administrative skills to execute initiatives efficiently.\nHolistic Approach: Sahayak bridges the gap between potential and opportunity by addressing systemic issues like poverty and lack of education, ensuring sustainable change through targeted programs.",
"what are the recent activities conducted by the sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
"2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
"3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
"4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
"what are the initiatives took by sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
"2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
"3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
"4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
"what are the recent events conducted by the sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
"2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
"3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
"4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
"when was the sahayak started": "Sahayak Organisation was started on 21 May 2024.\n\n"
"Since its inception, Sahayak has been dedicated to uplifting underprivileged communities through education and social support.\n"
"Would you like to know more about our founding story or current initiatives?",
# Handle common affirmative follow-ups
"yes": "Great! Here are some topics I can help you explore:\n\n"
"📋 **Programs & Initiatives**\n"
"• Educational programs for underprivileged children\n"
"• Visits to old age homes and special schools\n"
"• Awareness sessions on health, career, and technology\n\n"
"👥 **Team & Leadership**\n"
"• Founder: Verril Vaz\n"
"• Team of 45 passionate young members\n\n"
"📍 **Locations & Contact**\n"
"• Operating in Belgaum and Bangalore\n"
"• Phone: +91-123-456-7890\n"
"• Website: https://www.sahayak.org\n\n"
"What specific topic would you like to know more about?",
"sure": "Great! Here are some topics I can help you explore:\n\n"
"📋 **Programs & Initiatives**\n"
"• Educational programs for underprivileged children\n"
"• Visits to old age homes and special schools\n"
"• Awareness sessions on health, career, and technology\n\n"
"👥 **Team & Leadership**\n"
"• Founder: Verril Vaz\n"
"• Team of 45 passionate young members\n\n"
"📍 **Locations & Contact**\n"
"• Operating in Belgaum and Bangalore\n\n"
"What specific topic would you like to know more about?",
"ok": "I'm here to help! You can ask me about:\n\n"
"• Sahayak's mission and vision\n"
"• Our educational programs and initiatives\n"
"• Recent activities and events\n"
"• Team members and leadership\n"
"• How to volunteer or donate\n"
"• Contact information and locations\n\n"
"What would you like to know?",
"tell me more": "Here's more about Sahayak Organization:\n\n"
"**🎯 Our Mission**\n"
"Sahayak is dedicated to empowering underprivileged communities through education, inclusion, and dedicated care.\n\n"
"**📚 Key Programs**\n"
"1. Educational mentorship for underserved students\n"
"2. Support for visually impaired and specially-abled children\n"
"3. Assistance to senior citizens at old age homes\n"
"4. Awareness sessions on technology, career, and health\n\n"
"**🏆 Recent Achievements**\n"
"• Visited Anand Yatri Old Age Home (August 2024)\n"
"• Conducted sessions at Maheshwari School for the Blind (October 2024)\n"
"• Partnered with Sparsh Foundation (December 2024)\n"
"• Organized awareness programs at Balika Adarsh Vidyalaya (February 2025)\n\n"
"Which aspect would you like to explore further?",
"what is sahayak": "**📖 What is Sahayak?**\n\n"
"Sahayak (meaning 'helper' in Hindi) is a non-governmental organization (NGO) founded with the noble vision of uplifting society.\n\n"
"**🎯 Core Focus Areas:**\n"
"• Providing quality education to underserved students\n"
"• Supporting senior citizens with care and companionship\n"
"• Aiding visually impaired and differently-abled children\n\n"
"**📍 Where We Operate:**\n"
"Sahayak primarily operates in Belgaum and Bangalore, Karnataka, India.\n\n"
"**👥 Our Team:**\n"
"Led by founder Verril Vaz, Sahayak has a passionate team of 45 young leaders dedicated to making a difference.\n\n"
"Would you like to know about our specific programs or how to get involved?",
"initiatives": "**📋 Sahayak's Key Initiatives**\n\n"
"**1. Educational Programs**\n"
"• Academic mentorship for underprivileged children\n"
"• Scholarship guidance and study resources\n"
"• Career counseling and skill development\n\n"
"**2. Community Outreach**\n"
"• Regular visits to old age homes\n"
"• Support programs for specially-abled children\n"
"• Health and nutrition awareness campaigns\n\n"
"**3. Recent Activities**\n"
"• Anand Yatri Old Age Home visit (Aug 2024)\n"
"• Maheshwari School for the Blind (Oct 2024)\n"
"• Sparsh Foundation collaboration (Dec 2024)\n"
"• Balika Adarsh Vidyalaya sessions (Feb 2025)\n\n"
"Would you like details about any specific initiative?",
"what are the initiatives of sahayak": "**📋 Sahayak's Key Initiatives**\n\n"
"**1. Educational Programs**\n"
"• Academic mentorship for underprivileged children\n"
"• Scholarship guidance and study resources\n"
"• Career counseling and skill development\n\n"
"**2. Community Outreach**\n"
"• Regular visits to old age homes\n"
"• Support programs for specially-abled children\n"
"• Health and nutrition awareness campaigns\n\n"
"**3. Recent Activities (2024-2025)**\n"
"• Anand Yatri Old Age Home visit (Aug 3, 2024)\n"
"• Maheshwari School for the Blind (Oct 25, 2024)\n"
"• Sparsh Foundation collaboration (Dec 19, 2024)\n"
"• Balika Adarsh Vidyalaya sessions (Feb 4, 2025)\n\n"
"Would you like details about any specific initiative?",
"who is the president of sahayak": "**👤 President of Sahayak**\n\n"
"The founder and president of Sahayak Organisation is **Verril Vaz**.\n\n"
"**About Verril Vaz:**\n"
"• Visionary leader who believes in grassroots change\n"
"• Established Sahayak with the mission to uplift underprivileged communities\n"
"• Leads a passionate team of 45 young members\n\n"
"Under his leadership, Sahayak has grown to operate in multiple locations including Belgaum and Bangalore.\n\n"
"Would you like to know more about the leadership team or Sahayak's structure?",
"president of sahayak": "**👤 President of Sahayak**\n\n"
"The founder and president of Sahayak Organisation is **Verril Vaz**.\n\n"
"**About Verril Vaz:**\n"
"• Visionary leader who believes in grassroots change\n"
"• Established Sahayak with the mission to uplift underprivileged communities\n"
"• Leads a passionate team of 45 young members\n\n"
"Would you like to know more about the leadership team?",
"how to join sahayak": "**🤝 How to Join Sahayak**\n\n"
"**Option 1: Register Online**\n"
"• Visit: https://www.sahayak.org/volunteer\n"
"• Fill out the volunteer registration form\n\n"
"**Option 2: Contact Us Directly**\n"
"• Phone: +91-123-456-7890\n"
"• Email through the website contact form\n\n"
"**What We Look For:**\n"
"• Passion for community service\n"
"• Willingness to contribute time and skills\n"
"• Empathy for underprivileged communities\n\n"
"Would you like more information about volunteer opportunities?",
"how many branches sahayak has": "**📍 Sahayak Locations**\n\n"
"Sahayak currently operates in **2 main locations**:\n\n"
"**1. Belgaum (Belagavi)**\n"
"• Primary operational hub\n"
"• Multiple community outreach programs\n\n"
"**2. Bangalore**\n"
"• Extended operations and programs\n"
"• Partnership with local institutions\n\n"
"Both locations serve as centers for educational initiatives, community support, and volunteer activities.\n\n"
"Would you like to know about activities at a specific location?",
# Additional variations for better matching
"vice president": "**👤 Sahayak Leadership Team**\n\n"
"Sahayak is led by a passionate team of young leaders:\n\n"
"**President & Founder:** Verril Vaz\n"
"• Visionary leader who believes in grassroots change\n\n"
"**Core Team:**\n"
"• 45 dedicated young members\n"
"• Mix of empathy, administrative skills, and innovation\n"
"• Committed to Sahayak's core values\n\n"
"The team works together to ensure Sahayak runs efficiently while making a positive impact.\n\n"
"Would you like to know about specific team roles or how to join?",
"leadership": "**👥 Sahayak Leadership Team**\n\n"
"Sahayak is led by a passionate team of young leaders:\n\n"
"**President & Founder:** Verril Vaz\n"
"• Visionary leader who believes in grassroots change\n"
"• Established Sahayak on May 21, 2024\n\n"
"**Core Team:**\n"
"• 45 dedicated young members\n"
"• Mix of empathy, administrative skills, and innovation\n"
"• Committed to uplifting underprivileged communities\n\n"
"The team brings collective commitment to ensure Sahayak runs efficiently.\n\n"
"Would you like to know more about joining the team?",
"objectives": "**🎯 Sahayak's Objectives**\n\n"
"Sahayak Organisation works towards the following key objectives:\n\n"
"**1. Educational Empowerment**\n"
"• Provide quality education to underserved students\n"
"• Bridge the gap between potential and opportunity\n\n"
"**2. Community Support**\n"
"• Support senior citizens with care and companionship\n"
"• Aid visually impaired and differently-abled children\n\n"
"**3. Social Awareness**\n"
"• Raise awareness about social issues affecting marginalized groups\n"
"• Promote health, career, and technology awareness\n\n"
"**4. Sustainable Impact**\n"
"• Create lasting positive change in communities\n"
"• Empower individuals and families through dedicated programs\n\n"
"Would you like to learn about specific programs achieving these objectives?",
"what are the objectives": "**🎯 Sahayak's Objectives**\n\n"
"Sahayak Organisation works towards the following key objectives:\n\n"
"**1. Educational Empowerment**\n"
"• Provide quality education to underserved students\n"
"• Bridge the gap between potential and opportunity\n\n"
"**2. Community Support**\n"
"• Support senior citizens with care and companionship\n"
"• Aid visually impaired and differently-abled children\n\n"
"**3. Social Awareness**\n"
"• Raise awareness about social issues\n"
"• Promote health, career, and technology awareness\n\n"
"Would you like to learn about specific programs?",
"donate": "**💝 How to Donate to Sahayak**\n\n"
"Your donations help Sahayak continue its mission to support underprivileged communities.\n\n"
"**How to Donate:**\n"
"• Visit: https://www.sahayak.org\n"
"• Contact: +91-123-456-7890\n\n"
"**Your Donation Supports:**\n"
"• Educational programs for children\n"
"• Care for senior citizens\n"
"• Support for specially-abled individuals\n\n"
"Every contribution makes a difference!\n\n"
"Would you like more information about our programs?",
"help": "**ℹ️ How Can I Help You?**\n\n"
"I can provide information about:\n\n"
"📋 **Programs & Initiatives**\n"
"• Educational programs, community outreach, recent events\n\n"
"👥 **Team & Leadership**\n"
"• Founder, president, team members\n\n"
"📍 **Locations & Contact**\n"
"• Branches, phone numbers, website\n\n"
"🤝 **Getting Involved**\n"
"• How to volunteer, donate, or join\n\n"
"🎯 **Mission & Vision**\n"
"• Goals, objectives, values\n\n"
"What would you like to know about?"
}
translated_query, original_lang = detect_and_translate(query)
moderation = moderate_query(translated_query)
if moderation.get("status") == "blocked":
return translate_text(moderation["message"], original_lang)
# =====================================================================
# SEMANTIC INTENT UNDERSTANDING (Expert-Level)
# Uses embeddings to understand query meaning, not just keywords
# =====================================================================
normalized_query = re.sub(r'[^\w\s]', '', translated_query.lower().strip())
# Step 1: Semantic Intent Classification
intent, intent_confidence = semantic_intent_classification(translated_query)
logger.info(f"Semantic intent: {intent} (confidence: {intent_confidence:.3f})")
# Step 2: Query Complexity Analysis
complexity = analyze_query_complexity(translated_query)
logger.info(f"Query complexity: {complexity['level']} (score: {complexity['score']})")
# Step 3: Intent-based response routing with semantic understanding
# Map semantic intents to curated responses
intent_to_response = {
"about_organization": "what is sahayak",
"founder_leadership": "who is the founder of sahayak",
"president_leadership": "who is the president of sahayak",
"team_members": "who is the president of sahayak",
"mission_vision": "mission of sahayak",
"initiatives_programs": "initiatives",
"recent_events": "what are the recent activities conducted by the sahayak",
"location_branches": "how many branches sahayak has",
"join_volunteer": "how to join sahayak",
"donate_support": "donate",
"contact_info": "how to join sahayak",
"founding_date": "when was the sahayak started",
"greeting": "hi",
"thanks": "ok",
"affirmative": "yes"
}
# High confidence semantic match -> use curated response
if intent_confidence > 0.65 and intent in intent_to_response:
response_key = intent_to_response[intent]
if response_key in meta_responses:
logger.info(f"Semantic match: intent={intent}, confidence={intent_confidence:.3f}, response_key={response_key}")
return translate_text(meta_responses[response_key], original_lang)
# Step 4: Semantic similarity search in meta_responses
# Use embeddings to find best matching pre-defined response
try:
embedder = get_embedder()
query_embedding = embedder.encode([translated_query])[0]
best_match_key = None
best_match_score = 0.0
# Encode all meta_response keys and find best semantic match
meta_keys = list(meta_responses.keys())
meta_embeddings = embedder.encode(meta_keys)
for i, (key, key_embedding) in enumerate(zip(meta_keys, meta_embeddings)):
# Cosine similarity
similarity = np.dot(query_embedding, key_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(key_embedding)
)
if similarity > best_match_score:
best_match_score = similarity
best_match_key = key
# If we have a strong semantic match, use it
if best_match_score > 0.70 and best_match_key:
logger.info(f"Semantic meta-match: '{best_match_key}' (score: {best_match_score:.3f})")
return translate_text(meta_responses[best_match_key], original_lang)
except Exception as e:
logger.warning(f"Semantic matching failed: {e}")
# Step 5: Check scope
if not is_in_scope(translated_query):
return scope_redirect_message(original_lang)
# =====================================================================
# STEP 6: SEMANTIC PASSAGE RETRIEVAL & QA
# Use embeddings to find most relevant passages
# =====================================================================
# Determine retrieval depth based on query complexity
top_k = 10
if complexity["level"] == "complex":
top_k = 15
elif complexity["level"] == "simple":
top_k = 8
relevance_passages = retrieve_relevant_passages(
translated_query, passages, vector_index, embeddings, top_k=top_k
)
logger.info(f"Retrieved {len(relevance_passages)} passages for query '{translated_query}'")
# Re-rank passages using semantic similarity for better context
ranked_passages = semantic_passage_ranking(translated_query, relevance_passages, top_k=5)
# Build context from best semantically-matched passages
context_passages = [p for p, score in ranked_passages if score > 0.3]
if not context_passages:
context_passages = relevance_passages[:3]
context = " ".join(context_passages)[:2000] # More context for better answers
logger.info(f"Context length for QA model: {len(context)} chars from {len(context_passages)} passages")
qa_model = get_qa_model()
result = qa_model(question=translated_query, context=context)
english_answer = result["answer"]
confidence_score = result.get("score", 0.0)
logger.info(f"QA model result: answer='{english_answer}', score={confidence_score}")
# Apply expert reasoning chain
reasoning = expert_reasoning_chain(translated_query, relevance_passages, result)
logger.info(f"Expert reasoning steps: {reasoning['reasoning_steps']}")
is_numeric = english_answer.strip().replace(".", "").isdigit()
if not english_answer or (len(english_answer) < 5 and not is_numeric and confidence_score < 0.3) or confidence_score < 0.05:
logger.warning(f"QA model failed to provide a good answer. Answer: '{english_answer}', Score: {confidence_score}")
# Provide a more helpful fallback using expert reasoning
fallback_response = "I couldn't find a specific answer to your question.\n\n"
if relevance_passages:
fallback_response += "**However, here's what I found in our documentation:**\n"
for i, passage in enumerate(relevance_passages[:2]):
snippet = passage[:200] + "..." if len(passage) > 200 else passage
fallback_response += f"\n- {snippet}\n"
fallback_response += "\nWould you like me to help you with a more specific question about Sahayak?"
return translate_text(fallback_response, original_lang)
# Build expert response with evidence and reasoning
detailed_answer = build_expert_response(english_answer, reasoning, translated_query, relevance_passages)
legal_footer = (
"\n\n---\n_This response is for community awareness only and not legal, medical, or financial advice. "
"Program details may change; please verify with official Sahayak contacts or local authorities._"
)
final_answer = translate_text(detailed_answer + legal_footer, original_lang)
return final_answer
except Exception as e:
logger.error(f"Error answering question: {str(e)}")
return ("I'm sorry, I encountered an error while processing your question.\n\n"
"However, I can still tell you that Sahayak is a non-profit organization focused on helping those in need.\n"
"We work on various social issues, aiming to make a positive impact in the community.\n"
"Please try again with a different query, and I'll do my best to assist you!")