Spaces:
Sleeping
Sleeping
Upload 15 files
Browse files- .gitattributes +4 -0
- Dockerfile +72 -0
- README.md +15 -0
- Sahayak_Organisation_Expanded_Info.pdf +0 -0
- app.py +184 -0
- data/pdfs/Sahayak_Organisation_Expanded_Info.pdf +0 -0
- data/pdfs/sahayak_disaster_relief.pdf +3 -0
- data/pdfs/sahayak_educational_programs.pdf +3 -0
- data/pdfs/sahayak_general_info.pdf +3 -0
- data/pdfs/sahayak_ngo_resources.pdf +3 -0
- requirements.txt +31 -0
- sahayak_utils.py +1623 -0
- static/logo.png +0 -0
- static/style.css +299 -0
- templates/index.html +129 -0
.gitattributes
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data/pdfs/sahayak_disaster_relief.pdf filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
data/pdfs/sahayak_educational_programs.pdf filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
data/pdfs/sahayak_general_info.pdf filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
data/pdfs/sahayak_ngo_resources.pdf filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Docker_file
|
| 2 |
+
# Use Python 3.10 slim as the base image
|
| 3 |
+
FROM python:3.10-slim
|
| 4 |
+
|
| 5 |
+
# Set working directory
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
# Create necessary directories with proper permissions
|
| 9 |
+
RUN mkdir -p /app /data/models /tmp/nltk_data && \
|
| 10 |
+
chmod -R 777 /app /data /tmp/nltk_data
|
| 11 |
+
|
| 12 |
+
# Install system dependencies for faiss, sentence-transformers, and PDF processing
|
| 13 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 14 |
+
git \
|
| 15 |
+
git-lfs \
|
| 16 |
+
build-essential \
|
| 17 |
+
libopenblas-dev \
|
| 18 |
+
libgomp1 \
|
| 19 |
+
g++ \
|
| 20 |
+
libgcc-s1 \
|
| 21 |
+
curl \
|
| 22 |
+
&& apt-get clean \
|
| 23 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 24 |
+
|
| 25 |
+
# Copy requirements file first (for better Docker caching)
|
| 26 |
+
COPY requirements.txt .
|
| 27 |
+
|
| 28 |
+
# Set environment variables for Hugging Face
|
| 29 |
+
ENV TRANSFORMERS_CACHE=/data/models \
|
| 30 |
+
HF_HOME=/data/models \
|
| 31 |
+
HF_HUB_CACHE=/data/models \
|
| 32 |
+
NLTK_DATA=/tmp/nltk_data \
|
| 33 |
+
MODEL_DIR=/data/models \
|
| 34 |
+
PYTHONUNBUFFERED=1
|
| 35 |
+
|
| 36 |
+
# Install Python dependencies
|
| 37 |
+
RUN pip install --no-cache-dir -U pip setuptools wheel && \
|
| 38 |
+
pip install --no-cache-dir -r requirements.txt
|
| 39 |
+
|
| 40 |
+
# Pre-download NLTK data
|
| 41 |
+
RUN python -c "import nltk; nltk.download('punkt', download_dir='/tmp/nltk_data', quiet=True); nltk.download('punkt_tab', download_dir='/tmp/nltk_data', quiet=True)" || true
|
| 42 |
+
|
| 43 |
+
# Pre-download ML models during build
|
| 44 |
+
RUN python -c "\
|
| 45 |
+
from sentence_transformers import SentenceTransformer; \
|
| 46 |
+
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder='/data/models'); \
|
| 47 |
+
print('Embedding model downloaded successfully')" || echo "Will download at runtime"
|
| 48 |
+
|
| 49 |
+
RUN python -c "\
|
| 50 |
+
from transformers import pipeline; \
|
| 51 |
+
qa = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', cache_dir='/data/models'); \
|
| 52 |
+
print('QA model downloaded successfully')" || echo "Will download at runtime"
|
| 53 |
+
|
| 54 |
+
# Copy application code
|
| 55 |
+
COPY . .
|
| 56 |
+
|
| 57 |
+
# Ensure all directories have proper permissions
|
| 58 |
+
RUN chmod -R 777 /app /data /tmp/nltk_data
|
| 59 |
+
|
| 60 |
+
# Set runtime environment variables
|
| 61 |
+
ENV PDF_PATH=data/pdfs \
|
| 62 |
+
PORT=7860
|
| 63 |
+
|
| 64 |
+
# Expose the port
|
| 65 |
+
EXPOSE 7860
|
| 66 |
+
|
| 67 |
+
# Health check
|
| 68 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 69 |
+
CMD curl -f http://localhost:7860/ || exit 1
|
| 70 |
+
|
| 71 |
+
# Run the application with gunicorn
|
| 72 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--timeout", "300", "--log-level", "info", "--workers", "1", "--threads", "2", "app:app"]
|
README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Vexa Chatbot
|
| 3 |
+
emoji: 🤖
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Vexa Chatbot
|
| 12 |
+
|
| 13 |
+
A chatbot application for the Sahayak NGO, built with Flask and Hugging Face models.
|
| 14 |
+
|
| 15 |
+
<!-- Force rebuild: 2025-05-07 14:10 -->
|
Sahayak_Organisation_Expanded_Info.pdf
ADDED
|
Binary file (4.9 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import time
|
| 4 |
+
from flask import Flask, render_template, request, jsonify
|
| 5 |
+
from flask_cors import CORS
|
| 6 |
+
from flask_limiter import Limiter
|
| 7 |
+
from flask_limiter.util import get_remote_address
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
# Setup logging
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
level=logging.INFO,
|
| 13 |
+
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
|
| 14 |
+
handlers=[logging.StreamHandler()]
|
| 15 |
+
)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
# Import utils after logging setup
|
| 19 |
+
from sahayak_utils import (
|
| 20 |
+
load_pdf_text, load_passages_from_path, split_into_chunks, build_faiss_index,
|
| 21 |
+
get_qa_model, initialize_models, retrieve_relevant_passages,
|
| 22 |
+
answer_question, check_model_cache_status
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
app = Flask(__name__)
|
| 26 |
+
CORS(app) # Add this line to allow WebView requests
|
| 27 |
+
# Add rate limiting with stable initialization
|
| 28 |
+
limiter = Limiter(
|
| 29 |
+
key_func=get_remote_address,
|
| 30 |
+
default_limits=["200 per day", "50 per hour"]
|
| 31 |
+
)
|
| 32 |
+
limiter.init_app(app)
|
| 33 |
+
|
| 34 |
+
# Global variables
|
| 35 |
+
passages = []
|
| 36 |
+
faiss_index = None
|
| 37 |
+
embeddings = None
|
| 38 |
+
initialized = False
|
| 39 |
+
last_pdf_load_time = 0
|
| 40 |
+
response_cache = {} # In-memory cache for query responses
|
| 41 |
+
|
| 42 |
+
def validate_user_input(user_input):
|
| 43 |
+
"""Validate and sanitize user input"""
|
| 44 |
+
sanitized = re.sub(r'<script.*?>.*?</script>', '', user_input, flags=re.DOTALL)
|
| 45 |
+
if len(sanitized) > 500:
|
| 46 |
+
sanitized = sanitized[:500]
|
| 47 |
+
return sanitized
|
| 48 |
+
|
| 49 |
+
def setup():
|
| 50 |
+
"""Initialize the QA system with proper error handling"""
|
| 51 |
+
global passages, faiss_index, embeddings, initialized, last_pdf_load_time
|
| 52 |
+
|
| 53 |
+
if initialized and time.time() - last_pdf_load_time < 3600:
|
| 54 |
+
logger.info("Using cached data")
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
logger.info("Initializing models...")
|
| 59 |
+
initialize_models()
|
| 60 |
+
|
| 61 |
+
logger.info("Preloading QA model...")
|
| 62 |
+
get_qa_model()
|
| 63 |
+
|
| 64 |
+
PDF_PATH = os.environ.get("PDF_PATH", "data/pdfs")
|
| 65 |
+
passages = load_passages_from_path(PDF_PATH)
|
| 66 |
+
logger.info(f"Passages ready: {len(passages)}")
|
| 67 |
+
|
| 68 |
+
logger.info("Building search index...")
|
| 69 |
+
faiss_index, embeddings = build_faiss_index(passages)
|
| 70 |
+
|
| 71 |
+
initialized = True
|
| 72 |
+
last_pdf_load_time = time.time()
|
| 73 |
+
logger.info("Initialization complete")
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Error during initialization: {str(e)}")
|
| 76 |
+
passages = ["Sahayak is a non-profit organization dedicated to providing support."]
|
| 77 |
+
faiss_index, embeddings = build_faiss_index(passages)
|
| 78 |
+
initialized = True
|
| 79 |
+
|
| 80 |
+
@app.route("/", methods=["GET"])
|
| 81 |
+
def home():
|
| 82 |
+
"""Serve the home page"""
|
| 83 |
+
try:
|
| 84 |
+
if not initialized:
|
| 85 |
+
setup()
|
| 86 |
+
return render_template("index.html")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
logger.error(f"Error in home route: {str(e)}")
|
| 89 |
+
return "An error occurred. Please check the logs.", 500
|
| 90 |
+
|
| 91 |
+
@app.route("/get", methods=["POST"])
|
| 92 |
+
@limiter.limit("10 per minute")
|
| 93 |
+
def get_response():
|
| 94 |
+
"""Handle question answering requests with rate limiting and caching"""
|
| 95 |
+
try:
|
| 96 |
+
if not initialized:
|
| 97 |
+
setup()
|
| 98 |
+
|
| 99 |
+
user_input = request.form.get("user_input", "")
|
| 100 |
+
validated_input = validate_user_input(user_input)
|
| 101 |
+
|
| 102 |
+
if validated_input.strip():
|
| 103 |
+
# Check cache
|
| 104 |
+
if validated_input in response_cache:
|
| 105 |
+
logger.info(f"Cache hit for query: {validated_input[:50]}...")
|
| 106 |
+
cached_response = response_cache[validated_input]
|
| 107 |
+
return jsonify({
|
| 108 |
+
"response": cached_response["response"],
|
| 109 |
+
"process_time": cached_response["process_time"]
|
| 110 |
+
})
|
| 111 |
+
|
| 112 |
+
# Process new query
|
| 113 |
+
start_time = time.time()
|
| 114 |
+
answer = answer_question(validated_input, passages, faiss_index, embeddings)
|
| 115 |
+
process_time = time.time() - start_time
|
| 116 |
+
|
| 117 |
+
logger.info(f"Processed question in {process_time:.2f}s: {validated_input[:50]}...")
|
| 118 |
+
|
| 119 |
+
# Store in cache (limit cache size to 100)
|
| 120 |
+
response_cache[validated_input] = {
|
| 121 |
+
"response": answer,
|
| 122 |
+
"process_time": round(process_time, 2)
|
| 123 |
+
}
|
| 124 |
+
if len(response_cache) > 100:
|
| 125 |
+
response_cache.pop(next(iter(response_cache))) # Remove oldest entry
|
| 126 |
+
|
| 127 |
+
return jsonify({
|
| 128 |
+
"response": answer,
|
| 129 |
+
"process_time": round(process_time, 2)
|
| 130 |
+
})
|
| 131 |
+
|
| 132 |
+
return jsonify({
|
| 133 |
+
"response": "Please enter a valid question.",
|
| 134 |
+
"process_time": 0.0
|
| 135 |
+
})
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.error(f"Error in get_response: {str(e)}")
|
| 138 |
+
return jsonify({
|
| 139 |
+
"response": "Sorry, an error occurred while processing your request. Please try again.",
|
| 140 |
+
"process_time": 0.0
|
| 141 |
+
}), 500
|
| 142 |
+
|
| 143 |
+
@app.route("/health", methods=["GET"])
|
| 144 |
+
def health_check():
|
| 145 |
+
"""Health check endpoint for monitoring"""
|
| 146 |
+
status = "healthy" if initialized else "initializing"
|
| 147 |
+
return jsonify({
|
| 148 |
+
"status": status,
|
| 149 |
+
"passages_loaded": len(passages) > 1,
|
| 150 |
+
"uptime": time.time() - last_pdf_load_time if initialized else 0
|
| 151 |
+
}), 200
|
| 152 |
+
|
| 153 |
+
@app.route("/reset", methods=["POST"])
|
| 154 |
+
def reset_models():
|
| 155 |
+
"""Reset model cache (requires admin key)"""
|
| 156 |
+
global initialized, last_pdf_load_time, response_cache
|
| 157 |
+
|
| 158 |
+
admin_key = request.form.get("admin_key", "")
|
| 159 |
+
expected_key = os.environ.get("ADMIN_KEY", "not-set")
|
| 160 |
+
|
| 161 |
+
if admin_key != expected_key:
|
| 162 |
+
return jsonify({"status": "unauthorized"}), 401
|
| 163 |
+
|
| 164 |
+
initialized = False
|
| 165 |
+
last_pdf_load_time = 0
|
| 166 |
+
response_cache.clear() # Clear query cache
|
| 167 |
+
|
| 168 |
+
setup()
|
| 169 |
+
|
| 170 |
+
return jsonify({"status": "reset_complete"})
|
| 171 |
+
|
| 172 |
+
if __name__ == "__main__":
|
| 173 |
+
port = int(os.environ.get("PORT", 7860))
|
| 174 |
+
try:
|
| 175 |
+
port = int(port)
|
| 176 |
+
except (ValueError, TypeError):
|
| 177 |
+
logger.warning(f"Invalid PORT value: {port}, using default 7860")
|
| 178 |
+
port = 7860
|
| 179 |
+
|
| 180 |
+
logger.info(f"Starting app on port: {port}")
|
| 181 |
+
|
| 182 |
+
setup()
|
| 183 |
+
|
| 184 |
+
app.run(host="0.0.0.0", port=port)
|
data/pdfs/Sahayak_Organisation_Expanded_Info.pdf
ADDED
|
Binary file (4.9 kB). View file
|
|
|
data/pdfs/sahayak_disaster_relief.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73d39f1dd72602cce01036b96952334f3faaf47d1cdf45f84723feb6453e54b7
|
| 3 |
+
size 144362
|
data/pdfs/sahayak_educational_programs.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7c9933c39024d5ee88615d13946a1144bf84d3519059dac6d612f87e47ef15a
|
| 3 |
+
size 188177
|
data/pdfs/sahayak_general_info.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d4f52d77108ca4096b258b5930ed5fe892161980fbc96b2ecccd9000fdc33be
|
| 3 |
+
size 129981
|
data/pdfs/sahayak_ngo_resources.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca61a9fe949c469c120d80aeeb2ac0d75fba6794e0c13b3df1c6e7718d444a66
|
| 3 |
+
size 212873
|
requirements.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Flask and web dependencies
|
| 3 |
+
flask==2.2.5
|
| 4 |
+
flask-limiter==3.5.0
|
| 5 |
+
flask-cors==4.0.0
|
| 6 |
+
werkzeug==2.3.8
|
| 7 |
+
gunicorn==21.2.0
|
| 8 |
+
|
| 9 |
+
# NLP and ML dependencies
|
| 10 |
+
nltk==3.8.1
|
| 11 |
+
numpy>=1.24.0,<2.0.0
|
| 12 |
+
faiss-cpu==1.7.4
|
| 13 |
+
torch>=2.0.0
|
| 14 |
+
transformers>=4.35.0
|
| 15 |
+
sentence-transformers>=2.2.0
|
| 16 |
+
scikit-learn>=1.3.0
|
| 17 |
+
scipy>=1.11.0
|
| 18 |
+
|
| 19 |
+
# PDF processing
|
| 20 |
+
PyMuPDF>=1.23.0
|
| 21 |
+
PyPDF2>=3.0.0
|
| 22 |
+
|
| 23 |
+
# Translation
|
| 24 |
+
deep-translator>=1.11.0
|
| 25 |
+
beautifulsoup4>=4.12.0
|
| 26 |
+
|
| 27 |
+
# Utilities
|
| 28 |
+
requests>=2.31.0
|
| 29 |
+
setuptools>=65.5.1
|
| 30 |
+
wheel>=0.38.4
|
| 31 |
+
huggingface_hub>=0.19.0
|
sahayak_utils.py
ADDED
|
@@ -0,0 +1,1623 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["TRANSFORMERS_CACHE"] = "/data/models"
|
| 3 |
+
os.environ["HF_HOME"] = "/data/models"
|
| 4 |
+
os.environ["HF_HUB_CACHE"] = "/data/models"
|
| 5 |
+
import logging
|
| 6 |
+
import nltk
|
| 7 |
+
import numpy as np
|
| 8 |
+
import faiss
|
| 9 |
+
import re
|
| 10 |
+
import json
|
| 11 |
+
import time
|
| 12 |
+
import torch
|
| 13 |
+
from functools import lru_cache
|
| 14 |
+
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AutoModel, pipeline
|
| 15 |
+
import difflib
|
| 16 |
+
|
| 17 |
+
# Setup logging
|
| 18 |
+
logging.basicConfig(
|
| 19 |
+
level=logging.INFO,
|
| 20 |
+
format='%(asctime)s [%(levelname)s] %(name)s: %(message)s'
|
| 21 |
+
)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
# Debug: Log environment variables
|
| 25 |
+
logger.info(f"TRANSFORMERS_CACHE: {os.environ.get('TRANSFORMERS_CACHE')}")
|
| 26 |
+
logger.info(f"HF_HOME: {os.environ.get('HF_HOME')}")
|
| 27 |
+
logger.info(f"HF_HUB_CACHE: {os.environ.get('HF_HUB_CACHE')}")
|
| 28 |
+
|
| 29 |
+
# Import libraries with error handling
|
| 30 |
+
try:
|
| 31 |
+
import fitz # PyMuPDF
|
| 32 |
+
PDF_BACKEND = "pymupdf"
|
| 33 |
+
except ImportError:
|
| 34 |
+
logger.warning("PyMuPDF (fitz) not installed. Trying PyPDF2 as fallback.")
|
| 35 |
+
try:
|
| 36 |
+
import PyPDF2
|
| 37 |
+
PDF_BACKEND = "pypdf2"
|
| 38 |
+
except ImportError:
|
| 39 |
+
logger.error("No PDF processing library available.")
|
| 40 |
+
PDF_BACKEND = "none"
|
| 41 |
+
|
| 42 |
+
# Translation setup with graceful fallback
|
| 43 |
+
try:
|
| 44 |
+
from deep_translator import GoogleTranslator
|
| 45 |
+
from deep_translator.exceptions import LanguageNotSupportedException
|
| 46 |
+
|
| 47 |
+
class DeepTranslatorWrapper:
|
| 48 |
+
"""Wrapper to provide consistent API like googletrans"""
|
| 49 |
+
def detect(self, text):
|
| 50 |
+
"""Detect language using simple heuristics"""
|
| 51 |
+
class LangResult:
|
| 52 |
+
def __init__(self, lang):
|
| 53 |
+
self.lang = lang
|
| 54 |
+
|
| 55 |
+
# Simple language detection based on character ranges
|
| 56 |
+
text_sample = text[:100]
|
| 57 |
+
|
| 58 |
+
# Hindi (Devanagari script)
|
| 59 |
+
if any('\u0900' <= c <= '\u097F' for c in text_sample):
|
| 60 |
+
return LangResult("hi")
|
| 61 |
+
# Bengali
|
| 62 |
+
elif any('\u0980' <= c <= '\u09FF' for c in text_sample):
|
| 63 |
+
return LangResult("bn")
|
| 64 |
+
# Tamil
|
| 65 |
+
elif any('\u0B80' <= c <= '\u0BFF' for c in text_sample):
|
| 66 |
+
return LangResult("ta")
|
| 67 |
+
# Telugu
|
| 68 |
+
elif any('\u0C00' <= c <= '\u0C7F' for c in text_sample):
|
| 69 |
+
return LangResult("te")
|
| 70 |
+
# Gujarati
|
| 71 |
+
elif any('\u0A80' <= c <= '\u0AFF' for c in text_sample):
|
| 72 |
+
return LangResult("gu")
|
| 73 |
+
# Marathi (also Devanagari, but different patterns)
|
| 74 |
+
elif any('\u0900' <= c <= '\u097F' for c in text_sample):
|
| 75 |
+
return LangResult("mr")
|
| 76 |
+
# Chinese
|
| 77 |
+
elif any('\u4E00' <= c <= '\u9FFF' for c in text_sample):
|
| 78 |
+
return LangResult("zh")
|
| 79 |
+
# Arabic
|
| 80 |
+
elif any('\u0600' <= c <= '\u06FF' for c in text_sample):
|
| 81 |
+
return LangResult("ar")
|
| 82 |
+
# Spanish/French/German (accented Latin)
|
| 83 |
+
elif any(c in 'áéíóúñüÁÉÍÓÚÑÜ' for c in text_sample):
|
| 84 |
+
return LangResult("es")
|
| 85 |
+
else:
|
| 86 |
+
return LangResult("en")
|
| 87 |
+
|
| 88 |
+
def translate(self, text, src=None, dest=None):
|
| 89 |
+
"""Translate text using deep-translator"""
|
| 90 |
+
class TranslationResult:
|
| 91 |
+
def __init__(self, translated_text):
|
| 92 |
+
self.text = translated_text
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
if src == dest or dest is None:
|
| 96 |
+
return TranslationResult(text)
|
| 97 |
+
|
| 98 |
+
# Map language codes
|
| 99 |
+
src_lang = src if src and src != 'auto' else 'auto'
|
| 100 |
+
dest_lang = dest if dest else 'en'
|
| 101 |
+
|
| 102 |
+
translated = GoogleTranslator(source=src_lang, target=dest_lang).translate(text)
|
| 103 |
+
return TranslationResult(translated if translated else text)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.warning(f"Translation failed: {e}")
|
| 106 |
+
return TranslationResult(text)
|
| 107 |
+
|
| 108 |
+
translator = DeepTranslatorWrapper()
|
| 109 |
+
logger.info("deep-translator initialized successfully")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.warning(f"deep-translator not available ({e}); using simple no-op translator")
|
| 112 |
+
|
| 113 |
+
class SimpleTranslator:
|
| 114 |
+
def detect(self, text):
|
| 115 |
+
class LangResult:
|
| 116 |
+
def __init__(self):
|
| 117 |
+
self.lang = "en"
|
| 118 |
+
return LangResult()
|
| 119 |
+
|
| 120 |
+
def translate(self, text, src=None, dest=None):
|
| 121 |
+
class TranslationResult:
|
| 122 |
+
def __init__(self, text):
|
| 123 |
+
self.text = text
|
| 124 |
+
return TranslationResult(text)
|
| 125 |
+
|
| 126 |
+
translator = SimpleTranslator()
|
| 127 |
+
|
| 128 |
+
# Ensure NLTK punkt is downloaded to a writable path
|
| 129 |
+
nltk.data.path.append('/tmp/nltk_data')
|
| 130 |
+
try:
|
| 131 |
+
nltk.download('punkt', download_dir='/tmp/nltk_data', quiet=True)
|
| 132 |
+
except Exception as e:
|
| 133 |
+
logger.warning(f"Failed to download NLTK punkt: {e}. Text chunking may be affected.")
|
| 134 |
+
|
| 135 |
+
# Models configuration
|
| 136 |
+
MODEL_DIR = os.environ.get("MODEL_DIR", "/data/models")
|
| 137 |
+
EMBEDDING_MODEL_NAME = os.environ.get("EMBEDDING_MODEL", "sentence-transformers/all-mpnet-base-v2")
|
| 138 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 139 |
+
|
| 140 |
+
# Setup transformers-based embedding model
|
| 141 |
+
logger.info(f"Loading embedding model from: {EMBEDDING_MODEL_NAME}")
|
| 142 |
+
try:
|
| 143 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 144 |
+
EMBEDDING_MODEL_NAME,
|
| 145 |
+
cache_dir=MODEL_DIR,
|
| 146 |
+
local_files_only=os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
|
| 147 |
+
)
|
| 148 |
+
model = AutoModel.from_pretrained(
|
| 149 |
+
EMBEDDING_MODEL_NAME,
|
| 150 |
+
cache_dir=MODEL_DIR,
|
| 151 |
+
local_files_only=os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
|
| 152 |
+
)
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Failed to load embedding model: {e}")
|
| 155 |
+
raise
|
| 156 |
+
|
| 157 |
+
def mean_pooling(model_output, attention_mask):
|
| 158 |
+
token_embeddings = model_output[0]
|
| 159 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 160 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 161 |
+
|
| 162 |
+
def transformers_encode(texts, batch_size=8):
|
| 163 |
+
if isinstance(texts, str):
|
| 164 |
+
texts = [texts]
|
| 165 |
+
if not isinstance(texts, (list, tuple)) or not texts:
|
| 166 |
+
logger.error(f"Invalid input to transformers_encode: {texts}")
|
| 167 |
+
return np.random.randn(1, 768)
|
| 168 |
+
if not all(isinstance(t, str) for t in texts):
|
| 169 |
+
logger.error(f"Non-string elements in texts: {texts}")
|
| 170 |
+
texts = [str(t) for t in texts]
|
| 171 |
+
try:
|
| 172 |
+
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
|
| 173 |
+
with torch.no_grad():
|
| 174 |
+
model_output = model(**encoded_input)
|
| 175 |
+
embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
| 176 |
+
return embeddings.numpy()
|
| 177 |
+
except Exception as e:
|
| 178 |
+
logger.error(f"Error in transformers_encode: {str(e)}")
|
| 179 |
+
return np.random.randn(len(texts), 768)
|
| 180 |
+
|
| 181 |
+
# Model caching configuration
|
| 182 |
+
CACHE_FILE = os.path.join(MODEL_DIR, "model_cache_status.json")
|
| 183 |
+
|
| 184 |
+
def check_model_cache_status():
|
| 185 |
+
"""Check if models are already cached"""
|
| 186 |
+
if not os.path.exists(MODEL_DIR):
|
| 187 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 188 |
+
return False
|
| 189 |
+
|
| 190 |
+
if os.path.exists(CACHE_FILE):
|
| 191 |
+
try:
|
| 192 |
+
with open(CACHE_FILE, 'r') as f:
|
| 193 |
+
cache_data = json.load(f)
|
| 194 |
+
if cache_data.get('initialized', False):
|
| 195 |
+
logger.info("Using cached models")
|
| 196 |
+
return True
|
| 197 |
+
except Exception as e:
|
| 198 |
+
logger.warning(f"Error reading cache file: {e}")
|
| 199 |
+
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
def mark_models_as_cached():
|
| 203 |
+
"""Mark models as successfully cached"""
|
| 204 |
+
try:
|
| 205 |
+
with open(CACHE_FILE, 'w') as f:
|
| 206 |
+
json.dump({'initialized': True, 'timestamp': time.time()}, f)
|
| 207 |
+
logger.info("Models marked as cached")
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.warning(f"Error writing cache file: {e}")
|
| 210 |
+
|
| 211 |
+
def initialize_models():
|
| 212 |
+
"""Initialize and save models properly with caching"""
|
| 213 |
+
if check_model_cache_status():
|
| 214 |
+
logger.info("Models already cached, skipping initialization")
|
| 215 |
+
return
|
| 216 |
+
|
| 217 |
+
logger.info("Preloading QA model...")
|
| 218 |
+
get_qa_model()
|
| 219 |
+
mark_models_as_cached()
|
| 220 |
+
|
| 221 |
+
@lru_cache(maxsize=1)
|
| 222 |
+
def get_embedder():
|
| 223 |
+
"""Get the embedding model with caching"""
|
| 224 |
+
embedding_model_dir = os.path.join(MODEL_DIR, "embedding_model")
|
| 225 |
+
try:
|
| 226 |
+
class Embedder:
|
| 227 |
+
def encode(self, texts, batch_size=8):
|
| 228 |
+
return transformers_encode(texts, batch_size)
|
| 229 |
+
if os.path.exists(embedding_model_dir):
|
| 230 |
+
logger.info(f"Loading embedding model from: {embedding_model_dir}")
|
| 231 |
+
return Embedder()
|
| 232 |
+
logger.info(f"Using transformers-based embedding model: {EMBEDDING_MODEL_NAME}")
|
| 233 |
+
return Embedder()
|
| 234 |
+
except Exception as e:
|
| 235 |
+
logger.error(f"Error loading embedding model: {str(e)}. Using random embeddings as fallback.")
|
| 236 |
+
class SimpleEmbedder:
|
| 237 |
+
def encode(self, texts, batch_size=8):
|
| 238 |
+
if isinstance(texts, str):
|
| 239 |
+
return np.random.randn(768)
|
| 240 |
+
return np.random.randn(len(texts), 768)
|
| 241 |
+
return SimpleEmbedder()
|
| 242 |
+
|
| 243 |
+
@lru_cache(maxsize=1)
|
| 244 |
+
def get_qa_model():
|
| 245 |
+
"""Get the QA model with caching"""
|
| 246 |
+
try:
|
| 247 |
+
model_dir_contents = os.listdir(MODEL_DIR)
|
| 248 |
+
logger.info(f"Model directory contents: {model_dir_contents}")
|
| 249 |
+
except Exception as e:
|
| 250 |
+
logger.error(f"Failed to list model directory: {e}")
|
| 251 |
+
|
| 252 |
+
try:
|
| 253 |
+
logger.info("Loading QA model: distilbert-base-uncased-distilled-squad")
|
| 254 |
+
qa_model = pipeline(
|
| 255 |
+
"question-answering",
|
| 256 |
+
model="distilbert-base-uncased-distilled-squad",
|
| 257 |
+
tokenizer="distilbert-base-uncased-distilled-squad",
|
| 258 |
+
local_files_only=True,
|
| 259 |
+
cache_dir=MODEL_DIR
|
| 260 |
+
)
|
| 261 |
+
logger.info("Successfully loaded distilbert-base-uncased-distilled-squad")
|
| 262 |
+
return qa_model
|
| 263 |
+
except Exception as e:
|
| 264 |
+
logger.warning(f"Failed to load distilbert-base-uncased-distilled-squad: {e}. Falling back to roberta-base-squad2.")
|
| 265 |
+
try:
|
| 266 |
+
logger.info("Loading fallback QA model: deepset/roberta-base-squad2")
|
| 267 |
+
qa_model = pipeline(
|
| 268 |
+
"question-answering",
|
| 269 |
+
model="deepset/roberta-base-squad2",
|
| 270 |
+
tokenizer="deepset/roberta-base-squad2",
|
| 271 |
+
local_files_only=True,
|
| 272 |
+
cache_dir=MODEL_DIR
|
| 273 |
+
)
|
| 274 |
+
logger.info("Successfully loaded deepset/roberta-base-squad2")
|
| 275 |
+
return qa_model
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.error(f"Error loading fallback QA model: {e}")
|
| 278 |
+
def simple_qa(question, context):
|
| 279 |
+
return {
|
| 280 |
+
"answer": "I'm sorry, the QA model couldn't be loaded. Please try again later.",
|
| 281 |
+
"score": 0.0
|
| 282 |
+
}
|
| 283 |
+
return simple_qa
|
| 284 |
+
|
| 285 |
+
def load_pdf_text(pdf_path):
|
| 286 |
+
"""Load text from PDF with better error handling and multiple backends"""
|
| 287 |
+
if not os.path.exists(pdf_path):
|
| 288 |
+
logger.error(f"PDF file not found: {pdf_path}")
|
| 289 |
+
return "PDF file not found. Please check the file path."
|
| 290 |
+
|
| 291 |
+
cache_path = os.path.join(MODEL_DIR, f"{os.path.basename(pdf_path)}.cache.json")
|
| 292 |
+
if os.path.exists(cache_path):
|
| 293 |
+
try:
|
| 294 |
+
logger.info(f"Loading PDF content from cache: {cache_path}")
|
| 295 |
+
with open(cache_path, 'r', encoding='utf-8') as f:
|
| 296 |
+
cache_data = json.load(f)
|
| 297 |
+
return cache_data.get("text", "")
|
| 298 |
+
except Exception as e:
|
| 299 |
+
logger.warning(f"Error loading cache: {str(e)}")
|
| 300 |
+
|
| 301 |
+
try:
|
| 302 |
+
if PDF_BACKEND == "pymupdf":
|
| 303 |
+
logger.info(f"Loading PDF with PyMuPDF: {pdf_path}")
|
| 304 |
+
doc = fitz.open(pdf_path)
|
| 305 |
+
raw_text = "\n".join(page.get_text() for page in doc)
|
| 306 |
+
doc.close()
|
| 307 |
+
elif PDF_BACKEND == "pypdf2":
|
| 308 |
+
logger.info(f"Loading PDF with PyPDF2: {pdf_path}")
|
| 309 |
+
with open(pdf_path, 'rb') as file:
|
| 310 |
+
reader = PyPDF2.PdfReader(file)
|
| 311 |
+
raw_text = "\n".join(page.extract_text() for page in reader.pages)
|
| 312 |
+
else:
|
| 313 |
+
logger.error("No PDF backend available")
|
| 314 |
+
return "No PDF processing library is installed."
|
| 315 |
+
|
| 316 |
+
clean_text = " ".join(raw_text.split())
|
| 317 |
+
if not clean_text:
|
| 318 |
+
logger.warning(f"Extracted empty text from PDF: {pdf_path}")
|
| 319 |
+
return "No readable text found in the PDF."
|
| 320 |
+
|
| 321 |
+
try:
|
| 322 |
+
with open(cache_path, 'w', encoding='utf-8') as f:
|
| 323 |
+
json.dump({"text": clean_text, "timestamp": time.time()}, f)
|
| 324 |
+
except Exception as e:
|
| 325 |
+
logger.warning(f"Could not write PDF cache: {str(e)}")
|
| 326 |
+
|
| 327 |
+
return clean_text
|
| 328 |
+
except Exception as e:
|
| 329 |
+
logger.error(f"Error loading PDF: {str(e)}")
|
| 330 |
+
return f"Error loading PDF: {str(e)}"
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def load_passages_from_path(path_setting):
|
| 334 |
+
"""Load and chunk PDF content from all available sources.
|
| 335 |
+
|
| 336 |
+
Sources checked (in order):
|
| 337 |
+
1. Root PDF: Sahayak_Organisation_Expanded_Info.pdf
|
| 338 |
+
2. Directory: data/pdfs (all PDFs inside)
|
| 339 |
+
3. Custom path from path_setting
|
| 340 |
+
"""
|
| 341 |
+
all_passages = []
|
| 342 |
+
sources_loaded = []
|
| 343 |
+
|
| 344 |
+
# Always try to load root PDF first
|
| 345 |
+
root_pdf = "Sahayak_Organisation_Expanded_Info.pdf"
|
| 346 |
+
if os.path.isfile(root_pdf):
|
| 347 |
+
text = load_pdf_text(root_pdf)
|
| 348 |
+
chunks = split_into_chunks(text)
|
| 349 |
+
all_passages.extend(chunks)
|
| 350 |
+
sources_loaded.append(f"{root_pdf} ({len(chunks)} passages)")
|
| 351 |
+
logger.info(f"Loaded {len(chunks)} passages from root PDF: {root_pdf}")
|
| 352 |
+
|
| 353 |
+
# Load from data/pdfs directory
|
| 354 |
+
pdf_dir = "data/pdfs"
|
| 355 |
+
if os.path.isdir(pdf_dir):
|
| 356 |
+
pdf_files = [
|
| 357 |
+
os.path.join(pdf_dir, f) for f in sorted(os.listdir(pdf_dir))
|
| 358 |
+
if f.lower().endswith(".pdf")
|
| 359 |
+
]
|
| 360 |
+
for pdf_file in pdf_files:
|
| 361 |
+
# Skip if same as root PDF
|
| 362 |
+
if os.path.basename(pdf_file) == root_pdf:
|
| 363 |
+
continue
|
| 364 |
+
text = load_pdf_text(pdf_file)
|
| 365 |
+
chunks = split_into_chunks(text)
|
| 366 |
+
all_passages.extend(chunks)
|
| 367 |
+
sources_loaded.append(f"{os.path.basename(pdf_file)} ({len(chunks)} passages)")
|
| 368 |
+
logger.info(f"Loaded PDFs from directory: {pdf_dir}")
|
| 369 |
+
|
| 370 |
+
# Also check custom path if different from defaults
|
| 371 |
+
target = path_setting or ""
|
| 372 |
+
if target and target not in [root_pdf, pdf_dir]:
|
| 373 |
+
if os.path.isdir(target):
|
| 374 |
+
pdf_files = [
|
| 375 |
+
os.path.join(target, f) for f in sorted(os.listdir(target))
|
| 376 |
+
if f.lower().endswith(".pdf")
|
| 377 |
+
]
|
| 378 |
+
for pdf_file in pdf_files:
|
| 379 |
+
text = load_pdf_text(pdf_file)
|
| 380 |
+
chunks = split_into_chunks(text)
|
| 381 |
+
all_passages.extend(chunks)
|
| 382 |
+
sources_loaded.append(f"{os.path.basename(pdf_file)} ({len(chunks)} passages)")
|
| 383 |
+
elif os.path.isfile(target):
|
| 384 |
+
text = load_pdf_text(target)
|
| 385 |
+
chunks = split_into_chunks(text)
|
| 386 |
+
all_passages.extend(chunks)
|
| 387 |
+
sources_loaded.append(f"{target} ({len(chunks)} passages)")
|
| 388 |
+
|
| 389 |
+
if all_passages:
|
| 390 |
+
logger.info(f"Total knowledge base: {len(all_passages)} passages from {len(sources_loaded)} sources")
|
| 391 |
+
logger.info(f"Sources: {sources_loaded}")
|
| 392 |
+
return all_passages
|
| 393 |
+
|
| 394 |
+
logger.error("No PDF sources found. Using fallback.")
|
| 395 |
+
return ["Sahayak is a non-profit organization dedicated to providing support and community development."]
|
| 396 |
+
|
| 397 |
+
def split_into_chunks(text, max_length=200, min_length=50):
|
| 398 |
+
"""Split text into chunks based on thematic sections and sentence boundaries"""
|
| 399 |
+
if not isinstance(text, str):
|
| 400 |
+
logger.error(f"Invalid input to split_into_chunks: {text}")
|
| 401 |
+
return ["Invalid input"]
|
| 402 |
+
try:
|
| 403 |
+
sections = re.split(r'(?=\b[A-Z][a-zA-Z\s]+:)', text)
|
| 404 |
+
chunks = []
|
| 405 |
+
|
| 406 |
+
for section in sections:
|
| 407 |
+
section = section.strip()
|
| 408 |
+
if not section:
|
| 409 |
+
continue
|
| 410 |
+
paragraphs = section.split('\n\n')
|
| 411 |
+
for paragraph in paragraphs:
|
| 412 |
+
paragraph = paragraph.strip()
|
| 413 |
+
if not paragraph:
|
| 414 |
+
continue
|
| 415 |
+
if len(paragraph) <= max_length and len(paragraph) >= min_length:
|
| 416 |
+
chunks.append(paragraph)
|
| 417 |
+
elif len(paragraph) < min_length:
|
| 418 |
+
continue
|
| 419 |
+
else:
|
| 420 |
+
sentences = nltk.sent_tokenize(paragraph)
|
| 421 |
+
current_chunk = ""
|
| 422 |
+
for sentence in sentences:
|
| 423 |
+
if len(current_chunk) + len(sentence) <= max_length:
|
| 424 |
+
current_chunk += " " + sentence
|
| 425 |
+
else:
|
| 426 |
+
if len(current_chunk) >= min_length:
|
| 427 |
+
chunks.append(current_chunk.strip())
|
| 428 |
+
current_chunk = sentence
|
| 429 |
+
if current_chunk and len(current_chunk) >= min_length:
|
| 430 |
+
chunks.append(current_chunk.strip())
|
| 431 |
+
|
| 432 |
+
if not chunks:
|
| 433 |
+
logger.warning("No chunks created from text")
|
| 434 |
+
chunks = ["No content available"]
|
| 435 |
+
|
| 436 |
+
return chunks
|
| 437 |
+
except Exception as e:
|
| 438 |
+
logger.error(f"Error splitting text into chunks: {str(e)}")
|
| 439 |
+
return ["Error processing text content"]
|
| 440 |
+
|
| 441 |
+
def build_faiss_index(passages):
|
| 442 |
+
"""Build FAISS index with error handling"""
|
| 443 |
+
try:
|
| 444 |
+
logger.info(f"Building FAISS index with passages: {passages[:2]}... (total: {len(passages)})")
|
| 445 |
+
embedder = get_embedder()
|
| 446 |
+
embeddings = embedder.encode(passages)
|
| 447 |
+
|
| 448 |
+
dimension = embeddings.shape[1]
|
| 449 |
+
index = faiss.IndexFlatL2(dimension)
|
| 450 |
+
index.add(np.array(embeddings).astype('float32'))
|
| 451 |
+
|
| 452 |
+
return index, embeddings
|
| 453 |
+
except Exception as e:
|
| 454 |
+
logger.error(f"Error building FAISS index: {str(e)}")
|
| 455 |
+
dimension = 768
|
| 456 |
+
dummy_embeddings = np.random.randn(len(passages), dimension).astype('float32')
|
| 457 |
+
dummy_index = faiss.IndexFlatL2(dimension)
|
| 458 |
+
dummy_index.add(dummy_embeddings)
|
| 459 |
+
return dummy_index, dummy_embeddings
|
| 460 |
+
|
| 461 |
+
def retrieve_relevant_passages(query, passages, vector_index, embeddings, top_k=10):
|
| 462 |
+
"""Retrieve the most relevant passages using pure semantic similarity.
|
| 463 |
+
|
| 464 |
+
Uses cosine similarity between query and passage embeddings for true semantic understanding,
|
| 465 |
+
not just keyword matching.
|
| 466 |
+
"""
|
| 467 |
+
try:
|
| 468 |
+
embedder = get_embedder()
|
| 469 |
+
query_vector = embedder.encode([query])[0].reshape(1, -1).astype('float32')
|
| 470 |
+
|
| 471 |
+
# Get more candidates than needed, then re-rank
|
| 472 |
+
num_candidates = min(top_k * 3, len(passages))
|
| 473 |
+
D, I = vector_index.search(query_vector, num_candidates)
|
| 474 |
+
|
| 475 |
+
# Re-rank based on semantic similarity score
|
| 476 |
+
results = []
|
| 477 |
+
for idx, dist in zip(I[0], D[0]):
|
| 478 |
+
if idx < len(passages):
|
| 479 |
+
# Convert L2 distance to similarity score (closer = higher score)
|
| 480 |
+
similarity_score = 1.0 / (1.0 + dist)
|
| 481 |
+
results.append((idx, similarity_score, passages[idx]))
|
| 482 |
+
|
| 483 |
+
# Sort by similarity score descending
|
| 484 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 485 |
+
|
| 486 |
+
# Return top_k most relevant passages
|
| 487 |
+
return [passage for _, _, passage in results[:top_k]]
|
| 488 |
+
except Exception as e:
|
| 489 |
+
logger.error(f"Error retrieving passages: {str(e)}")
|
| 490 |
+
import random
|
| 491 |
+
return random.sample(passages, min(3, len(passages)))
|
| 492 |
+
|
| 493 |
+
def detect_and_translate(query):
|
| 494 |
+
"""Detect language and translate with error handling"""
|
| 495 |
+
try:
|
| 496 |
+
lang = translator.detect(query).lang
|
| 497 |
+
if lang != "en":
|
| 498 |
+
translated_query = translator.translate(query, src=lang, dest="en").text
|
| 499 |
+
else:
|
| 500 |
+
translated_query = query
|
| 501 |
+
return translated_query, lang
|
| 502 |
+
except Exception as e:
|
| 503 |
+
logger.warning(f"Translation error: {str(e)}. Using original query.")
|
| 504 |
+
return query, "en"
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
def translate_text(text, target_lang):
|
| 508 |
+
"""Translate text back to the target language with fallback"""
|
| 509 |
+
if not text or not target_lang or target_lang == "en":
|
| 510 |
+
return text
|
| 511 |
+
try:
|
| 512 |
+
return translator.translate(text, src="en", dest=target_lang).text
|
| 513 |
+
except Exception as e:
|
| 514 |
+
logger.warning(f"Answer translation failed: {e}. Returning English text.")
|
| 515 |
+
return text
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
def moderate_query(query):
|
| 519 |
+
"""Lightweight safety filter to avoid harmful or off-policy content."""
|
| 520 |
+
lowered = query.lower()
|
| 521 |
+
blocked_topics = [
|
| 522 |
+
"violence", "weapon", "harm myself", "self-harm", "suicide", "attack",
|
| 523 |
+
"explosive", "bomb", "terror", "hate", "racist", "sex", "explicit",
|
| 524 |
+
"drugs", "narcotic", "illegal"
|
| 525 |
+
]
|
| 526 |
+
if any(term in lowered for term in blocked_topics):
|
| 527 |
+
return {
|
| 528 |
+
"status": "blocked",
|
| 529 |
+
"message": ("I'm here to provide supportive, lawful information. "
|
| 530 |
+
"I can't help with that topic. If you need wellbeing support, consider contacting a local helpline.")
|
| 531 |
+
}
|
| 532 |
+
return {"status": "ok"}
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
def is_in_scope(query):
|
| 536 |
+
"""Keep answers aligned to community service, government schemes, and Sahayak context.
|
| 537 |
+
|
| 538 |
+
Expanded to cover:
|
| 539 |
+
- Sahayak organization topics
|
| 540 |
+
- Government welfare schemes and benefits
|
| 541 |
+
- Social services and community development
|
| 542 |
+
- Education and health programs
|
| 543 |
+
- Disaster relief and emergency support
|
| 544 |
+
- Legal rights and entitlements for citizens
|
| 545 |
+
"""
|
| 546 |
+
scope_keywords = [
|
| 547 |
+
# Sahayak specific
|
| 548 |
+
"sahayak", "vexa", "ngo", "non profit", "nonprofit",
|
| 549 |
+
# Community and social
|
| 550 |
+
"community", "benefit", "volunteer", "social", "help", "support", "welfare",
|
| 551 |
+
"charity", "donation", "outreach", "initiative", "campaign",
|
| 552 |
+
# Education
|
| 553 |
+
"education", "school", "scholarship", "student", "learning", "training",
|
| 554 |
+
"skill", "literacy", "academic", "mentorship",
|
| 555 |
+
# Health
|
| 556 |
+
"health", "medical", "hospital", "treatment", "medicine", "healthcare",
|
| 557 |
+
"disability", "disabled", "blind", "specially abled",
|
| 558 |
+
# Government schemes
|
| 559 |
+
"scheme", "yojana", "government", "subsidy", "pension", "ration",
|
| 560 |
+
"aadhar", "aadhaar", "pan", "voter", "certificate", "document",
|
| 561 |
+
# Services
|
| 562 |
+
"service", "programme", "program", "project", "event", "activity",
|
| 563 |
+
# Locations
|
| 564 |
+
"belgaum", "bangalore", "karnataka", "india",
|
| 565 |
+
# Disaster and emergency
|
| 566 |
+
"disaster", "relief", "emergency", "flood", "earthquake",
|
| 567 |
+
# Rights and legal
|
| 568 |
+
"rights", "entitlement", "eligibility", "apply", "registration",
|
| 569 |
+
# General queries that should be answered
|
| 570 |
+
"who", "what", "when", "where", "how", "why", "tell", "explain",
|
| 571 |
+
"founder", "team", "member", "leader", "president", "secretary",
|
| 572 |
+
"mission", "vision", "goal", "objective", "aim", "purpose"
|
| 573 |
+
]
|
| 574 |
+
lowered = query.lower()
|
| 575 |
+
# More permissive: if query is short or contains any scope keyword, allow it
|
| 576 |
+
if len(query.split()) <= 5:
|
| 577 |
+
return True # Short queries are likely on-topic
|
| 578 |
+
return any(k in lowered for k in scope_keywords)
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
def scope_redirect_message(lang):
|
| 582 |
+
msg = (
|
| 583 |
+
"I focus on Sahayak and community-benefit topics. "
|
| 584 |
+
"Please ask about our programmes, volunteering, events, or social impact."
|
| 585 |
+
)
|
| 586 |
+
return translate_text(msg, lang)
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
# =============================================================================
|
| 590 |
+
# SEMANTIC UNDERSTANDING ENGINE - Expert-Level Context Analysis
|
| 591 |
+
# =============================================================================
|
| 592 |
+
|
| 593 |
+
# Intent templates for semantic matching (these will be encoded for similarity)
|
| 594 |
+
INTENT_TEMPLATES = {
|
| 595 |
+
"about_organization": [
|
| 596 |
+
"What is Sahayak?",
|
| 597 |
+
"Tell me about Sahayak organization",
|
| 598 |
+
"Describe Sahayak NGO",
|
| 599 |
+
"What does Sahayak do?",
|
| 600 |
+
"Explain Sahayak organization"
|
| 601 |
+
],
|
| 602 |
+
"founder_leadership": [
|
| 603 |
+
"Who founded Sahayak?",
|
| 604 |
+
"Who is the founder of Sahayak?",
|
| 605 |
+
"Who started Sahayak?",
|
| 606 |
+
"Tell me about the founder",
|
| 607 |
+
"Who is Verril Vaz?"
|
| 608 |
+
],
|
| 609 |
+
"president_leadership": [
|
| 610 |
+
"Who is the president?",
|
| 611 |
+
"Who leads Sahayak?",
|
| 612 |
+
"Who is the current president?",
|
| 613 |
+
"Tell me about the leadership team"
|
| 614 |
+
],
|
| 615 |
+
"team_members": [
|
| 616 |
+
"Who are the team members?",
|
| 617 |
+
"How many members does Sahayak have?",
|
| 618 |
+
"Tell me about the team",
|
| 619 |
+
"Who works at Sahayak?"
|
| 620 |
+
],
|
| 621 |
+
"mission_vision": [
|
| 622 |
+
"What is the mission of Sahayak?",
|
| 623 |
+
"What is the vision?",
|
| 624 |
+
"What are the goals?",
|
| 625 |
+
"What are the objectives?",
|
| 626 |
+
"What does Sahayak aim to achieve?",
|
| 627 |
+
"What is the purpose of Sahayak?"
|
| 628 |
+
],
|
| 629 |
+
"initiatives_programs": [
|
| 630 |
+
"What programs does Sahayak run?",
|
| 631 |
+
"What are Sahayak's initiatives?",
|
| 632 |
+
"Tell me about the activities",
|
| 633 |
+
"What projects has Sahayak done?",
|
| 634 |
+
"What kind of work does Sahayak do?"
|
| 635 |
+
],
|
| 636 |
+
"recent_events": [
|
| 637 |
+
"What are the recent events?",
|
| 638 |
+
"What activities have been conducted recently?",
|
| 639 |
+
"What has Sahayak done recently?",
|
| 640 |
+
"Tell me about recent visits"
|
| 641 |
+
],
|
| 642 |
+
"location_branches": [
|
| 643 |
+
"Where is Sahayak located?",
|
| 644 |
+
"How many branches does Sahayak have?",
|
| 645 |
+
"Where does Sahayak operate?",
|
| 646 |
+
"What cities does Sahayak work in?"
|
| 647 |
+
],
|
| 648 |
+
"join_volunteer": [
|
| 649 |
+
"How can I join Sahayak?",
|
| 650 |
+
"How to volunteer?",
|
| 651 |
+
"How to become a member?",
|
| 652 |
+
"How can I help?",
|
| 653 |
+
"How to get involved?"
|
| 654 |
+
],
|
| 655 |
+
"donate_support": [
|
| 656 |
+
"How to donate?",
|
| 657 |
+
"How can I support Sahayak?",
|
| 658 |
+
"How to contribute?",
|
| 659 |
+
"Where can I donate?"
|
| 660 |
+
],
|
| 661 |
+
"contact_info": [
|
| 662 |
+
"How to contact Sahayak?",
|
| 663 |
+
"What is the phone number?",
|
| 664 |
+
"What is the email?",
|
| 665 |
+
"How to reach Sahayak?"
|
| 666 |
+
],
|
| 667 |
+
"founding_date": [
|
| 668 |
+
"When was Sahayak founded?",
|
| 669 |
+
"When did Sahayak start?",
|
| 670 |
+
"What is the founding date?",
|
| 671 |
+
"How old is Sahayak?"
|
| 672 |
+
],
|
| 673 |
+
"greeting": [
|
| 674 |
+
"Hello",
|
| 675 |
+
"Hi",
|
| 676 |
+
"Hey",
|
| 677 |
+
"Good morning",
|
| 678 |
+
"Namaste"
|
| 679 |
+
],
|
| 680 |
+
"thanks": [
|
| 681 |
+
"Thank you",
|
| 682 |
+
"Thanks",
|
| 683 |
+
"Thanks a lot",
|
| 684 |
+
"Thank you so much"
|
| 685 |
+
],
|
| 686 |
+
"affirmative": [
|
| 687 |
+
"Yes",
|
| 688 |
+
"Sure",
|
| 689 |
+
"Ok",
|
| 690 |
+
"Okay",
|
| 691 |
+
"Tell me more",
|
| 692 |
+
"Continue"
|
| 693 |
+
]
|
| 694 |
+
}
|
| 695 |
+
|
| 696 |
+
# Cache for intent embeddings
|
| 697 |
+
_intent_embeddings_cache = {}
|
| 698 |
+
|
| 699 |
+
def get_intent_embeddings():
|
| 700 |
+
"""Get cached intent embeddings or compute them."""
|
| 701 |
+
global _intent_embeddings_cache
|
| 702 |
+
|
| 703 |
+
if _intent_embeddings_cache:
|
| 704 |
+
return _intent_embeddings_cache
|
| 705 |
+
|
| 706 |
+
try:
|
| 707 |
+
embedder = get_embedder()
|
| 708 |
+
for intent, templates in INTENT_TEMPLATES.items():
|
| 709 |
+
embeddings = embedder.encode(templates)
|
| 710 |
+
# Store mean embedding for each intent
|
| 711 |
+
_intent_embeddings_cache[intent] = {
|
| 712 |
+
"templates": templates,
|
| 713 |
+
"embeddings": embeddings,
|
| 714 |
+
"centroid": np.mean(embeddings, axis=0)
|
| 715 |
+
}
|
| 716 |
+
logger.info(f"Intent embeddings computed for {len(_intent_embeddings_cache)} intents")
|
| 717 |
+
except Exception as e:
|
| 718 |
+
logger.error(f"Error computing intent embeddings: {e}")
|
| 719 |
+
|
| 720 |
+
return _intent_embeddings_cache
|
| 721 |
+
|
| 722 |
+
def semantic_intent_classification(query):
|
| 723 |
+
"""
|
| 724 |
+
Classify user intent using semantic similarity with embeddings.
|
| 725 |
+
Returns the best matching intent and confidence score.
|
| 726 |
+
"""
|
| 727 |
+
try:
|
| 728 |
+
embedder = get_embedder()
|
| 729 |
+
query_embedding = embedder.encode([query])[0]
|
| 730 |
+
|
| 731 |
+
intent_embeddings = get_intent_embeddings()
|
| 732 |
+
if not intent_embeddings:
|
| 733 |
+
return "general", 0.0
|
| 734 |
+
|
| 735 |
+
best_intent = "general"
|
| 736 |
+
best_score = 0.0
|
| 737 |
+
best_template = ""
|
| 738 |
+
|
| 739 |
+
for intent, data in intent_embeddings.items():
|
| 740 |
+
# Compare with centroid (mean of all templates)
|
| 741 |
+
centroid = data["centroid"]
|
| 742 |
+
centroid_similarity = np.dot(query_embedding, centroid) / (
|
| 743 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(centroid)
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
# Also check best individual template match
|
| 747 |
+
for i, template_emb in enumerate(data["embeddings"]):
|
| 748 |
+
template_similarity = np.dot(query_embedding, template_emb) / (
|
| 749 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(template_emb)
|
| 750 |
+
)
|
| 751 |
+
if template_similarity > best_score:
|
| 752 |
+
best_score = template_similarity
|
| 753 |
+
best_intent = intent
|
| 754 |
+
best_template = data["templates"][i]
|
| 755 |
+
|
| 756 |
+
logger.info(f"Semantic intent: {best_intent} (score: {best_score:.3f}, matched: '{best_template}')")
|
| 757 |
+
return best_intent, best_score
|
| 758 |
+
|
| 759 |
+
except Exception as e:
|
| 760 |
+
logger.error(f"Intent classification error: {e}")
|
| 761 |
+
return "general", 0.0
|
| 762 |
+
|
| 763 |
+
|
| 764 |
+
def semantic_passage_ranking(query, passages, top_k=5):
|
| 765 |
+
"""
|
| 766 |
+
Rank passages by semantic similarity to the query.
|
| 767 |
+
Uses cosine similarity between query and passage embeddings.
|
| 768 |
+
"""
|
| 769 |
+
try:
|
| 770 |
+
embedder = get_embedder()
|
| 771 |
+
query_embedding = embedder.encode([query])[0]
|
| 772 |
+
passage_embeddings = embedder.encode(passages)
|
| 773 |
+
|
| 774 |
+
# Calculate cosine similarity
|
| 775 |
+
similarities = []
|
| 776 |
+
for i, passage_emb in enumerate(passage_embeddings):
|
| 777 |
+
similarity = np.dot(query_embedding, passage_emb) / (
|
| 778 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(passage_emb)
|
| 779 |
+
)
|
| 780 |
+
similarities.append((i, similarity, passages[i]))
|
| 781 |
+
|
| 782 |
+
# Sort by similarity descending
|
| 783 |
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
| 784 |
+
|
| 785 |
+
return [(passage, score) for _, score, passage in similarities[:top_k]]
|
| 786 |
+
|
| 787 |
+
except Exception as e:
|
| 788 |
+
logger.error(f"Passage ranking error: {e}")
|
| 789 |
+
return [(p, 0.5) for p in passages[:top_k]]
|
| 790 |
+
|
| 791 |
+
|
| 792 |
+
def analyze_query_complexity(query):
|
| 793 |
+
"""
|
| 794 |
+
Analyze query complexity to determine response strategy.
|
| 795 |
+
"""
|
| 796 |
+
words = query.split()
|
| 797 |
+
complexity = {
|
| 798 |
+
"word_count": len(words),
|
| 799 |
+
"is_question": any(query.strip().endswith(c) for c in ['?', '?']),
|
| 800 |
+
"has_multiple_parts": any(c in query for c in [',', 'and', 'or', 'also']),
|
| 801 |
+
"is_comparison": any(w in query.lower() for w in ['compare', 'difference', 'between', 'vs', 'versus']),
|
| 802 |
+
"is_list_request": any(w in query.lower() for w in ['list', 'all', 'every', 'each', 'various']),
|
| 803 |
+
"is_explanation": any(w in query.lower() for w in ['why', 'how', 'explain', 'describe', 'elaborate']),
|
| 804 |
+
"is_specific": any(w in query.lower() for w in ['specific', 'exactly', 'particular', 'precise'])
|
| 805 |
+
}
|
| 806 |
+
|
| 807 |
+
# Calculate complexity score
|
| 808 |
+
score = 0
|
| 809 |
+
if complexity["word_count"] > 10:
|
| 810 |
+
score += 1
|
| 811 |
+
if complexity["has_multiple_parts"]:
|
| 812 |
+
score += 1
|
| 813 |
+
if complexity["is_comparison"]:
|
| 814 |
+
score += 2
|
| 815 |
+
if complexity["is_list_request"]:
|
| 816 |
+
score += 1
|
| 817 |
+
if complexity["is_explanation"]:
|
| 818 |
+
score += 1
|
| 819 |
+
|
| 820 |
+
complexity["score"] = score
|
| 821 |
+
complexity["level"] = "simple" if score < 2 else ("moderate" if score < 4 else "complex")
|
| 822 |
+
|
| 823 |
+
return complexity
|
| 824 |
+
|
| 825 |
+
|
| 826 |
+
# =============================================================================
|
| 827 |
+
# EXPERT REASONING ENGINE - Chain of Thought for Better Responses
|
| 828 |
+
# =============================================================================
|
| 829 |
+
|
| 830 |
+
def extract_key_entities(text):
|
| 831 |
+
"""Extract key entities like names, dates, places, and numbers from text."""
|
| 832 |
+
entities = {
|
| 833 |
+
"dates": re.findall(r'\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', text, re.IGNORECASE),
|
| 834 |
+
"years": re.findall(r'\b(?:19|20)\d{2}\b', text),
|
| 835 |
+
"numbers": re.findall(r'\b\d+(?:\.\d+)?\s*(?:members|people|students|children|rupees|rs|lakh|crore|percent|%)\b', text, re.IGNORECASE),
|
| 836 |
+
"places": re.findall(r'\b(?:Belgaum|Bangalore|Karnataka|India|Delhi|Mumbai)\b', text, re.IGNORECASE),
|
| 837 |
+
"organizations": re.findall(r'\b(?:Sahayak|Sparsh Foundation|Maheshwari School|Anand Yatri|Balika Adarsh Vidyalaya)\b', text, re.IGNORECASE)
|
| 838 |
+
}
|
| 839 |
+
return entities
|
| 840 |
+
|
| 841 |
+
|
| 842 |
+
def calculate_passage_relevance(passage, query):
|
| 843 |
+
"""Calculate a relevance score for a passage based on query overlap."""
|
| 844 |
+
query_words = set(query.lower().split())
|
| 845 |
+
passage_words = set(passage.lower().split())
|
| 846 |
+
overlap = query_words.intersection(passage_words)
|
| 847 |
+
if not query_words:
|
| 848 |
+
return 0.0
|
| 849 |
+
return len(overlap) / len(query_words)
|
| 850 |
+
|
| 851 |
+
|
| 852 |
+
def expert_reasoning_chain(query, passages, qa_result):
|
| 853 |
+
"""
|
| 854 |
+
Expert system reasoning chain that:
|
| 855 |
+
1. Uses semantic intent classification (not keyword matching)
|
| 856 |
+
2. Evaluates evidence from multiple passages with semantic similarity
|
| 857 |
+
3. Synthesizes a well-reasoned response
|
| 858 |
+
4. Provides confidence assessment based on multiple factors
|
| 859 |
+
"""
|
| 860 |
+
reasoning_steps = []
|
| 861 |
+
|
| 862 |
+
# Step 1: Semantic Intent Analysis (using embeddings)
|
| 863 |
+
intent, intent_confidence = semantic_intent_classification(query)
|
| 864 |
+
|
| 865 |
+
# Map semantic intents to response intents
|
| 866 |
+
intent_mapping = {
|
| 867 |
+
"about_organization": "definitional",
|
| 868 |
+
"founder_leadership": "person",
|
| 869 |
+
"president_leadership": "person",
|
| 870 |
+
"team_members": "person",
|
| 871 |
+
"mission_vision": "definitional",
|
| 872 |
+
"initiatives_programs": "information",
|
| 873 |
+
"recent_events": "temporal",
|
| 874 |
+
"location_branches": "location",
|
| 875 |
+
"join_volunteer": "procedural",
|
| 876 |
+
"donate_support": "procedural",
|
| 877 |
+
"contact_info": "information",
|
| 878 |
+
"founding_date": "temporal",
|
| 879 |
+
"greeting": "greeting",
|
| 880 |
+
"thanks": "greeting",
|
| 881 |
+
"affirmative": "confirmation",
|
| 882 |
+
"general": "information"
|
| 883 |
+
}
|
| 884 |
+
|
| 885 |
+
response_intent = intent_mapping.get(intent, "information")
|
| 886 |
+
reasoning_steps.append(f"Semantic Intent: {intent} -> {response_intent} (confidence: {intent_confidence:.2f})")
|
| 887 |
+
|
| 888 |
+
# Step 2: Semantic Evidence Gathering
|
| 889 |
+
evidence_pieces = []
|
| 890 |
+
try:
|
| 891 |
+
# Use semantic ranking for evidence
|
| 892 |
+
ranked = semantic_passage_ranking(query, passages[:10], top_k=5)
|
| 893 |
+
for passage, score in ranked:
|
| 894 |
+
if score > 0.25: # Semantic similarity threshold
|
| 895 |
+
entities = extract_key_entities(passage)
|
| 896 |
+
evidence_pieces.append({
|
| 897 |
+
"passage": passage[:300],
|
| 898 |
+
"semantic_score": score,
|
| 899 |
+
"entities": entities
|
| 900 |
+
})
|
| 901 |
+
except Exception as e:
|
| 902 |
+
# Fallback to simple relevance
|
| 903 |
+
for i, passage in enumerate(passages[:5]):
|
| 904 |
+
relevance = calculate_passage_relevance(passage, query)
|
| 905 |
+
if relevance > 0.1:
|
| 906 |
+
evidence_pieces.append({
|
| 907 |
+
"passage": passage[:300],
|
| 908 |
+
"semantic_score": relevance,
|
| 909 |
+
"entities": extract_key_entities(passage)
|
| 910 |
+
})
|
| 911 |
+
|
| 912 |
+
reasoning_steps.append(f"Semantic evidence pieces: {len(evidence_pieces)}")
|
| 913 |
+
|
| 914 |
+
# Step 3: Multi-factor Confidence Assessment
|
| 915 |
+
base_confidence = qa_result.get("score", 0.0)
|
| 916 |
+
intent_boost = intent_confidence * 0.2 # Intent confidence contributes
|
| 917 |
+
evidence_boost = min(len(evidence_pieces) * 0.1, 0.3)
|
| 918 |
+
|
| 919 |
+
# Calculate final confidence
|
| 920 |
+
final_confidence = min(base_confidence + intent_boost + evidence_boost, 1.0)
|
| 921 |
+
|
| 922 |
+
confidence_level = "low"
|
| 923 |
+
if final_confidence > 0.7:
|
| 924 |
+
confidence_level = "high"
|
| 925 |
+
elif final_confidence > 0.4:
|
| 926 |
+
confidence_level = "medium"
|
| 927 |
+
|
| 928 |
+
reasoning_steps.append(f"Confidence: {confidence_level} ({final_confidence:.2f})")
|
| 929 |
+
|
| 930 |
+
return {
|
| 931 |
+
"intent": intent,
|
| 932 |
+
"evidence": evidence_pieces,
|
| 933 |
+
"confidence": final_confidence,
|
| 934 |
+
"confidence_level": confidence_level,
|
| 935 |
+
"reasoning_steps": reasoning_steps
|
| 936 |
+
}
|
| 937 |
+
|
| 938 |
+
|
| 939 |
+
def clean_answer_fragment(answer, context_passages):
|
| 940 |
+
"""
|
| 941 |
+
Clean up fragmented QA model answers to make them complete sentences.
|
| 942 |
+
The QA model sometimes returns partial phrases - this fixes that.
|
| 943 |
+
"""
|
| 944 |
+
if not answer:
|
| 945 |
+
return None
|
| 946 |
+
|
| 947 |
+
answer = answer.strip()
|
| 948 |
+
|
| 949 |
+
# Check if answer starts with lowercase or incomplete phrase
|
| 950 |
+
incomplete_starts = ["the ", "a ", "an ", "is ", "are ", "was ", "were ", "has ", "have ",
|
| 951 |
+
"it ", "its ", "their ", "this ", "that ", "these ", "those ",
|
| 952 |
+
"and ", "or ", "but ", "with ", "for ", "to ", "in ", "on ", "at "]
|
| 953 |
+
|
| 954 |
+
# If answer starts with lowercase or is a fragment, try to find it in context
|
| 955 |
+
if answer and (answer[0].islower() or any(answer.lower().startswith(s) for s in incomplete_starts)):
|
| 956 |
+
for passage in context_passages[:3]:
|
| 957 |
+
if answer in passage:
|
| 958 |
+
# Find the sentence containing the answer
|
| 959 |
+
sentences = re.split(r'(?<=[.!?])\s+', passage)
|
| 960 |
+
for sent in sentences:
|
| 961 |
+
if answer in sent:
|
| 962 |
+
return sent.strip()
|
| 963 |
+
|
| 964 |
+
# If answer is very short and looks like a fragment
|
| 965 |
+
if len(answer.split()) < 4 and not answer.endswith(('.', '!', '?')):
|
| 966 |
+
for passage in context_passages[:3]:
|
| 967 |
+
if answer.lower() in passage.lower():
|
| 968 |
+
sentences = re.split(r'(?<=[.!?])\s+', passage)
|
| 969 |
+
for sent in sentences:
|
| 970 |
+
if answer.lower() in sent.lower():
|
| 971 |
+
return sent.strip()
|
| 972 |
+
|
| 973 |
+
return answer
|
| 974 |
+
|
| 975 |
+
|
| 976 |
+
def synthesize_answer_from_passages(query, passages, qa_answer, intent):
|
| 977 |
+
"""
|
| 978 |
+
Synthesize a comprehensive, well-structured answer from passages.
|
| 979 |
+
This creates complete, coherent responses instead of fragments.
|
| 980 |
+
"""
|
| 981 |
+
query_lower = query.lower()
|
| 982 |
+
|
| 983 |
+
# Extract key information based on intent
|
| 984 |
+
if intent == "person":
|
| 985 |
+
# Look for names, roles, titles
|
| 986 |
+
person_info = []
|
| 987 |
+
for passage in passages[:5]:
|
| 988 |
+
if any(term in passage.lower() for term in ["founder", "president", "leader", "team", "member", "secretary"]):
|
| 989 |
+
sentences = re.split(r'(?<=[.!?])\s+', passage)
|
| 990 |
+
for sent in sentences:
|
| 991 |
+
if any(term in sent.lower() for term in ["founder", "president", "verril", "vaz", "leader", "team"]):
|
| 992 |
+
person_info.append(sent.strip())
|
| 993 |
+
if person_info:
|
| 994 |
+
return " ".join(person_info[:3])
|
| 995 |
+
|
| 996 |
+
elif intent == "definitional":
|
| 997 |
+
# Look for definitions, descriptions
|
| 998 |
+
definitions = []
|
| 999 |
+
for passage in passages[:5]:
|
| 1000 |
+
if "sahayak" in passage.lower():
|
| 1001 |
+
sentences = re.split(r'(?<=[.!?])\s+', passage)
|
| 1002 |
+
for sent in sentences:
|
| 1003 |
+
if "sahayak" in sent.lower() and len(sent) > 30:
|
| 1004 |
+
definitions.append(sent.strip())
|
| 1005 |
+
if definitions:
|
| 1006 |
+
return " ".join(definitions[:3])
|
| 1007 |
+
|
| 1008 |
+
elif intent == "location":
|
| 1009 |
+
# Look for place-related information
|
| 1010 |
+
location_info = []
|
| 1011 |
+
for passage in passages[:5]:
|
| 1012 |
+
if any(place in passage.lower() for place in ["belgaum", "bangalore", "karnataka", "india", "location", "branch"]):
|
| 1013 |
+
sentences = re.split(r'(?<=[.!?])\s+', passage)
|
| 1014 |
+
for sent in sentences:
|
| 1015 |
+
if any(place in sent.lower() for place in ["belgaum", "bangalore", "operates", "location", "branch"]):
|
| 1016 |
+
location_info.append(sent.strip())
|
| 1017 |
+
if location_info:
|
| 1018 |
+
return " ".join(location_info[:3])
|
| 1019 |
+
|
| 1020 |
+
elif intent == "procedural":
|
| 1021 |
+
# Look for how-to, process information
|
| 1022 |
+
process_info = []
|
| 1023 |
+
for passage in passages[:5]:
|
| 1024 |
+
if any(term in passage.lower() for term in ["how", "step", "process", "register", "join", "volunteer", "donate", "contact"]):
|
| 1025 |
+
sentences = re.split(r'(?<=[.!?])\s+', passage)
|
| 1026 |
+
for sent in sentences:
|
| 1027 |
+
if len(sent) > 20:
|
| 1028 |
+
process_info.append(sent.strip())
|
| 1029 |
+
if process_info:
|
| 1030 |
+
return " ".join(process_info[:4])
|
| 1031 |
+
|
| 1032 |
+
# Default: return cleaned QA answer or first relevant passage
|
| 1033 |
+
cleaned = clean_answer_fragment(qa_answer, passages)
|
| 1034 |
+
if cleaned and len(cleaned) > 20:
|
| 1035 |
+
return cleaned
|
| 1036 |
+
|
| 1037 |
+
# Fallback to first passage sentences
|
| 1038 |
+
if passages:
|
| 1039 |
+
sentences = re.split(r'(?<=[.!?])\s+', passages[0])
|
| 1040 |
+
return " ".join(sentences[:3])
|
| 1041 |
+
|
| 1042 |
+
return qa_answer
|
| 1043 |
+
|
| 1044 |
+
|
| 1045 |
+
def build_expert_response(qa_answer, reasoning, query, passages):
|
| 1046 |
+
"""
|
| 1047 |
+
Build a comprehensive, well-formatted expert response with:
|
| 1048 |
+
- Clear, complete answer (not fragments)
|
| 1049 |
+
- Organized sections with proper headers
|
| 1050 |
+
- Supporting details
|
| 1051 |
+
- Helpful follow-up suggestions
|
| 1052 |
+
"""
|
| 1053 |
+
intent = reasoning["intent"]
|
| 1054 |
+
confidence = reasoning["confidence_level"]
|
| 1055 |
+
evidence = reasoning["evidence"]
|
| 1056 |
+
|
| 1057 |
+
# First, synthesize a proper answer from fragments
|
| 1058 |
+
synthesized_answer = synthesize_answer_from_passages(query, passages, qa_answer, intent)
|
| 1059 |
+
|
| 1060 |
+
# Clean up the answer to ensure it's a complete sentence
|
| 1061 |
+
if synthesized_answer:
|
| 1062 |
+
synthesized_answer = clean_answer_fragment(synthesized_answer, passages) or synthesized_answer
|
| 1063 |
+
|
| 1064 |
+
response_parts = []
|
| 1065 |
+
|
| 1066 |
+
# Section 1: Main Answer with clear header based on intent
|
| 1067 |
+
intent_headers = {
|
| 1068 |
+
"person": "👤 **About the Person/Team**",
|
| 1069 |
+
"temporal": "📅 **Timeline & Dates**",
|
| 1070 |
+
"location": "📍 **Location Information**",
|
| 1071 |
+
"quantitative": "📊 **Facts & Figures**",
|
| 1072 |
+
"procedural": "📋 **How It Works**",
|
| 1073 |
+
"explanatory": "💡 **Explanation**",
|
| 1074 |
+
"definitional": "📖 **Overview**",
|
| 1075 |
+
"information": "ℹ️ **Information**"
|
| 1076 |
+
}
|
| 1077 |
+
|
| 1078 |
+
header = intent_headers.get(intent, intent_headers["information"])
|
| 1079 |
+
response_parts.append(f"{header}\n\n")
|
| 1080 |
+
|
| 1081 |
+
# Add the main answer
|
| 1082 |
+
if synthesized_answer and len(synthesized_answer) > 10:
|
| 1083 |
+
# Ensure answer starts with capital letter and ends properly
|
| 1084 |
+
answer_text = synthesized_answer.strip()
|
| 1085 |
+
if answer_text and answer_text[0].islower():
|
| 1086 |
+
answer_text = answer_text[0].upper() + answer_text[1:]
|
| 1087 |
+
if not answer_text.endswith(('.', '!', '?')):
|
| 1088 |
+
answer_text += "."
|
| 1089 |
+
response_parts.append(f"{answer_text}\n")
|
| 1090 |
+
else:
|
| 1091 |
+
response_parts.append(f"{qa_answer}\n")
|
| 1092 |
+
|
| 1093 |
+
# Section 2: Key Details (extracted from evidence)
|
| 1094 |
+
if evidence and len(evidence) > 0:
|
| 1095 |
+
response_parts.append("\n**Key Details:**\n")
|
| 1096 |
+
|
| 1097 |
+
added_details = set()
|
| 1098 |
+
for ev in evidence[:3]:
|
| 1099 |
+
passage = ev["passage"]
|
| 1100 |
+
sentences = re.split(r'(?<=[.!?])\s+', passage)
|
| 1101 |
+
|
| 1102 |
+
for sent in sentences[:2]:
|
| 1103 |
+
sent = sent.strip()
|
| 1104 |
+
# Only add if it's substantial and not duplicate
|
| 1105 |
+
if len(sent) > 30 and sent not in added_details:
|
| 1106 |
+
# Clean up the sentence
|
| 1107 |
+
if sent[0].islower():
|
| 1108 |
+
sent = sent[0].upper() + sent[1:]
|
| 1109 |
+
if not sent.endswith(('.', '!', '?')):
|
| 1110 |
+
sent += "."
|
| 1111 |
+
response_parts.append(f"• {sent}\n")
|
| 1112 |
+
added_details.add(sent)
|
| 1113 |
+
if len(added_details) >= 3:
|
| 1114 |
+
break
|
| 1115 |
+
if len(added_details) >= 3:
|
| 1116 |
+
break
|
| 1117 |
+
|
| 1118 |
+
# Section 3: Contact/Action info if relevant
|
| 1119 |
+
if intent == "procedural" or any(word in query.lower() for word in ["join", "contact", "call", "volunteer", "donate"]):
|
| 1120 |
+
contact_info = None
|
| 1121 |
+
for passage in passages[:5]:
|
| 1122 |
+
if "phone" in passage.lower() or "contact" in passage.lower() or "website" in passage.lower():
|
| 1123 |
+
# Extract contact details
|
| 1124 |
+
phone_match = re.search(r'\+?\d{2,3}[-\s]?\d{3}[-\s]?\d{3}[-\s]?\d{4}', passage)
|
| 1125 |
+
website_match = re.search(r'https?://[\w\./]+', passage)
|
| 1126 |
+
if phone_match or website_match:
|
| 1127 |
+
response_parts.append("\n**📞 Contact Information:**\n")
|
| 1128 |
+
if phone_match:
|
| 1129 |
+
response_parts.append(f"• Phone: {phone_match.group()}\n")
|
| 1130 |
+
if website_match:
|
| 1131 |
+
response_parts.append(f"• Website: {website_match.group()}\n")
|
| 1132 |
+
break
|
| 1133 |
+
|
| 1134 |
+
# Section 4: Helpful follow-up (context-aware)
|
| 1135 |
+
follow_ups = {
|
| 1136 |
+
"person": "\n💬 *Would you like to know about specific team members, their roles, or how to contact them?*",
|
| 1137 |
+
"temporal": "\n💬 *Would you like to know about upcoming events or past milestones?*",
|
| 1138 |
+
"location": "\n💬 *Would you like directions or information about activities at a specific location?*",
|
| 1139 |
+
"quantitative": "\n💬 *Would you like more details about Sahayak's impact and achievements?*",
|
| 1140 |
+
"procedural": "\n💬 *Need help with the next steps? I can provide more detailed guidance.*",
|
| 1141 |
+
"explanatory": "\n💬 *Would you like me to explain any specific aspect in more detail?*",
|
| 1142 |
+
"definitional": "\n💬 *Would you like to learn about specific programs, events, or initiatives?*",
|
| 1143 |
+
"information": "\n💬 *What else would you like to know about Sahayak Organization?*"
|
| 1144 |
+
}
|
| 1145 |
+
|
| 1146 |
+
response_parts.append(follow_ups.get(intent, follow_ups["information"]))
|
| 1147 |
+
|
| 1148 |
+
return "".join(response_parts)
|
| 1149 |
+
|
| 1150 |
+
|
| 1151 |
+
def append_context_to_answer(answer, query, passages):
|
| 1152 |
+
"""Enhanced context integration using expert reasoning chain."""
|
| 1153 |
+
if not passages:
|
| 1154 |
+
return answer
|
| 1155 |
+
|
| 1156 |
+
# Use expert reasoning for better responses
|
| 1157 |
+
qa_result = {"answer": answer, "score": 0.5} # Default score for context
|
| 1158 |
+
reasoning = expert_reasoning_chain(query, passages, qa_result)
|
| 1159 |
+
|
| 1160 |
+
logger.info(f"Expert reasoning: {reasoning['reasoning_steps']}")
|
| 1161 |
+
|
| 1162 |
+
return build_expert_response(answer, reasoning, query, passages)
|
| 1163 |
+
|
| 1164 |
+
|
| 1165 |
+
# =============================================================================
|
| 1166 |
+
# MAIN ANSWER FUNCTION
|
| 1167 |
+
# =============================================================================
|
| 1168 |
+
|
| 1169 |
+
def answer_question(query, passages, vector_index, embeddings):
|
| 1170 |
+
"""Answer a question with expert-level reasoning, retrieval, and detailed responses"""
|
| 1171 |
+
try:
|
| 1172 |
+
meta_responses = {
|
| 1173 |
+
"hi": "Hello! I'm Vexa, your assistant for Sahayak Organization.\n\n"
|
| 1174 |
+
"Sahayak is a non-profit dedicated to making a positive impact in the community by supporting underprivileged groups.\n"
|
| 1175 |
+
"I can help you learn more about our mission, goals, leadership, and activities.\n"
|
| 1176 |
+
"What would you like to know about Sahayak Organization today?",
|
| 1177 |
+
"hello": "Hi there! I'm Vexa, here to assist you with information about Sahayak Organization.\n\n"
|
| 1178 |
+
"We are a non-profit focused on addressing social issues and empowering communities through education and support.\n"
|
| 1179 |
+
"I can provide details about our vision, projects, or team members.\n"
|
| 1180 |
+
"How can I help you today?",
|
| 1181 |
+
"hey": "Hey! I'm Vexa, your guide to Sahayak Organization.\n\n"
|
| 1182 |
+
"Sahayak works to support those in need, focusing on education, inclusion, and community development.\n"
|
| 1183 |
+
"I'm here to answer any questions you have about our work, leadership, or goals.\n"
|
| 1184 |
+
"What would you like to explore?",
|
| 1185 |
+
"explain me about sahayak": "Sahayak is a non-profit organization committed to providing support and assistance to those in need.\n\n"
|
| 1186 |
+
"Our primary focus is on addressing various social issues, such as education, health, and community development.\n"
|
| 1187 |
+
"We aim to make a lasting positive impact by empowering underprivileged communities through dedicated programs.\n"
|
| 1188 |
+
"Additionally, Sahayak collaborates with local leaders and volunteers to ensure our initiatives are effective.\n"
|
| 1189 |
+
"Would you like to know more about our specific projects or leadership team?",
|
| 1190 |
+
"tell me about sahayak": "Sahayak Organization is a non-profit dedicated to helping those in need across various communities.\n\n"
|
| 1191 |
+
"We focus on tackling social issues like lack of access to education, healthcare challenges, and economic disparities.\n"
|
| 1192 |
+
"Our mission is to empower underprivileged groups by providing resources, support, and opportunities for growth.\n"
|
| 1193 |
+
"Sahayak operates in multiple locations, including Belgaum and Bangalore, to maximize our impact.\n"
|
| 1194 |
+
"If you'd like, I can share more about our goals or the team behind our efforts!",
|
| 1195 |
+
"brief me about sahayak": "Sahayak is a non-profit organization devoted to supporting underprivileged communities.\n\n"
|
| 1196 |
+
"Our work centers on addressing key social issues, such as education, health, and social inclusion.\n"
|
| 1197 |
+
"We strive to create a positive impact by offering resources and programs that empower individuals and families.\n"
|
| 1198 |
+
"With operations in places like Belgaum and Bangalore, we aim to reach as many people as possible.\n"
|
| 1199 |
+
"Let me know if you'd like to dive deeper into our mission or activities!",
|
| 1200 |
+
"goals of sahayak": "The goals of Sahayak Organization are centered on creating a better future for underprivileged communities.\n\n"
|
| 1201 |
+
"We aim to provide support by addressing key challenges like access to education and healthcare.\n"
|
| 1202 |
+
"Another goal is to raise awareness about social issues that affect marginalized groups.\n"
|
| 1203 |
+
"Additionally, we promote initiatives that improve health and education outcomes for children and families.\n"
|
| 1204 |
+
"Would you like to learn more about how we achieve these goals through our programs?",
|
| 1205 |
+
"vision": "Sahayak Organisation envisions a society where education is accessible to every child, regardless of their socio-economic background.\n\n"
|
| 1206 |
+
"We believe that education is a fundamental right that can transform lives and uplift entire communities.\n"
|
| 1207 |
+
"Our vision drives us to work tirelessly to remove barriers and create opportunities for learning.\n"
|
| 1208 |
+
"By doing so, we hope to build a more equitable and inclusive society for future generations.\n"
|
| 1209 |
+
"Would you like to know more about our specific initiatives in education?",
|
| 1210 |
+
"mission of sahayak": "Sahayak's mission is to empower underprivileged communities through education, inclusion, and dedicated care.\n\n"
|
| 1211 |
+
"We strive to provide resources and support to those who need it most, ensuring they have access to opportunities.\n"
|
| 1212 |
+
"Our focus is on creating sustainable change by addressing systemic issues like poverty and lack of education.\n"
|
| 1213 |
+
"Through our programs, we aim to foster a sense of community and hope for a better future.\n"
|
| 1214 |
+
"Let me know if you'd like more details about our mission-driven projects!",
|
| 1215 |
+
"vision of sahayak": "Sahayak Organisation envisions a society where education is accessible to every child, regardless of their socio-economic background.\n\n"
|
| 1216 |
+
"We believe that education is a fundamental right that can transform lives and uplift entire communities.\n"
|
| 1217 |
+
"Our vision drives us to work tirelessly to remove barriers and create opportunities for learning.\n"
|
| 1218 |
+
"By doing so, we hope to build a more equitable and inclusive society for future generations.\n"
|
| 1219 |
+
"Would you like to know more about our specific initiatives in education?",
|
| 1220 |
+
"who created you": "I was created by B Chaitanya Reddy, a dedicated developer passionate about using technology for social good.\n\n"
|
| 1221 |
+
"Chaitanya built me to assist users in learning more about Sahayak Organization and its impactful work.\n"
|
| 1222 |
+
"My purpose is to provide accurate and detailed information to help you understand Sahayak's mission and activities.\n"
|
| 1223 |
+
"Thanks to Chaitanya's efforts, I can answer your questions and guide you through Sahayak's initiatives.\n"
|
| 1224 |
+
"What else would you like to know about me or Sahayak?",
|
| 1225 |
+
"who developed you": "I was developed by B Chaitanya Reddy, a skilled developer who wanted to support Sahayak Organization's mission.\n\n"
|
| 1226 |
+
"Chaitanya designed me to be a helpful tool for anyone seeking information about Sahayak's work and goals.\n"
|
| 1227 |
+
"My role is to provide detailed answers and insights into the organization's efforts to make a difference.\n"
|
| 1228 |
+
"I'm here to assist you with any questions you have about Sahayak or its initiatives.\n"
|
| 1229 |
+
"What would you like to explore next?",
|
| 1230 |
+
"who is the founder of sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
|
| 1231 |
+
"Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
|
| 1232 |
+
"Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
|
| 1233 |
+
"founder of sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
|
| 1234 |
+
"Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
|
| 1235 |
+
"Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
|
| 1236 |
+
"who founded sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
|
| 1237 |
+
"Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
|
| 1238 |
+
"Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
|
| 1239 |
+
"who is verril vaz": "**👤 About Verril Vaz**\n\n"
|
| 1240 |
+
"Verril Vaz is the **Founder and President** of Sahayak Organisation.\n\n"
|
| 1241 |
+
"**Key Facts:**\n"
|
| 1242 |
+
"• Visionary leader who believes in grassroots change\n"
|
| 1243 |
+
"• Established Sahayak on May 21, 2024\n"
|
| 1244 |
+
"• Leads a passionate team of 45 young members\n"
|
| 1245 |
+
"• Focuses on education and community empowerment\n\n"
|
| 1246 |
+
"**His Vision:**\n"
|
| 1247 |
+
"Under his leadership, Sahayak has grown to operate in Belgaum and Bangalore, helping underprivileged communities through education and social support.\n\n"
|
| 1248 |
+
"Would you like to know more about the leadership team or Sahayak's activities?",
|
| 1249 |
+
"verril vaz": "**👤 About Verril Vaz**\n\n"
|
| 1250 |
+
"Verril Vaz is the **Founder and President** of Sahayak Organisation.\n\n"
|
| 1251 |
+
"**Key Facts:**\n"
|
| 1252 |
+
"• Visionary leader who believes in grassroots change\n"
|
| 1253 |
+
"• Established Sahayak on May 21, 2024\n"
|
| 1254 |
+
"• Leads a passionate team of 45 young members\n\n"
|
| 1255 |
+
"Under his leadership, Sahayak operates in Belgaum and Bangalore, helping communities through education and support.\n\n"
|
| 1256 |
+
"Would you like to know more about the leadership team?",
|
| 1257 |
+
"how does sahayak organization works": "Sahayak Organisation operates as a non-profit dedicated to uplifting underprivileged communities through education, social support, and community development.\nIt functions by Implementing Educational Initiatives:\nSahayak focuses heavily on education, providing academic mentorship and resources to ensure access to learning opportunities, particularly for children from marginalized backgrounds.\nProviding Social and Emotional Support: The organization acts as a 'helper' (as its name suggests in Hindi), offering emotional and social assistance to those in need, fostering inclusion and empowerment.\nOperating in Key Locations: Sahayak runs its programs primarily in Belgaum and Bangalore, collaborating with local special schools and healthcare providers to maximize impact.\nLeveraging a Youthful Team: Powered by a team of 45 core members, Sahayak is led by young, passionate leaders like founder Verril Vaz.\nThis team brings empathy, innovation, and administrative skills to execute initiatives efficiently.\nHolistic Approach: Sahayak bridges the gap between potential and opportunity by addressing systemic issues like poverty and lack of education, ensuring sustainable change through targeted programs.",
|
| 1258 |
+
"what are the recent activities conducted by the sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
|
| 1259 |
+
"2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
|
| 1260 |
+
"3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
|
| 1261 |
+
"4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
|
| 1262 |
+
"what are the initiatives took by sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
|
| 1263 |
+
"2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
|
| 1264 |
+
"3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
|
| 1265 |
+
"4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
|
| 1266 |
+
"what are the recent events conducted by the sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
|
| 1267 |
+
"2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
|
| 1268 |
+
"3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
|
| 1269 |
+
"4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
|
| 1270 |
+
"when was the sahayak started": "Sahayak Organisation was started on 21 May 2024.\n\n"
|
| 1271 |
+
"Since its inception, Sahayak has been dedicated to uplifting underprivileged communities through education and social support.\n"
|
| 1272 |
+
"Would you like to know more about our founding story or current initiatives?",
|
| 1273 |
+
# Handle common affirmative follow-ups
|
| 1274 |
+
"yes": "Great! Here are some topics I can help you explore:\n\n"
|
| 1275 |
+
"📋 **Programs & Initiatives**\n"
|
| 1276 |
+
"• Educational programs for underprivileged children\n"
|
| 1277 |
+
"• Visits to old age homes and special schools\n"
|
| 1278 |
+
"• Awareness sessions on health, career, and technology\n\n"
|
| 1279 |
+
"👥 **Team & Leadership**\n"
|
| 1280 |
+
"• Founder: Verril Vaz\n"
|
| 1281 |
+
"• Team of 45 passionate young members\n\n"
|
| 1282 |
+
"📍 **Locations & Contact**\n"
|
| 1283 |
+
"• Operating in Belgaum and Bangalore\n"
|
| 1284 |
+
"• Phone: +91-123-456-7890\n"
|
| 1285 |
+
"• Website: https://www.sahayak.org\n\n"
|
| 1286 |
+
"What specific topic would you like to know more about?",
|
| 1287 |
+
"sure": "Great! Here are some topics I can help you explore:\n\n"
|
| 1288 |
+
"📋 **Programs & Initiatives**\n"
|
| 1289 |
+
"• Educational programs for underprivileged children\n"
|
| 1290 |
+
"• Visits to old age homes and special schools\n"
|
| 1291 |
+
"• Awareness sessions on health, career, and technology\n\n"
|
| 1292 |
+
"👥 **Team & Leadership**\n"
|
| 1293 |
+
"• Founder: Verril Vaz\n"
|
| 1294 |
+
"• Team of 45 passionate young members\n\n"
|
| 1295 |
+
"📍 **Locations & Contact**\n"
|
| 1296 |
+
"• Operating in Belgaum and Bangalore\n\n"
|
| 1297 |
+
"What specific topic would you like to know more about?",
|
| 1298 |
+
"ok": "I'm here to help! You can ask me about:\n\n"
|
| 1299 |
+
"• Sahayak's mission and vision\n"
|
| 1300 |
+
"• Our educational programs and initiatives\n"
|
| 1301 |
+
"• Recent activities and events\n"
|
| 1302 |
+
"• Team members and leadership\n"
|
| 1303 |
+
"• How to volunteer or donate\n"
|
| 1304 |
+
"• Contact information and locations\n\n"
|
| 1305 |
+
"What would you like to know?",
|
| 1306 |
+
"tell me more": "Here's more about Sahayak Organization:\n\n"
|
| 1307 |
+
"**🎯 Our Mission**\n"
|
| 1308 |
+
"Sahayak is dedicated to empowering underprivileged communities through education, inclusion, and dedicated care.\n\n"
|
| 1309 |
+
"**📚 Key Programs**\n"
|
| 1310 |
+
"1. Educational mentorship for underserved students\n"
|
| 1311 |
+
"2. Support for visually impaired and specially-abled children\n"
|
| 1312 |
+
"3. Assistance to senior citizens at old age homes\n"
|
| 1313 |
+
"4. Awareness sessions on technology, career, and health\n\n"
|
| 1314 |
+
"**🏆 Recent Achievements**\n"
|
| 1315 |
+
"• Visited Anand Yatri Old Age Home (August 2024)\n"
|
| 1316 |
+
"• Conducted sessions at Maheshwari School for the Blind (October 2024)\n"
|
| 1317 |
+
"• Partnered with Sparsh Foundation (December 2024)\n"
|
| 1318 |
+
"• Organized awareness programs at Balika Adarsh Vidyalaya (February 2025)\n\n"
|
| 1319 |
+
"Which aspect would you like to explore further?",
|
| 1320 |
+
"what is sahayak": "**📖 What is Sahayak?**\n\n"
|
| 1321 |
+
"Sahayak (meaning 'helper' in Hindi) is a non-governmental organization (NGO) founded with the noble vision of uplifting society.\n\n"
|
| 1322 |
+
"**🎯 Core Focus Areas:**\n"
|
| 1323 |
+
"• Providing quality education to underserved students\n"
|
| 1324 |
+
"• Supporting senior citizens with care and companionship\n"
|
| 1325 |
+
"• Aiding visually impaired and differently-abled children\n\n"
|
| 1326 |
+
"**📍 Where We Operate:**\n"
|
| 1327 |
+
"Sahayak primarily operates in Belgaum and Bangalore, Karnataka, India.\n\n"
|
| 1328 |
+
"**👥 Our Team:**\n"
|
| 1329 |
+
"Led by founder Verril Vaz, Sahayak has a passionate team of 45 young leaders dedicated to making a difference.\n\n"
|
| 1330 |
+
"Would you like to know about our specific programs or how to get involved?",
|
| 1331 |
+
"initiatives": "**📋 Sahayak's Key Initiatives**\n\n"
|
| 1332 |
+
"**1. Educational Programs**\n"
|
| 1333 |
+
"• Academic mentorship for underprivileged children\n"
|
| 1334 |
+
"• Scholarship guidance and study resources\n"
|
| 1335 |
+
"• Career counseling and skill development\n\n"
|
| 1336 |
+
"**2. Community Outreach**\n"
|
| 1337 |
+
"• Regular visits to old age homes\n"
|
| 1338 |
+
"• Support programs for specially-abled children\n"
|
| 1339 |
+
"• Health and nutrition awareness campaigns\n\n"
|
| 1340 |
+
"**3. Recent Activities**\n"
|
| 1341 |
+
"• Anand Yatri Old Age Home visit (Aug 2024)\n"
|
| 1342 |
+
"• Maheshwari School for the Blind (Oct 2024)\n"
|
| 1343 |
+
"• Sparsh Foundation collaboration (Dec 2024)\n"
|
| 1344 |
+
"• Balika Adarsh Vidyalaya sessions (Feb 2025)\n\n"
|
| 1345 |
+
"Would you like details about any specific initiative?",
|
| 1346 |
+
"what are the initiatives of sahayak": "**📋 Sahayak's Key Initiatives**\n\n"
|
| 1347 |
+
"**1. Educational Programs**\n"
|
| 1348 |
+
"• Academic mentorship for underprivileged children\n"
|
| 1349 |
+
"• Scholarship guidance and study resources\n"
|
| 1350 |
+
"• Career counseling and skill development\n\n"
|
| 1351 |
+
"**2. Community Outreach**\n"
|
| 1352 |
+
"• Regular visits to old age homes\n"
|
| 1353 |
+
"• Support programs for specially-abled children\n"
|
| 1354 |
+
"• Health and nutrition awareness campaigns\n\n"
|
| 1355 |
+
"**3. Recent Activities (2024-2025)**\n"
|
| 1356 |
+
"• Anand Yatri Old Age Home visit (Aug 3, 2024)\n"
|
| 1357 |
+
"• Maheshwari School for the Blind (Oct 25, 2024)\n"
|
| 1358 |
+
"• Sparsh Foundation collaboration (Dec 19, 2024)\n"
|
| 1359 |
+
"• Balika Adarsh Vidyalaya sessions (Feb 4, 2025)\n\n"
|
| 1360 |
+
"Would you like details about any specific initiative?",
|
| 1361 |
+
"who is the president of sahayak": "**👤 President of Sahayak**\n\n"
|
| 1362 |
+
"The founder and president of Sahayak Organisation is **Verril Vaz**.\n\n"
|
| 1363 |
+
"**About Verril Vaz:**\n"
|
| 1364 |
+
"• Visionary leader who believes in grassroots change\n"
|
| 1365 |
+
"• Established Sahayak with the mission to uplift underprivileged communities\n"
|
| 1366 |
+
"• Leads a passionate team of 45 young members\n\n"
|
| 1367 |
+
"Under his leadership, Sahayak has grown to operate in multiple locations including Belgaum and Bangalore.\n\n"
|
| 1368 |
+
"Would you like to know more about the leadership team or Sahayak's structure?",
|
| 1369 |
+
"president of sahayak": "**👤 President of Sahayak**\n\n"
|
| 1370 |
+
"The founder and president of Sahayak Organisation is **Verril Vaz**.\n\n"
|
| 1371 |
+
"**About Verril Vaz:**\n"
|
| 1372 |
+
"• Visionary leader who believes in grassroots change\n"
|
| 1373 |
+
"• Established Sahayak with the mission to uplift underprivileged communities\n"
|
| 1374 |
+
"• Leads a passionate team of 45 young members\n\n"
|
| 1375 |
+
"Would you like to know more about the leadership team?",
|
| 1376 |
+
"how to join sahayak": "**🤝 How to Join Sahayak**\n\n"
|
| 1377 |
+
"**Option 1: Register Online**\n"
|
| 1378 |
+
"• Visit: https://www.sahayak.org/volunteer\n"
|
| 1379 |
+
"• Fill out the volunteer registration form\n\n"
|
| 1380 |
+
"**Option 2: Contact Us Directly**\n"
|
| 1381 |
+
"• Phone: +91-123-456-7890\n"
|
| 1382 |
+
"• Email through the website contact form\n\n"
|
| 1383 |
+
"**What We Look For:**\n"
|
| 1384 |
+
"• Passion for community service\n"
|
| 1385 |
+
"• Willingness to contribute time and skills\n"
|
| 1386 |
+
"• Empathy for underprivileged communities\n\n"
|
| 1387 |
+
"Would you like more information about volunteer opportunities?",
|
| 1388 |
+
"how many branches sahayak has": "**📍 Sahayak Locations**\n\n"
|
| 1389 |
+
"Sahayak currently operates in **2 main locations**:\n\n"
|
| 1390 |
+
"**1. Belgaum (Belagavi)**\n"
|
| 1391 |
+
"• Primary operational hub\n"
|
| 1392 |
+
"• Multiple community outreach programs\n\n"
|
| 1393 |
+
"**2. Bangalore**\n"
|
| 1394 |
+
"• Extended operations and programs\n"
|
| 1395 |
+
"• Partnership with local institutions\n\n"
|
| 1396 |
+
"Both locations serve as centers for educational initiatives, community support, and volunteer activities.\n\n"
|
| 1397 |
+
"Would you like to know about activities at a specific location?",
|
| 1398 |
+
# Additional variations for better matching
|
| 1399 |
+
"vice president": "**👤 Sahayak Leadership Team**\n\n"
|
| 1400 |
+
"Sahayak is led by a passionate team of young leaders:\n\n"
|
| 1401 |
+
"**President & Founder:** Verril Vaz\n"
|
| 1402 |
+
"• Visionary leader who believes in grassroots change\n\n"
|
| 1403 |
+
"**Core Team:**\n"
|
| 1404 |
+
"• 45 dedicated young members\n"
|
| 1405 |
+
"• Mix of empathy, administrative skills, and innovation\n"
|
| 1406 |
+
"• Committed to Sahayak's core values\n\n"
|
| 1407 |
+
"The team works together to ensure Sahayak runs efficiently while making a positive impact.\n\n"
|
| 1408 |
+
"Would you like to know about specific team roles or how to join?",
|
| 1409 |
+
"leadership": "**👥 Sahayak Leadership Team**\n\n"
|
| 1410 |
+
"Sahayak is led by a passionate team of young leaders:\n\n"
|
| 1411 |
+
"**President & Founder:** Verril Vaz\n"
|
| 1412 |
+
"• Visionary leader who believes in grassroots change\n"
|
| 1413 |
+
"• Established Sahayak on May 21, 2024\n\n"
|
| 1414 |
+
"**Core Team:**\n"
|
| 1415 |
+
"• 45 dedicated young members\n"
|
| 1416 |
+
"• Mix of empathy, administrative skills, and innovation\n"
|
| 1417 |
+
"• Committed to uplifting underprivileged communities\n\n"
|
| 1418 |
+
"The team brings collective commitment to ensure Sahayak runs efficiently.\n\n"
|
| 1419 |
+
"Would you like to know more about joining the team?",
|
| 1420 |
+
"objectives": "**🎯 Sahayak's Objectives**\n\n"
|
| 1421 |
+
"Sahayak Organisation works towards the following key objectives:\n\n"
|
| 1422 |
+
"**1. Educational Empowerment**\n"
|
| 1423 |
+
"• Provide quality education to underserved students\n"
|
| 1424 |
+
"• Bridge the gap between potential and opportunity\n\n"
|
| 1425 |
+
"**2. Community Support**\n"
|
| 1426 |
+
"• Support senior citizens with care and companionship\n"
|
| 1427 |
+
"• Aid visually impaired and differently-abled children\n\n"
|
| 1428 |
+
"**3. Social Awareness**\n"
|
| 1429 |
+
"• Raise awareness about social issues affecting marginalized groups\n"
|
| 1430 |
+
"• Promote health, career, and technology awareness\n\n"
|
| 1431 |
+
"**4. Sustainable Impact**\n"
|
| 1432 |
+
"• Create lasting positive change in communities\n"
|
| 1433 |
+
"• Empower individuals and families through dedicated programs\n\n"
|
| 1434 |
+
"Would you like to learn about specific programs achieving these objectives?",
|
| 1435 |
+
"what are the objectives": "**🎯 Sahayak's Objectives**\n\n"
|
| 1436 |
+
"Sahayak Organisation works towards the following key objectives:\n\n"
|
| 1437 |
+
"**1. Educational Empowerment**\n"
|
| 1438 |
+
"• Provide quality education to underserved students\n"
|
| 1439 |
+
"• Bridge the gap between potential and opportunity\n\n"
|
| 1440 |
+
"**2. Community Support**\n"
|
| 1441 |
+
"• Support senior citizens with care and companionship\n"
|
| 1442 |
+
"• Aid visually impaired and differently-abled children\n\n"
|
| 1443 |
+
"**3. Social Awareness**\n"
|
| 1444 |
+
"• Raise awareness about social issues\n"
|
| 1445 |
+
"• Promote health, career, and technology awareness\n\n"
|
| 1446 |
+
"Would you like to learn about specific programs?",
|
| 1447 |
+
"donate": "**💝 How to Donate to Sahayak**\n\n"
|
| 1448 |
+
"Your donations help Sahayak continue its mission to support underprivileged communities.\n\n"
|
| 1449 |
+
"**How to Donate:**\n"
|
| 1450 |
+
"• Visit: https://www.sahayak.org\n"
|
| 1451 |
+
"• Contact: +91-123-456-7890\n\n"
|
| 1452 |
+
"**Your Donation Supports:**\n"
|
| 1453 |
+
"• Educational programs for children\n"
|
| 1454 |
+
"• Care for senior citizens\n"
|
| 1455 |
+
"• Support for specially-abled individuals\n\n"
|
| 1456 |
+
"Every contribution makes a difference!\n\n"
|
| 1457 |
+
"Would you like more information about our programs?",
|
| 1458 |
+
"help": "**ℹ️ How Can I Help You?**\n\n"
|
| 1459 |
+
"I can provide information about:\n\n"
|
| 1460 |
+
"📋 **Programs & Initiatives**\n"
|
| 1461 |
+
"• Educational programs, community outreach, recent events\n\n"
|
| 1462 |
+
"👥 **Team & Leadership**\n"
|
| 1463 |
+
"• Founder, president, team members\n\n"
|
| 1464 |
+
"📍 **Locations & Contact**\n"
|
| 1465 |
+
"• Branches, phone numbers, website\n\n"
|
| 1466 |
+
"🤝 **Getting Involved**\n"
|
| 1467 |
+
"• How to volunteer, donate, or join\n\n"
|
| 1468 |
+
"🎯 **Mission & Vision**\n"
|
| 1469 |
+
"• Goals, objectives, values\n\n"
|
| 1470 |
+
"What would you like to know about?"
|
| 1471 |
+
}
|
| 1472 |
+
|
| 1473 |
+
translated_query, original_lang = detect_and_translate(query)
|
| 1474 |
+
|
| 1475 |
+
moderation = moderate_query(translated_query)
|
| 1476 |
+
if moderation.get("status") == "blocked":
|
| 1477 |
+
return translate_text(moderation["message"], original_lang)
|
| 1478 |
+
|
| 1479 |
+
# =====================================================================
|
| 1480 |
+
# SEMANTIC INTENT UNDERSTANDING (Expert-Level)
|
| 1481 |
+
# Uses embeddings to understand query meaning, not just keywords
|
| 1482 |
+
# =====================================================================
|
| 1483 |
+
|
| 1484 |
+
normalized_query = re.sub(r'[^\w\s]', '', translated_query.lower().strip())
|
| 1485 |
+
|
| 1486 |
+
# Step 1: Semantic Intent Classification
|
| 1487 |
+
intent, intent_confidence = semantic_intent_classification(translated_query)
|
| 1488 |
+
logger.info(f"Semantic intent: {intent} (confidence: {intent_confidence:.3f})")
|
| 1489 |
+
|
| 1490 |
+
# Step 2: Query Complexity Analysis
|
| 1491 |
+
complexity = analyze_query_complexity(translated_query)
|
| 1492 |
+
logger.info(f"Query complexity: {complexity['level']} (score: {complexity['score']})")
|
| 1493 |
+
|
| 1494 |
+
# Step 3: Intent-based response routing with semantic understanding
|
| 1495 |
+
# Map semantic intents to curated responses
|
| 1496 |
+
intent_to_response = {
|
| 1497 |
+
"about_organization": "what is sahayak",
|
| 1498 |
+
"founder_leadership": "who is the founder of sahayak",
|
| 1499 |
+
"president_leadership": "who is the president of sahayak",
|
| 1500 |
+
"team_members": "who is the president of sahayak",
|
| 1501 |
+
"mission_vision": "mission of sahayak",
|
| 1502 |
+
"initiatives_programs": "initiatives",
|
| 1503 |
+
"recent_events": "what are the recent activities conducted by the sahayak",
|
| 1504 |
+
"location_branches": "how many branches sahayak has",
|
| 1505 |
+
"join_volunteer": "how to join sahayak",
|
| 1506 |
+
"donate_support": "donate",
|
| 1507 |
+
"contact_info": "how to join sahayak",
|
| 1508 |
+
"founding_date": "when was the sahayak started",
|
| 1509 |
+
"greeting": "hi",
|
| 1510 |
+
"thanks": "ok",
|
| 1511 |
+
"affirmative": "yes"
|
| 1512 |
+
}
|
| 1513 |
+
|
| 1514 |
+
# High confidence semantic match -> use curated response
|
| 1515 |
+
if intent_confidence > 0.65 and intent in intent_to_response:
|
| 1516 |
+
response_key = intent_to_response[intent]
|
| 1517 |
+
if response_key in meta_responses:
|
| 1518 |
+
logger.info(f"Semantic match: intent={intent}, confidence={intent_confidence:.3f}, response_key={response_key}")
|
| 1519 |
+
return translate_text(meta_responses[response_key], original_lang)
|
| 1520 |
+
|
| 1521 |
+
# Step 4: Semantic similarity search in meta_responses
|
| 1522 |
+
# Use embeddings to find best matching pre-defined response
|
| 1523 |
+
try:
|
| 1524 |
+
embedder = get_embedder()
|
| 1525 |
+
query_embedding = embedder.encode([translated_query])[0]
|
| 1526 |
+
|
| 1527 |
+
best_match_key = None
|
| 1528 |
+
best_match_score = 0.0
|
| 1529 |
+
|
| 1530 |
+
# Encode all meta_response keys and find best semantic match
|
| 1531 |
+
meta_keys = list(meta_responses.keys())
|
| 1532 |
+
meta_embeddings = embedder.encode(meta_keys)
|
| 1533 |
+
|
| 1534 |
+
for i, (key, key_embedding) in enumerate(zip(meta_keys, meta_embeddings)):
|
| 1535 |
+
# Cosine similarity
|
| 1536 |
+
similarity = np.dot(query_embedding, key_embedding) / (
|
| 1537 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(key_embedding)
|
| 1538 |
+
)
|
| 1539 |
+
if similarity > best_match_score:
|
| 1540 |
+
best_match_score = similarity
|
| 1541 |
+
best_match_key = key
|
| 1542 |
+
|
| 1543 |
+
# If we have a strong semantic match, use it
|
| 1544 |
+
if best_match_score > 0.70 and best_match_key:
|
| 1545 |
+
logger.info(f"Semantic meta-match: '{best_match_key}' (score: {best_match_score:.3f})")
|
| 1546 |
+
return translate_text(meta_responses[best_match_key], original_lang)
|
| 1547 |
+
|
| 1548 |
+
except Exception as e:
|
| 1549 |
+
logger.warning(f"Semantic matching failed: {e}")
|
| 1550 |
+
|
| 1551 |
+
# Step 5: Check scope
|
| 1552 |
+
if not is_in_scope(translated_query):
|
| 1553 |
+
return scope_redirect_message(original_lang)
|
| 1554 |
+
|
| 1555 |
+
# =====================================================================
|
| 1556 |
+
# STEP 6: SEMANTIC PASSAGE RETRIEVAL & QA
|
| 1557 |
+
# Use embeddings to find most relevant passages
|
| 1558 |
+
# =====================================================================
|
| 1559 |
+
|
| 1560 |
+
# Determine retrieval depth based on query complexity
|
| 1561 |
+
top_k = 10
|
| 1562 |
+
if complexity["level"] == "complex":
|
| 1563 |
+
top_k = 15
|
| 1564 |
+
elif complexity["level"] == "simple":
|
| 1565 |
+
top_k = 8
|
| 1566 |
+
|
| 1567 |
+
relevance_passages = retrieve_relevant_passages(
|
| 1568 |
+
translated_query, passages, vector_index, embeddings, top_k=top_k
|
| 1569 |
+
)
|
| 1570 |
+
logger.info(f"Retrieved {len(relevance_passages)} passages for query '{translated_query}'")
|
| 1571 |
+
|
| 1572 |
+
# Re-rank passages using semantic similarity for better context
|
| 1573 |
+
ranked_passages = semantic_passage_ranking(translated_query, relevance_passages, top_k=5)
|
| 1574 |
+
|
| 1575 |
+
# Build context from best semantically-matched passages
|
| 1576 |
+
context_passages = [p for p, score in ranked_passages if score > 0.3]
|
| 1577 |
+
if not context_passages:
|
| 1578 |
+
context_passages = relevance_passages[:3]
|
| 1579 |
+
|
| 1580 |
+
context = " ".join(context_passages)[:2000] # More context for better answers
|
| 1581 |
+
logger.info(f"Context length for QA model: {len(context)} chars from {len(context_passages)} passages")
|
| 1582 |
+
|
| 1583 |
+
qa_model = get_qa_model()
|
| 1584 |
+
|
| 1585 |
+
result = qa_model(question=translated_query, context=context)
|
| 1586 |
+
|
| 1587 |
+
english_answer = result["answer"]
|
| 1588 |
+
confidence_score = result.get("score", 0.0)
|
| 1589 |
+
logger.info(f"QA model result: answer='{english_answer}', score={confidence_score}")
|
| 1590 |
+
|
| 1591 |
+
# Apply expert reasoning chain
|
| 1592 |
+
reasoning = expert_reasoning_chain(translated_query, relevance_passages, result)
|
| 1593 |
+
logger.info(f"Expert reasoning steps: {reasoning['reasoning_steps']}")
|
| 1594 |
+
|
| 1595 |
+
is_numeric = english_answer.strip().replace(".", "").isdigit()
|
| 1596 |
+
if not english_answer or (len(english_answer) < 5 and not is_numeric and confidence_score < 0.3) or confidence_score < 0.05:
|
| 1597 |
+
logger.warning(f"QA model failed to provide a good answer. Answer: '{english_answer}', Score: {confidence_score}")
|
| 1598 |
+
# Provide a more helpful fallback using expert reasoning
|
| 1599 |
+
fallback_response = "I couldn't find a specific answer to your question.\n\n"
|
| 1600 |
+
if relevance_passages:
|
| 1601 |
+
fallback_response += "**However, here's what I found in our documentation:**\n"
|
| 1602 |
+
for i, passage in enumerate(relevance_passages[:2]):
|
| 1603 |
+
snippet = passage[:200] + "..." if len(passage) > 200 else passage
|
| 1604 |
+
fallback_response += f"\n- {snippet}\n"
|
| 1605 |
+
fallback_response += "\nWould you like me to help you with a more specific question about Sahayak?"
|
| 1606 |
+
return translate_text(fallback_response, original_lang)
|
| 1607 |
+
|
| 1608 |
+
# Build expert response with evidence and reasoning
|
| 1609 |
+
detailed_answer = build_expert_response(english_answer, reasoning, translated_query, relevance_passages)
|
| 1610 |
+
|
| 1611 |
+
legal_footer = (
|
| 1612 |
+
"\n\n---\n_This response is for community awareness only and not legal, medical, or financial advice. "
|
| 1613 |
+
"Program details may change; please verify with official Sahayak contacts or local authorities._"
|
| 1614 |
+
)
|
| 1615 |
+
final_answer = translate_text(detailed_answer + legal_footer, original_lang)
|
| 1616 |
+
return final_answer
|
| 1617 |
+
|
| 1618 |
+
except Exception as e:
|
| 1619 |
+
logger.error(f"Error answering question: {str(e)}")
|
| 1620 |
+
return ("I'm sorry, I encountered an error while processing your question.\n\n"
|
| 1621 |
+
"However, I can still tell you that Sahayak is a non-profit organization focused on helping those in need.\n"
|
| 1622 |
+
"We work on various social issues, aiming to make a positive impact in the community.\n"
|
| 1623 |
+
"Please try again with a different query, and I'll do my best to assist you!")
|
static/logo.png
ADDED
|
static/style.css
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
* {
|
| 2 |
+
box-sizing: border-box;
|
| 3 |
+
margin: 0;
|
| 4 |
+
padding: 0;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
html, body {
|
| 8 |
+
height: 100%;
|
| 9 |
+
background: linear-gradient(135deg, #23272f 0%, #343541 100%);
|
| 10 |
+
color: #fff;
|
| 11 |
+
font-family: 'Inter', 'Poppins', sans-serif;
|
| 12 |
+
overflow-x: hidden;
|
| 13 |
+
overflow-y: auto;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
body {
|
| 17 |
+
min-height: 100vh;
|
| 18 |
+
display: flex;
|
| 19 |
+
flex-direction: column;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
/* Branding row at the top, scrolls with page, NO navbar effect */
|
| 23 |
+
.branding-row {
|
| 24 |
+
display: flex;
|
| 25 |
+
align-items: center;
|
| 26 |
+
gap: 1rem;
|
| 27 |
+
padding: 1.6rem 0 0.6rem 1.6rem;
|
| 28 |
+
width: auto;
|
| 29 |
+
background: transparent;
|
| 30 |
+
box-shadow: none;
|
| 31 |
+
border-bottom: none;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
.logo {
|
| 35 |
+
width: 72px;
|
| 36 |
+
height: 72px;
|
| 37 |
+
border-radius: 16px;
|
| 38 |
+
object-fit: cover;
|
| 39 |
+
box-shadow: 0 4px 24px rgba(76, 110, 245, 0.12);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
.brand-name {
|
| 43 |
+
font-family: 'Poppins', sans-serif;
|
| 44 |
+
font-weight: 700;
|
| 45 |
+
font-size: 2.7rem;
|
| 46 |
+
background: linear-gradient(135deg, #4f9fff, #9c7aff);
|
| 47 |
+
-webkit-background-clip: text;
|
| 48 |
+
background-clip: text;
|
| 49 |
+
-webkit-text-fill-color: transparent;
|
| 50 |
+
letter-spacing: 1.5px;
|
| 51 |
+
user-select: none;
|
| 52 |
+
line-height: 1;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/* Chat container and rest of your styles remain unchanged */
|
| 56 |
+
.chat-outer {
|
| 57 |
+
display: flex;
|
| 58 |
+
flex-direction: column;
|
| 59 |
+
justify-content: flex-end;
|
| 60 |
+
align-items: center;
|
| 61 |
+
height: 100vh;
|
| 62 |
+
width: 100vw;
|
| 63 |
+
padding-top: 0;
|
| 64 |
+
padding-bottom: 0;
|
| 65 |
+
position: relative;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
/* Ensure chat-container fills available space above input */
|
| 69 |
+
.chat-container {
|
| 70 |
+
width: 100%;
|
| 71 |
+
max-width: 720px;
|
| 72 |
+
flex: 1 1 auto;
|
| 73 |
+
background: rgba(44, 46, 56, 0.75);
|
| 74 |
+
border-radius: 1.5rem;
|
| 75 |
+
margin: 0 auto;
|
| 76 |
+
padding: 2rem 1.2rem 1rem 1.2rem;
|
| 77 |
+
box-shadow: 0 8px 32px rgba(20, 20, 40, 0.14);
|
| 78 |
+
overflow-y: auto;
|
| 79 |
+
display: flex;
|
| 80 |
+
flex-direction: column;
|
| 81 |
+
gap: 1.2rem;
|
| 82 |
+
scroll-behavior: smooth;
|
| 83 |
+
min-height: 60vh;
|
| 84 |
+
max-height: none;
|
| 85 |
+
backdrop-filter: blur(8px);
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
/* Spacer to ensure input bar never overlaps chat on any device */
|
| 89 |
+
.input-area-spacer {
|
| 90 |
+
height: 100px;
|
| 91 |
+
flex-shrink: 0;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
@media (max-width: 600px) {
|
| 95 |
+
.branding-row {
|
| 96 |
+
gap: 0.7rem;
|
| 97 |
+
padding: 1.2rem 0 0.5rem 1.1rem;
|
| 98 |
+
}
|
| 99 |
+
.logo {
|
| 100 |
+
width: 56px;
|
| 101 |
+
height: 56px;
|
| 102 |
+
border-radius: 12px;
|
| 103 |
+
}
|
| 104 |
+
.brand-name {
|
| 105 |
+
font-size: 2.1rem;
|
| 106 |
+
letter-spacing: 1px;
|
| 107 |
+
}
|
| 108 |
+
.chat-container, .input-area {
|
| 109 |
+
max-width: 100vw;
|
| 110 |
+
border-radius: 0.5rem;
|
| 111 |
+
padding-left: 0.3rem;
|
| 112 |
+
padding-right: 0.3rem;
|
| 113 |
+
}
|
| 114 |
+
.input-area-spacer {
|
| 115 |
+
height: 120px;
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
/* Input bar at the bottom, always above chat */
|
| 120 |
+
.input-area {
|
| 121 |
+
width: 100%;
|
| 122 |
+
max-width: 720px;
|
| 123 |
+
background: none;
|
| 124 |
+
position: fixed;
|
| 125 |
+
left: 50%;
|
| 126 |
+
bottom: 0;
|
| 127 |
+
transform: translateX(-50%);
|
| 128 |
+
z-index: 20;
|
| 129 |
+
padding: 0 1rem 1.2rem 1rem;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
.input-container {
|
| 133 |
+
display: flex;
|
| 134 |
+
align-items: center;
|
| 135 |
+
background: rgba(44, 46, 56, 0.90);
|
| 136 |
+
border-radius: 1.2rem;
|
| 137 |
+
box-shadow: 0 2px 12px rgba(0,0,0,0.12);
|
| 138 |
+
padding: 0.25rem 0.5rem;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
input[type="text"] {
|
| 142 |
+
flex: 1;
|
| 143 |
+
padding: 1rem 1.2rem;
|
| 144 |
+
font-size: 1.1rem;
|
| 145 |
+
background: transparent;
|
| 146 |
+
border: none;
|
| 147 |
+
color: #fff;
|
| 148 |
+
outline: none;
|
| 149 |
+
border-radius: 1.2rem;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
input[type="text"]:focus {
|
| 153 |
+
background: rgba(44, 46, 56, 0.98);
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
button {
|
| 157 |
+
background: linear-gradient(135deg, #4f9fff 60%, #9c7aff 100%);
|
| 158 |
+
color: white;
|
| 159 |
+
border: none;
|
| 160 |
+
padding: 0.7rem 0.9rem;
|
| 161 |
+
border-radius: 0.9rem;
|
| 162 |
+
cursor: pointer;
|
| 163 |
+
margin-left: 0.5rem;
|
| 164 |
+
transition: background 0.2s, transform 0.2s;
|
| 165 |
+
display: flex;
|
| 166 |
+
align-items: center;
|
| 167 |
+
justify-content: center;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
button:hover {
|
| 171 |
+
background: linear-gradient(135deg, #2563eb 70%, #7a5cff 100%);
|
| 172 |
+
transform: scale(1.07);
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
.welcome-message {
|
| 176 |
+
text-align: center;
|
| 177 |
+
margin-bottom: 1.5rem;
|
| 178 |
+
opacity: 0.92;
|
| 179 |
+
transition: opacity 0.3s, max-height 0.3s;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.welcome-message[style*="display: none"] {
|
| 183 |
+
opacity: 0;
|
| 184 |
+
max-height: 0;
|
| 185 |
+
margin: 0;
|
| 186 |
+
padding: 0;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.welcome-message h2 {
|
| 190 |
+
font-size: 1.6rem;
|
| 191 |
+
font-weight: 700;
|
| 192 |
+
margin-bottom: 0.4rem;
|
| 193 |
+
background: linear-gradient(135deg, #4f9fff, #9c7aff);
|
| 194 |
+
-webkit-background-clip: text;
|
| 195 |
+
background-clip: text;
|
| 196 |
+
-webkit-text-fill-color: transparent;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.welcome-message p {
|
| 200 |
+
font-size: 1.05rem;
|
| 201 |
+
opacity: 0.8;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
.disclaimer {
|
| 205 |
+
font-size: 0.9rem;
|
| 206 |
+
opacity: 0.65;
|
| 207 |
+
margin-top: 0.3rem;
|
| 208 |
+
line-height: 1.4;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
.message-row {
|
| 212 |
+
display: flex;
|
| 213 |
+
width: 100%;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
.message-row.user {
|
| 217 |
+
justify-content: flex-end;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
.message-row.bot {
|
| 221 |
+
justify-content: flex-start;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
.message {
|
| 225 |
+
max-width: 85%;
|
| 226 |
+
padding: 1rem 1.3rem;
|
| 227 |
+
border-radius: 1.2rem;
|
| 228 |
+
font-size: 1.05rem;
|
| 229 |
+
line-height: 1.6;
|
| 230 |
+
animation: fadeIn 0.3s ease-in;
|
| 231 |
+
white-space: pre-wrap;
|
| 232 |
+
word-wrap: break-word;
|
| 233 |
+
background: rgba(60, 64, 90, 0.85);
|
| 234 |
+
box-shadow: 0 2px 12px rgba(0,0,0,0.08);
|
| 235 |
+
transition: background 0.2s;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
.message.user {
|
| 239 |
+
background: linear-gradient(135deg, #4f9fff 60%, #9c7aff 100%);
|
| 240 |
+
color: #fff;
|
| 241 |
+
border-bottom-right-radius: 0.3rem;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
.message.bot {
|
| 245 |
+
background: rgba(44, 46, 56, 0.94);
|
| 246 |
+
color: #fff;
|
| 247 |
+
border-bottom-left-radius: 0.3rem;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
.time-display {
|
| 251 |
+
font-style: italic;
|
| 252 |
+
color: #a0aec0;
|
| 253 |
+
font-size: 0.8rem;
|
| 254 |
+
margin-top: 0.3rem;
|
| 255 |
+
margin-left: 0.7rem;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
.typing-indicator {
|
| 259 |
+
display: flex;
|
| 260 |
+
align-items: center;
|
| 261 |
+
background: rgba(44, 46, 56, 0.94);
|
| 262 |
+
padding: 1rem 1.3rem;
|
| 263 |
+
border-radius: 1.2rem;
|
| 264 |
+
width: fit-content;
|
| 265 |
+
gap: 3px;
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
.typing-indicator span {
|
| 269 |
+
height: 8px;
|
| 270 |
+
width: 8px;
|
| 271 |
+
margin: 0 2px;
|
| 272 |
+
background-color: #b5b7c2;
|
| 273 |
+
border-radius: 50%;
|
| 274 |
+
display: inline-block;
|
| 275 |
+
animation: bounce 1.5s infinite ease-in-out;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
.typing-indicator span:nth-child(1) { animation-delay: 0s; }
|
| 279 |
+
.typing-indicator span:nth-child(2) { animation-delay: 0.2s; }
|
| 280 |
+
.typing-indicator span:nth-child(3) { animation-delay: 0.4s; }
|
| 281 |
+
|
| 282 |
+
@keyframes bounce {
|
| 283 |
+
0%, 80%, 100% { transform: translateY(0); }
|
| 284 |
+
40% { transform: translateY(-8px); }
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
@keyframes fadeIn {
|
| 288 |
+
from { opacity: 0; transform: translateY(10px);}
|
| 289 |
+
to { opacity: 1; transform: translateY(0);}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
/* Scrollbar Styling */
|
| 293 |
+
.chat-container::-webkit-scrollbar {
|
| 294 |
+
width: 7px;
|
| 295 |
+
}
|
| 296 |
+
.chat-container::-webkit-scrollbar-thumb {
|
| 297 |
+
background: #555a6a;
|
| 298 |
+
border-radius: 4px;
|
| 299 |
+
}
|
templates/index.html
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Vexa Chat</title>
|
| 7 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
|
| 8 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap" rel="stylesheet">
|
| 9 |
+
<link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;700&display=swap" rel="stylesheet">
|
| 10 |
+
</head>
|
| 11 |
+
<body>
|
| 12 |
+
<!-- Branding Row (Logo + VEXA) -->
|
| 13 |
+
<div class="branding-row">
|
| 14 |
+
<img class="logo" src="{{ url_for('static', filename='logo.png') }}" alt="Vexa Logo">
|
| 15 |
+
<span class="brand-name">VEXA</span>
|
| 16 |
+
</div>
|
| 17 |
+
<main class="chat-outer">
|
| 18 |
+
<div class="chat-container" id="chat">
|
| 19 |
+
<div class="welcome-message" id="welcome-message">
|
| 20 |
+
<h2>Welcome to Vexa AI</h2>
|
| 21 |
+
<p>Your digital support assistant</p>
|
| 22 |
+
<p class="disclaimer">Answers focus on community benefit and Sahayak services.</p>
|
| 23 |
+
</div>
|
| 24 |
+
<div class="message-row bot">
|
| 25 |
+
<div class="message bot">Hello! I am Vexa. Ask me anything about Sahayak Organization.</div>
|
| 26 |
+
</div>
|
| 27 |
+
</div>
|
| 28 |
+
<div class="input-area-spacer"></div> <!-- Spacer to push chat above input on all screens -->
|
| 29 |
+
<form class="input-area" id="input-form" autocomplete="off">
|
| 30 |
+
<div class="input-container">
|
| 31 |
+
<input type="text" id="msg" placeholder="Type your message..." autocomplete="off">
|
| 32 |
+
<button type="submit">
|
| 33 |
+
<svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
| 34 |
+
<line x1="22" y1="2" x2="11" y2="13"></line>
|
| 35 |
+
<polygon points="22 2 15 22 11 13 2 9 22 2"></polygon>
|
| 36 |
+
</svg>
|
| 37 |
+
</button>
|
| 38 |
+
</div>
|
| 39 |
+
</form>
|
| 40 |
+
</main>
|
| 41 |
+
<script>
|
| 42 |
+
let isProcessing = false;
|
| 43 |
+
let hasSentMessage = false;
|
| 44 |
+
|
| 45 |
+
function hideWelcome() {
|
| 46 |
+
if (!hasSentMessage) {
|
| 47 |
+
const welcome = document.getElementById("welcome-message");
|
| 48 |
+
if (welcome) welcome.style.display = "none";
|
| 49 |
+
hasSentMessage = true;
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
function send() {
|
| 54 |
+
if (isProcessing) return;
|
| 55 |
+
const userInput = document.getElementById("msg").value;
|
| 56 |
+
if (!userInput.trim()) return;
|
| 57 |
+
hideWelcome();
|
| 58 |
+
isProcessing = true;
|
| 59 |
+
const chat = document.getElementById("chat");
|
| 60 |
+
const userRow = document.createElement("div");
|
| 61 |
+
userRow.className = "message-row user";
|
| 62 |
+
const userDiv = document.createElement("div");
|
| 63 |
+
userDiv.className = "message user";
|
| 64 |
+
userDiv.textContent = userInput;
|
| 65 |
+
userRow.appendChild(userDiv);
|
| 66 |
+
chat.appendChild(userRow);
|
| 67 |
+
|
| 68 |
+
// Typing indicator
|
| 69 |
+
const botTypingRow = document.createElement("div");
|
| 70 |
+
botTypingRow.className = "message-row bot typing-row";
|
| 71 |
+
botTypingRow.innerHTML = '<div class="typing-indicator"><span></span><span></span><span></span></div>';
|
| 72 |
+
chat.appendChild(botTypingRow);
|
| 73 |
+
chat.scrollTop = chat.scrollHeight;
|
| 74 |
+
|
| 75 |
+
fetch("/get", {
|
| 76 |
+
method: "POST",
|
| 77 |
+
headers: { "Content-Type": "application/x-www-form-urlencoded" },
|
| 78 |
+
body: "user_input=" + encodeURIComponent(userInput)
|
| 79 |
+
})
|
| 80 |
+
.then(res => res.json())
|
| 81 |
+
.then(data => {
|
| 82 |
+
chat.removeChild(botTypingRow);
|
| 83 |
+
const botRow = document.createElement("div");
|
| 84 |
+
botRow.className = "message-row bot";
|
| 85 |
+
const botDiv = document.createElement("div");
|
| 86 |
+
botDiv.className = "message bot";
|
| 87 |
+
botDiv.textContent = data.response;
|
| 88 |
+
botRow.appendChild(botDiv);
|
| 89 |
+
if (data.process_time !== undefined) {
|
| 90 |
+
const timeDiv = document.createElement("div");
|
| 91 |
+
timeDiv.className = "time-display";
|
| 92 |
+
timeDiv.textContent = `Processing time: ${data.process_time} seconds`;
|
| 93 |
+
botRow.appendChild(timeDiv);
|
| 94 |
+
}
|
| 95 |
+
chat.appendChild(botRow);
|
| 96 |
+
chat.scrollTop = chat.scrollHeight;
|
| 97 |
+
})
|
| 98 |
+
.catch(error => {
|
| 99 |
+
chat.removeChild(botTypingRow);
|
| 100 |
+
const botRow = document.createElement("div");
|
| 101 |
+
botRow.className = "message-row bot";
|
| 102 |
+
const botDiv = document.createElement("div");
|
| 103 |
+
botDiv.className = "message bot";
|
| 104 |
+
botDiv.textContent = "An error occurred. Please try again.";
|
| 105 |
+
botRow.appendChild(botDiv);
|
| 106 |
+
chat.appendChild(botRow);
|
| 107 |
+
chat.scrollTop = chat.scrollHeight;
|
| 108 |
+
})
|
| 109 |
+
.finally(() => {
|
| 110 |
+
isProcessing = false;
|
| 111 |
+
});
|
| 112 |
+
document.getElementById("msg").value = "";
|
| 113 |
+
chat.scrollTop = chat.scrollHeight;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
document.getElementById("input-form").addEventListener("submit", function(e) {
|
| 117 |
+
e.preventDefault();
|
| 118 |
+
send();
|
| 119 |
+
});
|
| 120 |
+
|
| 121 |
+
document.getElementById("msg").addEventListener("keypress", function(e) {
|
| 122 |
+
if (e.key === "Enter") {
|
| 123 |
+
e.preventDefault();
|
| 124 |
+
send();
|
| 125 |
+
}
|
| 126 |
+
});
|
| 127 |
+
</script>
|
| 128 |
+
</body>
|
| 129 |
+
</html>
|