Chaitanya895 commited on
Commit
9ce76ea
·
verified ·
1 Parent(s): 7cf7100

Upload 15 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ data/pdfs/sahayak_disaster_relief.pdf filter=lfs diff=lfs merge=lfs -text
2
+ data/pdfs/sahayak_educational_programs.pdf filter=lfs diff=lfs merge=lfs -text
3
+ data/pdfs/sahayak_general_info.pdf filter=lfs diff=lfs merge=lfs -text
4
+ data/pdfs/sahayak_ngo_resources.pdf filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Docker_file
2
+ # Use Python 3.10 slim as the base image
3
+ FROM python:3.10-slim
4
+
5
+ # Set working directory
6
+ WORKDIR /app
7
+
8
+ # Create necessary directories with proper permissions
9
+ RUN mkdir -p /app /data/models /tmp/nltk_data && \
10
+ chmod -R 777 /app /data /tmp/nltk_data
11
+
12
+ # Install system dependencies for faiss, sentence-transformers, and PDF processing
13
+ RUN apt-get update && apt-get install -y --no-install-recommends \
14
+ git \
15
+ git-lfs \
16
+ build-essential \
17
+ libopenblas-dev \
18
+ libgomp1 \
19
+ g++ \
20
+ libgcc-s1 \
21
+ curl \
22
+ && apt-get clean \
23
+ && rm -rf /var/lib/apt/lists/*
24
+
25
+ # Copy requirements file first (for better Docker caching)
26
+ COPY requirements.txt .
27
+
28
+ # Set environment variables for Hugging Face
29
+ ENV TRANSFORMERS_CACHE=/data/models \
30
+ HF_HOME=/data/models \
31
+ HF_HUB_CACHE=/data/models \
32
+ NLTK_DATA=/tmp/nltk_data \
33
+ MODEL_DIR=/data/models \
34
+ PYTHONUNBUFFERED=1
35
+
36
+ # Install Python dependencies
37
+ RUN pip install --no-cache-dir -U pip setuptools wheel && \
38
+ pip install --no-cache-dir -r requirements.txt
39
+
40
+ # Pre-download NLTK data
41
+ RUN python -c "import nltk; nltk.download('punkt', download_dir='/tmp/nltk_data', quiet=True); nltk.download('punkt_tab', download_dir='/tmp/nltk_data', quiet=True)" || true
42
+
43
+ # Pre-download ML models during build
44
+ RUN python -c "\
45
+ from sentence_transformers import SentenceTransformer; \
46
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', cache_folder='/data/models'); \
47
+ print('Embedding model downloaded successfully')" || echo "Will download at runtime"
48
+
49
+ RUN python -c "\
50
+ from transformers import pipeline; \
51
+ qa = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', cache_dir='/data/models'); \
52
+ print('QA model downloaded successfully')" || echo "Will download at runtime"
53
+
54
+ # Copy application code
55
+ COPY . .
56
+
57
+ # Ensure all directories have proper permissions
58
+ RUN chmod -R 777 /app /data /tmp/nltk_data
59
+
60
+ # Set runtime environment variables
61
+ ENV PDF_PATH=data/pdfs \
62
+ PORT=7860
63
+
64
+ # Expose the port
65
+ EXPOSE 7860
66
+
67
+ # Health check
68
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
69
+ CMD curl -f http://localhost:7860/ || exit 1
70
+
71
+ # Run the application with gunicorn
72
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--timeout", "300", "--log-level", "info", "--workers", "1", "--threads", "2", "app:app"]
README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vexa Chatbot
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
+ # Vexa Chatbot
12
+
13
+ A chatbot application for the Sahayak NGO, built with Flask and Hugging Face models.
14
+
15
+ <!-- Force rebuild: 2025-05-07 14:10 -->
Sahayak_Organisation_Expanded_Info.pdf ADDED
Binary file (4.9 kB). View file
 
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import time
4
+ from flask import Flask, render_template, request, jsonify
5
+ from flask_cors import CORS
6
+ from flask_limiter import Limiter
7
+ from flask_limiter.util import get_remote_address
8
+ import re
9
+
10
+ # Setup logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
14
+ handlers=[logging.StreamHandler()]
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Import utils after logging setup
19
+ from sahayak_utils import (
20
+ load_pdf_text, load_passages_from_path, split_into_chunks, build_faiss_index,
21
+ get_qa_model, initialize_models, retrieve_relevant_passages,
22
+ answer_question, check_model_cache_status
23
+ )
24
+
25
+ app = Flask(__name__)
26
+ CORS(app) # Add this line to allow WebView requests
27
+ # Add rate limiting with stable initialization
28
+ limiter = Limiter(
29
+ key_func=get_remote_address,
30
+ default_limits=["200 per day", "50 per hour"]
31
+ )
32
+ limiter.init_app(app)
33
+
34
+ # Global variables
35
+ passages = []
36
+ faiss_index = None
37
+ embeddings = None
38
+ initialized = False
39
+ last_pdf_load_time = 0
40
+ response_cache = {} # In-memory cache for query responses
41
+
42
+ def validate_user_input(user_input):
43
+ """Validate and sanitize user input"""
44
+ sanitized = re.sub(r'<script.*?>.*?</script>', '', user_input, flags=re.DOTALL)
45
+ if len(sanitized) > 500:
46
+ sanitized = sanitized[:500]
47
+ return sanitized
48
+
49
+ def setup():
50
+ """Initialize the QA system with proper error handling"""
51
+ global passages, faiss_index, embeddings, initialized, last_pdf_load_time
52
+
53
+ if initialized and time.time() - last_pdf_load_time < 3600:
54
+ logger.info("Using cached data")
55
+ return
56
+
57
+ try:
58
+ logger.info("Initializing models...")
59
+ initialize_models()
60
+
61
+ logger.info("Preloading QA model...")
62
+ get_qa_model()
63
+
64
+ PDF_PATH = os.environ.get("PDF_PATH", "data/pdfs")
65
+ passages = load_passages_from_path(PDF_PATH)
66
+ logger.info(f"Passages ready: {len(passages)}")
67
+
68
+ logger.info("Building search index...")
69
+ faiss_index, embeddings = build_faiss_index(passages)
70
+
71
+ initialized = True
72
+ last_pdf_load_time = time.time()
73
+ logger.info("Initialization complete")
74
+ except Exception as e:
75
+ logger.error(f"Error during initialization: {str(e)}")
76
+ passages = ["Sahayak is a non-profit organization dedicated to providing support."]
77
+ faiss_index, embeddings = build_faiss_index(passages)
78
+ initialized = True
79
+
80
+ @app.route("/", methods=["GET"])
81
+ def home():
82
+ """Serve the home page"""
83
+ try:
84
+ if not initialized:
85
+ setup()
86
+ return render_template("index.html")
87
+ except Exception as e:
88
+ logger.error(f"Error in home route: {str(e)}")
89
+ return "An error occurred. Please check the logs.", 500
90
+
91
+ @app.route("/get", methods=["POST"])
92
+ @limiter.limit("10 per minute")
93
+ def get_response():
94
+ """Handle question answering requests with rate limiting and caching"""
95
+ try:
96
+ if not initialized:
97
+ setup()
98
+
99
+ user_input = request.form.get("user_input", "")
100
+ validated_input = validate_user_input(user_input)
101
+
102
+ if validated_input.strip():
103
+ # Check cache
104
+ if validated_input in response_cache:
105
+ logger.info(f"Cache hit for query: {validated_input[:50]}...")
106
+ cached_response = response_cache[validated_input]
107
+ return jsonify({
108
+ "response": cached_response["response"],
109
+ "process_time": cached_response["process_time"]
110
+ })
111
+
112
+ # Process new query
113
+ start_time = time.time()
114
+ answer = answer_question(validated_input, passages, faiss_index, embeddings)
115
+ process_time = time.time() - start_time
116
+
117
+ logger.info(f"Processed question in {process_time:.2f}s: {validated_input[:50]}...")
118
+
119
+ # Store in cache (limit cache size to 100)
120
+ response_cache[validated_input] = {
121
+ "response": answer,
122
+ "process_time": round(process_time, 2)
123
+ }
124
+ if len(response_cache) > 100:
125
+ response_cache.pop(next(iter(response_cache))) # Remove oldest entry
126
+
127
+ return jsonify({
128
+ "response": answer,
129
+ "process_time": round(process_time, 2)
130
+ })
131
+
132
+ return jsonify({
133
+ "response": "Please enter a valid question.",
134
+ "process_time": 0.0
135
+ })
136
+ except Exception as e:
137
+ logger.error(f"Error in get_response: {str(e)}")
138
+ return jsonify({
139
+ "response": "Sorry, an error occurred while processing your request. Please try again.",
140
+ "process_time": 0.0
141
+ }), 500
142
+
143
+ @app.route("/health", methods=["GET"])
144
+ def health_check():
145
+ """Health check endpoint for monitoring"""
146
+ status = "healthy" if initialized else "initializing"
147
+ return jsonify({
148
+ "status": status,
149
+ "passages_loaded": len(passages) > 1,
150
+ "uptime": time.time() - last_pdf_load_time if initialized else 0
151
+ }), 200
152
+
153
+ @app.route("/reset", methods=["POST"])
154
+ def reset_models():
155
+ """Reset model cache (requires admin key)"""
156
+ global initialized, last_pdf_load_time, response_cache
157
+
158
+ admin_key = request.form.get("admin_key", "")
159
+ expected_key = os.environ.get("ADMIN_KEY", "not-set")
160
+
161
+ if admin_key != expected_key:
162
+ return jsonify({"status": "unauthorized"}), 401
163
+
164
+ initialized = False
165
+ last_pdf_load_time = 0
166
+ response_cache.clear() # Clear query cache
167
+
168
+ setup()
169
+
170
+ return jsonify({"status": "reset_complete"})
171
+
172
+ if __name__ == "__main__":
173
+ port = int(os.environ.get("PORT", 7860))
174
+ try:
175
+ port = int(port)
176
+ except (ValueError, TypeError):
177
+ logger.warning(f"Invalid PORT value: {port}, using default 7860")
178
+ port = 7860
179
+
180
+ logger.info(f"Starting app on port: {port}")
181
+
182
+ setup()
183
+
184
+ app.run(host="0.0.0.0", port=port)
data/pdfs/Sahayak_Organisation_Expanded_Info.pdf ADDED
Binary file (4.9 kB). View file
 
data/pdfs/sahayak_disaster_relief.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73d39f1dd72602cce01036b96952334f3faaf47d1cdf45f84723feb6453e54b7
3
+ size 144362
data/pdfs/sahayak_educational_programs.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7c9933c39024d5ee88615d13946a1144bf84d3519059dac6d612f87e47ef15a
3
+ size 188177
data/pdfs/sahayak_general_info.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d4f52d77108ca4096b258b5930ed5fe892161980fbc96b2ecccd9000fdc33be
3
+ size 129981
data/pdfs/sahayak_ngo_resources.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca61a9fe949c469c120d80aeeb2ac0d75fba6794e0c13b3df1c6e7718d444a66
3
+ size 212873
requirements.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Flask and web dependencies
3
+ flask==2.2.5
4
+ flask-limiter==3.5.0
5
+ flask-cors==4.0.0
6
+ werkzeug==2.3.8
7
+ gunicorn==21.2.0
8
+
9
+ # NLP and ML dependencies
10
+ nltk==3.8.1
11
+ numpy>=1.24.0,<2.0.0
12
+ faiss-cpu==1.7.4
13
+ torch>=2.0.0
14
+ transformers>=4.35.0
15
+ sentence-transformers>=2.2.0
16
+ scikit-learn>=1.3.0
17
+ scipy>=1.11.0
18
+
19
+ # PDF processing
20
+ PyMuPDF>=1.23.0
21
+ PyPDF2>=3.0.0
22
+
23
+ # Translation
24
+ deep-translator>=1.11.0
25
+ beautifulsoup4>=4.12.0
26
+
27
+ # Utilities
28
+ requests>=2.31.0
29
+ setuptools>=65.5.1
30
+ wheel>=0.38.4
31
+ huggingface_hub>=0.19.0
sahayak_utils.py ADDED
@@ -0,0 +1,1623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["TRANSFORMERS_CACHE"] = "/data/models"
3
+ os.environ["HF_HOME"] = "/data/models"
4
+ os.environ["HF_HUB_CACHE"] = "/data/models"
5
+ import logging
6
+ import nltk
7
+ import numpy as np
8
+ import faiss
9
+ import re
10
+ import json
11
+ import time
12
+ import torch
13
+ from functools import lru_cache
14
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AutoModel, pipeline
15
+ import difflib
16
+
17
+ # Setup logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format='%(asctime)s [%(levelname)s] %(name)s: %(message)s'
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Debug: Log environment variables
25
+ logger.info(f"TRANSFORMERS_CACHE: {os.environ.get('TRANSFORMERS_CACHE')}")
26
+ logger.info(f"HF_HOME: {os.environ.get('HF_HOME')}")
27
+ logger.info(f"HF_HUB_CACHE: {os.environ.get('HF_HUB_CACHE')}")
28
+
29
+ # Import libraries with error handling
30
+ try:
31
+ import fitz # PyMuPDF
32
+ PDF_BACKEND = "pymupdf"
33
+ except ImportError:
34
+ logger.warning("PyMuPDF (fitz) not installed. Trying PyPDF2 as fallback.")
35
+ try:
36
+ import PyPDF2
37
+ PDF_BACKEND = "pypdf2"
38
+ except ImportError:
39
+ logger.error("No PDF processing library available.")
40
+ PDF_BACKEND = "none"
41
+
42
+ # Translation setup with graceful fallback
43
+ try:
44
+ from deep_translator import GoogleTranslator
45
+ from deep_translator.exceptions import LanguageNotSupportedException
46
+
47
+ class DeepTranslatorWrapper:
48
+ """Wrapper to provide consistent API like googletrans"""
49
+ def detect(self, text):
50
+ """Detect language using simple heuristics"""
51
+ class LangResult:
52
+ def __init__(self, lang):
53
+ self.lang = lang
54
+
55
+ # Simple language detection based on character ranges
56
+ text_sample = text[:100]
57
+
58
+ # Hindi (Devanagari script)
59
+ if any('\u0900' <= c <= '\u097F' for c in text_sample):
60
+ return LangResult("hi")
61
+ # Bengali
62
+ elif any('\u0980' <= c <= '\u09FF' for c in text_sample):
63
+ return LangResult("bn")
64
+ # Tamil
65
+ elif any('\u0B80' <= c <= '\u0BFF' for c in text_sample):
66
+ return LangResult("ta")
67
+ # Telugu
68
+ elif any('\u0C00' <= c <= '\u0C7F' for c in text_sample):
69
+ return LangResult("te")
70
+ # Gujarati
71
+ elif any('\u0A80' <= c <= '\u0AFF' for c in text_sample):
72
+ return LangResult("gu")
73
+ # Marathi (also Devanagari, but different patterns)
74
+ elif any('\u0900' <= c <= '\u097F' for c in text_sample):
75
+ return LangResult("mr")
76
+ # Chinese
77
+ elif any('\u4E00' <= c <= '\u9FFF' for c in text_sample):
78
+ return LangResult("zh")
79
+ # Arabic
80
+ elif any('\u0600' <= c <= '\u06FF' for c in text_sample):
81
+ return LangResult("ar")
82
+ # Spanish/French/German (accented Latin)
83
+ elif any(c in 'áéíóúñüÁÉÍÓÚÑÜ' for c in text_sample):
84
+ return LangResult("es")
85
+ else:
86
+ return LangResult("en")
87
+
88
+ def translate(self, text, src=None, dest=None):
89
+ """Translate text using deep-translator"""
90
+ class TranslationResult:
91
+ def __init__(self, translated_text):
92
+ self.text = translated_text
93
+
94
+ try:
95
+ if src == dest or dest is None:
96
+ return TranslationResult(text)
97
+
98
+ # Map language codes
99
+ src_lang = src if src and src != 'auto' else 'auto'
100
+ dest_lang = dest if dest else 'en'
101
+
102
+ translated = GoogleTranslator(source=src_lang, target=dest_lang).translate(text)
103
+ return TranslationResult(translated if translated else text)
104
+ except Exception as e:
105
+ logger.warning(f"Translation failed: {e}")
106
+ return TranslationResult(text)
107
+
108
+ translator = DeepTranslatorWrapper()
109
+ logger.info("deep-translator initialized successfully")
110
+ except Exception as e:
111
+ logger.warning(f"deep-translator not available ({e}); using simple no-op translator")
112
+
113
+ class SimpleTranslator:
114
+ def detect(self, text):
115
+ class LangResult:
116
+ def __init__(self):
117
+ self.lang = "en"
118
+ return LangResult()
119
+
120
+ def translate(self, text, src=None, dest=None):
121
+ class TranslationResult:
122
+ def __init__(self, text):
123
+ self.text = text
124
+ return TranslationResult(text)
125
+
126
+ translator = SimpleTranslator()
127
+
128
+ # Ensure NLTK punkt is downloaded to a writable path
129
+ nltk.data.path.append('/tmp/nltk_data')
130
+ try:
131
+ nltk.download('punkt', download_dir='/tmp/nltk_data', quiet=True)
132
+ except Exception as e:
133
+ logger.warning(f"Failed to download NLTK punkt: {e}. Text chunking may be affected.")
134
+
135
+ # Models configuration
136
+ MODEL_DIR = os.environ.get("MODEL_DIR", "/data/models")
137
+ EMBEDDING_MODEL_NAME = os.environ.get("EMBEDDING_MODEL", "sentence-transformers/all-mpnet-base-v2")
138
+ os.makedirs(MODEL_DIR, exist_ok=True)
139
+
140
+ # Setup transformers-based embedding model
141
+ logger.info(f"Loading embedding model from: {EMBEDDING_MODEL_NAME}")
142
+ try:
143
+ tokenizer = AutoTokenizer.from_pretrained(
144
+ EMBEDDING_MODEL_NAME,
145
+ cache_dir=MODEL_DIR,
146
+ local_files_only=os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
147
+ )
148
+ model = AutoModel.from_pretrained(
149
+ EMBEDDING_MODEL_NAME,
150
+ cache_dir=MODEL_DIR,
151
+ local_files_only=os.environ.get("TRANSFORMERS_OFFLINE", "0") == "1"
152
+ )
153
+ except Exception as e:
154
+ logger.error(f"Failed to load embedding model: {e}")
155
+ raise
156
+
157
+ def mean_pooling(model_output, attention_mask):
158
+ token_embeddings = model_output[0]
159
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
160
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
161
+
162
+ def transformers_encode(texts, batch_size=8):
163
+ if isinstance(texts, str):
164
+ texts = [texts]
165
+ if not isinstance(texts, (list, tuple)) or not texts:
166
+ logger.error(f"Invalid input to transformers_encode: {texts}")
167
+ return np.random.randn(1, 768)
168
+ if not all(isinstance(t, str) for t in texts):
169
+ logger.error(f"Non-string elements in texts: {texts}")
170
+ texts = [str(t) for t in texts]
171
+ try:
172
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
173
+ with torch.no_grad():
174
+ model_output = model(**encoded_input)
175
+ embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
176
+ return embeddings.numpy()
177
+ except Exception as e:
178
+ logger.error(f"Error in transformers_encode: {str(e)}")
179
+ return np.random.randn(len(texts), 768)
180
+
181
+ # Model caching configuration
182
+ CACHE_FILE = os.path.join(MODEL_DIR, "model_cache_status.json")
183
+
184
+ def check_model_cache_status():
185
+ """Check if models are already cached"""
186
+ if not os.path.exists(MODEL_DIR):
187
+ os.makedirs(MODEL_DIR, exist_ok=True)
188
+ return False
189
+
190
+ if os.path.exists(CACHE_FILE):
191
+ try:
192
+ with open(CACHE_FILE, 'r') as f:
193
+ cache_data = json.load(f)
194
+ if cache_data.get('initialized', False):
195
+ logger.info("Using cached models")
196
+ return True
197
+ except Exception as e:
198
+ logger.warning(f"Error reading cache file: {e}")
199
+
200
+ return False
201
+
202
+ def mark_models_as_cached():
203
+ """Mark models as successfully cached"""
204
+ try:
205
+ with open(CACHE_FILE, 'w') as f:
206
+ json.dump({'initialized': True, 'timestamp': time.time()}, f)
207
+ logger.info("Models marked as cached")
208
+ except Exception as e:
209
+ logger.warning(f"Error writing cache file: {e}")
210
+
211
+ def initialize_models():
212
+ """Initialize and save models properly with caching"""
213
+ if check_model_cache_status():
214
+ logger.info("Models already cached, skipping initialization")
215
+ return
216
+
217
+ logger.info("Preloading QA model...")
218
+ get_qa_model()
219
+ mark_models_as_cached()
220
+
221
+ @lru_cache(maxsize=1)
222
+ def get_embedder():
223
+ """Get the embedding model with caching"""
224
+ embedding_model_dir = os.path.join(MODEL_DIR, "embedding_model")
225
+ try:
226
+ class Embedder:
227
+ def encode(self, texts, batch_size=8):
228
+ return transformers_encode(texts, batch_size)
229
+ if os.path.exists(embedding_model_dir):
230
+ logger.info(f"Loading embedding model from: {embedding_model_dir}")
231
+ return Embedder()
232
+ logger.info(f"Using transformers-based embedding model: {EMBEDDING_MODEL_NAME}")
233
+ return Embedder()
234
+ except Exception as e:
235
+ logger.error(f"Error loading embedding model: {str(e)}. Using random embeddings as fallback.")
236
+ class SimpleEmbedder:
237
+ def encode(self, texts, batch_size=8):
238
+ if isinstance(texts, str):
239
+ return np.random.randn(768)
240
+ return np.random.randn(len(texts), 768)
241
+ return SimpleEmbedder()
242
+
243
+ @lru_cache(maxsize=1)
244
+ def get_qa_model():
245
+ """Get the QA model with caching"""
246
+ try:
247
+ model_dir_contents = os.listdir(MODEL_DIR)
248
+ logger.info(f"Model directory contents: {model_dir_contents}")
249
+ except Exception as e:
250
+ logger.error(f"Failed to list model directory: {e}")
251
+
252
+ try:
253
+ logger.info("Loading QA model: distilbert-base-uncased-distilled-squad")
254
+ qa_model = pipeline(
255
+ "question-answering",
256
+ model="distilbert-base-uncased-distilled-squad",
257
+ tokenizer="distilbert-base-uncased-distilled-squad",
258
+ local_files_only=True,
259
+ cache_dir=MODEL_DIR
260
+ )
261
+ logger.info("Successfully loaded distilbert-base-uncased-distilled-squad")
262
+ return qa_model
263
+ except Exception as e:
264
+ logger.warning(f"Failed to load distilbert-base-uncased-distilled-squad: {e}. Falling back to roberta-base-squad2.")
265
+ try:
266
+ logger.info("Loading fallback QA model: deepset/roberta-base-squad2")
267
+ qa_model = pipeline(
268
+ "question-answering",
269
+ model="deepset/roberta-base-squad2",
270
+ tokenizer="deepset/roberta-base-squad2",
271
+ local_files_only=True,
272
+ cache_dir=MODEL_DIR
273
+ )
274
+ logger.info("Successfully loaded deepset/roberta-base-squad2")
275
+ return qa_model
276
+ except Exception as e:
277
+ logger.error(f"Error loading fallback QA model: {e}")
278
+ def simple_qa(question, context):
279
+ return {
280
+ "answer": "I'm sorry, the QA model couldn't be loaded. Please try again later.",
281
+ "score": 0.0
282
+ }
283
+ return simple_qa
284
+
285
+ def load_pdf_text(pdf_path):
286
+ """Load text from PDF with better error handling and multiple backends"""
287
+ if not os.path.exists(pdf_path):
288
+ logger.error(f"PDF file not found: {pdf_path}")
289
+ return "PDF file not found. Please check the file path."
290
+
291
+ cache_path = os.path.join(MODEL_DIR, f"{os.path.basename(pdf_path)}.cache.json")
292
+ if os.path.exists(cache_path):
293
+ try:
294
+ logger.info(f"Loading PDF content from cache: {cache_path}")
295
+ with open(cache_path, 'r', encoding='utf-8') as f:
296
+ cache_data = json.load(f)
297
+ return cache_data.get("text", "")
298
+ except Exception as e:
299
+ logger.warning(f"Error loading cache: {str(e)}")
300
+
301
+ try:
302
+ if PDF_BACKEND == "pymupdf":
303
+ logger.info(f"Loading PDF with PyMuPDF: {pdf_path}")
304
+ doc = fitz.open(pdf_path)
305
+ raw_text = "\n".join(page.get_text() for page in doc)
306
+ doc.close()
307
+ elif PDF_BACKEND == "pypdf2":
308
+ logger.info(f"Loading PDF with PyPDF2: {pdf_path}")
309
+ with open(pdf_path, 'rb') as file:
310
+ reader = PyPDF2.PdfReader(file)
311
+ raw_text = "\n".join(page.extract_text() for page in reader.pages)
312
+ else:
313
+ logger.error("No PDF backend available")
314
+ return "No PDF processing library is installed."
315
+
316
+ clean_text = " ".join(raw_text.split())
317
+ if not clean_text:
318
+ logger.warning(f"Extracted empty text from PDF: {pdf_path}")
319
+ return "No readable text found in the PDF."
320
+
321
+ try:
322
+ with open(cache_path, 'w', encoding='utf-8') as f:
323
+ json.dump({"text": clean_text, "timestamp": time.time()}, f)
324
+ except Exception as e:
325
+ logger.warning(f"Could not write PDF cache: {str(e)}")
326
+
327
+ return clean_text
328
+ except Exception as e:
329
+ logger.error(f"Error loading PDF: {str(e)}")
330
+ return f"Error loading PDF: {str(e)}"
331
+
332
+
333
+ def load_passages_from_path(path_setting):
334
+ """Load and chunk PDF content from all available sources.
335
+
336
+ Sources checked (in order):
337
+ 1. Root PDF: Sahayak_Organisation_Expanded_Info.pdf
338
+ 2. Directory: data/pdfs (all PDFs inside)
339
+ 3. Custom path from path_setting
340
+ """
341
+ all_passages = []
342
+ sources_loaded = []
343
+
344
+ # Always try to load root PDF first
345
+ root_pdf = "Sahayak_Organisation_Expanded_Info.pdf"
346
+ if os.path.isfile(root_pdf):
347
+ text = load_pdf_text(root_pdf)
348
+ chunks = split_into_chunks(text)
349
+ all_passages.extend(chunks)
350
+ sources_loaded.append(f"{root_pdf} ({len(chunks)} passages)")
351
+ logger.info(f"Loaded {len(chunks)} passages from root PDF: {root_pdf}")
352
+
353
+ # Load from data/pdfs directory
354
+ pdf_dir = "data/pdfs"
355
+ if os.path.isdir(pdf_dir):
356
+ pdf_files = [
357
+ os.path.join(pdf_dir, f) for f in sorted(os.listdir(pdf_dir))
358
+ if f.lower().endswith(".pdf")
359
+ ]
360
+ for pdf_file in pdf_files:
361
+ # Skip if same as root PDF
362
+ if os.path.basename(pdf_file) == root_pdf:
363
+ continue
364
+ text = load_pdf_text(pdf_file)
365
+ chunks = split_into_chunks(text)
366
+ all_passages.extend(chunks)
367
+ sources_loaded.append(f"{os.path.basename(pdf_file)} ({len(chunks)} passages)")
368
+ logger.info(f"Loaded PDFs from directory: {pdf_dir}")
369
+
370
+ # Also check custom path if different from defaults
371
+ target = path_setting or ""
372
+ if target and target not in [root_pdf, pdf_dir]:
373
+ if os.path.isdir(target):
374
+ pdf_files = [
375
+ os.path.join(target, f) for f in sorted(os.listdir(target))
376
+ if f.lower().endswith(".pdf")
377
+ ]
378
+ for pdf_file in pdf_files:
379
+ text = load_pdf_text(pdf_file)
380
+ chunks = split_into_chunks(text)
381
+ all_passages.extend(chunks)
382
+ sources_loaded.append(f"{os.path.basename(pdf_file)} ({len(chunks)} passages)")
383
+ elif os.path.isfile(target):
384
+ text = load_pdf_text(target)
385
+ chunks = split_into_chunks(text)
386
+ all_passages.extend(chunks)
387
+ sources_loaded.append(f"{target} ({len(chunks)} passages)")
388
+
389
+ if all_passages:
390
+ logger.info(f"Total knowledge base: {len(all_passages)} passages from {len(sources_loaded)} sources")
391
+ logger.info(f"Sources: {sources_loaded}")
392
+ return all_passages
393
+
394
+ logger.error("No PDF sources found. Using fallback.")
395
+ return ["Sahayak is a non-profit organization dedicated to providing support and community development."]
396
+
397
+ def split_into_chunks(text, max_length=200, min_length=50):
398
+ """Split text into chunks based on thematic sections and sentence boundaries"""
399
+ if not isinstance(text, str):
400
+ logger.error(f"Invalid input to split_into_chunks: {text}")
401
+ return ["Invalid input"]
402
+ try:
403
+ sections = re.split(r'(?=\b[A-Z][a-zA-Z\s]+:)', text)
404
+ chunks = []
405
+
406
+ for section in sections:
407
+ section = section.strip()
408
+ if not section:
409
+ continue
410
+ paragraphs = section.split('\n\n')
411
+ for paragraph in paragraphs:
412
+ paragraph = paragraph.strip()
413
+ if not paragraph:
414
+ continue
415
+ if len(paragraph) <= max_length and len(paragraph) >= min_length:
416
+ chunks.append(paragraph)
417
+ elif len(paragraph) < min_length:
418
+ continue
419
+ else:
420
+ sentences = nltk.sent_tokenize(paragraph)
421
+ current_chunk = ""
422
+ for sentence in sentences:
423
+ if len(current_chunk) + len(sentence) <= max_length:
424
+ current_chunk += " " + sentence
425
+ else:
426
+ if len(current_chunk) >= min_length:
427
+ chunks.append(current_chunk.strip())
428
+ current_chunk = sentence
429
+ if current_chunk and len(current_chunk) >= min_length:
430
+ chunks.append(current_chunk.strip())
431
+
432
+ if not chunks:
433
+ logger.warning("No chunks created from text")
434
+ chunks = ["No content available"]
435
+
436
+ return chunks
437
+ except Exception as e:
438
+ logger.error(f"Error splitting text into chunks: {str(e)}")
439
+ return ["Error processing text content"]
440
+
441
+ def build_faiss_index(passages):
442
+ """Build FAISS index with error handling"""
443
+ try:
444
+ logger.info(f"Building FAISS index with passages: {passages[:2]}... (total: {len(passages)})")
445
+ embedder = get_embedder()
446
+ embeddings = embedder.encode(passages)
447
+
448
+ dimension = embeddings.shape[1]
449
+ index = faiss.IndexFlatL2(dimension)
450
+ index.add(np.array(embeddings).astype('float32'))
451
+
452
+ return index, embeddings
453
+ except Exception as e:
454
+ logger.error(f"Error building FAISS index: {str(e)}")
455
+ dimension = 768
456
+ dummy_embeddings = np.random.randn(len(passages), dimension).astype('float32')
457
+ dummy_index = faiss.IndexFlatL2(dimension)
458
+ dummy_index.add(dummy_embeddings)
459
+ return dummy_index, dummy_embeddings
460
+
461
+ def retrieve_relevant_passages(query, passages, vector_index, embeddings, top_k=10):
462
+ """Retrieve the most relevant passages using pure semantic similarity.
463
+
464
+ Uses cosine similarity between query and passage embeddings for true semantic understanding,
465
+ not just keyword matching.
466
+ """
467
+ try:
468
+ embedder = get_embedder()
469
+ query_vector = embedder.encode([query])[0].reshape(1, -1).astype('float32')
470
+
471
+ # Get more candidates than needed, then re-rank
472
+ num_candidates = min(top_k * 3, len(passages))
473
+ D, I = vector_index.search(query_vector, num_candidates)
474
+
475
+ # Re-rank based on semantic similarity score
476
+ results = []
477
+ for idx, dist in zip(I[0], D[0]):
478
+ if idx < len(passages):
479
+ # Convert L2 distance to similarity score (closer = higher score)
480
+ similarity_score = 1.0 / (1.0 + dist)
481
+ results.append((idx, similarity_score, passages[idx]))
482
+
483
+ # Sort by similarity score descending
484
+ results.sort(key=lambda x: x[1], reverse=True)
485
+
486
+ # Return top_k most relevant passages
487
+ return [passage for _, _, passage in results[:top_k]]
488
+ except Exception as e:
489
+ logger.error(f"Error retrieving passages: {str(e)}")
490
+ import random
491
+ return random.sample(passages, min(3, len(passages)))
492
+
493
+ def detect_and_translate(query):
494
+ """Detect language and translate with error handling"""
495
+ try:
496
+ lang = translator.detect(query).lang
497
+ if lang != "en":
498
+ translated_query = translator.translate(query, src=lang, dest="en").text
499
+ else:
500
+ translated_query = query
501
+ return translated_query, lang
502
+ except Exception as e:
503
+ logger.warning(f"Translation error: {str(e)}. Using original query.")
504
+ return query, "en"
505
+
506
+
507
+ def translate_text(text, target_lang):
508
+ """Translate text back to the target language with fallback"""
509
+ if not text or not target_lang or target_lang == "en":
510
+ return text
511
+ try:
512
+ return translator.translate(text, src="en", dest=target_lang).text
513
+ except Exception as e:
514
+ logger.warning(f"Answer translation failed: {e}. Returning English text.")
515
+ return text
516
+
517
+
518
+ def moderate_query(query):
519
+ """Lightweight safety filter to avoid harmful or off-policy content."""
520
+ lowered = query.lower()
521
+ blocked_topics = [
522
+ "violence", "weapon", "harm myself", "self-harm", "suicide", "attack",
523
+ "explosive", "bomb", "terror", "hate", "racist", "sex", "explicit",
524
+ "drugs", "narcotic", "illegal"
525
+ ]
526
+ if any(term in lowered for term in blocked_topics):
527
+ return {
528
+ "status": "blocked",
529
+ "message": ("I'm here to provide supportive, lawful information. "
530
+ "I can't help with that topic. If you need wellbeing support, consider contacting a local helpline.")
531
+ }
532
+ return {"status": "ok"}
533
+
534
+
535
+ def is_in_scope(query):
536
+ """Keep answers aligned to community service, government schemes, and Sahayak context.
537
+
538
+ Expanded to cover:
539
+ - Sahayak organization topics
540
+ - Government welfare schemes and benefits
541
+ - Social services and community development
542
+ - Education and health programs
543
+ - Disaster relief and emergency support
544
+ - Legal rights and entitlements for citizens
545
+ """
546
+ scope_keywords = [
547
+ # Sahayak specific
548
+ "sahayak", "vexa", "ngo", "non profit", "nonprofit",
549
+ # Community and social
550
+ "community", "benefit", "volunteer", "social", "help", "support", "welfare",
551
+ "charity", "donation", "outreach", "initiative", "campaign",
552
+ # Education
553
+ "education", "school", "scholarship", "student", "learning", "training",
554
+ "skill", "literacy", "academic", "mentorship",
555
+ # Health
556
+ "health", "medical", "hospital", "treatment", "medicine", "healthcare",
557
+ "disability", "disabled", "blind", "specially abled",
558
+ # Government schemes
559
+ "scheme", "yojana", "government", "subsidy", "pension", "ration",
560
+ "aadhar", "aadhaar", "pan", "voter", "certificate", "document",
561
+ # Services
562
+ "service", "programme", "program", "project", "event", "activity",
563
+ # Locations
564
+ "belgaum", "bangalore", "karnataka", "india",
565
+ # Disaster and emergency
566
+ "disaster", "relief", "emergency", "flood", "earthquake",
567
+ # Rights and legal
568
+ "rights", "entitlement", "eligibility", "apply", "registration",
569
+ # General queries that should be answered
570
+ "who", "what", "when", "where", "how", "why", "tell", "explain",
571
+ "founder", "team", "member", "leader", "president", "secretary",
572
+ "mission", "vision", "goal", "objective", "aim", "purpose"
573
+ ]
574
+ lowered = query.lower()
575
+ # More permissive: if query is short or contains any scope keyword, allow it
576
+ if len(query.split()) <= 5:
577
+ return True # Short queries are likely on-topic
578
+ return any(k in lowered for k in scope_keywords)
579
+
580
+
581
+ def scope_redirect_message(lang):
582
+ msg = (
583
+ "I focus on Sahayak and community-benefit topics. "
584
+ "Please ask about our programmes, volunteering, events, or social impact."
585
+ )
586
+ return translate_text(msg, lang)
587
+
588
+
589
+ # =============================================================================
590
+ # SEMANTIC UNDERSTANDING ENGINE - Expert-Level Context Analysis
591
+ # =============================================================================
592
+
593
+ # Intent templates for semantic matching (these will be encoded for similarity)
594
+ INTENT_TEMPLATES = {
595
+ "about_organization": [
596
+ "What is Sahayak?",
597
+ "Tell me about Sahayak organization",
598
+ "Describe Sahayak NGO",
599
+ "What does Sahayak do?",
600
+ "Explain Sahayak organization"
601
+ ],
602
+ "founder_leadership": [
603
+ "Who founded Sahayak?",
604
+ "Who is the founder of Sahayak?",
605
+ "Who started Sahayak?",
606
+ "Tell me about the founder",
607
+ "Who is Verril Vaz?"
608
+ ],
609
+ "president_leadership": [
610
+ "Who is the president?",
611
+ "Who leads Sahayak?",
612
+ "Who is the current president?",
613
+ "Tell me about the leadership team"
614
+ ],
615
+ "team_members": [
616
+ "Who are the team members?",
617
+ "How many members does Sahayak have?",
618
+ "Tell me about the team",
619
+ "Who works at Sahayak?"
620
+ ],
621
+ "mission_vision": [
622
+ "What is the mission of Sahayak?",
623
+ "What is the vision?",
624
+ "What are the goals?",
625
+ "What are the objectives?",
626
+ "What does Sahayak aim to achieve?",
627
+ "What is the purpose of Sahayak?"
628
+ ],
629
+ "initiatives_programs": [
630
+ "What programs does Sahayak run?",
631
+ "What are Sahayak's initiatives?",
632
+ "Tell me about the activities",
633
+ "What projects has Sahayak done?",
634
+ "What kind of work does Sahayak do?"
635
+ ],
636
+ "recent_events": [
637
+ "What are the recent events?",
638
+ "What activities have been conducted recently?",
639
+ "What has Sahayak done recently?",
640
+ "Tell me about recent visits"
641
+ ],
642
+ "location_branches": [
643
+ "Where is Sahayak located?",
644
+ "How many branches does Sahayak have?",
645
+ "Where does Sahayak operate?",
646
+ "What cities does Sahayak work in?"
647
+ ],
648
+ "join_volunteer": [
649
+ "How can I join Sahayak?",
650
+ "How to volunteer?",
651
+ "How to become a member?",
652
+ "How can I help?",
653
+ "How to get involved?"
654
+ ],
655
+ "donate_support": [
656
+ "How to donate?",
657
+ "How can I support Sahayak?",
658
+ "How to contribute?",
659
+ "Where can I donate?"
660
+ ],
661
+ "contact_info": [
662
+ "How to contact Sahayak?",
663
+ "What is the phone number?",
664
+ "What is the email?",
665
+ "How to reach Sahayak?"
666
+ ],
667
+ "founding_date": [
668
+ "When was Sahayak founded?",
669
+ "When did Sahayak start?",
670
+ "What is the founding date?",
671
+ "How old is Sahayak?"
672
+ ],
673
+ "greeting": [
674
+ "Hello",
675
+ "Hi",
676
+ "Hey",
677
+ "Good morning",
678
+ "Namaste"
679
+ ],
680
+ "thanks": [
681
+ "Thank you",
682
+ "Thanks",
683
+ "Thanks a lot",
684
+ "Thank you so much"
685
+ ],
686
+ "affirmative": [
687
+ "Yes",
688
+ "Sure",
689
+ "Ok",
690
+ "Okay",
691
+ "Tell me more",
692
+ "Continue"
693
+ ]
694
+ }
695
+
696
+ # Cache for intent embeddings
697
+ _intent_embeddings_cache = {}
698
+
699
+ def get_intent_embeddings():
700
+ """Get cached intent embeddings or compute them."""
701
+ global _intent_embeddings_cache
702
+
703
+ if _intent_embeddings_cache:
704
+ return _intent_embeddings_cache
705
+
706
+ try:
707
+ embedder = get_embedder()
708
+ for intent, templates in INTENT_TEMPLATES.items():
709
+ embeddings = embedder.encode(templates)
710
+ # Store mean embedding for each intent
711
+ _intent_embeddings_cache[intent] = {
712
+ "templates": templates,
713
+ "embeddings": embeddings,
714
+ "centroid": np.mean(embeddings, axis=0)
715
+ }
716
+ logger.info(f"Intent embeddings computed for {len(_intent_embeddings_cache)} intents")
717
+ except Exception as e:
718
+ logger.error(f"Error computing intent embeddings: {e}")
719
+
720
+ return _intent_embeddings_cache
721
+
722
+ def semantic_intent_classification(query):
723
+ """
724
+ Classify user intent using semantic similarity with embeddings.
725
+ Returns the best matching intent and confidence score.
726
+ """
727
+ try:
728
+ embedder = get_embedder()
729
+ query_embedding = embedder.encode([query])[0]
730
+
731
+ intent_embeddings = get_intent_embeddings()
732
+ if not intent_embeddings:
733
+ return "general", 0.0
734
+
735
+ best_intent = "general"
736
+ best_score = 0.0
737
+ best_template = ""
738
+
739
+ for intent, data in intent_embeddings.items():
740
+ # Compare with centroid (mean of all templates)
741
+ centroid = data["centroid"]
742
+ centroid_similarity = np.dot(query_embedding, centroid) / (
743
+ np.linalg.norm(query_embedding) * np.linalg.norm(centroid)
744
+ )
745
+
746
+ # Also check best individual template match
747
+ for i, template_emb in enumerate(data["embeddings"]):
748
+ template_similarity = np.dot(query_embedding, template_emb) / (
749
+ np.linalg.norm(query_embedding) * np.linalg.norm(template_emb)
750
+ )
751
+ if template_similarity > best_score:
752
+ best_score = template_similarity
753
+ best_intent = intent
754
+ best_template = data["templates"][i]
755
+
756
+ logger.info(f"Semantic intent: {best_intent} (score: {best_score:.3f}, matched: '{best_template}')")
757
+ return best_intent, best_score
758
+
759
+ except Exception as e:
760
+ logger.error(f"Intent classification error: {e}")
761
+ return "general", 0.0
762
+
763
+
764
+ def semantic_passage_ranking(query, passages, top_k=5):
765
+ """
766
+ Rank passages by semantic similarity to the query.
767
+ Uses cosine similarity between query and passage embeddings.
768
+ """
769
+ try:
770
+ embedder = get_embedder()
771
+ query_embedding = embedder.encode([query])[0]
772
+ passage_embeddings = embedder.encode(passages)
773
+
774
+ # Calculate cosine similarity
775
+ similarities = []
776
+ for i, passage_emb in enumerate(passage_embeddings):
777
+ similarity = np.dot(query_embedding, passage_emb) / (
778
+ np.linalg.norm(query_embedding) * np.linalg.norm(passage_emb)
779
+ )
780
+ similarities.append((i, similarity, passages[i]))
781
+
782
+ # Sort by similarity descending
783
+ similarities.sort(key=lambda x: x[1], reverse=True)
784
+
785
+ return [(passage, score) for _, score, passage in similarities[:top_k]]
786
+
787
+ except Exception as e:
788
+ logger.error(f"Passage ranking error: {e}")
789
+ return [(p, 0.5) for p in passages[:top_k]]
790
+
791
+
792
+ def analyze_query_complexity(query):
793
+ """
794
+ Analyze query complexity to determine response strategy.
795
+ """
796
+ words = query.split()
797
+ complexity = {
798
+ "word_count": len(words),
799
+ "is_question": any(query.strip().endswith(c) for c in ['?', '?']),
800
+ "has_multiple_parts": any(c in query for c in [',', 'and', 'or', 'also']),
801
+ "is_comparison": any(w in query.lower() for w in ['compare', 'difference', 'between', 'vs', 'versus']),
802
+ "is_list_request": any(w in query.lower() for w in ['list', 'all', 'every', 'each', 'various']),
803
+ "is_explanation": any(w in query.lower() for w in ['why', 'how', 'explain', 'describe', 'elaborate']),
804
+ "is_specific": any(w in query.lower() for w in ['specific', 'exactly', 'particular', 'precise'])
805
+ }
806
+
807
+ # Calculate complexity score
808
+ score = 0
809
+ if complexity["word_count"] > 10:
810
+ score += 1
811
+ if complexity["has_multiple_parts"]:
812
+ score += 1
813
+ if complexity["is_comparison"]:
814
+ score += 2
815
+ if complexity["is_list_request"]:
816
+ score += 1
817
+ if complexity["is_explanation"]:
818
+ score += 1
819
+
820
+ complexity["score"] = score
821
+ complexity["level"] = "simple" if score < 2 else ("moderate" if score < 4 else "complex")
822
+
823
+ return complexity
824
+
825
+
826
+ # =============================================================================
827
+ # EXPERT REASONING ENGINE - Chain of Thought for Better Responses
828
+ # =============================================================================
829
+
830
+ def extract_key_entities(text):
831
+ """Extract key entities like names, dates, places, and numbers from text."""
832
+ entities = {
833
+ "dates": re.findall(r'\b\d{1,2}(?:st|nd|rd|th)?\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b', text, re.IGNORECASE),
834
+ "years": re.findall(r'\b(?:19|20)\d{2}\b', text),
835
+ "numbers": re.findall(r'\b\d+(?:\.\d+)?\s*(?:members|people|students|children|rupees|rs|lakh|crore|percent|%)\b', text, re.IGNORECASE),
836
+ "places": re.findall(r'\b(?:Belgaum|Bangalore|Karnataka|India|Delhi|Mumbai)\b', text, re.IGNORECASE),
837
+ "organizations": re.findall(r'\b(?:Sahayak|Sparsh Foundation|Maheshwari School|Anand Yatri|Balika Adarsh Vidyalaya)\b', text, re.IGNORECASE)
838
+ }
839
+ return entities
840
+
841
+
842
+ def calculate_passage_relevance(passage, query):
843
+ """Calculate a relevance score for a passage based on query overlap."""
844
+ query_words = set(query.lower().split())
845
+ passage_words = set(passage.lower().split())
846
+ overlap = query_words.intersection(passage_words)
847
+ if not query_words:
848
+ return 0.0
849
+ return len(overlap) / len(query_words)
850
+
851
+
852
+ def expert_reasoning_chain(query, passages, qa_result):
853
+ """
854
+ Expert system reasoning chain that:
855
+ 1. Uses semantic intent classification (not keyword matching)
856
+ 2. Evaluates evidence from multiple passages with semantic similarity
857
+ 3. Synthesizes a well-reasoned response
858
+ 4. Provides confidence assessment based on multiple factors
859
+ """
860
+ reasoning_steps = []
861
+
862
+ # Step 1: Semantic Intent Analysis (using embeddings)
863
+ intent, intent_confidence = semantic_intent_classification(query)
864
+
865
+ # Map semantic intents to response intents
866
+ intent_mapping = {
867
+ "about_organization": "definitional",
868
+ "founder_leadership": "person",
869
+ "president_leadership": "person",
870
+ "team_members": "person",
871
+ "mission_vision": "definitional",
872
+ "initiatives_programs": "information",
873
+ "recent_events": "temporal",
874
+ "location_branches": "location",
875
+ "join_volunteer": "procedural",
876
+ "donate_support": "procedural",
877
+ "contact_info": "information",
878
+ "founding_date": "temporal",
879
+ "greeting": "greeting",
880
+ "thanks": "greeting",
881
+ "affirmative": "confirmation",
882
+ "general": "information"
883
+ }
884
+
885
+ response_intent = intent_mapping.get(intent, "information")
886
+ reasoning_steps.append(f"Semantic Intent: {intent} -> {response_intent} (confidence: {intent_confidence:.2f})")
887
+
888
+ # Step 2: Semantic Evidence Gathering
889
+ evidence_pieces = []
890
+ try:
891
+ # Use semantic ranking for evidence
892
+ ranked = semantic_passage_ranking(query, passages[:10], top_k=5)
893
+ for passage, score in ranked:
894
+ if score > 0.25: # Semantic similarity threshold
895
+ entities = extract_key_entities(passage)
896
+ evidence_pieces.append({
897
+ "passage": passage[:300],
898
+ "semantic_score": score,
899
+ "entities": entities
900
+ })
901
+ except Exception as e:
902
+ # Fallback to simple relevance
903
+ for i, passage in enumerate(passages[:5]):
904
+ relevance = calculate_passage_relevance(passage, query)
905
+ if relevance > 0.1:
906
+ evidence_pieces.append({
907
+ "passage": passage[:300],
908
+ "semantic_score": relevance,
909
+ "entities": extract_key_entities(passage)
910
+ })
911
+
912
+ reasoning_steps.append(f"Semantic evidence pieces: {len(evidence_pieces)}")
913
+
914
+ # Step 3: Multi-factor Confidence Assessment
915
+ base_confidence = qa_result.get("score", 0.0)
916
+ intent_boost = intent_confidence * 0.2 # Intent confidence contributes
917
+ evidence_boost = min(len(evidence_pieces) * 0.1, 0.3)
918
+
919
+ # Calculate final confidence
920
+ final_confidence = min(base_confidence + intent_boost + evidence_boost, 1.0)
921
+
922
+ confidence_level = "low"
923
+ if final_confidence > 0.7:
924
+ confidence_level = "high"
925
+ elif final_confidence > 0.4:
926
+ confidence_level = "medium"
927
+
928
+ reasoning_steps.append(f"Confidence: {confidence_level} ({final_confidence:.2f})")
929
+
930
+ return {
931
+ "intent": intent,
932
+ "evidence": evidence_pieces,
933
+ "confidence": final_confidence,
934
+ "confidence_level": confidence_level,
935
+ "reasoning_steps": reasoning_steps
936
+ }
937
+
938
+
939
+ def clean_answer_fragment(answer, context_passages):
940
+ """
941
+ Clean up fragmented QA model answers to make them complete sentences.
942
+ The QA model sometimes returns partial phrases - this fixes that.
943
+ """
944
+ if not answer:
945
+ return None
946
+
947
+ answer = answer.strip()
948
+
949
+ # Check if answer starts with lowercase or incomplete phrase
950
+ incomplete_starts = ["the ", "a ", "an ", "is ", "are ", "was ", "were ", "has ", "have ",
951
+ "it ", "its ", "their ", "this ", "that ", "these ", "those ",
952
+ "and ", "or ", "but ", "with ", "for ", "to ", "in ", "on ", "at "]
953
+
954
+ # If answer starts with lowercase or is a fragment, try to find it in context
955
+ if answer and (answer[0].islower() or any(answer.lower().startswith(s) for s in incomplete_starts)):
956
+ for passage in context_passages[:3]:
957
+ if answer in passage:
958
+ # Find the sentence containing the answer
959
+ sentences = re.split(r'(?<=[.!?])\s+', passage)
960
+ for sent in sentences:
961
+ if answer in sent:
962
+ return sent.strip()
963
+
964
+ # If answer is very short and looks like a fragment
965
+ if len(answer.split()) < 4 and not answer.endswith(('.', '!', '?')):
966
+ for passage in context_passages[:3]:
967
+ if answer.lower() in passage.lower():
968
+ sentences = re.split(r'(?<=[.!?])\s+', passage)
969
+ for sent in sentences:
970
+ if answer.lower() in sent.lower():
971
+ return sent.strip()
972
+
973
+ return answer
974
+
975
+
976
+ def synthesize_answer_from_passages(query, passages, qa_answer, intent):
977
+ """
978
+ Synthesize a comprehensive, well-structured answer from passages.
979
+ This creates complete, coherent responses instead of fragments.
980
+ """
981
+ query_lower = query.lower()
982
+
983
+ # Extract key information based on intent
984
+ if intent == "person":
985
+ # Look for names, roles, titles
986
+ person_info = []
987
+ for passage in passages[:5]:
988
+ if any(term in passage.lower() for term in ["founder", "president", "leader", "team", "member", "secretary"]):
989
+ sentences = re.split(r'(?<=[.!?])\s+', passage)
990
+ for sent in sentences:
991
+ if any(term in sent.lower() for term in ["founder", "president", "verril", "vaz", "leader", "team"]):
992
+ person_info.append(sent.strip())
993
+ if person_info:
994
+ return " ".join(person_info[:3])
995
+
996
+ elif intent == "definitional":
997
+ # Look for definitions, descriptions
998
+ definitions = []
999
+ for passage in passages[:5]:
1000
+ if "sahayak" in passage.lower():
1001
+ sentences = re.split(r'(?<=[.!?])\s+', passage)
1002
+ for sent in sentences:
1003
+ if "sahayak" in sent.lower() and len(sent) > 30:
1004
+ definitions.append(sent.strip())
1005
+ if definitions:
1006
+ return " ".join(definitions[:3])
1007
+
1008
+ elif intent == "location":
1009
+ # Look for place-related information
1010
+ location_info = []
1011
+ for passage in passages[:5]:
1012
+ if any(place in passage.lower() for place in ["belgaum", "bangalore", "karnataka", "india", "location", "branch"]):
1013
+ sentences = re.split(r'(?<=[.!?])\s+', passage)
1014
+ for sent in sentences:
1015
+ if any(place in sent.lower() for place in ["belgaum", "bangalore", "operates", "location", "branch"]):
1016
+ location_info.append(sent.strip())
1017
+ if location_info:
1018
+ return " ".join(location_info[:3])
1019
+
1020
+ elif intent == "procedural":
1021
+ # Look for how-to, process information
1022
+ process_info = []
1023
+ for passage in passages[:5]:
1024
+ if any(term in passage.lower() for term in ["how", "step", "process", "register", "join", "volunteer", "donate", "contact"]):
1025
+ sentences = re.split(r'(?<=[.!?])\s+', passage)
1026
+ for sent in sentences:
1027
+ if len(sent) > 20:
1028
+ process_info.append(sent.strip())
1029
+ if process_info:
1030
+ return " ".join(process_info[:4])
1031
+
1032
+ # Default: return cleaned QA answer or first relevant passage
1033
+ cleaned = clean_answer_fragment(qa_answer, passages)
1034
+ if cleaned and len(cleaned) > 20:
1035
+ return cleaned
1036
+
1037
+ # Fallback to first passage sentences
1038
+ if passages:
1039
+ sentences = re.split(r'(?<=[.!?])\s+', passages[0])
1040
+ return " ".join(sentences[:3])
1041
+
1042
+ return qa_answer
1043
+
1044
+
1045
+ def build_expert_response(qa_answer, reasoning, query, passages):
1046
+ """
1047
+ Build a comprehensive, well-formatted expert response with:
1048
+ - Clear, complete answer (not fragments)
1049
+ - Organized sections with proper headers
1050
+ - Supporting details
1051
+ - Helpful follow-up suggestions
1052
+ """
1053
+ intent = reasoning["intent"]
1054
+ confidence = reasoning["confidence_level"]
1055
+ evidence = reasoning["evidence"]
1056
+
1057
+ # First, synthesize a proper answer from fragments
1058
+ synthesized_answer = synthesize_answer_from_passages(query, passages, qa_answer, intent)
1059
+
1060
+ # Clean up the answer to ensure it's a complete sentence
1061
+ if synthesized_answer:
1062
+ synthesized_answer = clean_answer_fragment(synthesized_answer, passages) or synthesized_answer
1063
+
1064
+ response_parts = []
1065
+
1066
+ # Section 1: Main Answer with clear header based on intent
1067
+ intent_headers = {
1068
+ "person": "👤 **About the Person/Team**",
1069
+ "temporal": "📅 **Timeline & Dates**",
1070
+ "location": "📍 **Location Information**",
1071
+ "quantitative": "📊 **Facts & Figures**",
1072
+ "procedural": "📋 **How It Works**",
1073
+ "explanatory": "💡 **Explanation**",
1074
+ "definitional": "📖 **Overview**",
1075
+ "information": "ℹ️ **Information**"
1076
+ }
1077
+
1078
+ header = intent_headers.get(intent, intent_headers["information"])
1079
+ response_parts.append(f"{header}\n\n")
1080
+
1081
+ # Add the main answer
1082
+ if synthesized_answer and len(synthesized_answer) > 10:
1083
+ # Ensure answer starts with capital letter and ends properly
1084
+ answer_text = synthesized_answer.strip()
1085
+ if answer_text and answer_text[0].islower():
1086
+ answer_text = answer_text[0].upper() + answer_text[1:]
1087
+ if not answer_text.endswith(('.', '!', '?')):
1088
+ answer_text += "."
1089
+ response_parts.append(f"{answer_text}\n")
1090
+ else:
1091
+ response_parts.append(f"{qa_answer}\n")
1092
+
1093
+ # Section 2: Key Details (extracted from evidence)
1094
+ if evidence and len(evidence) > 0:
1095
+ response_parts.append("\n**Key Details:**\n")
1096
+
1097
+ added_details = set()
1098
+ for ev in evidence[:3]:
1099
+ passage = ev["passage"]
1100
+ sentences = re.split(r'(?<=[.!?])\s+', passage)
1101
+
1102
+ for sent in sentences[:2]:
1103
+ sent = sent.strip()
1104
+ # Only add if it's substantial and not duplicate
1105
+ if len(sent) > 30 and sent not in added_details:
1106
+ # Clean up the sentence
1107
+ if sent[0].islower():
1108
+ sent = sent[0].upper() + sent[1:]
1109
+ if not sent.endswith(('.', '!', '?')):
1110
+ sent += "."
1111
+ response_parts.append(f"• {sent}\n")
1112
+ added_details.add(sent)
1113
+ if len(added_details) >= 3:
1114
+ break
1115
+ if len(added_details) >= 3:
1116
+ break
1117
+
1118
+ # Section 3: Contact/Action info if relevant
1119
+ if intent == "procedural" or any(word in query.lower() for word in ["join", "contact", "call", "volunteer", "donate"]):
1120
+ contact_info = None
1121
+ for passage in passages[:5]:
1122
+ if "phone" in passage.lower() or "contact" in passage.lower() or "website" in passage.lower():
1123
+ # Extract contact details
1124
+ phone_match = re.search(r'\+?\d{2,3}[-\s]?\d{3}[-\s]?\d{3}[-\s]?\d{4}', passage)
1125
+ website_match = re.search(r'https?://[\w\./]+', passage)
1126
+ if phone_match or website_match:
1127
+ response_parts.append("\n**📞 Contact Information:**\n")
1128
+ if phone_match:
1129
+ response_parts.append(f"• Phone: {phone_match.group()}\n")
1130
+ if website_match:
1131
+ response_parts.append(f"• Website: {website_match.group()}\n")
1132
+ break
1133
+
1134
+ # Section 4: Helpful follow-up (context-aware)
1135
+ follow_ups = {
1136
+ "person": "\n💬 *Would you like to know about specific team members, their roles, or how to contact them?*",
1137
+ "temporal": "\n💬 *Would you like to know about upcoming events or past milestones?*",
1138
+ "location": "\n💬 *Would you like directions or information about activities at a specific location?*",
1139
+ "quantitative": "\n💬 *Would you like more details about Sahayak's impact and achievements?*",
1140
+ "procedural": "\n💬 *Need help with the next steps? I can provide more detailed guidance.*",
1141
+ "explanatory": "\n💬 *Would you like me to explain any specific aspect in more detail?*",
1142
+ "definitional": "\n💬 *Would you like to learn about specific programs, events, or initiatives?*",
1143
+ "information": "\n💬 *What else would you like to know about Sahayak Organization?*"
1144
+ }
1145
+
1146
+ response_parts.append(follow_ups.get(intent, follow_ups["information"]))
1147
+
1148
+ return "".join(response_parts)
1149
+
1150
+
1151
+ def append_context_to_answer(answer, query, passages):
1152
+ """Enhanced context integration using expert reasoning chain."""
1153
+ if not passages:
1154
+ return answer
1155
+
1156
+ # Use expert reasoning for better responses
1157
+ qa_result = {"answer": answer, "score": 0.5} # Default score for context
1158
+ reasoning = expert_reasoning_chain(query, passages, qa_result)
1159
+
1160
+ logger.info(f"Expert reasoning: {reasoning['reasoning_steps']}")
1161
+
1162
+ return build_expert_response(answer, reasoning, query, passages)
1163
+
1164
+
1165
+ # =============================================================================
1166
+ # MAIN ANSWER FUNCTION
1167
+ # =============================================================================
1168
+
1169
+ def answer_question(query, passages, vector_index, embeddings):
1170
+ """Answer a question with expert-level reasoning, retrieval, and detailed responses"""
1171
+ try:
1172
+ meta_responses = {
1173
+ "hi": "Hello! I'm Vexa, your assistant for Sahayak Organization.\n\n"
1174
+ "Sahayak is a non-profit dedicated to making a positive impact in the community by supporting underprivileged groups.\n"
1175
+ "I can help you learn more about our mission, goals, leadership, and activities.\n"
1176
+ "What would you like to know about Sahayak Organization today?",
1177
+ "hello": "Hi there! I'm Vexa, here to assist you with information about Sahayak Organization.\n\n"
1178
+ "We are a non-profit focused on addressing social issues and empowering communities through education and support.\n"
1179
+ "I can provide details about our vision, projects, or team members.\n"
1180
+ "How can I help you today?",
1181
+ "hey": "Hey! I'm Vexa, your guide to Sahayak Organization.\n\n"
1182
+ "Sahayak works to support those in need, focusing on education, inclusion, and community development.\n"
1183
+ "I'm here to answer any questions you have about our work, leadership, or goals.\n"
1184
+ "What would you like to explore?",
1185
+ "explain me about sahayak": "Sahayak is a non-profit organization committed to providing support and assistance to those in need.\n\n"
1186
+ "Our primary focus is on addressing various social issues, such as education, health, and community development.\n"
1187
+ "We aim to make a lasting positive impact by empowering underprivileged communities through dedicated programs.\n"
1188
+ "Additionally, Sahayak collaborates with local leaders and volunteers to ensure our initiatives are effective.\n"
1189
+ "Would you like to know more about our specific projects or leadership team?",
1190
+ "tell me about sahayak": "Sahayak Organization is a non-profit dedicated to helping those in need across various communities.\n\n"
1191
+ "We focus on tackling social issues like lack of access to education, healthcare challenges, and economic disparities.\n"
1192
+ "Our mission is to empower underprivileged groups by providing resources, support, and opportunities for growth.\n"
1193
+ "Sahayak operates in multiple locations, including Belgaum and Bangalore, to maximize our impact.\n"
1194
+ "If you'd like, I can share more about our goals or the team behind our efforts!",
1195
+ "brief me about sahayak": "Sahayak is a non-profit organization devoted to supporting underprivileged communities.\n\n"
1196
+ "Our work centers on addressing key social issues, such as education, health, and social inclusion.\n"
1197
+ "We strive to create a positive impact by offering resources and programs that empower individuals and families.\n"
1198
+ "With operations in places like Belgaum and Bangalore, we aim to reach as many people as possible.\n"
1199
+ "Let me know if you'd like to dive deeper into our mission or activities!",
1200
+ "goals of sahayak": "The goals of Sahayak Organization are centered on creating a better future for underprivileged communities.\n\n"
1201
+ "We aim to provide support by addressing key challenges like access to education and healthcare.\n"
1202
+ "Another goal is to raise awareness about social issues that affect marginalized groups.\n"
1203
+ "Additionally, we promote initiatives that improve health and education outcomes for children and families.\n"
1204
+ "Would you like to learn more about how we achieve these goals through our programs?",
1205
+ "vision": "Sahayak Organisation envisions a society where education is accessible to every child, regardless of their socio-economic background.\n\n"
1206
+ "We believe that education is a fundamental right that can transform lives and uplift entire communities.\n"
1207
+ "Our vision drives us to work tirelessly to remove barriers and create opportunities for learning.\n"
1208
+ "By doing so, we hope to build a more equitable and inclusive society for future generations.\n"
1209
+ "Would you like to know more about our specific initiatives in education?",
1210
+ "mission of sahayak": "Sahayak's mission is to empower underprivileged communities through education, inclusion, and dedicated care.\n\n"
1211
+ "We strive to provide resources and support to those who need it most, ensuring they have access to opportunities.\n"
1212
+ "Our focus is on creating sustainable change by addressing systemic issues like poverty and lack of education.\n"
1213
+ "Through our programs, we aim to foster a sense of community and hope for a better future.\n"
1214
+ "Let me know if you'd like more details about our mission-driven projects!",
1215
+ "vision of sahayak": "Sahayak Organisation envisions a society where education is accessible to every child, regardless of their socio-economic background.\n\n"
1216
+ "We believe that education is a fundamental right that can transform lives and uplift entire communities.\n"
1217
+ "Our vision drives us to work tirelessly to remove barriers and create opportunities for learning.\n"
1218
+ "By doing so, we hope to build a more equitable and inclusive society for future generations.\n"
1219
+ "Would you like to know more about our specific initiatives in education?",
1220
+ "who created you": "I was created by B Chaitanya Reddy, a dedicated developer passionate about using technology for social good.\n\n"
1221
+ "Chaitanya built me to assist users in learning more about Sahayak Organization and its impactful work.\n"
1222
+ "My purpose is to provide accurate and detailed information to help you understand Sahayak's mission and activities.\n"
1223
+ "Thanks to Chaitanya's efforts, I can answer your questions and guide you through Sahayak's initiatives.\n"
1224
+ "What else would you like to know about me or Sahayak?",
1225
+ "who developed you": "I was developed by B Chaitanya Reddy, a skilled developer who wanted to support Sahayak Organization's mission.\n\n"
1226
+ "Chaitanya designed me to be a helpful tool for anyone seeking information about Sahayak's work and goals.\n"
1227
+ "My role is to provide detailed answers and insights into the organization's efforts to make a difference.\n"
1228
+ "I'm here to assist you with any questions you have about Sahayak or its initiatives.\n"
1229
+ "What would you like to explore next?",
1230
+ "who is the founder of sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
1231
+ "Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
1232
+ "Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
1233
+ "founder of sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
1234
+ "Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
1235
+ "Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
1236
+ "who founded sahayak": "The founder of Sahayak Organisation is Verril Vaz.\n\n"
1237
+ "Verril Vaz is a visionary leader who established Sahayak with the mission to uplift underprivileged communities through education and social support.\n"
1238
+ "Under his leadership, Sahayak has grown to operate in multiple locations, including Belgaum and Bangalore.\n",
1239
+ "who is verril vaz": "**👤 About Verril Vaz**\n\n"
1240
+ "Verril Vaz is the **Founder and President** of Sahayak Organisation.\n\n"
1241
+ "**Key Facts:**\n"
1242
+ "• Visionary leader who believes in grassroots change\n"
1243
+ "• Established Sahayak on May 21, 2024\n"
1244
+ "• Leads a passionate team of 45 young members\n"
1245
+ "• Focuses on education and community empowerment\n\n"
1246
+ "**His Vision:**\n"
1247
+ "Under his leadership, Sahayak has grown to operate in Belgaum and Bangalore, helping underprivileged communities through education and social support.\n\n"
1248
+ "Would you like to know more about the leadership team or Sahayak's activities?",
1249
+ "verril vaz": "**👤 About Verril Vaz**\n\n"
1250
+ "Verril Vaz is the **Founder and President** of Sahayak Organisation.\n\n"
1251
+ "**Key Facts:**\n"
1252
+ "• Visionary leader who believes in grassroots change\n"
1253
+ "• Established Sahayak on May 21, 2024\n"
1254
+ "• Leads a passionate team of 45 young members\n\n"
1255
+ "Under his leadership, Sahayak operates in Belgaum and Bangalore, helping communities through education and support.\n\n"
1256
+ "Would you like to know more about the leadership team?",
1257
+ "how does sahayak organization works": "Sahayak Organisation operates as a non-profit dedicated to uplifting underprivileged communities through education, social support, and community development.\nIt functions by Implementing Educational Initiatives:\nSahayak focuses heavily on education, providing academic mentorship and resources to ensure access to learning opportunities, particularly for children from marginalized backgrounds.\nProviding Social and Emotional Support: The organization acts as a 'helper' (as its name suggests in Hindi), offering emotional and social assistance to those in need, fostering inclusion and empowerment.\nOperating in Key Locations: Sahayak runs its programs primarily in Belgaum and Bangalore, collaborating with local special schools and healthcare providers to maximize impact.\nLeveraging a Youthful Team: Powered by a team of 45 core members, Sahayak is led by young, passionate leaders like founder Verril Vaz.\nThis team brings empathy, innovation, and administrative skills to execute initiatives efficiently.\nHolistic Approach: Sahayak bridges the gap between potential and opportunity by addressing systemic issues like poverty and lack of education, ensuring sustainable change through targeted programs.",
1258
+ "what are the recent activities conducted by the sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
1259
+ "2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
1260
+ "3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
1261
+ "4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
1262
+ "what are the initiatives took by sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
1263
+ "2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
1264
+ "3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
1265
+ "4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
1266
+ "what are the recent events conducted by the sahayak": "1)Sahayak had a great visit to Anand Yatri Old Age Home on August 3 2024, where all our members had the best experience and tears of happiness in our eyes. Making all the members happy of Anand Yatri was our main motive.They cheris to remember us each and everyday.\n"
1267
+ "2)We visited the Maheshwari School for the Blind on October 25 2024 with a great motive to make all the students realize that being blind they can do immense miracles on this world. A heartfelt thanks to the Principal of Maheshwari School for providing us this wonderful opportunity.\n"
1268
+ "3)Team Sahayak visited Sparsh Foundation on december 19 2024 which is a school for all the specially disabled children and provides Education for completely free. Our team had a very great experience meeting the students and spending time with them.\n"
1269
+ "4) Team Sahayak visited Balika Adarsh Vidyalaya on 4th february 2025 and conducted various sessions such Technical Awareness, Social Media Awareness, Career Guidance and Nutrition which were the best topics decided by our team. The sessions were really helpful to all the students and everyone enjoyed it to the fullest.\n",
1270
+ "when was the sahayak started": "Sahayak Organisation was started on 21 May 2024.\n\n"
1271
+ "Since its inception, Sahayak has been dedicated to uplifting underprivileged communities through education and social support.\n"
1272
+ "Would you like to know more about our founding story or current initiatives?",
1273
+ # Handle common affirmative follow-ups
1274
+ "yes": "Great! Here are some topics I can help you explore:\n\n"
1275
+ "📋 **Programs & Initiatives**\n"
1276
+ "• Educational programs for underprivileged children\n"
1277
+ "• Visits to old age homes and special schools\n"
1278
+ "• Awareness sessions on health, career, and technology\n\n"
1279
+ "👥 **Team & Leadership**\n"
1280
+ "• Founder: Verril Vaz\n"
1281
+ "• Team of 45 passionate young members\n\n"
1282
+ "📍 **Locations & Contact**\n"
1283
+ "• Operating in Belgaum and Bangalore\n"
1284
+ "• Phone: +91-123-456-7890\n"
1285
+ "• Website: https://www.sahayak.org\n\n"
1286
+ "What specific topic would you like to know more about?",
1287
+ "sure": "Great! Here are some topics I can help you explore:\n\n"
1288
+ "📋 **Programs & Initiatives**\n"
1289
+ "• Educational programs for underprivileged children\n"
1290
+ "• Visits to old age homes and special schools\n"
1291
+ "• Awareness sessions on health, career, and technology\n\n"
1292
+ "👥 **Team & Leadership**\n"
1293
+ "• Founder: Verril Vaz\n"
1294
+ "• Team of 45 passionate young members\n\n"
1295
+ "📍 **Locations & Contact**\n"
1296
+ "• Operating in Belgaum and Bangalore\n\n"
1297
+ "What specific topic would you like to know more about?",
1298
+ "ok": "I'm here to help! You can ask me about:\n\n"
1299
+ "• Sahayak's mission and vision\n"
1300
+ "• Our educational programs and initiatives\n"
1301
+ "• Recent activities and events\n"
1302
+ "• Team members and leadership\n"
1303
+ "• How to volunteer or donate\n"
1304
+ "• Contact information and locations\n\n"
1305
+ "What would you like to know?",
1306
+ "tell me more": "Here's more about Sahayak Organization:\n\n"
1307
+ "**🎯 Our Mission**\n"
1308
+ "Sahayak is dedicated to empowering underprivileged communities through education, inclusion, and dedicated care.\n\n"
1309
+ "**📚 Key Programs**\n"
1310
+ "1. Educational mentorship for underserved students\n"
1311
+ "2. Support for visually impaired and specially-abled children\n"
1312
+ "3. Assistance to senior citizens at old age homes\n"
1313
+ "4. Awareness sessions on technology, career, and health\n\n"
1314
+ "**🏆 Recent Achievements**\n"
1315
+ "• Visited Anand Yatri Old Age Home (August 2024)\n"
1316
+ "• Conducted sessions at Maheshwari School for the Blind (October 2024)\n"
1317
+ "• Partnered with Sparsh Foundation (December 2024)\n"
1318
+ "• Organized awareness programs at Balika Adarsh Vidyalaya (February 2025)\n\n"
1319
+ "Which aspect would you like to explore further?",
1320
+ "what is sahayak": "**📖 What is Sahayak?**\n\n"
1321
+ "Sahayak (meaning 'helper' in Hindi) is a non-governmental organization (NGO) founded with the noble vision of uplifting society.\n\n"
1322
+ "**🎯 Core Focus Areas:**\n"
1323
+ "• Providing quality education to underserved students\n"
1324
+ "• Supporting senior citizens with care and companionship\n"
1325
+ "• Aiding visually impaired and differently-abled children\n\n"
1326
+ "**📍 Where We Operate:**\n"
1327
+ "Sahayak primarily operates in Belgaum and Bangalore, Karnataka, India.\n\n"
1328
+ "**👥 Our Team:**\n"
1329
+ "Led by founder Verril Vaz, Sahayak has a passionate team of 45 young leaders dedicated to making a difference.\n\n"
1330
+ "Would you like to know about our specific programs or how to get involved?",
1331
+ "initiatives": "**📋 Sahayak's Key Initiatives**\n\n"
1332
+ "**1. Educational Programs**\n"
1333
+ "• Academic mentorship for underprivileged children\n"
1334
+ "• Scholarship guidance and study resources\n"
1335
+ "• Career counseling and skill development\n\n"
1336
+ "**2. Community Outreach**\n"
1337
+ "• Regular visits to old age homes\n"
1338
+ "• Support programs for specially-abled children\n"
1339
+ "• Health and nutrition awareness campaigns\n\n"
1340
+ "**3. Recent Activities**\n"
1341
+ "• Anand Yatri Old Age Home visit (Aug 2024)\n"
1342
+ "• Maheshwari School for the Blind (Oct 2024)\n"
1343
+ "• Sparsh Foundation collaboration (Dec 2024)\n"
1344
+ "• Balika Adarsh Vidyalaya sessions (Feb 2025)\n\n"
1345
+ "Would you like details about any specific initiative?",
1346
+ "what are the initiatives of sahayak": "**📋 Sahayak's Key Initiatives**\n\n"
1347
+ "**1. Educational Programs**\n"
1348
+ "• Academic mentorship for underprivileged children\n"
1349
+ "• Scholarship guidance and study resources\n"
1350
+ "• Career counseling and skill development\n\n"
1351
+ "**2. Community Outreach**\n"
1352
+ "• Regular visits to old age homes\n"
1353
+ "• Support programs for specially-abled children\n"
1354
+ "• Health and nutrition awareness campaigns\n\n"
1355
+ "**3. Recent Activities (2024-2025)**\n"
1356
+ "• Anand Yatri Old Age Home visit (Aug 3, 2024)\n"
1357
+ "• Maheshwari School for the Blind (Oct 25, 2024)\n"
1358
+ "• Sparsh Foundation collaboration (Dec 19, 2024)\n"
1359
+ "• Balika Adarsh Vidyalaya sessions (Feb 4, 2025)\n\n"
1360
+ "Would you like details about any specific initiative?",
1361
+ "who is the president of sahayak": "**👤 President of Sahayak**\n\n"
1362
+ "The founder and president of Sahayak Organisation is **Verril Vaz**.\n\n"
1363
+ "**About Verril Vaz:**\n"
1364
+ "• Visionary leader who believes in grassroots change\n"
1365
+ "• Established Sahayak with the mission to uplift underprivileged communities\n"
1366
+ "• Leads a passionate team of 45 young members\n\n"
1367
+ "Under his leadership, Sahayak has grown to operate in multiple locations including Belgaum and Bangalore.\n\n"
1368
+ "Would you like to know more about the leadership team or Sahayak's structure?",
1369
+ "president of sahayak": "**👤 President of Sahayak**\n\n"
1370
+ "The founder and president of Sahayak Organisation is **Verril Vaz**.\n\n"
1371
+ "**About Verril Vaz:**\n"
1372
+ "• Visionary leader who believes in grassroots change\n"
1373
+ "• Established Sahayak with the mission to uplift underprivileged communities\n"
1374
+ "• Leads a passionate team of 45 young members\n\n"
1375
+ "Would you like to know more about the leadership team?",
1376
+ "how to join sahayak": "**🤝 How to Join Sahayak**\n\n"
1377
+ "**Option 1: Register Online**\n"
1378
+ "• Visit: https://www.sahayak.org/volunteer\n"
1379
+ "• Fill out the volunteer registration form\n\n"
1380
+ "**Option 2: Contact Us Directly**\n"
1381
+ "• Phone: +91-123-456-7890\n"
1382
+ "• Email through the website contact form\n\n"
1383
+ "**What We Look For:**\n"
1384
+ "• Passion for community service\n"
1385
+ "• Willingness to contribute time and skills\n"
1386
+ "• Empathy for underprivileged communities\n\n"
1387
+ "Would you like more information about volunteer opportunities?",
1388
+ "how many branches sahayak has": "**📍 Sahayak Locations**\n\n"
1389
+ "Sahayak currently operates in **2 main locations**:\n\n"
1390
+ "**1. Belgaum (Belagavi)**\n"
1391
+ "• Primary operational hub\n"
1392
+ "• Multiple community outreach programs\n\n"
1393
+ "**2. Bangalore**\n"
1394
+ "• Extended operations and programs\n"
1395
+ "• Partnership with local institutions\n\n"
1396
+ "Both locations serve as centers for educational initiatives, community support, and volunteer activities.\n\n"
1397
+ "Would you like to know about activities at a specific location?",
1398
+ # Additional variations for better matching
1399
+ "vice president": "**👤 Sahayak Leadership Team**\n\n"
1400
+ "Sahayak is led by a passionate team of young leaders:\n\n"
1401
+ "**President & Founder:** Verril Vaz\n"
1402
+ "• Visionary leader who believes in grassroots change\n\n"
1403
+ "**Core Team:**\n"
1404
+ "• 45 dedicated young members\n"
1405
+ "• Mix of empathy, administrative skills, and innovation\n"
1406
+ "• Committed to Sahayak's core values\n\n"
1407
+ "The team works together to ensure Sahayak runs efficiently while making a positive impact.\n\n"
1408
+ "Would you like to know about specific team roles or how to join?",
1409
+ "leadership": "**👥 Sahayak Leadership Team**\n\n"
1410
+ "Sahayak is led by a passionate team of young leaders:\n\n"
1411
+ "**President & Founder:** Verril Vaz\n"
1412
+ "• Visionary leader who believes in grassroots change\n"
1413
+ "• Established Sahayak on May 21, 2024\n\n"
1414
+ "**Core Team:**\n"
1415
+ "• 45 dedicated young members\n"
1416
+ "• Mix of empathy, administrative skills, and innovation\n"
1417
+ "• Committed to uplifting underprivileged communities\n\n"
1418
+ "The team brings collective commitment to ensure Sahayak runs efficiently.\n\n"
1419
+ "Would you like to know more about joining the team?",
1420
+ "objectives": "**🎯 Sahayak's Objectives**\n\n"
1421
+ "Sahayak Organisation works towards the following key objectives:\n\n"
1422
+ "**1. Educational Empowerment**\n"
1423
+ "• Provide quality education to underserved students\n"
1424
+ "• Bridge the gap between potential and opportunity\n\n"
1425
+ "**2. Community Support**\n"
1426
+ "• Support senior citizens with care and companionship\n"
1427
+ "• Aid visually impaired and differently-abled children\n\n"
1428
+ "**3. Social Awareness**\n"
1429
+ "• Raise awareness about social issues affecting marginalized groups\n"
1430
+ "• Promote health, career, and technology awareness\n\n"
1431
+ "**4. Sustainable Impact**\n"
1432
+ "• Create lasting positive change in communities\n"
1433
+ "• Empower individuals and families through dedicated programs\n\n"
1434
+ "Would you like to learn about specific programs achieving these objectives?",
1435
+ "what are the objectives": "**🎯 Sahayak's Objectives**\n\n"
1436
+ "Sahayak Organisation works towards the following key objectives:\n\n"
1437
+ "**1. Educational Empowerment**\n"
1438
+ "• Provide quality education to underserved students\n"
1439
+ "• Bridge the gap between potential and opportunity\n\n"
1440
+ "**2. Community Support**\n"
1441
+ "• Support senior citizens with care and companionship\n"
1442
+ "• Aid visually impaired and differently-abled children\n\n"
1443
+ "**3. Social Awareness**\n"
1444
+ "• Raise awareness about social issues\n"
1445
+ "• Promote health, career, and technology awareness\n\n"
1446
+ "Would you like to learn about specific programs?",
1447
+ "donate": "**💝 How to Donate to Sahayak**\n\n"
1448
+ "Your donations help Sahayak continue its mission to support underprivileged communities.\n\n"
1449
+ "**How to Donate:**\n"
1450
+ "• Visit: https://www.sahayak.org\n"
1451
+ "• Contact: +91-123-456-7890\n\n"
1452
+ "**Your Donation Supports:**\n"
1453
+ "• Educational programs for children\n"
1454
+ "• Care for senior citizens\n"
1455
+ "• Support for specially-abled individuals\n\n"
1456
+ "Every contribution makes a difference!\n\n"
1457
+ "Would you like more information about our programs?",
1458
+ "help": "**ℹ️ How Can I Help You?**\n\n"
1459
+ "I can provide information about:\n\n"
1460
+ "📋 **Programs & Initiatives**\n"
1461
+ "• Educational programs, community outreach, recent events\n\n"
1462
+ "👥 **Team & Leadership**\n"
1463
+ "• Founder, president, team members\n\n"
1464
+ "📍 **Locations & Contact**\n"
1465
+ "• Branches, phone numbers, website\n\n"
1466
+ "🤝 **Getting Involved**\n"
1467
+ "• How to volunteer, donate, or join\n\n"
1468
+ "🎯 **Mission & Vision**\n"
1469
+ "• Goals, objectives, values\n\n"
1470
+ "What would you like to know about?"
1471
+ }
1472
+
1473
+ translated_query, original_lang = detect_and_translate(query)
1474
+
1475
+ moderation = moderate_query(translated_query)
1476
+ if moderation.get("status") == "blocked":
1477
+ return translate_text(moderation["message"], original_lang)
1478
+
1479
+ # =====================================================================
1480
+ # SEMANTIC INTENT UNDERSTANDING (Expert-Level)
1481
+ # Uses embeddings to understand query meaning, not just keywords
1482
+ # =====================================================================
1483
+
1484
+ normalized_query = re.sub(r'[^\w\s]', '', translated_query.lower().strip())
1485
+
1486
+ # Step 1: Semantic Intent Classification
1487
+ intent, intent_confidence = semantic_intent_classification(translated_query)
1488
+ logger.info(f"Semantic intent: {intent} (confidence: {intent_confidence:.3f})")
1489
+
1490
+ # Step 2: Query Complexity Analysis
1491
+ complexity = analyze_query_complexity(translated_query)
1492
+ logger.info(f"Query complexity: {complexity['level']} (score: {complexity['score']})")
1493
+
1494
+ # Step 3: Intent-based response routing with semantic understanding
1495
+ # Map semantic intents to curated responses
1496
+ intent_to_response = {
1497
+ "about_organization": "what is sahayak",
1498
+ "founder_leadership": "who is the founder of sahayak",
1499
+ "president_leadership": "who is the president of sahayak",
1500
+ "team_members": "who is the president of sahayak",
1501
+ "mission_vision": "mission of sahayak",
1502
+ "initiatives_programs": "initiatives",
1503
+ "recent_events": "what are the recent activities conducted by the sahayak",
1504
+ "location_branches": "how many branches sahayak has",
1505
+ "join_volunteer": "how to join sahayak",
1506
+ "donate_support": "donate",
1507
+ "contact_info": "how to join sahayak",
1508
+ "founding_date": "when was the sahayak started",
1509
+ "greeting": "hi",
1510
+ "thanks": "ok",
1511
+ "affirmative": "yes"
1512
+ }
1513
+
1514
+ # High confidence semantic match -> use curated response
1515
+ if intent_confidence > 0.65 and intent in intent_to_response:
1516
+ response_key = intent_to_response[intent]
1517
+ if response_key in meta_responses:
1518
+ logger.info(f"Semantic match: intent={intent}, confidence={intent_confidence:.3f}, response_key={response_key}")
1519
+ return translate_text(meta_responses[response_key], original_lang)
1520
+
1521
+ # Step 4: Semantic similarity search in meta_responses
1522
+ # Use embeddings to find best matching pre-defined response
1523
+ try:
1524
+ embedder = get_embedder()
1525
+ query_embedding = embedder.encode([translated_query])[0]
1526
+
1527
+ best_match_key = None
1528
+ best_match_score = 0.0
1529
+
1530
+ # Encode all meta_response keys and find best semantic match
1531
+ meta_keys = list(meta_responses.keys())
1532
+ meta_embeddings = embedder.encode(meta_keys)
1533
+
1534
+ for i, (key, key_embedding) in enumerate(zip(meta_keys, meta_embeddings)):
1535
+ # Cosine similarity
1536
+ similarity = np.dot(query_embedding, key_embedding) / (
1537
+ np.linalg.norm(query_embedding) * np.linalg.norm(key_embedding)
1538
+ )
1539
+ if similarity > best_match_score:
1540
+ best_match_score = similarity
1541
+ best_match_key = key
1542
+
1543
+ # If we have a strong semantic match, use it
1544
+ if best_match_score > 0.70 and best_match_key:
1545
+ logger.info(f"Semantic meta-match: '{best_match_key}' (score: {best_match_score:.3f})")
1546
+ return translate_text(meta_responses[best_match_key], original_lang)
1547
+
1548
+ except Exception as e:
1549
+ logger.warning(f"Semantic matching failed: {e}")
1550
+
1551
+ # Step 5: Check scope
1552
+ if not is_in_scope(translated_query):
1553
+ return scope_redirect_message(original_lang)
1554
+
1555
+ # =====================================================================
1556
+ # STEP 6: SEMANTIC PASSAGE RETRIEVAL & QA
1557
+ # Use embeddings to find most relevant passages
1558
+ # =====================================================================
1559
+
1560
+ # Determine retrieval depth based on query complexity
1561
+ top_k = 10
1562
+ if complexity["level"] == "complex":
1563
+ top_k = 15
1564
+ elif complexity["level"] == "simple":
1565
+ top_k = 8
1566
+
1567
+ relevance_passages = retrieve_relevant_passages(
1568
+ translated_query, passages, vector_index, embeddings, top_k=top_k
1569
+ )
1570
+ logger.info(f"Retrieved {len(relevance_passages)} passages for query '{translated_query}'")
1571
+
1572
+ # Re-rank passages using semantic similarity for better context
1573
+ ranked_passages = semantic_passage_ranking(translated_query, relevance_passages, top_k=5)
1574
+
1575
+ # Build context from best semantically-matched passages
1576
+ context_passages = [p for p, score in ranked_passages if score > 0.3]
1577
+ if not context_passages:
1578
+ context_passages = relevance_passages[:3]
1579
+
1580
+ context = " ".join(context_passages)[:2000] # More context for better answers
1581
+ logger.info(f"Context length for QA model: {len(context)} chars from {len(context_passages)} passages")
1582
+
1583
+ qa_model = get_qa_model()
1584
+
1585
+ result = qa_model(question=translated_query, context=context)
1586
+
1587
+ english_answer = result["answer"]
1588
+ confidence_score = result.get("score", 0.0)
1589
+ logger.info(f"QA model result: answer='{english_answer}', score={confidence_score}")
1590
+
1591
+ # Apply expert reasoning chain
1592
+ reasoning = expert_reasoning_chain(translated_query, relevance_passages, result)
1593
+ logger.info(f"Expert reasoning steps: {reasoning['reasoning_steps']}")
1594
+
1595
+ is_numeric = english_answer.strip().replace(".", "").isdigit()
1596
+ if not english_answer or (len(english_answer) < 5 and not is_numeric and confidence_score < 0.3) or confidence_score < 0.05:
1597
+ logger.warning(f"QA model failed to provide a good answer. Answer: '{english_answer}', Score: {confidence_score}")
1598
+ # Provide a more helpful fallback using expert reasoning
1599
+ fallback_response = "I couldn't find a specific answer to your question.\n\n"
1600
+ if relevance_passages:
1601
+ fallback_response += "**However, here's what I found in our documentation:**\n"
1602
+ for i, passage in enumerate(relevance_passages[:2]):
1603
+ snippet = passage[:200] + "..." if len(passage) > 200 else passage
1604
+ fallback_response += f"\n- {snippet}\n"
1605
+ fallback_response += "\nWould you like me to help you with a more specific question about Sahayak?"
1606
+ return translate_text(fallback_response, original_lang)
1607
+
1608
+ # Build expert response with evidence and reasoning
1609
+ detailed_answer = build_expert_response(english_answer, reasoning, translated_query, relevance_passages)
1610
+
1611
+ legal_footer = (
1612
+ "\n\n---\n_This response is for community awareness only and not legal, medical, or financial advice. "
1613
+ "Program details may change; please verify with official Sahayak contacts or local authorities._"
1614
+ )
1615
+ final_answer = translate_text(detailed_answer + legal_footer, original_lang)
1616
+ return final_answer
1617
+
1618
+ except Exception as e:
1619
+ logger.error(f"Error answering question: {str(e)}")
1620
+ return ("I'm sorry, I encountered an error while processing your question.\n\n"
1621
+ "However, I can still tell you that Sahayak is a non-profit organization focused on helping those in need.\n"
1622
+ "We work on various social issues, aiming to make a positive impact in the community.\n"
1623
+ "Please try again with a different query, and I'll do my best to assist you!")
static/logo.png ADDED
static/style.css ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ * {
2
+ box-sizing: border-box;
3
+ margin: 0;
4
+ padding: 0;
5
+ }
6
+
7
+ html, body {
8
+ height: 100%;
9
+ background: linear-gradient(135deg, #23272f 0%, #343541 100%);
10
+ color: #fff;
11
+ font-family: 'Inter', 'Poppins', sans-serif;
12
+ overflow-x: hidden;
13
+ overflow-y: auto;
14
+ }
15
+
16
+ body {
17
+ min-height: 100vh;
18
+ display: flex;
19
+ flex-direction: column;
20
+ }
21
+
22
+ /* Branding row at the top, scrolls with page, NO navbar effect */
23
+ .branding-row {
24
+ display: flex;
25
+ align-items: center;
26
+ gap: 1rem;
27
+ padding: 1.6rem 0 0.6rem 1.6rem;
28
+ width: auto;
29
+ background: transparent;
30
+ box-shadow: none;
31
+ border-bottom: none;
32
+ }
33
+
34
+ .logo {
35
+ width: 72px;
36
+ height: 72px;
37
+ border-radius: 16px;
38
+ object-fit: cover;
39
+ box-shadow: 0 4px 24px rgba(76, 110, 245, 0.12);
40
+ }
41
+
42
+ .brand-name {
43
+ font-family: 'Poppins', sans-serif;
44
+ font-weight: 700;
45
+ font-size: 2.7rem;
46
+ background: linear-gradient(135deg, #4f9fff, #9c7aff);
47
+ -webkit-background-clip: text;
48
+ background-clip: text;
49
+ -webkit-text-fill-color: transparent;
50
+ letter-spacing: 1.5px;
51
+ user-select: none;
52
+ line-height: 1;
53
+ }
54
+
55
+ /* Chat container and rest of your styles remain unchanged */
56
+ .chat-outer {
57
+ display: flex;
58
+ flex-direction: column;
59
+ justify-content: flex-end;
60
+ align-items: center;
61
+ height: 100vh;
62
+ width: 100vw;
63
+ padding-top: 0;
64
+ padding-bottom: 0;
65
+ position: relative;
66
+ }
67
+
68
+ /* Ensure chat-container fills available space above input */
69
+ .chat-container {
70
+ width: 100%;
71
+ max-width: 720px;
72
+ flex: 1 1 auto;
73
+ background: rgba(44, 46, 56, 0.75);
74
+ border-radius: 1.5rem;
75
+ margin: 0 auto;
76
+ padding: 2rem 1.2rem 1rem 1.2rem;
77
+ box-shadow: 0 8px 32px rgba(20, 20, 40, 0.14);
78
+ overflow-y: auto;
79
+ display: flex;
80
+ flex-direction: column;
81
+ gap: 1.2rem;
82
+ scroll-behavior: smooth;
83
+ min-height: 60vh;
84
+ max-height: none;
85
+ backdrop-filter: blur(8px);
86
+ }
87
+
88
+ /* Spacer to ensure input bar never overlaps chat on any device */
89
+ .input-area-spacer {
90
+ height: 100px;
91
+ flex-shrink: 0;
92
+ }
93
+
94
+ @media (max-width: 600px) {
95
+ .branding-row {
96
+ gap: 0.7rem;
97
+ padding: 1.2rem 0 0.5rem 1.1rem;
98
+ }
99
+ .logo {
100
+ width: 56px;
101
+ height: 56px;
102
+ border-radius: 12px;
103
+ }
104
+ .brand-name {
105
+ font-size: 2.1rem;
106
+ letter-spacing: 1px;
107
+ }
108
+ .chat-container, .input-area {
109
+ max-width: 100vw;
110
+ border-radius: 0.5rem;
111
+ padding-left: 0.3rem;
112
+ padding-right: 0.3rem;
113
+ }
114
+ .input-area-spacer {
115
+ height: 120px;
116
+ }
117
+ }
118
+
119
+ /* Input bar at the bottom, always above chat */
120
+ .input-area {
121
+ width: 100%;
122
+ max-width: 720px;
123
+ background: none;
124
+ position: fixed;
125
+ left: 50%;
126
+ bottom: 0;
127
+ transform: translateX(-50%);
128
+ z-index: 20;
129
+ padding: 0 1rem 1.2rem 1rem;
130
+ }
131
+
132
+ .input-container {
133
+ display: flex;
134
+ align-items: center;
135
+ background: rgba(44, 46, 56, 0.90);
136
+ border-radius: 1.2rem;
137
+ box-shadow: 0 2px 12px rgba(0,0,0,0.12);
138
+ padding: 0.25rem 0.5rem;
139
+ }
140
+
141
+ input[type="text"] {
142
+ flex: 1;
143
+ padding: 1rem 1.2rem;
144
+ font-size: 1.1rem;
145
+ background: transparent;
146
+ border: none;
147
+ color: #fff;
148
+ outline: none;
149
+ border-radius: 1.2rem;
150
+ }
151
+
152
+ input[type="text"]:focus {
153
+ background: rgba(44, 46, 56, 0.98);
154
+ }
155
+
156
+ button {
157
+ background: linear-gradient(135deg, #4f9fff 60%, #9c7aff 100%);
158
+ color: white;
159
+ border: none;
160
+ padding: 0.7rem 0.9rem;
161
+ border-radius: 0.9rem;
162
+ cursor: pointer;
163
+ margin-left: 0.5rem;
164
+ transition: background 0.2s, transform 0.2s;
165
+ display: flex;
166
+ align-items: center;
167
+ justify-content: center;
168
+ }
169
+
170
+ button:hover {
171
+ background: linear-gradient(135deg, #2563eb 70%, #7a5cff 100%);
172
+ transform: scale(1.07);
173
+ }
174
+
175
+ .welcome-message {
176
+ text-align: center;
177
+ margin-bottom: 1.5rem;
178
+ opacity: 0.92;
179
+ transition: opacity 0.3s, max-height 0.3s;
180
+ }
181
+
182
+ .welcome-message[style*="display: none"] {
183
+ opacity: 0;
184
+ max-height: 0;
185
+ margin: 0;
186
+ padding: 0;
187
+ }
188
+
189
+ .welcome-message h2 {
190
+ font-size: 1.6rem;
191
+ font-weight: 700;
192
+ margin-bottom: 0.4rem;
193
+ background: linear-gradient(135deg, #4f9fff, #9c7aff);
194
+ -webkit-background-clip: text;
195
+ background-clip: text;
196
+ -webkit-text-fill-color: transparent;
197
+ }
198
+
199
+ .welcome-message p {
200
+ font-size: 1.05rem;
201
+ opacity: 0.8;
202
+ }
203
+
204
+ .disclaimer {
205
+ font-size: 0.9rem;
206
+ opacity: 0.65;
207
+ margin-top: 0.3rem;
208
+ line-height: 1.4;
209
+ }
210
+
211
+ .message-row {
212
+ display: flex;
213
+ width: 100%;
214
+ }
215
+
216
+ .message-row.user {
217
+ justify-content: flex-end;
218
+ }
219
+
220
+ .message-row.bot {
221
+ justify-content: flex-start;
222
+ }
223
+
224
+ .message {
225
+ max-width: 85%;
226
+ padding: 1rem 1.3rem;
227
+ border-radius: 1.2rem;
228
+ font-size: 1.05rem;
229
+ line-height: 1.6;
230
+ animation: fadeIn 0.3s ease-in;
231
+ white-space: pre-wrap;
232
+ word-wrap: break-word;
233
+ background: rgba(60, 64, 90, 0.85);
234
+ box-shadow: 0 2px 12px rgba(0,0,0,0.08);
235
+ transition: background 0.2s;
236
+ }
237
+
238
+ .message.user {
239
+ background: linear-gradient(135deg, #4f9fff 60%, #9c7aff 100%);
240
+ color: #fff;
241
+ border-bottom-right-radius: 0.3rem;
242
+ }
243
+
244
+ .message.bot {
245
+ background: rgba(44, 46, 56, 0.94);
246
+ color: #fff;
247
+ border-bottom-left-radius: 0.3rem;
248
+ }
249
+
250
+ .time-display {
251
+ font-style: italic;
252
+ color: #a0aec0;
253
+ font-size: 0.8rem;
254
+ margin-top: 0.3rem;
255
+ margin-left: 0.7rem;
256
+ }
257
+
258
+ .typing-indicator {
259
+ display: flex;
260
+ align-items: center;
261
+ background: rgba(44, 46, 56, 0.94);
262
+ padding: 1rem 1.3rem;
263
+ border-radius: 1.2rem;
264
+ width: fit-content;
265
+ gap: 3px;
266
+ }
267
+
268
+ .typing-indicator span {
269
+ height: 8px;
270
+ width: 8px;
271
+ margin: 0 2px;
272
+ background-color: #b5b7c2;
273
+ border-radius: 50%;
274
+ display: inline-block;
275
+ animation: bounce 1.5s infinite ease-in-out;
276
+ }
277
+
278
+ .typing-indicator span:nth-child(1) { animation-delay: 0s; }
279
+ .typing-indicator span:nth-child(2) { animation-delay: 0.2s; }
280
+ .typing-indicator span:nth-child(3) { animation-delay: 0.4s; }
281
+
282
+ @keyframes bounce {
283
+ 0%, 80%, 100% { transform: translateY(0); }
284
+ 40% { transform: translateY(-8px); }
285
+ }
286
+
287
+ @keyframes fadeIn {
288
+ from { opacity: 0; transform: translateY(10px);}
289
+ to { opacity: 1; transform: translateY(0);}
290
+ }
291
+
292
+ /* Scrollbar Styling */
293
+ .chat-container::-webkit-scrollbar {
294
+ width: 7px;
295
+ }
296
+ .chat-container::-webkit-scrollbar-thumb {
297
+ background: #555a6a;
298
+ border-radius: 4px;
299
+ }
templates/index.html ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Vexa Chat</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
8
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&display=swap" rel="stylesheet">
9
+ <link href="https://fonts.googleapis.com/css2?family=Poppins:wght@500;700&display=swap" rel="stylesheet">
10
+ </head>
11
+ <body>
12
+ <!-- Branding Row (Logo + VEXA) -->
13
+ <div class="branding-row">
14
+ <img class="logo" src="{{ url_for('static', filename='logo.png') }}" alt="Vexa Logo">
15
+ <span class="brand-name">VEXA</span>
16
+ </div>
17
+ <main class="chat-outer">
18
+ <div class="chat-container" id="chat">
19
+ <div class="welcome-message" id="welcome-message">
20
+ <h2>Welcome to Vexa AI</h2>
21
+ <p>Your digital support assistant</p>
22
+ <p class="disclaimer">Answers focus on community benefit and Sahayak services.</p>
23
+ </div>
24
+ <div class="message-row bot">
25
+ <div class="message bot">Hello! I am Vexa. Ask me anything about Sahayak Organization.</div>
26
+ </div>
27
+ </div>
28
+ <div class="input-area-spacer"></div> <!-- Spacer to push chat above input on all screens -->
29
+ <form class="input-area" id="input-form" autocomplete="off">
30
+ <div class="input-container">
31
+ <input type="text" id="msg" placeholder="Type your message..." autocomplete="off">
32
+ <button type="submit">
33
+ <svg xmlns="http://www.w3.org/2000/svg" width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
34
+ <line x1="22" y1="2" x2="11" y2="13"></line>
35
+ <polygon points="22 2 15 22 11 13 2 9 22 2"></polygon>
36
+ </svg>
37
+ </button>
38
+ </div>
39
+ </form>
40
+ </main>
41
+ <script>
42
+ let isProcessing = false;
43
+ let hasSentMessage = false;
44
+
45
+ function hideWelcome() {
46
+ if (!hasSentMessage) {
47
+ const welcome = document.getElementById("welcome-message");
48
+ if (welcome) welcome.style.display = "none";
49
+ hasSentMessage = true;
50
+ }
51
+ }
52
+
53
+ function send() {
54
+ if (isProcessing) return;
55
+ const userInput = document.getElementById("msg").value;
56
+ if (!userInput.trim()) return;
57
+ hideWelcome();
58
+ isProcessing = true;
59
+ const chat = document.getElementById("chat");
60
+ const userRow = document.createElement("div");
61
+ userRow.className = "message-row user";
62
+ const userDiv = document.createElement("div");
63
+ userDiv.className = "message user";
64
+ userDiv.textContent = userInput;
65
+ userRow.appendChild(userDiv);
66
+ chat.appendChild(userRow);
67
+
68
+ // Typing indicator
69
+ const botTypingRow = document.createElement("div");
70
+ botTypingRow.className = "message-row bot typing-row";
71
+ botTypingRow.innerHTML = '<div class="typing-indicator"><span></span><span></span><span></span></div>';
72
+ chat.appendChild(botTypingRow);
73
+ chat.scrollTop = chat.scrollHeight;
74
+
75
+ fetch("/get", {
76
+ method: "POST",
77
+ headers: { "Content-Type": "application/x-www-form-urlencoded" },
78
+ body: "user_input=" + encodeURIComponent(userInput)
79
+ })
80
+ .then(res => res.json())
81
+ .then(data => {
82
+ chat.removeChild(botTypingRow);
83
+ const botRow = document.createElement("div");
84
+ botRow.className = "message-row bot";
85
+ const botDiv = document.createElement("div");
86
+ botDiv.className = "message bot";
87
+ botDiv.textContent = data.response;
88
+ botRow.appendChild(botDiv);
89
+ if (data.process_time !== undefined) {
90
+ const timeDiv = document.createElement("div");
91
+ timeDiv.className = "time-display";
92
+ timeDiv.textContent = `Processing time: ${data.process_time} seconds`;
93
+ botRow.appendChild(timeDiv);
94
+ }
95
+ chat.appendChild(botRow);
96
+ chat.scrollTop = chat.scrollHeight;
97
+ })
98
+ .catch(error => {
99
+ chat.removeChild(botTypingRow);
100
+ const botRow = document.createElement("div");
101
+ botRow.className = "message-row bot";
102
+ const botDiv = document.createElement("div");
103
+ botDiv.className = "message bot";
104
+ botDiv.textContent = "An error occurred. Please try again.";
105
+ botRow.appendChild(botDiv);
106
+ chat.appendChild(botRow);
107
+ chat.scrollTop = chat.scrollHeight;
108
+ })
109
+ .finally(() => {
110
+ isProcessing = false;
111
+ });
112
+ document.getElementById("msg").value = "";
113
+ chat.scrollTop = chat.scrollHeight;
114
+ }
115
+
116
+ document.getElementById("input-form").addEventListener("submit", function(e) {
117
+ e.preventDefault();
118
+ send();
119
+ });
120
+
121
+ document.getElementById("msg").addEventListener("keypress", function(e) {
122
+ if (e.key === "Enter") {
123
+ e.preventDefault();
124
+ send();
125
+ }
126
+ });
127
+ </script>
128
+ </body>
129
+ </html>