SakibAhmed commited on
Commit
cb72fba
·
verified ·
1 Parent(s): 68d3d8e

Upload 15 files

Browse files
Files changed (15) hide show
  1. .env +52 -0
  2. Dockerfile +51 -0
  3. app.py +246 -0
  4. assets/users.csv +12 -0
  5. chunker.py +114 -0
  6. config.py +71 -0
  7. note.txt +1 -0
  8. postman.json +153 -0
  9. rag_components.py +338 -0
  10. rag_system.py +70 -0
  11. requirements original.txt +35 -0
  12. requirements.txt +15 -0
  13. sources/context.txt +0 -0
  14. templates/chat-bot.html +108 -0
  15. utils.py +124 -0
.env ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- API Credentials (FOR N8N AUTH) ---
2
+ # Used for the /webhook/search endpoint (Basic Auth)
3
+ API_USERNAME=admin
4
+ API_PASSWORD=12345
5
+
6
+ # --- Admin Dashboard Credentials (Fallback) ---
7
+ # Used if users.csv fails or is missing
8
+ FLASK_ADMIN_USERNAME=admin
9
+ FLASK_ADMIN_PASSWORD=1234
10
+
11
+ # --- RAG Settings ---
12
+ # Directory to store the vector index (automatically created)
13
+ RAG_STORAGE_DIR=faiss_storage
14
+ # Directory where your context.txt lives
15
+ SOURCES_DIR=sources
16
+
17
+ # --- RAG System Configuration ---
18
+
19
+ RAG_EMBEDDING_MODEL="BAAI/bge-large-en-v1.5" #BAAI/bge-small-en
20
+ RAG_EMBEDDING_GPU="false"
21
+ RAG_LOAD_INDEX="true"
22
+
23
+ # Step 1: Fetch this many documents from the vector database (FAISS).
24
+ RAG_INITIAL_FETCH_K=10
25
+
26
+ # Step 2: After reranking the initial docs, keep this many final documents for the LLM context.
27
+ RAG_RERANKER_K=5
28
+
29
+ RAG_MAX_FILES_FOR_INCREMENTAL=50
30
+
31
+ # Text chunking settings
32
+ RAG_CHUNK_SIZE=2000
33
+ RAG_CHUNK_OVERLAP=150
34
+
35
+ # --- Reranker & Retrieval Pipeline Settings ---
36
+ RAG_RERANKER_ENABLED=False
37
+ RAG_RERANKER_MODEL="jinaai/jina-reranker-v1-turbo-en"
38
+ # RAG_RERANKER_MODEL=jinaai/jina-reranker-v2-base-multilingual
39
+
40
+ # --- Google Drive Settings (Disabled) ---
41
+ GDRIVE_SOURCES_ENABLED=false
42
+ GDRIVE_FOLDER_URL="1xkBOzr8eN-lXRYNA62jbl3UHdmtZ4TJA"
43
+ GDRIVE_INDEX_ENABLED=false
44
+ GDRIVE_INDEX_URL="1wUsdasdsa7f8qENTR-lFmsZV"
45
+
46
+ GDRIVE_USERS_CSV_ENABLED=true
47
+ GDRIVE_USERS_CSV_URL="1yadsaHX2yy9MttYrLSE20"
48
+
49
+ # --- System ---
50
+ RAG_DETAILED_LOGGING=True
51
+ # PORT=5000
52
+ # FLASK_DEBUG=True
Dockerfile ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.11 to support Pandas 3.x and newer libraries
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ # libgl1 and libglib2.0-0 are often needed for CV/PDF libraries
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ libgl1 \
11
+ libglib2.0-0 \
12
+ build-essential \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy the requirements file
16
+ COPY requirements.txt requirements.txt
17
+
18
+ # Install Python packages with timeout increase
19
+ RUN pip install --no-cache-dir --timeout=1000 -r requirements.txt
20
+
21
+ # Copy application code
22
+ COPY . /app
23
+
24
+ # Create a non-root user (Security Best Practice)
25
+ RUN useradd -m -u 1000 user
26
+
27
+ # Change ownership of the app directory and the temp directories
28
+ RUN chown -R user:user /app
29
+
30
+ # Create temp directories for HuggingFace/Torch cache and set permissions
31
+ RUN mkdir -p /tmp/transformers_cache /tmp/hf_home /tmp/torch_home && \
32
+ chown -R user:user /tmp/transformers_cache /tmp/hf_home /tmp/torch_home
33
+
34
+ # Switch to the non-root user
35
+ USER user
36
+
37
+ # Expose the port (Standard for HF Spaces)
38
+ EXPOSE 7860
39
+
40
+ # Set environment variables
41
+ ENV FLASK_HOST=0.0.0.0
42
+ ENV FLASK_PORT=7860
43
+ ENV FLASK_DEBUG=False
44
+
45
+ # CRITICAL: Set HF-specific env vars to writable directories
46
+ ENV TRANSFORMERS_CACHE=/tmp/transformers_cache
47
+ ENV HF_HOME=/tmp/hf_home
48
+ ENV TORCH_HOME=/tmp/torch_home
49
+
50
+ # Command to run the app
51
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, Response, render_template
2
+ from flask_cors import CORS
3
+ import os
4
+ import logging
5
+ import functools
6
+ import pandas as pd
7
+ from dotenv import load_dotenv
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ # Custom Imports
13
+ from rag_system import initialize_and_get_rag_system
14
+ from config import (
15
+ API_USERNAME, API_PASSWORD, RAG_SOURCES_DIR,
16
+ GDRIVE_INDEX_ENABLED, GDRIVE_INDEX_ID_OR_URL,
17
+ GDRIVE_USERS_CSV_ENABLED, GDRIVE_USERS_CSV_ID_OR_URL,
18
+ ADMIN_USERNAME, ADMIN_PASSWORD,
19
+ RAG_RERANKER_K
20
+ )
21
+ from utils import download_and_unzip_gdrive_file, download_gdrive_file
22
+
23
+ # Logging Setup
24
+ logging.basicConfig(level=logging.INFO)
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # Flask Init
28
+ app = Flask(__name__, static_folder='static', template_folder='templates')
29
+ CORS(app)
30
+
31
+ # Global State
32
+ rag_system = None
33
+ user_df = None
34
+ _APP_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
35
+
36
+ # --- Helper: Load Users ---
37
+ def load_users_from_csv():
38
+ global user_df
39
+ assets_folder = os.path.join(_APP_BASE_DIR, 'assets')
40
+ os.makedirs(assets_folder, exist_ok=True)
41
+ users_csv_path = os.path.join(assets_folder, 'users.csv')
42
+
43
+ try:
44
+ if os.path.exists(users_csv_path):
45
+ user_df = pd.read_csv(users_csv_path)
46
+ # Normalize email
47
+ if 'email' in user_df.columns:
48
+ user_df['email'] = user_df['email'].str.lower().str.strip()
49
+ logger.info(f"Loaded {len(user_df)} users from CSV.")
50
+ else:
51
+ logger.warning("users.csv not found in assets folder.")
52
+ user_df = None
53
+ except Exception as e:
54
+ logger.error(f"Failed to load users.csv: {e}")
55
+ user_df = None
56
+
57
+ # --- Helper: Auth Decorators ---
58
+ def require_api_auth(f):
59
+ """Protects the N8N Webhook endpoint"""
60
+ @functools.wraps(f)
61
+ def decorated(*args, **kwargs):
62
+ auth = request.authorization
63
+ # Check against API_USERNAME/PASSWORD from .env
64
+ if not auth or auth.username != API_USERNAME or auth.password != API_PASSWORD:
65
+ return Response('Unauthorized', 401, {'WWW-Authenticate': 'Basic realm="API Login Required"'})
66
+ return f(*args, **kwargs)
67
+ return decorated
68
+
69
+ def require_admin_auth(f):
70
+ """Protects Admin Rebuild/Update endpoints"""
71
+ @functools.wraps(f)
72
+ def decorated(*args, **kwargs):
73
+ auth = request.authorization
74
+ if not auth:
75
+ return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
76
+
77
+ # 1. Check against loaded CSV
78
+ if user_df is not None:
79
+ user_email = auth.username.lower().strip()
80
+ user_record = user_df[user_df['email'] == user_email]
81
+ if not user_record.empty:
82
+ user_data = user_record.iloc[0]
83
+ # Compare password as string
84
+ if str(user_data['password']) == auth.password and user_data['role'] == 'admin':
85
+ return f(*args, **kwargs)
86
+
87
+ # 2. Fallback to .env Admin Credentials
88
+ elif auth.username == ADMIN_USERNAME and auth.password == ADMIN_PASSWORD:
89
+ return f(*args, **kwargs)
90
+
91
+ return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
92
+ return decorated
93
+
94
+ # --- Startup Logic (Fixed: No Decorator) ---
95
+ def run_startup_tasks():
96
+ """Initializes RAG system and loads data. Called explicitly."""
97
+ global rag_system
98
+ logger.info("--- Executing Startup Tasks ---")
99
+
100
+ # 1. Download Users CSV if configured
101
+ if GDRIVE_USERS_CSV_ENABLED and GDRIVE_USERS_CSV_ID_OR_URL:
102
+ target = os.path.join(_APP_BASE_DIR, 'assets', 'users.csv')
103
+ download_gdrive_file(GDRIVE_USERS_CSV_ID_OR_URL, target)
104
+
105
+ # 2. Load User Data
106
+ load_users_from_csv()
107
+
108
+ # 3. Download FAISS Index if configured
109
+ if GDRIVE_INDEX_ENABLED and GDRIVE_INDEX_ID_OR_URL:
110
+ download_and_unzip_gdrive_file(GDRIVE_INDEX_ID_OR_URL, os.getcwd())
111
+
112
+ # 4. Initialize RAG
113
+ rag_system = initialize_and_get_rag_system()
114
+ logger.info("--- Startup Tasks Complete ---")
115
+
116
+ # Execute startup tasks immediately when this module is loaded
117
+ # This ensures it runs before the first request in Flask 3.x
118
+ with app.app_context():
119
+ run_startup_tasks()
120
+
121
+ # ===========================
122
+ # API ROUTES
123
+ # ===========================
124
+
125
+ # --- 1. N8N Webhook (The Core Function) ---
126
+ @app.route('/webhook/search', methods=['POST'])
127
+ @require_api_auth
128
+ def search_knowledgebase_api():
129
+ """
130
+ Main entry point for N8N.
131
+ Expected JSON: { "query": "...", "use_reranker": bool, "final_k": int }
132
+ """
133
+ if not rag_system:
134
+ # Try to recover if somehow not initialized
135
+ return jsonify({"error": "RAG not initialized. Check server logs."}), 503
136
+
137
+ data = request.json or {}
138
+ query = data.get('query')
139
+ if not query:
140
+ return jsonify({"error": "Query field is required"}), 400
141
+
142
+ # Configs (with defaults)
143
+ # --- FIX: Use RAG_RERANKER_K from config as the default instead of hardcoded 5 ---
144
+ top_k = data.get('final_k', RAG_RERANKER_K)
145
+ use_reranker = data.get('use_reranker', True)
146
+
147
+ # Dynamic Reranker Toggling
148
+ if rag_system.retriever:
149
+ if not use_reranker:
150
+ rag_system.retriever.reranker = None
151
+ elif use_reranker and rag_system.reranker:
152
+ rag_system.retriever.reranker = rag_system.reranker
153
+
154
+ try:
155
+ results = rag_system.search_knowledge_base(query, top_k=top_k)
156
+ return jsonify({
157
+ "results": results,
158
+ "count": len(results),
159
+ "status": "success"
160
+ })
161
+ except Exception as e:
162
+ logger.error(f"Search API Error: {e}")
163
+ return jsonify({"error": str(e)}), 500
164
+
165
+ # --- 2. User Login (RESTORED) ---
166
+ @app.route('/user-login', methods=['POST'])
167
+ def user_login():
168
+ """
169
+ Standard user login endpoint.
170
+ Checks credentials against users.csv.
171
+ """
172
+ if user_df is None:
173
+ return jsonify({"error": "User database not available."}), 503
174
+
175
+ data = request.json
176
+ email = data.get('email', '').lower().strip()
177
+ password = data.get('password')
178
+
179
+ if not email or not password:
180
+ return jsonify({"error": "Email and password required"}), 400
181
+
182
+ user_record = user_df[user_df['email'] == email]
183
+ if not user_record.empty:
184
+ u_data = user_record.iloc[0]
185
+ if str(u_data['password']) == str(password):
186
+ # Return user info (excluding password)
187
+ resp = u_data.to_dict()
188
+ if 'password' in resp:
189
+ del resp['password']
190
+ return jsonify(resp), 200
191
+
192
+ return jsonify({"error": "Invalid credentials"}), 401
193
+
194
+ # --- 3. UI Route ---
195
+ @app.route('/')
196
+ def index_route():
197
+ # Renders the HTML Dashboard
198
+ return render_template('chat-bot.html')
199
+
200
+ # --- 4. Admin Auth Check ---
201
+ @app.route('/admin/login', methods=['POST'])
202
+ @require_admin_auth
203
+ def admin_login():
204
+ """Verifies Admin Credentials via Basic Auth Header"""
205
+ return jsonify({"status": "success", "message": "Authenticated"}), 200
206
+
207
+ # --- 5. Admin RAG Controls ---
208
+ @app.route('/admin/update_faiss_index', methods=['POST'])
209
+ @require_admin_auth
210
+ def update_faiss_index():
211
+ if not rag_system:
212
+ return jsonify({"error": "RAG system not initialized"}), 503
213
+
214
+ data = request.json or {}
215
+ max_files = data.get('max_new_files')
216
+
217
+ try:
218
+ result = rag_system.update_index_with_new_files(RAG_SOURCES_DIR, max_files)
219
+ return jsonify(result), 200
220
+ except Exception as e:
221
+ return jsonify({"error": str(e)}), 500
222
+
223
+ @app.route('/admin/rebuild_index', methods=['POST'])
224
+ @require_admin_auth
225
+ def rebuild_index():
226
+ global rag_system
227
+ try:
228
+ # Force rebuild calls the initialization logic with force_rebuild=True
229
+ rag_system = initialize_and_get_rag_system(force_rebuild=True)
230
+ return jsonify({"status": "Index rebuilt successfully"}), 200
231
+ except Exception as e:
232
+ return jsonify({"error": str(e)}), 500
233
+
234
+ # --- 6. Status Check ---
235
+ @app.route('/status', methods=['GET'])
236
+ def status_route():
237
+ return jsonify({
238
+ "status": "online",
239
+ "rag_initialized": rag_system is not None,
240
+ "users_loaded": user_df is not None
241
+ })
242
+
243
+ if __name__ == '__main__':
244
+ # Default to 7860 for Hugging Face Spaces
245
+ port = int(os.environ.get("PORT", 7860))
246
+ app.run(host='0.0.0.0', port=port)
assets/users.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sl,name,email,password,role
2
+ 1,Sifat Hossain Fahim,fahim@ge-bd.com,WorldTour1234!,admin
3
+ 2,Sakib Ahmed,sakib.ahmed@ge-bd.com,12345678!,admin
4
+ 3,Rezwanul Islam,rezwanul@ge-bd.com,marstour1234!,admin
5
+ 4,Sarwar Jahan,sarwar.piel@ge-bd.com,password123,user
6
+ 5,Rezaul Kabir,rezaul.kabir@ge-bd.com,securepass,user
7
+ 6,Test,test@test.com,12345678!,user
8
+ 7,Sadiquzzaman,sadiquzzaman@ge-bd.com,wqeqw1234,user
9
+ 8,Sadman,sadman@ge-bd.com,1234fvb,user
10
+ 9,Pavel,pavel@ge-bd.com,12314rdf,user
11
+ 10,Sajib,sajib.hossain@ge-bd.com,1234rge,user
12
+ 11,Abdur Rahim,arahim@ge-bd.com,23ree4rt,user
chunker.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import json
4
+ import argparse
5
+ from typing import List, Dict, Optional
6
+
7
+ # --- UPDATED IMPORT ---
8
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
9
+ # ----------------------
10
+
11
+ from utils import extract_text_from_file, FAISS_RAG_SUPPORTED_EXTENSIONS
12
+
13
+ # --- Logging Setup ---
14
+ logging.basicConfig(
15
+ level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
17
+ handlers=[
18
+ logging.StreamHandler()
19
+ ]
20
+ )
21
+ logger = logging.getLogger(__name__)
22
+
23
+ def process_sources_and_create_chunks(
24
+ sources_dir: str,
25
+ output_file: str,
26
+ chunk_size: int = 1000,
27
+ chunk_overlap: int = 150,
28
+ text_output_dir: Optional[str] = None
29
+ ) -> None:
30
+ if not os.path.isdir(sources_dir):
31
+ logger.error(f"Source directory not found: '{sources_dir}'")
32
+ raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
33
+
34
+ logger.info(f"Starting chunking process. Sources: '{sources_dir}', Output: '{output_file}'")
35
+
36
+ if text_output_dir:
37
+ os.makedirs(text_output_dir, exist_ok=True)
38
+ logger.info(f"Will save raw extracted text to: '{text_output_dir}'")
39
+
40
+ all_chunks_for_json: List[Dict] = []
41
+ processed_files_count = 0
42
+
43
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
44
+
45
+ for filename in os.listdir(sources_dir):
46
+ file_path = os.path.join(sources_dir, filename)
47
+ if not os.path.isfile(file_path):
48
+ continue
49
+
50
+ file_ext = filename.split('.')[-1].lower()
51
+ if file_ext not in FAISS_RAG_SUPPORTED_EXTENSIONS:
52
+ logger.debug(f"Skipping unsupported file: {filename}")
53
+ continue
54
+
55
+ logger.info(f"Processing source file: {filename}")
56
+ text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
57
+
58
+ if text_content:
59
+ if text_output_dir:
60
+ try:
61
+ text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
62
+ with open(text_output_path, 'w', encoding='utf-8') as f_text:
63
+ f_text.write(text_content)
64
+ except Exception as e_text_save:
65
+ logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
66
+
67
+ chunks = text_splitter.split_text(text_content)
68
+ for i, chunk_text in enumerate(chunks):
69
+ chunk_data = {
70
+ "page_content": chunk_text,
71
+ "metadata": {
72
+ "source_document_name": filename,
73
+ "chunk_index": i,
74
+ "full_location": f"{filename}, Chunk {i+1}"
75
+ }
76
+ }
77
+ all_chunks_for_json.append(chunk_data)
78
+
79
+ processed_files_count += 1
80
+
81
+ if not all_chunks_for_json:
82
+ logger.warning(f"No processable documents found in '{sources_dir}'.")
83
+
84
+ output_dir = os.path.dirname(output_file)
85
+ os.makedirs(output_dir, exist_ok=True)
86
+
87
+ with open(output_file, 'w', encoding='utf-8') as f:
88
+ json.dump(all_chunks_for_json, f, indent=2)
89
+
90
+ logger.info(f"Chunking complete. Processed {processed_files_count} files. Total chunks: {len(all_chunks_for_json)}")
91
+
92
+ def main():
93
+ parser = argparse.ArgumentParser()
94
+ parser.add_argument('--sources-dir', type=str, required=True)
95
+ parser.add_argument('--output-file', type=str, required=True)
96
+ parser.add_argument('--text-output-dir', type=str, default=None)
97
+ parser.add_argument('--chunk-size', type=int, default=1000)
98
+ parser.add_argument('--chunk-overlap', type=int, default=150)
99
+ args = parser.parse_args()
100
+
101
+ try:
102
+ process_sources_and_create_chunks(
103
+ sources_dir=args.sources_dir,
104
+ output_file=args.output_file,
105
+ chunk_size=args.chunk_size,
106
+ chunk_overlap=args.chunk_overlap,
107
+ text_output_dir=args.text_output_dir
108
+ )
109
+ except Exception as e:
110
+ logger.critical(f"Chunking failed: {e}", exc_info=True)
111
+ exit(1)
112
+
113
+ if __name__ == "__main__":
114
+ main()
config.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+ # --- Logging Setup ---
5
+ logger = logging.getLogger(__name__)
6
+ if not logger.handlers:
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10
+ )
11
+
12
+ _MODULE_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
13
+
14
+ # API Authentication for n8n (Basic Auth)
15
+ API_USERNAME = os.getenv("API_USERNAME", "admin")
16
+ API_PASSWORD = os.getenv("API_PASSWORD", "password")
17
+
18
+ # Admin fallback for UI
19
+ ADMIN_USERNAME = os.getenv('FLASK_ADMIN_USERNAME', 'admin')
20
+ ADMIN_PASSWORD = os.getenv('FLASK_ADMIN_PASSWORD', '1234')
21
+
22
+ RAG_FAISS_INDEX_SUBDIR_NAME = "faiss_index"
23
+ RAG_STORAGE_PARENT_DIR = os.getenv("RAG_STORAGE_DIR", os.path.join(_MODULE_BASE_DIR, "faiss_storage"))
24
+ RAG_SOURCES_DIR = os.getenv("SOURCES_DIR", os.path.join(_MODULE_BASE_DIR, "sources"))
25
+ RAG_CHUNKED_SOURCES_FILENAME = "pre_chunked_sources.json"
26
+
27
+ os.makedirs(RAG_SOURCES_DIR, exist_ok=True)
28
+ os.makedirs(RAG_STORAGE_PARENT_DIR, exist_ok=True)
29
+
30
+ # Embedding and model configuration
31
+ RAG_EMBEDDING_MODEL_NAME = os.getenv("RAG_EMBEDDING_MODEL", "BAAI/bge-small-en")
32
+ RAG_EMBEDDING_USE_GPU = os.getenv("RAG_EMBEDDING_GPU", "False").lower() == "true"
33
+ RAG_LOAD_INDEX_ON_STARTUP = os.getenv("RAG_LOAD_INDEX", "True").lower() == "true"
34
+
35
+ # Retrieval Settings
36
+ RAG_INITIAL_FETCH_K = int(os.getenv("RAG_INITIAL_FETCH_K", 20))
37
+ RAG_RERANKER_K = int(os.getenv("RAG_RERANKER_K", 5))
38
+ # RESTORED: Incremental update limit
39
+ RAG_MAX_FILES_FOR_INCREMENTAL = int(os.getenv("RAG_MAX_FILES_FOR_INCREMENTAL", "50"))
40
+
41
+ # Chunk configuration
42
+ RAG_CHUNK_SIZE = int(os.getenv("RAG_CHUNK_SIZE", 1000))
43
+ RAG_CHUNK_OVERLAP = int(os.getenv("RAG_CHUNK_OVERLAP", 150))
44
+
45
+ # Reranker configuration
46
+ RAG_RERANKER_MODEL_NAME = os.getenv("RAG_RERANKER_MODEL", "jinaai/jina-reranker-v2-base-multilingual")
47
+ RAG_RERANKER_ENABLED = os.getenv("RAG_RERANKER_ENABLED", "True").lower() == "true"
48
+
49
+ # GDrive configuration for RAG sources
50
+ GDRIVE_SOURCES_ENABLED = os.getenv("GDRIVE_SOURCES_ENABLED", "False").lower() == "true"
51
+ GDRIVE_FOLDER_ID_OR_URL = os.getenv("GDRIVE_FOLDER_URL")
52
+
53
+ # GDrive configuration for downloading a pre-built FAISS index
54
+ GDRIVE_INDEX_ENABLED = os.getenv("GDRIVE_INDEX_ENABLED", "False").lower() == "true"
55
+ GDRIVE_INDEX_ID_OR_URL = os.getenv("GDRIVE_INDEX_URL")
56
+
57
+ # RESTORED: GDrive configuration for downloading users.csv
58
+ GDRIVE_USERS_CSV_ENABLED = os.getenv("GDRIVE_USERS_CSV_ENABLED", "False").lower() == "true"
59
+ GDRIVE_USERS_CSV_ID_OR_URL = os.getenv("GDRIVE_USERS_CSV_URL")
60
+
61
+ # Detailed logging configuration
62
+ RAG_DETAILED_LOGGING = os.getenv("RAG_DETAILED_LOGGING", "True").lower() == "true"
63
+
64
+ logger.info(f"RAG Config Loaded - Chunk Size: {RAG_CHUNK_SIZE}, Chunk Overlap: {RAG_CHUNK_OVERLAP}")
65
+ logger.info(f"Embedding Model: {RAG_EMBEDDING_MODEL_NAME}")
66
+ logger.info(f"Reranker Model: {RAG_RERANKER_MODEL_NAME}")
67
+ logger.info(f"Retrieval Pipeline: Initial Fetch K={RAG_INITIAL_FETCH_K}, Reranker Final K={RAG_RERANKER_K}")
68
+ logger.info(f"Detailed Logging: {'ENABLED' if RAG_DETAILED_LOGGING else 'DISABLED'}")
69
+ logger.info(f"GDrive Sources Download: {'ENABLED' if GDRIVE_SOURCES_ENABLED else 'DISABLED'}")
70
+ logger.info(f"GDrive Pre-built Index Download: {'ENABLED' if GDRIVE_INDEX_ENABLED else 'DISABLED'}")
71
+ logger.info(f"GDrive users.csv Download: {'ENABLED' if GDRIVE_USERS_CSV_ENABLED else 'DISABLED'}")
note.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://homemademirpur-ed-cad-ref.hf.space
postman.json ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "info": {
3
+ "_postman_id": "b8f9e9a1-5c8e-4a8e-9b8e-1f8e9a1f8e9a",
4
+ "name": "edmond_cad_refund",
5
+ "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
6
+ },
7
+ "item": [
8
+ {
9
+ "name": "N8N - Search Knowledgebase",
10
+ "request": {
11
+ "auth": {
12
+ "type": "basic",
13
+ "basic": [
14
+ {
15
+ "key": "password",
16
+ "value": "password",
17
+ "type": "string"
18
+ },
19
+ {
20
+ "key": "username",
21
+ "value": "admin",
22
+ "type": "string"
23
+ }
24
+ ]
25
+ },
26
+ "method": "POST",
27
+ "header": [],
28
+ "body": {
29
+ "mode": "raw",
30
+ "raw": "{\n \"query\": \"how to get a refund for electronics?\",\n \"use_reranker\": true,\n \"final_k\": 5,\n \"persona\": [\"standard\"],\n \"tier\": [\"gold\"]\n}",
31
+ "options": {
32
+ "raw": {
33
+ "language": "json"
34
+ }
35
+ }
36
+ },
37
+ "url": {
38
+ "raw": "{{base_url}}/webhook/search",
39
+ "host": [
40
+ "{{base_url}}"
41
+ ],
42
+ "path": [
43
+ "webhook",
44
+ "search"
45
+ ]
46
+ },
47
+ "description": "Main endpoint used by N8N to retrieve context chunks."
48
+ },
49
+ "response": []
50
+ },
51
+ {
52
+ "name": "Admin - Rebuild Index",
53
+ "request": {
54
+ "auth": {
55
+ "type": "basic",
56
+ "basic": [
57
+ {
58
+ "key": "password",
59
+ "value": "1234",
60
+ "type": "string"
61
+ },
62
+ {
63
+ "key": "username",
64
+ "value": "admin",
65
+ "type": "string"
66
+ }
67
+ ]
68
+ },
69
+ "method": "POST",
70
+ "header": [],
71
+ "url": {
72
+ "raw": "{{base_url}}/admin/rebuild_index",
73
+ "host": [
74
+ "{{base_url}}"
75
+ ],
76
+ "path": [
77
+ "admin",
78
+ "rebuild_index"
79
+ ]
80
+ },
81
+ "description": "Completely deletes and rebuilds the FAISS index from sources."
82
+ },
83
+ "response": []
84
+ },
85
+ {
86
+ "name": "Admin - Update Index (Incremental)",
87
+ "request": {
88
+ "auth": {
89
+ "type": "basic",
90
+ "basic": [
91
+ {
92
+ "key": "password",
93
+ "value": "1234",
94
+ "type": "string"
95
+ },
96
+ {
97
+ "key": "username",
98
+ "value": "admin",
99
+ "type": "string"
100
+ }
101
+ ]
102
+ },
103
+ "method": "POST",
104
+ "header": [],
105
+ "body": {
106
+ "mode": "raw",
107
+ "raw": "{\n \"max_new_files\": 50\n}",
108
+ "options": {
109
+ "raw": {
110
+ "language": "json"
111
+ }
112
+ }
113
+ },
114
+ "url": {
115
+ "raw": "{{base_url}}/admin/update_faiss_index",
116
+ "host": [
117
+ "{{base_url}}"
118
+ ],
119
+ "path": [
120
+ "admin",
121
+ "update_faiss_index"
122
+ ]
123
+ },
124
+ "description": "Adds only new files to the existing index."
125
+ },
126
+ "response": []
127
+ },
128
+ {
129
+ "name": "Public - Status",
130
+ "request": {
131
+ "method": "GET",
132
+ "header": [],
133
+ "url": {
134
+ "raw": "{{base_url}}/status",
135
+ "host": [
136
+ "{{base_url}}"
137
+ ],
138
+ "path": [
139
+ "status"
140
+ ]
141
+ }
142
+ },
143
+ "response": []
144
+ }
145
+ ],
146
+ "variable": [
147
+ {
148
+ "key": "base_url",
149
+ "value": "http://localhost:5000",
150
+ "type": "string"
151
+ }
152
+ ]
153
+ }
rag_components.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import json
4
+ import time
5
+ from typing import List, Dict, Optional, Any
6
+
7
+ import torch
8
+ from sentence_transformers import CrossEncoder
9
+
10
+ from langchain_huggingface import HuggingFaceEmbeddings
11
+ from langchain_community.vectorstores import FAISS
12
+
13
+ from langchain_core.documents import Document
14
+ from langchain_core.retrievers import BaseRetriever
15
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
16
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
17
+
18
+ from config import (
19
+ RAG_RERANKER_MODEL_NAME, RAG_DETAILED_LOGGING,
20
+ RAG_CHUNK_SIZE, RAG_CHUNK_OVERLAP, RAG_CHUNKED_SOURCES_FILENAME,
21
+ RAG_FAISS_INDEX_SUBDIR_NAME, RAG_INITIAL_FETCH_K, RAG_RERANKER_K,
22
+ RAG_MAX_FILES_FOR_INCREMENTAL
23
+ )
24
+ from utils import FAISS_RAG_SUPPORTED_EXTENSIONS
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ class DocumentReranker:
30
+ def __init__(self, model_name: str = RAG_RERANKER_MODEL_NAME):
31
+ self.logger = logging.getLogger(__name__ + ".DocumentReranker")
32
+ self.model_name = model_name
33
+ self.model = None
34
+
35
+ try:
36
+ self.logger.info(f"[RERANKER_INIT] Loading reranker model: {self.model_name}")
37
+ start_time = time.time()
38
+ self.model = CrossEncoder(model_name, trust_remote_code=True)
39
+ load_time = time.time() - start_time
40
+ self.logger.info(f"[RERANKER_INIT] Reranker model '{self.model_name}' loaded successfully in {load_time:.2f}s")
41
+ except Exception as e:
42
+ self.logger.error(f"[RERANKER_INIT] Failed to load reranker model '{self.model_name}': {e}", exc_info=True)
43
+ raise RuntimeError(f"Could not initialize reranker model: {e}") from e
44
+
45
+ def rerank_documents(self, query: str, documents: List[Document], top_k: int) -> List[Document]:
46
+ if not documents or not self.model:
47
+ return documents[:top_k] if documents else []
48
+
49
+ try:
50
+ start_time = time.time()
51
+ doc_pairs = [[query, doc.page_content] for doc in documents]
52
+ scores = self.model.predict(doc_pairs)
53
+ rerank_time = time.time() - start_time
54
+ self.logger.info(f"[RERANKER] Computed relevance scores in {rerank_time:.3f}s")
55
+
56
+ doc_score_pairs = list(zip(documents, scores))
57
+ doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
58
+
59
+ reranked_docs = []
60
+ for doc, score in doc_score_pairs[:top_k]:
61
+ doc.metadata["reranker_score"] = float(score)
62
+ reranked_docs.append(doc)
63
+
64
+ return reranked_docs
65
+ except Exception as e:
66
+ self.logger.error(f"[RERANKER] Error during reranking: {e}", exc_info=True)
67
+ return documents[:top_k] if documents else []
68
+
69
+
70
+ class FAISSRetrieverWithScore(BaseRetriever):
71
+ vectorstore: FAISS
72
+ reranker: Optional[DocumentReranker] = None
73
+ initial_fetch_k: int = RAG_INITIAL_FETCH_K
74
+ final_k: int = RAG_RERANKER_K
75
+
76
+ def _get_relevant_documents(
77
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun
78
+ ) -> List[Document]:
79
+
80
+ start_time = time.time()
81
+ num_to_fetch = self.initial_fetch_k if self.reranker else self.final_k
82
+
83
+ # --- FIX: Use global logger, not self.logger ---
84
+ logger.info(f"[RETRIEVER] Fetching {num_to_fetch} docs (Rerank={self.reranker is not None})")
85
+
86
+ docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=num_to_fetch)
87
+
88
+ relevant_docs = []
89
+ for doc, score in docs_and_scores:
90
+ doc.metadata["retrieval_score"] = float(score)
91
+ relevant_docs.append(doc)
92
+
93
+ if self.reranker and relevant_docs:
94
+ relevant_docs = self.reranker.rerank_documents(query, relevant_docs, top_k=self.final_k)
95
+
96
+ total_time = time.time() - start_time
97
+ # --- FIX: Use global logger ---
98
+ logger.info(f"[RETRIEVER] Completed in {total_time:.3f}s. Returned {len(relevant_docs)} docs.")
99
+ return relevant_docs
100
+
101
+
102
+ class KnowledgeRAG:
103
+ def __init__(
104
+ self,
105
+ index_storage_dir: str,
106
+ embedding_model_name: str,
107
+ use_gpu_for_embeddings: bool,
108
+ chunk_size: int = RAG_CHUNK_SIZE,
109
+ chunk_overlap: int = RAG_CHUNK_OVERLAP,
110
+ reranker_model_name: Optional[str] = None,
111
+ enable_reranker: bool = True,
112
+ ):
113
+ self.logger = logging.getLogger(__name__ + ".KnowledgeRAG")
114
+ self.logger.info(f"[RAG_INIT] Initializing KnowledgeRAG system")
115
+
116
+ self.index_storage_dir = index_storage_dir
117
+ os.makedirs(self.index_storage_dir, exist_ok=True)
118
+
119
+ self.embedding_model_name = embedding_model_name
120
+ self.use_gpu_for_embeddings = use_gpu_for_embeddings
121
+ self.chunk_size = chunk_size
122
+ self.chunk_overlap = chunk_overlap
123
+ self.reranker_model_name = reranker_model_name or RAG_RERANKER_MODEL_NAME
124
+ self.enable_reranker = enable_reranker
125
+ self.reranker = None
126
+
127
+ device = "cpu"
128
+ if self.use_gpu_for_embeddings:
129
+ if torch.cuda.is_available():
130
+ self.logger.info(f"[RAG_INIT] CUDA available. Requesting GPU.")
131
+ device = "cuda"
132
+ else:
133
+ self.logger.warning("[RAG_INIT] CUDA not available. Fallback to CPU.")
134
+
135
+ self.embeddings = HuggingFaceEmbeddings(
136
+ model_name=self.embedding_model_name,
137
+ model_kwargs={"device": device},
138
+ encode_kwargs={"normalize_embeddings": True}
139
+ )
140
+
141
+ if self.enable_reranker:
142
+ try:
143
+ self.reranker = DocumentReranker(self.reranker_model_name)
144
+ except Exception as e:
145
+ self.logger.warning(f"[RAG_INIT] Reranker Init Failed: {e}")
146
+ self.reranker = None
147
+
148
+ self.vector_store: Optional[FAISS] = None
149
+ self.retriever: Optional[FAISSRetrieverWithScore] = None
150
+ self.processed_source_files: List[str] = []
151
+
152
+ def _save_chunk_config(self):
153
+ """Persist current chunk settings alongside the FAISS index for change detection."""
154
+ faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
155
+ config_file = os.path.join(faiss_path, "chunk_config.json")
156
+ with open(config_file, 'w') as f:
157
+ json.dump({"chunk_size": self.chunk_size, "chunk_overlap": self.chunk_overlap}, f)
158
+
159
+ def _load_chunk_config(self) -> Optional[dict]:
160
+ """Load previously saved chunk config. Returns None if not found."""
161
+ faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
162
+ config_file = os.path.join(faiss_path, "chunk_config.json")
163
+ if os.path.exists(config_file):
164
+ with open(config_file, 'r') as f:
165
+ return json.load(f)
166
+ return None
167
+
168
+ def chunk_config_has_changed(self) -> bool:
169
+ """Returns True if chunk_size or chunk_overlap differ from what the index was built with."""
170
+ saved = self._load_chunk_config()
171
+ if saved is None:
172
+ return False # No record yet — assume compatible
173
+ changed = saved.get("chunk_size") != self.chunk_size or saved.get("chunk_overlap") != self.chunk_overlap
174
+ if changed:
175
+ self.logger.warning(
176
+ f"[CONFIG_CHANGE] Chunk config mismatch! "
177
+ f"Saved=(size={saved.get('chunk_size')}, overlap={saved.get('chunk_overlap')}) "
178
+ f"Current=(size={self.chunk_size}, overlap={self.chunk_overlap}). "
179
+ f"Index will be rebuilt."
180
+ )
181
+ return changed
182
+
183
+ def build_index_from_source_files(self, source_folder_path: str):
184
+ self.logger.info(f"[INDEX_BUILD] Building from: {source_folder_path}")
185
+ if not os.path.isdir(source_folder_path):
186
+ raise FileNotFoundError(f"Source folder not found: '{source_folder_path}'.")
187
+
188
+ all_docs = []
189
+ processed_files = []
190
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
191
+
192
+ # 1. Pre-chunked JSON
193
+ pre_chunked_path = os.path.join(self.index_storage_dir, RAG_CHUNKED_SOURCES_FILENAME)
194
+ if os.path.exists(pre_chunked_path):
195
+ try:
196
+ with open(pre_chunked_path, 'r', encoding='utf-8') as f:
197
+ chunk_data_list = json.load(f)
198
+ for chunk in chunk_data_list:
199
+ doc = Document(page_content=chunk.get("page_content", ""), metadata=chunk.get("metadata", {}))
200
+ all_docs.append(doc)
201
+ if 'source_document_name' in doc.metadata:
202
+ processed_files.append(doc.metadata['source_document_name'])
203
+ processed_files = sorted(list(set(processed_files)))
204
+ except Exception as e:
205
+ self.logger.error(f"[INDEX_BUILD] JSON load failed: {e}")
206
+
207
+ # 2. Raw Files
208
+ if not all_docs:
209
+ for filename in os.listdir(source_folder_path):
210
+ file_path = os.path.join(source_folder_path, filename)
211
+ if not os.path.isfile(file_path): continue
212
+ file_ext = filename.split('.')[-1].lower()
213
+ if file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
214
+ text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
215
+ if text_content:
216
+ chunks = text_splitter.split_text(text_content)
217
+ for i, chunk_text in enumerate(chunks):
218
+ meta = {"source_document_name": filename, "chunk_index": i}
219
+ all_docs.append(Document(page_content=chunk_text, metadata=meta))
220
+ processed_files.append(filename)
221
+
222
+ if not all_docs:
223
+ raise ValueError("No documents to index.")
224
+
225
+ self.processed_source_files = processed_files
226
+ self.logger.info(f"[INDEX_BUILD] Creating FAISS index with {len(all_docs)} chunks.")
227
+
228
+ self.vector_store = FAISS.from_documents(all_docs, self.embeddings)
229
+ faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
230
+ self.vector_store.save_local(faiss_path)
231
+ self._save_chunk_config()
232
+
233
+ self.retriever = FAISSRetrieverWithScore(
234
+ vectorstore=self.vector_store,
235
+ reranker=self.reranker,
236
+ initial_fetch_k=RAG_INITIAL_FETCH_K,
237
+ final_k=RAG_RERANKER_K
238
+ )
239
+
240
+ def load_index_from_disk(self):
241
+ faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
242
+ if not os.path.exists(faiss_path):
243
+ raise FileNotFoundError("Index not found.")
244
+
245
+ self.vector_store = FAISS.load_local(
246
+ folder_path=faiss_path,
247
+ embeddings=self.embeddings,
248
+ allow_dangerous_deserialization=True
249
+ )
250
+ self.retriever = FAISSRetrieverWithScore(
251
+ vectorstore=self.vector_store,
252
+ reranker=self.reranker,
253
+ initial_fetch_k=RAG_INITIAL_FETCH_K,
254
+ final_k=RAG_RERANKER_K
255
+ )
256
+
257
+ # Restore metadata
258
+ meta_file = os.path.join(faiss_path, "processed_files.json")
259
+ if os.path.exists(meta_file):
260
+ with open(meta_file, 'r') as f:
261
+ self.processed_source_files = json.load(f)
262
+ else:
263
+ self.processed_source_files = ["Loaded from disk (unknown sources)"]
264
+
265
+ self.logger.info("[INDEX_LOAD] Success.")
266
+
267
+ def update_index_with_new_files(self, source_folder_path: str, max_files_to_process: Optional[int] = None) -> Dict[str, Any]:
268
+ self.logger.info(f"[INDEX_UPDATE] Checking for new files in: {source_folder_path}")
269
+
270
+ if not self.vector_store:
271
+ raise RuntimeError("Cannot update: no index loaded.")
272
+
273
+ processed_set = set(self.processed_source_files)
274
+ all_new_files = []
275
+ for filename in sorted(os.listdir(source_folder_path)):
276
+ if filename not in processed_set:
277
+ file_ext = filename.split('.')[-1].lower()
278
+ if file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
279
+ all_new_files.append(filename)
280
+
281
+ if not all_new_files:
282
+ return {"status": "success", "message": "No new files found.", "files_added": []}
283
+
284
+ limit = max_files_to_process if max_files_to_process is not None else RAG_MAX_FILES_FOR_INCREMENTAL
285
+ files_to_process = all_new_files[:limit]
286
+
287
+ new_docs = []
288
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
289
+
290
+ for filename in files_to_process:
291
+ file_path = os.path.join(source_folder_path, filename)
292
+ text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[filename.split('.')[-1].lower()](file_path)
293
+
294
+ if text_content:
295
+ chunks = text_splitter.split_text(text_content)
296
+ for i, chunk_text in enumerate(chunks):
297
+ meta = {"source_document_name": filename, "chunk_index": i}
298
+ new_docs.append(Document(page_content=chunk_text, metadata=meta))
299
+
300
+ if not new_docs:
301
+ return {"status": "warning", "message": "New files found but no text extracted.", "files_added": []}
302
+
303
+ self.vector_store.add_documents(new_docs)
304
+
305
+ faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
306
+ self.vector_store.save_local(faiss_path)
307
+
308
+ self.processed_source_files.extend(files_to_process)
309
+ with open(os.path.join(faiss_path, "processed_files.json"), 'w') as f:
310
+ json.dump(sorted(self.processed_source_files), f)
311
+
312
+ return {
313
+ "status": "success",
314
+ "message": f"Added {len(files_to_process)} files.",
315
+ "files_added": files_to_process,
316
+ "remaining": len(all_new_files) - len(files_to_process)
317
+ }
318
+
319
+ def search_knowledge_base(self, query: str, top_k: Optional[int] = None) -> List[Dict[str, Any]]:
320
+ if not self.retriever:
321
+ raise RuntimeError("Retriever not initialized.")
322
+
323
+ original_k = self.retriever.final_k
324
+ if top_k:
325
+ self.retriever.final_k = top_k
326
+
327
+ try:
328
+ docs = self.retriever.invoke(query)
329
+ results = []
330
+ for doc in docs:
331
+ results.append({
332
+ "content": doc.page_content,
333
+ "metadata": doc.metadata,
334
+ "score": doc.metadata.get("reranker_score") or doc.metadata.get("retrieval_score")
335
+ })
336
+ return results
337
+ finally:
338
+ self.retriever.final_k = original_k
rag_system.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import shutil
4
+ from typing import Optional
5
+
6
+ from rag_components import KnowledgeRAG
7
+ from utils import download_and_unzip_gdrive_folder
8
+ from config import (
9
+ GDRIVE_SOURCES_ENABLED, GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR,
10
+ RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME, RAG_LOAD_INDEX_ON_STARTUP,
11
+ RAG_EMBEDDING_MODEL_NAME, RAG_EMBEDDING_USE_GPU,
12
+ RAG_CHUNK_SIZE, RAG_CHUNK_OVERLAP,
13
+ RAG_RERANKER_MODEL_NAME, RAG_RERANKER_ENABLED
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ def initialize_and_get_rag_system(force_rebuild: bool = False, source_dir_override: Optional[str] = None) -> Optional[KnowledgeRAG]:
19
+
20
+ logger.info("[RAG_SYSTEM_INIT] Initializing...")
21
+ source_dir_to_use = source_dir_override if source_dir_override else RAG_SOURCES_DIR
22
+
23
+ # GDrive Logic (Restored)
24
+ if GDRIVE_SOURCES_ENABLED and not source_dir_override and GDRIVE_FOLDER_ID_OR_URL:
25
+ logger.info("[RAG_SYSTEM_INIT] Downloading sources from GDrive...")
26
+ if os.path.exists(RAG_SOURCES_DIR):
27
+ shutil.rmtree(RAG_SOURCES_DIR)
28
+ download_and_unzip_gdrive_folder(GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR)
29
+
30
+ faiss_index_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME)
31
+
32
+ if force_rebuild and os.path.exists(faiss_index_path):
33
+ logger.info("[RAG_SYSTEM_INIT] Force rebuild: deleting old index.")
34
+ shutil.rmtree(faiss_index_path)
35
+
36
+ try:
37
+ rag = KnowledgeRAG(
38
+ index_storage_dir=RAG_STORAGE_PARENT_DIR,
39
+ embedding_model_name=RAG_EMBEDDING_MODEL_NAME,
40
+ use_gpu_for_embeddings=RAG_EMBEDDING_USE_GPU,
41
+ chunk_size=RAG_CHUNK_SIZE,
42
+ chunk_overlap=RAG_CHUNK_OVERLAP,
43
+ reranker_model_name=RAG_RERANKER_MODEL_NAME,
44
+ enable_reranker=RAG_RERANKER_ENABLED,
45
+ )
46
+
47
+ loaded = False
48
+ if RAG_LOAD_INDEX_ON_STARTUP and not force_rebuild:
49
+ # Check if chunk settings have changed since the index was built
50
+ if rag.chunk_config_has_changed():
51
+ logger.warning("[RAG_SYSTEM_INIT] Chunk config changed — forcing index rebuild.")
52
+ else:
53
+ try:
54
+ rag.load_index_from_disk()
55
+ loaded = True
56
+ except Exception as e:
57
+ logger.warning(f"[RAG_SYSTEM_INIT] Load failed ({e}). Building new.")
58
+
59
+ if not loaded:
60
+ if not os.path.exists(source_dir_to_use) or not os.listdir(source_dir_to_use):
61
+ logger.warning("[RAG_SYSTEM_INIT] No sources found. System empty.")
62
+ else:
63
+ rag.build_index_from_source_files(source_dir_to_use)
64
+
65
+ logger.info("[RAG_SYSTEM_INIT] Complete.")
66
+ return rag
67
+
68
+ except Exception as e:
69
+ logger.critical(f"[RAG_SYSTEM_INIT] FATAL: {e}", exc_info=True)
70
+ return None
requirements original.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==3.0.3
2
+ Flask_Cors==5.0.0
3
+ flask_session
4
+ numpy
5
+ pandas==2.2.3
6
+ # rapidfuzz==3.10.1
7
+ Requests==2.32.3
8
+ # scikit_learn==1.4.1.post1
9
+ # scikit_learn==1.5.2
10
+ psycopg2-binary==2.9.10
11
+ python-dotenv==1.0.1
12
+ apscheduler==3.11.0
13
+ redis==3.5.3
14
+ faiss-cpu==1.10.0
15
+ groq==0.15.0
16
+ llama_index==0.12.13
17
+ llama_index.llms.groq==0.3.1
18
+ # langchain_groq==0.2.4
19
+ # langchain_core==0.3.39
20
+ sentence_transformers==3.4.0
21
+ gunicorn
22
+ llama-index-embeddings-huggingface==0.5.4
23
+ onnxruntime==1.22.0
24
+ langchain-groq==0.3.2
25
+ python-docx==1.1.2
26
+ langchain==0.3.24
27
+ langchain_community==0.3.23
28
+ gdown==5.2.0
29
+ # torch
30
+ pymupdf==1.25.5
31
+ pypdf==5.4.0
32
+ hf_xet==1.1.10
33
+ # protobuf==3.20.3
34
+
35
+ # must install https://aka.ms/vs/17/release/vc_redist.x64.exe
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask==3.1.2
2
+ Flask_Cors==5.0.0
3
+ gdown==5.2.1
4
+ langchain==1.2.10
5
+ langchain_community==0.4.1
6
+ langchain_huggingface==1.2.0
7
+ pandas==3.0.0
8
+ pypdf==6.7.0
9
+ python-dotenv==1.2.1
10
+ python_docx==1.1.2
11
+ sentence_transformers==3.4.0
12
+ torch==2.9.0
13
+ langchain_core
14
+ langchain_text_splitters
15
+ faiss-cpu
sources/context.txt ADDED
The diff for this file is too large to render. See raw diff
 
templates/chat-bot.html ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>RAG API Dashboard</title>
6
+ <link href="https://fonts.googleapis.com/css?family=Roboto:400,500" rel="stylesheet">
7
+ <style>
8
+ body { font-family: 'Roboto', sans-serif; background: #f4f7f9; color: #333; display: flex; justify-content: center; align-items: center; height: 100vh; margin: 0; }
9
+ .container { background: white; padding: 40px; border-radius: 12px; box-shadow: 0 4px 15px rgba(0,0,0,0.1); width: 400px; text-align: center; }
10
+ h2 { margin-bottom: 20px; color: #2c3e50; }
11
+ .status-badge { display: inline-block; padding: 8px 15px; border-radius: 20px; background: #2ecc71; color: white; font-weight: bold; margin-bottom: 30px; }
12
+ .status-badge.error { background: #e74c3c; }
13
+ .btn { display: block; width: 100%; padding: 12px; margin: 10px 0; border: none; border-radius: 6px; cursor: pointer; font-size: 16px; transition: 0.3s; }
14
+ .btn-primary { background: #3498db; color: white; }
15
+ .btn-primary:hover { background: #2980b9; }
16
+ .btn-danger { background: #e74c3c; color: white; }
17
+ .btn-danger:hover { background: #c0392b; }
18
+ input { width: calc(100% - 22px); padding: 10px; margin: 5px 0 15px; border: 1px solid #ddd; border-radius: 6px; }
19
+ .admin-panel { display: none; text-align: left; border-top: 1px solid #eee; padding-top: 20px; margin-top: 20px; }
20
+ .log-box { background: #2c3e50; color: #2ecc71; padding: 10px; border-radius: 6px; font-family: monospace; font-size: 12px; height: 100px; overflow-y: auto; margin-top: 15px; display:none;}
21
+ </style>
22
+ </head>
23
+ <body>
24
+
25
+ <div class="container">
26
+ <h2>RAG API System</h2>
27
+ <div id="status-badge" class="status-badge">Checking Status...</div>
28
+
29
+ <!-- Login Form -->
30
+ <div id="login-form">
31
+ <input type="text" id="username" placeholder="Admin Username">
32
+ <input type="password" id="password" placeholder="Password">
33
+ <button class="btn btn-primary" onclick="login()">Admin Login</button>
34
+ </div>
35
+
36
+ <!-- Admin Panel (Hidden until logged in) -->
37
+ <div id="admin-panel" class="admin-panel">
38
+ <h3>Admin Controls</h3>
39
+ <p>Manage Vector Index</p>
40
+
41
+ <input type="number" id="max-files" placeholder="Max files (e.g. 50) for Update" value="50">
42
+ <button class="btn btn-primary" onclick="performAction('/admin/update_faiss_index')">Update Index (New Files)</button>
43
+ <button class="btn btn-danger" onclick="performAction('/admin/rebuild_index')">Rebuild Full Index</button>
44
+
45
+ <div id="log-box" class="log-box"></div>
46
+ </div>
47
+ </div>
48
+
49
+ <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
50
+ <script>
51
+ let authHeader = null;
52
+
53
+ // Check Server Status on Load
54
+ async function checkStatus() {
55
+ try {
56
+ const res = await axios.get('/status');
57
+ const badge = document.getElementById('status-badge');
58
+ if (res.data.rag_initialized) {
59
+ badge.textContent = "System Online";
60
+ badge.className = "status-badge";
61
+ } else {
62
+ badge.textContent = "Not Initialized";
63
+ badge.className = "status-badge error";
64
+ }
65
+ } catch (e) {
66
+ document.getElementById('status-badge').textContent = "Server Offline";
67
+ document.getElementById('status-badge').className = "status-badge error";
68
+ }
69
+ }
70
+ checkStatus();
71
+
72
+ // Login Logic
73
+ async function login() {
74
+ const u = document.getElementById('username').value;
75
+ const p = document.getElementById('password').value;
76
+ const auth = { username: u, password: p };
77
+
78
+ try {
79
+ await axios.post('/admin/login', {}, { auth });
80
+ authHeader = auth;
81
+ document.getElementById('login-form').style.display = 'none';
82
+ document.getElementById('admin-panel').style.display = 'block';
83
+ } catch (e) {
84
+ alert("Login Failed");
85
+ }
86
+ }
87
+
88
+ // Admin Actions
89
+ async function performAction(url) {
90
+ const logBox = document.getElementById('log-box');
91
+ logBox.style.display = 'block';
92
+ logBox.textContent = "Processing...";
93
+
94
+ const payload = {};
95
+ if(url.includes('update')) {
96
+ payload.max_new_files = document.getElementById('max-files').value;
97
+ }
98
+
99
+ try {
100
+ const res = await axios.post(url, payload, { auth: authHeader });
101
+ logBox.textContent = JSON.stringify(res.data, null, 2);
102
+ } catch (e) {
103
+ logBox.textContent = "Error: " + (e.response ? JSON.stringify(e.response.data) : e.message);
104
+ }
105
+ }
106
+ </script>
107
+ </body>
108
+ </html>
utils.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import re
4
+ import shutil
5
+ import tempfile
6
+ import time
7
+ from typing import Optional
8
+ import zipfile
9
+
10
+ import gdown
11
+ from pypdf import PdfReader
12
+ import docx as python_docx
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def extract_text_from_file(file_path: str, file_type: str) -> Optional[str]:
17
+ logger.info(f"[TEXT_EXTRACTION] Starting extraction from {file_type.upper()} file: {file_path}")
18
+ text_content = None
19
+ try:
20
+ if file_type == 'pdf':
21
+ reader = PdfReader(file_path)
22
+ text_content = "".join(page.extract_text() + "\n" for page in reader.pages if page.extract_text())
23
+ elif file_type == 'docx':
24
+ doc = python_docx.Document(file_path)
25
+ text_content = "\n".join(para.text for para in doc.paragraphs if para.text)
26
+ elif file_type == 'txt':
27
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
28
+ text_content = f.read()
29
+ else:
30
+ logger.warning(f"[TEXT_EXTRACTION] Unsupported file type: {file_type}")
31
+ return None
32
+
33
+ if not text_content or not text_content.strip():
34
+ logger.warning(f"[TEXT_EXTRACTION] No text content extracted from {file_path}")
35
+ return None
36
+
37
+ return text_content.strip()
38
+ except Exception as e:
39
+ logger.error(f"[TEXT_EXTRACTION] Error extracting text: {e}", exc_info=True)
40
+ return None
41
+
42
+ FAISS_RAG_SUPPORTED_EXTENSIONS = {
43
+ 'pdf': lambda path: extract_text_from_file(path, 'pdf'),
44
+ 'docx': lambda path: extract_text_from_file(path, 'docx'),
45
+ 'txt': lambda path: extract_text_from_file(path, 'txt'),
46
+ }
47
+
48
+ def get_id_from_gdrive_input(url_or_id: str) -> Optional[str]:
49
+ if not url_or_id: return None
50
+ match_folder = re.search(r"/folders/([a-zA-Z0-9_-]+)", url_or_id)
51
+ if match_folder: return match_folder.group(1)
52
+ match_file_d = re.search(r"/d/([a-zA-Z0-9_-]+)", url_or_id)
53
+ if match_file_d: return match_file_d.group(1)
54
+ match_uc = re.search(r"id=([a-zA-Z0-9_-]+)", url_or_id)
55
+ if match_uc: return match_uc.group(1)
56
+ return url_or_id if len(url_or_id) > 10 else None
57
+
58
+ def download_gdrive_file(file_id_or_url: str, target_path: str) -> bool:
59
+ """
60
+ Downloads a single file (like users.csv) from GDrive to a specific path.
61
+ """
62
+ logger.info(f"[GDRIVE_SINGLE] Downloading file. Input: {file_id_or_url}")
63
+ file_id = get_id_from_gdrive_input(file_id_or_url)
64
+ if not file_id:
65
+ logger.error("[GDRIVE_SINGLE] Invalid ID")
66
+ return False
67
+
68
+ try:
69
+ # Ensure dir exists
70
+ os.makedirs(os.path.dirname(target_path), exist_ok=True)
71
+ # fuzzy=True allows gdown to handle permissions more gracefully
72
+ gdown.download(id=file_id, output=target_path, quiet=False, fuzzy=True)
73
+
74
+ if os.path.exists(target_path) and os.path.getsize(target_path) > 0:
75
+ logger.info("[GDRIVE_SINGLE] Success.")
76
+ return True
77
+ else:
78
+ logger.error("[GDRIVE_SINGLE] Downloaded file is empty or missing.")
79
+ return False
80
+ except Exception as e:
81
+ logger.error(f"[GDRIVE_SINGLE] Error: {e}", exc_info=True)
82
+ return False
83
+
84
+ def download_and_unzip_gdrive_folder(folder_id_or_url: str, target_dir_for_contents: str) -> bool:
85
+ logger.info(f"[GDRIVE] Downloading folder. Input: {folder_id_or_url}")
86
+ folder_id = get_id_from_gdrive_input(folder_id_or_url)
87
+ if not folder_id: return False
88
+
89
+ temp_dir = tempfile.mkdtemp()
90
+ try:
91
+ gdown.download_folder(id=folder_id, output=temp_dir, quiet=False, use_cookies=False)
92
+
93
+ if not os.path.exists(target_dir_for_contents):
94
+ os.makedirs(target_dir_for_contents)
95
+
96
+ src_root = temp_dir
97
+ if len(os.listdir(temp_dir)) == 1 and os.path.isdir(os.path.join(temp_dir, os.listdir(temp_dir)[0])):
98
+ src_root = os.path.join(temp_dir, os.listdir(temp_dir)[0])
99
+
100
+ for item in os.listdir(src_root):
101
+ shutil.move(os.path.join(src_root, item), os.path.join(target_dir_for_contents, item))
102
+
103
+ logger.info(f"[GDRIVE] Download complete.")
104
+ return True
105
+ except Exception as e:
106
+ logger.error(f"[GDRIVE] Error: {e}", exc_info=True)
107
+ return False
108
+ finally:
109
+ shutil.rmtree(temp_dir, ignore_errors=True)
110
+
111
+ def download_and_unzip_gdrive_file(file_id_or_url: str, target_extraction_dir: str) -> bool:
112
+ logger.info(f"[GDRIVE_ZIP] Downloading ZIP. Input: {file_id_or_url}")
113
+ file_id = get_id_from_gdrive_input(file_id_or_url)
114
+ if not file_id: return False
115
+
116
+ temp_zip = os.path.join(tempfile.gettempdir(), "temp_download.zip")
117
+ try:
118
+ gdown.download(id=file_id, output=temp_zip, quiet=False)
119
+ with zipfile.ZipFile(temp_zip, 'r') as zip_ref:
120
+ zip_ref.extractall(target_extraction_dir)
121
+ return True
122
+ except Exception as e:
123
+ logger.error(f"[GDRIVE_ZIP] Error: {e}", exc_info=True)
124
+ return False