Spaces:

aibot8800
/

RAG-Brain

Running

App Files Files Community

SakibAhmed commited on Feb 22

Commit

cb72fba

verified ·

1 Parent(s): 68d3d8e

Upload 15 files

Browse files

Files changed (15) hide show

.env +52 -0
Dockerfile +51 -0
app.py +246 -0
assets/users.csv +12 -0
chunker.py +114 -0
config.py +71 -0
note.txt +1 -0
postman.json +153 -0
rag_components.py +338 -0
rag_system.py +70 -0
requirements original.txt +35 -0
requirements.txt +15 -0
sources/context.txt +0 -0
templates/chat-bot.html +108 -0
utils.py +124 -0

.env ADDED Viewed

	@@ -0,0 +1,52 @@

+# --- API Credentials (FOR N8N AUTH) ---
+# Used for the /webhook/search endpoint (Basic Auth)
+API_USERNAME=admin
+API_PASSWORD=12345
+# --- Admin Dashboard Credentials (Fallback) ---
+# Used if users.csv fails or is missing
+FLASK_ADMIN_USERNAME=admin
+FLASK_ADMIN_PASSWORD=1234
+# --- RAG Settings ---
+# Directory to store the vector index (automatically created)
+RAG_STORAGE_DIR=faiss_storage
+# Directory where your context.txt lives
+SOURCES_DIR=sources
+# --- RAG System Configuration ---
+RAG_EMBEDDING_MODEL="BAAI/bge-large-en-v1.5" #BAAI/bge-small-en
+RAG_EMBEDDING_GPU="false"
+RAG_LOAD_INDEX="true"
+# Step 1: Fetch this many documents from the vector database (FAISS).
+RAG_INITIAL_FETCH_K=10
+# Step 2: After reranking the initial docs, keep this many final documents for the LLM context.
+RAG_RERANKER_K=5
+RAG_MAX_FILES_FOR_INCREMENTAL=50
+# Text chunking settings
+RAG_CHUNK_SIZE=2000
+RAG_CHUNK_OVERLAP=150
+# --- Reranker & Retrieval Pipeline Settings ---
+RAG_RERANKER_ENABLED=False
+RAG_RERANKER_MODEL="jinaai/jina-reranker-v1-turbo-en"
+# RAG_RERANKER_MODEL=jinaai/jina-reranker-v2-base-multilingual
+# --- Google Drive Settings (Disabled) ---
+GDRIVE_SOURCES_ENABLED=false
+GDRIVE_FOLDER_URL="1xkBOzr8eN-lXRYNA62jbl3UHdmtZ4TJA"
+GDRIVE_INDEX_ENABLED=false
+GDRIVE_INDEX_URL="1wUsdasdsa7f8qENTR-lFmsZV"
+GDRIVE_USERS_CSV_ENABLED=true
+GDRIVE_USERS_CSV_URL="1yadsaHX2yy9MttYrLSE20"
+# --- System ---
+RAG_DETAILED_LOGGING=True
+# PORT=5000
+# FLASK_DEBUG=True

Dockerfile ADDED Viewed

	@@ -0,0 +1,51 @@

+# Use Python 3.11 to support Pandas 3.x and newer libraries
+FROM python:3.11-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install system dependencies
+# libgl1 and libglib2.0-0 are often needed for CV/PDF libraries
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the requirements file
+COPY requirements.txt requirements.txt
+# Install Python packages with timeout increase
+RUN pip install --no-cache-dir --timeout=1000 -r requirements.txt
+# Copy application code
+COPY . /app
+# Create a non-root user (Security Best Practice)
+RUN useradd -m -u 1000 user
+# Change ownership of the app directory and the temp directories
+RUN chown -R user:user /app
+# Create temp directories for HuggingFace/Torch cache and set permissions
+RUN mkdir -p /tmp/transformers_cache /tmp/hf_home /tmp/torch_home && \
+    chown -R user:user /tmp/transformers_cache /tmp/hf_home /tmp/torch_home
+# Switch to the non-root user
+USER user
+# Expose the port (Standard for HF Spaces)
+EXPOSE 7860
+# Set environment variables
+ENV FLASK_HOST=0.0.0.0
+ENV FLASK_PORT=7860
+ENV FLASK_DEBUG=False
+# CRITICAL: Set HF-specific env vars to writable directories
+ENV TRANSFORMERS_CACHE=/tmp/transformers_cache
+ENV HF_HOME=/tmp/hf_home
+ENV TORCH_HOME=/tmp/torch_home
+# Command to run the app
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,246 @@

+from flask import Flask, request, jsonify, Response, render_template
+from flask_cors import CORS
+import os
+import logging
+import functools
+import pandas as pd
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Custom Imports
+from rag_system import initialize_and_get_rag_system
+from config import (
+    API_USERNAME, API_PASSWORD, RAG_SOURCES_DIR,
+    GDRIVE_INDEX_ENABLED, GDRIVE_INDEX_ID_OR_URL,
+    GDRIVE_USERS_CSV_ENABLED, GDRIVE_USERS_CSV_ID_OR_URL,
+    ADMIN_USERNAME, ADMIN_PASSWORD,
+    RAG_RERANKER_K
+)
+from utils import download_and_unzip_gdrive_file, download_gdrive_file
+# Logging Setup
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Flask Init
+app = Flask(__name__, static_folder='static', template_folder='templates')
+CORS(app)
+# Global State
+rag_system = None
+user_df = None
+_APP_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# --- Helper: Load Users ---
+def load_users_from_csv():
+    global user_df
+    assets_folder = os.path.join(_APP_BASE_DIR, 'assets')
+    os.makedirs(assets_folder, exist_ok=True)
+    users_csv_path = os.path.join(assets_folder, 'users.csv')
+    try:
+        if os.path.exists(users_csv_path):
+            user_df = pd.read_csv(users_csv_path)
+            # Normalize email
+            if 'email' in user_df.columns:
+                user_df['email'] = user_df['email'].str.lower().str.strip()
+            logger.info(f"Loaded {len(user_df)} users from CSV.")
+        else:
+            logger.warning("users.csv not found in assets folder.")
+            user_df = None
+    except Exception as e:
+        logger.error(f"Failed to load users.csv: {e}")
+        user_df = None
+# --- Helper: Auth Decorators ---
+def require_api_auth(f):
+    """Protects the N8N Webhook endpoint"""
+    @functools.wraps(f)
+    def decorated(*args, **kwargs):
+        auth = request.authorization
+        # Check against API_USERNAME/PASSWORD from .env
+        if not auth or auth.username != API_USERNAME or auth.password != API_PASSWORD:
+            return Response('Unauthorized', 401, {'WWW-Authenticate': 'Basic realm="API Login Required"'})
+        return f(*args, **kwargs)
+    return decorated
+def require_admin_auth(f):
+    """Protects Admin Rebuild/Update endpoints"""
+    @functools.wraps(f)
+    def decorated(*args, **kwargs):
+        auth = request.authorization
+        if not auth:
+            return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
+        # 1. Check against loaded CSV
+        if user_df is not None:
+            user_email = auth.username.lower().strip()
+            user_record = user_df[user_df['email'] == user_email]
+            if not user_record.empty:
+                user_data = user_record.iloc[0]
+                # Compare password as string
+                if str(user_data['password']) == auth.password and user_data['role'] == 'admin':
+                    return f(*args, **kwargs)
+        # 2. Fallback to .env Admin Credentials
+        elif auth.username == ADMIN_USERNAME and auth.password == ADMIN_PASSWORD:
+            return f(*args, **kwargs)
+        return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
+    return decorated
+# --- Startup Logic (Fixed: No Decorator) ---
+def run_startup_tasks():
+    """Initializes RAG system and loads data. Called explicitly."""
+    global rag_system
+    logger.info("--- Executing Startup Tasks ---")
+    # 1. Download Users CSV if configured
+    if GDRIVE_USERS_CSV_ENABLED and GDRIVE_USERS_CSV_ID_OR_URL:
+        target = os.path.join(_APP_BASE_DIR, 'assets', 'users.csv')
+        download_gdrive_file(GDRIVE_USERS_CSV_ID_OR_URL, target)
+    # 2. Load User Data
+    load_users_from_csv()
+    # 3. Download FAISS Index if configured
+    if GDRIVE_INDEX_ENABLED and GDRIVE_INDEX_ID_OR_URL:
+        download_and_unzip_gdrive_file(GDRIVE_INDEX_ID_OR_URL, os.getcwd())
+    # 4. Initialize RAG
+    rag_system = initialize_and_get_rag_system()
+    logger.info("--- Startup Tasks Complete ---")
+# Execute startup tasks immediately when this module is loaded
+# This ensures it runs before the first request in Flask 3.x
+with app.app_context():
+    run_startup_tasks()
+# ===========================
+#       API ROUTES
+# ===========================
+# --- 1. N8N Webhook (The Core Function) ---
+@app.route('/webhook/search', methods=['POST'])
+@require_api_auth
+def search_knowledgebase_api():
+    """
+    Main entry point for N8N.
+    Expected JSON: { "query": "...", "use_reranker": bool, "final_k": int }
+    """
+    if not rag_system:
+        # Try to recover if somehow not initialized
+        return jsonify({"error": "RAG not initialized. Check server logs."}), 503
+    data = request.json or {}
+    query = data.get('query')
+    if not query:
+        return jsonify({"error": "Query field is required"}), 400
+    # Configs (with defaults)
+    # --- FIX: Use RAG_RERANKER_K from config as the default instead of hardcoded 5 ---
+    top_k = data.get('final_k', RAG_RERANKER_K)
+    use_reranker = data.get('use_reranker', True)
+    # Dynamic Reranker Toggling
+    if rag_system.retriever:
+        if not use_reranker:
+            rag_system.retriever.reranker = None
+        elif use_reranker and rag_system.reranker:
+             rag_system.retriever.reranker = rag_system.reranker
+    try:
+        results = rag_system.search_knowledge_base(query, top_k=top_k)
+        return jsonify({
+            "results": results,
+            "count": len(results),
+            "status": "success"
+        })
+    except Exception as e:
+        logger.error(f"Search API Error: {e}")
+        return jsonify({"error": str(e)}), 500
+# --- 2. User Login (RESTORED) ---
+@app.route('/user-login', methods=['POST'])
+def user_login():
+    """
+    Standard user login endpoint.
+    Checks credentials against users.csv.
+    """
+    if user_df is None:
+        return jsonify({"error": "User database not available."}), 503
+    data = request.json
+    email = data.get('email', '').lower().strip()
+    password = data.get('password')
+    if not email or not password:
+        return jsonify({"error": "Email and password required"}), 400
+    user_record = user_df[user_df['email'] == email]
+    if not user_record.empty:
+        u_data = user_record.iloc[0]
+        if str(u_data['password']) == str(password):
+            # Return user info (excluding password)
+            resp = u_data.to_dict()
+            if 'password' in resp:
+                del resp['password']
+            return jsonify(resp), 200
+    return jsonify({"error": "Invalid credentials"}), 401
+# --- 3. UI Route ---
+@app.route('/')
+def index_route():
+    # Renders the HTML Dashboard
+    return render_template('chat-bot.html')
+# --- 4. Admin Auth Check ---
+@app.route('/admin/login', methods=['POST'])
+@require_admin_auth
+def admin_login():
+    """Verifies Admin Credentials via Basic Auth Header"""
+    return jsonify({"status": "success", "message": "Authenticated"}), 200
+# --- 5. Admin RAG Controls ---
+@app.route('/admin/update_faiss_index', methods=['POST'])
+@require_admin_auth
+def update_faiss_index():
+    if not rag_system:
+        return jsonify({"error": "RAG system not initialized"}), 503
+    data = request.json or {}
+    max_files = data.get('max_new_files')
+    try:
+        result = rag_system.update_index_with_new_files(RAG_SOURCES_DIR, max_files)
+        return jsonify(result), 200
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+@app.route('/admin/rebuild_index', methods=['POST'])
+@require_admin_auth
+def rebuild_index():
+    global rag_system
+    try:
+        # Force rebuild calls the initialization logic with force_rebuild=True
+        rag_system = initialize_and_get_rag_system(force_rebuild=True)
+        return jsonify({"status": "Index rebuilt successfully"}), 200
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+# --- 6. Status Check ---
+@app.route('/status', methods=['GET'])
+def status_route():
+    return jsonify({
+        "status": "online",
+        "rag_initialized": rag_system is not None,
+        "users_loaded": user_df is not None
+    })
+if __name__ == '__main__':
+    # Default to 7860 for Hugging Face Spaces
+    port = int(os.environ.get("PORT", 7860))
+    app.run(host='0.0.0.0', port=port)

assets/users.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+sl,name,email,password,role
+1,Sifat Hossain Fahim,fahim@ge-bd.com,WorldTour1234!,admin
+2,Sakib Ahmed,sakib.ahmed@ge-bd.com,12345678!,admin
+3,Rezwanul Islam,rezwanul@ge-bd.com,marstour1234!,admin
+4,Sarwar Jahan,sarwar.piel@ge-bd.com,password123,user
+5,Rezaul Kabir,rezaul.kabir@ge-bd.com,securepass,user
+6,Test,test@test.com,12345678!,user
+7,Sadiquzzaman,sadiquzzaman@ge-bd.com,wqeqw1234,user
+8,Sadman,sadman@ge-bd.com,1234fvb,user
+9,Pavel,pavel@ge-bd.com,12314rdf,user
+10,Sajib,sajib.hossain@ge-bd.com,1234rge,user
+11,Abdur Rahim,arahim@ge-bd.com,23ree4rt,user

chunker.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import logging
+import json
+import argparse
+from typing import List, Dict, Optional
+# --- UPDATED IMPORT ---
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# ----------------------
+from utils import extract_text_from_file, FAISS_RAG_SUPPORTED_EXTENSIONS
+# --- Logging Setup ---
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+def process_sources_and_create_chunks(
+    sources_dir: str,
+    output_file: str,
+    chunk_size: int = 1000,
+    chunk_overlap: int = 150,
+    text_output_dir: Optional[str] = None
+) -> None:
+    if not os.path.isdir(sources_dir):
+        logger.error(f"Source directory not found: '{sources_dir}'")
+        raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
+    logger.info(f"Starting chunking process. Sources: '{sources_dir}', Output: '{output_file}'")
+    if text_output_dir:
+        os.makedirs(text_output_dir, exist_ok=True)
+        logger.info(f"Will save raw extracted text to: '{text_output_dir}'")
+    all_chunks_for_json: List[Dict] = []
+    processed_files_count = 0
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    for filename in os.listdir(sources_dir):
+        file_path = os.path.join(sources_dir, filename)
+        if not os.path.isfile(file_path):
+            continue
+        file_ext = filename.split('.')[-1].lower()
+        if file_ext not in FAISS_RAG_SUPPORTED_EXTENSIONS:
+            logger.debug(f"Skipping unsupported file: {filename}")
+            continue
+        logger.info(f"Processing source file: {filename}")
+        text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
+        if text_content:
+            if text_output_dir:
+                try:
+                    text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
+                    with open(text_output_path, 'w', encoding='utf-8') as f_text:
+                        f_text.write(text_content)
+                except Exception as e_text_save:
+                    logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
+            chunks = text_splitter.split_text(text_content)
+            for i, chunk_text in enumerate(chunks):
+                chunk_data = {
+                    "page_content": chunk_text,
+                    "metadata": {
+                        "source_document_name": filename,
+                        "chunk_index": i,
+                        "full_location": f"{filename}, Chunk {i+1}"
+                    }
+                }
+                all_chunks_for_json.append(chunk_data)
+            processed_files_count += 1
+    if not all_chunks_for_json:
+        logger.warning(f"No processable documents found in '{sources_dir}'.")
+    output_dir = os.path.dirname(output_file)
+    os.makedirs(output_dir, exist_ok=True)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(all_chunks_for_json, f, indent=2)
+    logger.info(f"Chunking complete. Processed {processed_files_count} files. Total chunks: {len(all_chunks_for_json)}")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sources-dir', type=str, required=True)
+    parser.add_argument('--output-file', type=str, required=True)
+    parser.add_argument('--text-output-dir', type=str, default=None)
+    parser.add_argument('--chunk-size', type=int, default=1000)
+    parser.add_argument('--chunk-overlap', type=int, default=150)
+    args = parser.parse_args()
+    try:
+        process_sources_and_create_chunks(
+            sources_dir=args.sources_dir,
+            output_file=args.output_file,
+            chunk_size=args.chunk_size,
+            chunk_overlap=args.chunk_overlap,
+            text_output_dir=args.text_output_dir
+        )
+    except Exception as e:
+        logger.critical(f"Chunking failed: {e}", exc_info=True)
+        exit(1)
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import logging
+# --- Logging Setup ---
+logger = logging.getLogger(__name__)
+if not logger.handlers:
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+_MODULE_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# API Authentication for n8n (Basic Auth)
+API_USERNAME = os.getenv("API_USERNAME", "admin")
+API_PASSWORD = os.getenv("API_PASSWORD", "password")
+# Admin fallback for UI
+ADMIN_USERNAME = os.getenv('FLASK_ADMIN_USERNAME', 'admin')
+ADMIN_PASSWORD = os.getenv('FLASK_ADMIN_PASSWORD', '1234')
+RAG_FAISS_INDEX_SUBDIR_NAME = "faiss_index"
+RAG_STORAGE_PARENT_DIR = os.getenv("RAG_STORAGE_DIR", os.path.join(_MODULE_BASE_DIR, "faiss_storage"))
+RAG_SOURCES_DIR = os.getenv("SOURCES_DIR", os.path.join(_MODULE_BASE_DIR, "sources"))
+RAG_CHUNKED_SOURCES_FILENAME = "pre_chunked_sources.json"
+os.makedirs(RAG_SOURCES_DIR, exist_ok=True)
+os.makedirs(RAG_STORAGE_PARENT_DIR, exist_ok=True)
+# Embedding and model configuration
+RAG_EMBEDDING_MODEL_NAME = os.getenv("RAG_EMBEDDING_MODEL", "BAAI/bge-small-en")
+RAG_EMBEDDING_USE_GPU = os.getenv("RAG_EMBEDDING_GPU", "False").lower() == "true"
+RAG_LOAD_INDEX_ON_STARTUP = os.getenv("RAG_LOAD_INDEX", "True").lower() == "true"
+# Retrieval Settings
+RAG_INITIAL_FETCH_K = int(os.getenv("RAG_INITIAL_FETCH_K", 20))
+RAG_RERANKER_K = int(os.getenv("RAG_RERANKER_K", 5))
+# RESTORED: Incremental update limit
+RAG_MAX_FILES_FOR_INCREMENTAL = int(os.getenv("RAG_MAX_FILES_FOR_INCREMENTAL", "50"))
+# Chunk configuration
+RAG_CHUNK_SIZE = int(os.getenv("RAG_CHUNK_SIZE", 1000))
+RAG_CHUNK_OVERLAP = int(os.getenv("RAG_CHUNK_OVERLAP", 150))
+# Reranker configuration
+RAG_RERANKER_MODEL_NAME = os.getenv("RAG_RERANKER_MODEL", "jinaai/jina-reranker-v2-base-multilingual")
+RAG_RERANKER_ENABLED = os.getenv("RAG_RERANKER_ENABLED", "True").lower() == "true"
+# GDrive configuration for RAG sources
+GDRIVE_SOURCES_ENABLED = os.getenv("GDRIVE_SOURCES_ENABLED", "False").lower() == "true"
+GDRIVE_FOLDER_ID_OR_URL = os.getenv("GDRIVE_FOLDER_URL")
+# GDrive configuration for downloading a pre-built FAISS index
+GDRIVE_INDEX_ENABLED = os.getenv("GDRIVE_INDEX_ENABLED", "False").lower() == "true"
+GDRIVE_INDEX_ID_OR_URL = os.getenv("GDRIVE_INDEX_URL")
+# RESTORED: GDrive configuration for downloading users.csv
+GDRIVE_USERS_CSV_ENABLED = os.getenv("GDRIVE_USERS_CSV_ENABLED", "False").lower() == "true"
+GDRIVE_USERS_CSV_ID_OR_URL = os.getenv("GDRIVE_USERS_CSV_URL")
+# Detailed logging configuration
+RAG_DETAILED_LOGGING = os.getenv("RAG_DETAILED_LOGGING", "True").lower() == "true"
+logger.info(f"RAG Config Loaded - Chunk Size: {RAG_CHUNK_SIZE}, Chunk Overlap: {RAG_CHUNK_OVERLAP}")
+logger.info(f"Embedding Model: {RAG_EMBEDDING_MODEL_NAME}")
+logger.info(f"Reranker Model: {RAG_RERANKER_MODEL_NAME}")
+logger.info(f"Retrieval Pipeline: Initial Fetch K={RAG_INITIAL_FETCH_K}, Reranker Final K={RAG_RERANKER_K}")
+logger.info(f"Detailed Logging: {'ENABLED' if RAG_DETAILED_LOGGING else 'DISABLED'}")
+logger.info(f"GDrive Sources Download: {'ENABLED' if GDRIVE_SOURCES_ENABLED else 'DISABLED'}")
+logger.info(f"GDrive Pre-built Index Download: {'ENABLED' if GDRIVE_INDEX_ENABLED else 'DISABLED'}")
+logger.info(f"GDrive users.csv Download: {'ENABLED' if GDRIVE_USERS_CSV_ENABLED else 'DISABLED'}")

note.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ https://homemademirpur-ed-cad-ref.hf.space

postman.json ADDED Viewed

	@@ -0,0 +1,153 @@

+{
+	"info": {
+		"_postman_id": "b8f9e9a1-5c8e-4a8e-9b8e-1f8e9a1f8e9a",
+		"name": "edmond_cad_refund",
+		"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
+	},
+	"item": [
+		{
+			"name": "N8N - Search Knowledgebase",
+			"request": {
+				"auth": {
+					"type": "basic",
+					"basic": [
+						{
+							"key": "password",
+							"value": "password",
+							"type": "string"
+						},
+						{
+							"key": "username",
+							"value": "admin",
+							"type": "string"
+						}
+					]
+				},
+				"method": "POST",
+				"header": [],
+				"body": {
+					"mode": "raw",
+					"raw": "{\n    \"query\": \"how to get a refund for electronics?\",\n    \"use_reranker\": true,\n    \"final_k\": 5,\n    \"persona\": [\"standard\"],\n    \"tier\": [\"gold\"]\n}",
+					"options": {
+						"raw": {
+							"language": "json"
+						}
+					}
+				},
+				"url": {
+					"raw": "{{base_url}}/webhook/search",
+					"host": [
+						"{{base_url}}"
+					],
+					"path": [
+						"webhook",
+						"search"
+					]
+				},
+				"description": "Main endpoint used by N8N to retrieve context chunks."
+			},
+			"response": []
+		},
+		{
+			"name": "Admin - Rebuild Index",
+			"request": {
+				"auth": {
+					"type": "basic",
+					"basic": [
+						{
+							"key": "password",
+							"value": "1234",
+							"type": "string"
+						},
+						{
+							"key": "username",
+							"value": "admin",
+							"type": "string"
+						}
+					]
+				},
+				"method": "POST",
+				"header": [],
+				"url": {
+					"raw": "{{base_url}}/admin/rebuild_index",
+					"host": [
+						"{{base_url}}"
+					],
+					"path": [
+						"admin",
+						"rebuild_index"
+					]
+				},
+				"description": "Completely deletes and rebuilds the FAISS index from sources."
+			},
+			"response": []
+		},
+		{
+			"name": "Admin - Update Index (Incremental)",
+			"request": {
+				"auth": {
+					"type": "basic",
+					"basic": [
+						{
+							"key": "password",
+							"value": "1234",
+							"type": "string"
+						},
+						{
+							"key": "username",
+							"value": "admin",
+							"type": "string"
+						}
+					]
+				},
+				"method": "POST",
+				"header": [],
+				"body": {
+					"mode": "raw",
+					"raw": "{\n    \"max_new_files\": 50\n}",
+					"options": {
+						"raw": {
+							"language": "json"
+						}
+					}
+				},
+				"url": {
+					"raw": "{{base_url}}/admin/update_faiss_index",
+					"host": [
+						"{{base_url}}"
+					],
+					"path": [
+						"admin",
+						"update_faiss_index"
+					]
+				},
+				"description": "Adds only new files to the existing index."
+			},
+			"response": []
+		},
+		{
+			"name": "Public - Status",
+			"request": {
+				"method": "GET",
+				"header": [],
+				"url": {
+					"raw": "{{base_url}}/status",
+					"host": [
+						"{{base_url}}"
+					],
+					"path": [
+						"status"
+					]
+				}
+			},
+			"response": []
+		}
+	],
+	"variable": [
+		{
+			"key": "base_url",
+			"value": "http://localhost:5000",
+			"type": "string"
+		}
+	]
+}

rag_components.py ADDED Viewed

	@@ -0,0 +1,338 @@

+import os
+import logging
+import json
+import time
+from typing import List, Dict, Optional, Any
+import torch
+from sentence_transformers import CrossEncoder
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from config import (
+    RAG_RERANKER_MODEL_NAME, RAG_DETAILED_LOGGING,
+    RAG_CHUNK_SIZE, RAG_CHUNK_OVERLAP, RAG_CHUNKED_SOURCES_FILENAME,
+    RAG_FAISS_INDEX_SUBDIR_NAME, RAG_INITIAL_FETCH_K, RAG_RERANKER_K,
+    RAG_MAX_FILES_FOR_INCREMENTAL
+)
+from utils import FAISS_RAG_SUPPORTED_EXTENSIONS
+logger = logging.getLogger(__name__)
+class DocumentReranker:
+    def __init__(self, model_name: str = RAG_RERANKER_MODEL_NAME):
+        self.logger = logging.getLogger(__name__ + ".DocumentReranker")
+        self.model_name = model_name
+        self.model = None
+        try:
+            self.logger.info(f"[RERANKER_INIT] Loading reranker model: {self.model_name}")
+            start_time = time.time()
+            self.model = CrossEncoder(model_name, trust_remote_code=True)
+            load_time = time.time() - start_time
+            self.logger.info(f"[RERANKER_INIT] Reranker model '{self.model_name}' loaded successfully in {load_time:.2f}s")
+        except Exception as e:
+            self.logger.error(f"[RERANKER_INIT] Failed to load reranker model '{self.model_name}': {e}", exc_info=True)
+            raise RuntimeError(f"Could not initialize reranker model: {e}") from e
+    def rerank_documents(self, query: str, documents: List[Document], top_k: int) -> List[Document]:
+        if not documents or not self.model:
+            return documents[:top_k] if documents else []
+        try:
+            start_time = time.time()
+            doc_pairs = [[query, doc.page_content] for doc in documents]
+            scores = self.model.predict(doc_pairs)
+            rerank_time = time.time() - start_time
+            self.logger.info(f"[RERANKER] Computed relevance scores in {rerank_time:.3f}s")
+            doc_score_pairs = list(zip(documents, scores))
+            doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
+            reranked_docs = []
+            for doc, score in doc_score_pairs[:top_k]:
+                doc.metadata["reranker_score"] = float(score)
+                reranked_docs.append(doc)
+            return reranked_docs
+        except Exception as e:
+            self.logger.error(f"[RERANKER] Error during reranking: {e}", exc_info=True)
+            return documents[:top_k] if documents else []
+class FAISSRetrieverWithScore(BaseRetriever):
+    vectorstore: FAISS
+    reranker: Optional[DocumentReranker] = None
+    initial_fetch_k: int = RAG_INITIAL_FETCH_K
+    final_k: int = RAG_RERANKER_K
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        start_time = time.time()
+        num_to_fetch = self.initial_fetch_k if self.reranker else self.final_k
+        # --- FIX: Use global logger, not self.logger ---
+        logger.info(f"[RETRIEVER] Fetching {num_to_fetch} docs (Rerank={self.reranker is not None})")
+        docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=num_to_fetch)
+        relevant_docs = []
+        for doc, score in docs_and_scores:
+            doc.metadata["retrieval_score"] = float(score)
+            relevant_docs.append(doc)
+        if self.reranker and relevant_docs:
+            relevant_docs = self.reranker.rerank_documents(query, relevant_docs, top_k=self.final_k)
+        total_time = time.time() - start_time
+        # --- FIX: Use global logger ---
+        logger.info(f"[RETRIEVER] Completed in {total_time:.3f}s. Returned {len(relevant_docs)} docs.")
+        return relevant_docs
+class KnowledgeRAG:
+    def __init__(
+        self,
+        index_storage_dir: str,
+        embedding_model_name: str,
+        use_gpu_for_embeddings: bool,
+        chunk_size: int = RAG_CHUNK_SIZE,
+        chunk_overlap: int = RAG_CHUNK_OVERLAP,
+        reranker_model_name: Optional[str] = None,
+        enable_reranker: bool = True,
+    ):
+        self.logger = logging.getLogger(__name__ + ".KnowledgeRAG")
+        self.logger.info(f"[RAG_INIT] Initializing KnowledgeRAG system")
+        self.index_storage_dir = index_storage_dir
+        os.makedirs(self.index_storage_dir, exist_ok=True)
+        self.embedding_model_name = embedding_model_name
+        self.use_gpu_for_embeddings = use_gpu_for_embeddings
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.reranker_model_name = reranker_model_name or RAG_RERANKER_MODEL_NAME
+        self.enable_reranker = enable_reranker
+        self.reranker = None
+        device = "cpu"
+        if self.use_gpu_for_embeddings:
+            if torch.cuda.is_available():
+                self.logger.info(f"[RAG_INIT] CUDA available. Requesting GPU.")
+                device = "cuda"
+            else:
+                self.logger.warning("[RAG_INIT] CUDA not available. Fallback to CPU.")
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name=self.embedding_model_name,
+            model_kwargs={"device": device},
+            encode_kwargs={"normalize_embeddings": True}
+        )
+        if self.enable_reranker:
+            try:
+                self.reranker = DocumentReranker(self.reranker_model_name)
+            except Exception as e:
+                self.logger.warning(f"[RAG_INIT] Reranker Init Failed: {e}")
+                self.reranker = None
+        self.vector_store: Optional[FAISS] = None
+        self.retriever: Optional[FAISSRetrieverWithScore] = None
+        self.processed_source_files: List[str] = []
+    def _save_chunk_config(self):
+        """Persist current chunk settings alongside the FAISS index for change detection."""
+        faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
+        config_file = os.path.join(faiss_path, "chunk_config.json")
+        with open(config_file, 'w') as f:
+            json.dump({"chunk_size": self.chunk_size, "chunk_overlap": self.chunk_overlap}, f)
+    def _load_chunk_config(self) -> Optional[dict]:
+        """Load previously saved chunk config. Returns None if not found."""
+        faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
+        config_file = os.path.join(faiss_path, "chunk_config.json")
+        if os.path.exists(config_file):
+            with open(config_file, 'r') as f:
+                return json.load(f)
+        return None
+    def chunk_config_has_changed(self) -> bool:
+        """Returns True if chunk_size or chunk_overlap differ from what the index was built with."""
+        saved = self._load_chunk_config()
+        if saved is None:
+            return False  # No record yet — assume compatible
+        changed = saved.get("chunk_size") != self.chunk_size or saved.get("chunk_overlap") != self.chunk_overlap
+        if changed:
+            self.logger.warning(
+                f"[CONFIG_CHANGE] Chunk config mismatch! "
+                f"Saved=(size={saved.get('chunk_size')}, overlap={saved.get('chunk_overlap')}) "
+                f"Current=(size={self.chunk_size}, overlap={self.chunk_overlap}). "
+                f"Index will be rebuilt."
+            )
+        return changed
+    def build_index_from_source_files(self, source_folder_path: str):
+        self.logger.info(f"[INDEX_BUILD] Building from: {source_folder_path}")
+        if not os.path.isdir(source_folder_path):
+            raise FileNotFoundError(f"Source folder not found: '{source_folder_path}'.")
+        all_docs = []
+        processed_files = []
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        # 1. Pre-chunked JSON
+        pre_chunked_path = os.path.join(self.index_storage_dir, RAG_CHUNKED_SOURCES_FILENAME)
+        if os.path.exists(pre_chunked_path):
+            try:
+                with open(pre_chunked_path, 'r', encoding='utf-8') as f:
+                    chunk_data_list = json.load(f)
+                for chunk in chunk_data_list:
+                    doc = Document(page_content=chunk.get("page_content", ""), metadata=chunk.get("metadata", {}))
+                    all_docs.append(doc)
+                    if 'source_document_name' in doc.metadata:
+                        processed_files.append(doc.metadata['source_document_name'])
+                processed_files = sorted(list(set(processed_files)))
+            except Exception as e:
+                self.logger.error(f"[INDEX_BUILD] JSON load failed: {e}")
+        # 2. Raw Files
+        if not all_docs:
+            for filename in os.listdir(source_folder_path):
+                file_path = os.path.join(source_folder_path, filename)
+                if not os.path.isfile(file_path): continue
+                file_ext = filename.split('.')[-1].lower()
+                if file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
+                    text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
+                    if text_content:
+                        chunks = text_splitter.split_text(text_content)
+                        for i, chunk_text in enumerate(chunks):
+                            meta = {"source_document_name": filename, "chunk_index": i}
+                            all_docs.append(Document(page_content=chunk_text, metadata=meta))
+                        processed_files.append(filename)
+        if not all_docs:
+            raise ValueError("No documents to index.")
+        self.processed_source_files = processed_files
+        self.logger.info(f"[INDEX_BUILD] Creating FAISS index with {len(all_docs)} chunks.")
+        self.vector_store = FAISS.from_documents(all_docs, self.embeddings)
+        faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
+        self.vector_store.save_local(faiss_path)
+        self._save_chunk_config()
+        self.retriever = FAISSRetrieverWithScore(
+            vectorstore=self.vector_store,
+            reranker=self.reranker,
+            initial_fetch_k=RAG_INITIAL_FETCH_K,
+            final_k=RAG_RERANKER_K
+        )
+    def load_index_from_disk(self):
+        faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
+        if not os.path.exists(faiss_path):
+            raise FileNotFoundError("Index not found.")
+        self.vector_store = FAISS.load_local(
+            folder_path=faiss_path,
+            embeddings=self.embeddings,
+            allow_dangerous_deserialization=True
+        )
+        self.retriever = FAISSRetrieverWithScore(
+            vectorstore=self.vector_store,
+            reranker=self.reranker,
+            initial_fetch_k=RAG_INITIAL_FETCH_K,
+            final_k=RAG_RERANKER_K
+        )
+        # Restore metadata
+        meta_file = os.path.join(faiss_path, "processed_files.json")
+        if os.path.exists(meta_file):
+            with open(meta_file, 'r') as f:
+                self.processed_source_files = json.load(f)
+        else:
+            self.processed_source_files = ["Loaded from disk (unknown sources)"]
+        self.logger.info("[INDEX_LOAD] Success.")
+    def update_index_with_new_files(self, source_folder_path: str, max_files_to_process: Optional[int] = None) -> Dict[str, Any]:
+        self.logger.info(f"[INDEX_UPDATE] Checking for new files in: {source_folder_path}")
+        if not self.vector_store:
+            raise RuntimeError("Cannot update: no index loaded.")
+        processed_set = set(self.processed_source_files)
+        all_new_files = []
+        for filename in sorted(os.listdir(source_folder_path)):
+            if filename not in processed_set:
+                file_ext = filename.split('.')[-1].lower()
+                if file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
+                    all_new_files.append(filename)
+        if not all_new_files:
+            return {"status": "success", "message": "No new files found.", "files_added": []}
+        limit = max_files_to_process if max_files_to_process is not None else RAG_MAX_FILES_FOR_INCREMENTAL
+        files_to_process = all_new_files[:limit]
+        new_docs = []
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
+        for filename in files_to_process:
+            file_path = os.path.join(source_folder_path, filename)
+            text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[filename.split('.')[-1].lower()](file_path)
+            if text_content:
+                chunks = text_splitter.split_text(text_content)
+                for i, chunk_text in enumerate(chunks):
+                    meta = {"source_document_name": filename, "chunk_index": i}
+                    new_docs.append(Document(page_content=chunk_text, metadata=meta))
+        if not new_docs:
+            return {"status": "warning", "message": "New files found but no text extracted.", "files_added": []}
+        self.vector_store.add_documents(new_docs)
+        faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
+        self.vector_store.save_local(faiss_path)
+        self.processed_source_files.extend(files_to_process)
+        with open(os.path.join(faiss_path, "processed_files.json"), 'w') as f:
+            json.dump(sorted(self.processed_source_files), f)
+        return {
+            "status": "success",
+            "message": f"Added {len(files_to_process)} files.",
+            "files_added": files_to_process,
+            "remaining": len(all_new_files) - len(files_to_process)
+        }
+    def search_knowledge_base(self, query: str, top_k: Optional[int] = None) -> List[Dict[str, Any]]:
+        if not self.retriever:
+            raise RuntimeError("Retriever not initialized.")
+        original_k = self.retriever.final_k
+        if top_k:
+            self.retriever.final_k = top_k
+        try:
+            docs = self.retriever.invoke(query)
+            results = []
+            for doc in docs:
+                results.append({
+                    "content": doc.page_content,
+                    "metadata": doc.metadata,
+                    "score": doc.metadata.get("reranker_score") or doc.metadata.get("retrieval_score")
+                })
+            return results
+        finally:
+            self.retriever.final_k = original_k

rag_system.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import logging
+import shutil
+from typing import Optional
+from rag_components import KnowledgeRAG
+from utils import download_and_unzip_gdrive_folder
+from config import (
+    GDRIVE_SOURCES_ENABLED, GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR,
+    RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME, RAG_LOAD_INDEX_ON_STARTUP,
+    RAG_EMBEDDING_MODEL_NAME, RAG_EMBEDDING_USE_GPU,
+    RAG_CHUNK_SIZE, RAG_CHUNK_OVERLAP,
+    RAG_RERANKER_MODEL_NAME, RAG_RERANKER_ENABLED
+)
+logger = logging.getLogger(__name__)
+def initialize_and_get_rag_system(force_rebuild: bool = False, source_dir_override: Optional[str] = None) -> Optional[KnowledgeRAG]:
+    logger.info("[RAG_SYSTEM_INIT] Initializing...")
+    source_dir_to_use = source_dir_override if source_dir_override else RAG_SOURCES_DIR
+    # GDrive Logic (Restored)
+    if GDRIVE_SOURCES_ENABLED and not source_dir_override and GDRIVE_FOLDER_ID_OR_URL:
+        logger.info("[RAG_SYSTEM_INIT] Downloading sources from GDrive...")
+        if os.path.exists(RAG_SOURCES_DIR):
+            shutil.rmtree(RAG_SOURCES_DIR)
+        download_and_unzip_gdrive_folder(GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR)
+    faiss_index_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME)
+    if force_rebuild and os.path.exists(faiss_index_path):
+        logger.info("[RAG_SYSTEM_INIT] Force rebuild: deleting old index.")
+        shutil.rmtree(faiss_index_path)
+    try:
+        rag = KnowledgeRAG(
+            index_storage_dir=RAG_STORAGE_PARENT_DIR,
+            embedding_model_name=RAG_EMBEDDING_MODEL_NAME,
+            use_gpu_for_embeddings=RAG_EMBEDDING_USE_GPU,
+            chunk_size=RAG_CHUNK_SIZE,
+            chunk_overlap=RAG_CHUNK_OVERLAP,
+            reranker_model_name=RAG_RERANKER_MODEL_NAME,
+            enable_reranker=RAG_RERANKER_ENABLED,
+        )
+        loaded = False
+        if RAG_LOAD_INDEX_ON_STARTUP and not force_rebuild:
+            # Check if chunk settings have changed since the index was built
+            if rag.chunk_config_has_changed():
+                logger.warning("[RAG_SYSTEM_INIT] Chunk config changed — forcing index rebuild.")
+            else:
+                try:
+                    rag.load_index_from_disk()
+                    loaded = True
+                except Exception as e:
+                    logger.warning(f"[RAG_SYSTEM_INIT] Load failed ({e}). Building new.")
+        if not loaded:
+            if not os.path.exists(source_dir_to_use) or not os.listdir(source_dir_to_use):
+                logger.warning("[RAG_SYSTEM_INIT] No sources found. System empty.")
+            else:
+                rag.build_index_from_source_files(source_dir_to_use)
+        logger.info("[RAG_SYSTEM_INIT] Complete.")
+        return rag
+    except Exception as e:
+        logger.critical(f"[RAG_SYSTEM_INIT] FATAL: {e}", exc_info=True)
+        return None

requirements original.txt ADDED Viewed

	@@ -0,0 +1,35 @@

+Flask==3.0.3
+Flask_Cors==5.0.0
+flask_session
+numpy
+pandas==2.2.3
+# rapidfuzz==3.10.1
+Requests==2.32.3
+# scikit_learn==1.4.1.post1
+# scikit_learn==1.5.2
+psycopg2-binary==2.9.10
+python-dotenv==1.0.1
+apscheduler==3.11.0
+redis==3.5.3
+faiss-cpu==1.10.0
+groq==0.15.0
+llama_index==0.12.13
+llama_index.llms.groq==0.3.1
+# langchain_groq==0.2.4
+# langchain_core==0.3.39
+sentence_transformers==3.4.0
+gunicorn
+llama-index-embeddings-huggingface==0.5.4
+onnxruntime==1.22.0
+langchain-groq==0.3.2
+python-docx==1.1.2
+langchain==0.3.24
+langchain_community==0.3.23
+gdown==5.2.0
+# torch
+pymupdf==1.25.5
+pypdf==5.4.0
+hf_xet==1.1.10
+# protobuf==3.20.3
+# must install https://aka.ms/vs/17/release/vc_redist.x64.exe

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+Flask==3.1.2
+Flask_Cors==5.0.0
+gdown==5.2.1
+langchain==1.2.10
+langchain_community==0.4.1
+langchain_huggingface==1.2.0
+pandas==3.0.0
+pypdf==6.7.0
+python-dotenv==1.2.1
+python_docx==1.1.2
+sentence_transformers==3.4.0
+torch==2.9.0
+langchain_core
+langchain_text_splitters
+faiss-cpu

sources/context.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

templates/chat-bot.html ADDED Viewed

	@@ -0,0 +1,108 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>RAG API Dashboard</title>
+    <link href="https://fonts.googleapis.com/css?family=Roboto:400,500" rel="stylesheet">
+    <style>
+        body { font-family: 'Roboto', sans-serif; background: #f4f7f9; color: #333; display: flex; justify-content: center; align-items: center; height: 100vh; margin: 0; }
+        .container { background: white; padding: 40px; border-radius: 12px; box-shadow: 0 4px 15px rgba(0,0,0,0.1); width: 400px; text-align: center; }
+        h2 { margin-bottom: 20px; color: #2c3e50; }
+        .status-badge { display: inline-block; padding: 8px 15px; border-radius: 20px; background: #2ecc71; color: white; font-weight: bold; margin-bottom: 30px; }
+        .status-badge.error { background: #e74c3c; }
+        .btn { display: block; width: 100%; padding: 12px; margin: 10px 0; border: none; border-radius: 6px; cursor: pointer; font-size: 16px; transition: 0.3s; }
+        .btn-primary { background: #3498db; color: white; }
+        .btn-primary:hover { background: #2980b9; }
+        .btn-danger { background: #e74c3c; color: white; }
+        .btn-danger:hover { background: #c0392b; }
+        input { width: calc(100% - 22px); padding: 10px; margin: 5px 0 15px; border: 1px solid #ddd; border-radius: 6px; }
+        .admin-panel { display: none; text-align: left; border-top: 1px solid #eee; padding-top: 20px; margin-top: 20px; }
+        .log-box { background: #2c3e50; color: #2ecc71; padding: 10px; border-radius: 6px; font-family: monospace; font-size: 12px; height: 100px; overflow-y: auto; margin-top: 15px; display:none;}
+    </style>
+</head>
+<body>
+<div class="container">
+    <h2>RAG API System</h2>
+    <div id="status-badge" class="status-badge">Checking Status...</div>
+    <!-- Login Form -->
+    <div id="login-form">
+        <input type="text" id="username" placeholder="Admin Username">
+        <input type="password" id="password" placeholder="Password">
+        <button class="btn btn-primary" onclick="login()">Admin Login</button>
+    </div>
+    <!-- Admin Panel (Hidden until logged in) -->
+    <div id="admin-panel" class="admin-panel">
+        <h3>Admin Controls</h3>
+        <p>Manage Vector Index</p>
+        <input type="number" id="max-files" placeholder="Max files (e.g. 50) for Update" value="50">
+        <button class="btn btn-primary" onclick="performAction('/admin/update_faiss_index')">Update Index (New Files)</button>
+        <button class="btn btn-danger" onclick="performAction('/admin/rebuild_index')">Rebuild Full Index</button>
+        <div id="log-box" class="log-box"></div>
+    </div>
+</div>
+<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
+<script>
+    let authHeader = null;
+    // Check Server Status on Load
+    async function checkStatus() {
+        try {
+            const res = await axios.get('/status');
+            const badge = document.getElementById('status-badge');
+            if (res.data.rag_initialized) {
+                badge.textContent = "System Online";
+                badge.className = "status-badge";
+            } else {
+                badge.textContent = "Not Initialized";
+                badge.className = "status-badge error";
+            }
+        } catch (e) {
+            document.getElementById('status-badge').textContent = "Server Offline";
+            document.getElementById('status-badge').className = "status-badge error";
+        }
+    }
+    checkStatus();
+    // Login Logic
+    async function login() {
+        const u = document.getElementById('username').value;
+        const p = document.getElementById('password').value;
+        const auth = { username: u, password: p };
+        try {
+            await axios.post('/admin/login', {}, { auth });
+            authHeader = auth;
+            document.getElementById('login-form').style.display = 'none';
+            document.getElementById('admin-panel').style.display = 'block';
+        } catch (e) {
+            alert("Login Failed");
+        }
+    }
+    // Admin Actions
+    async function performAction(url) {
+        const logBox = document.getElementById('log-box');
+        logBox.style.display = 'block';
+        logBox.textContent = "Processing...";
+        const payload = {};
+        if(url.includes('update')) {
+            payload.max_new_files = document.getElementById('max-files').value;
+        }
+        try {
+            const res = await axios.post(url, payload, { auth: authHeader });
+            logBox.textContent = JSON.stringify(res.data, null, 2);
+        } catch (e) {
+            logBox.textContent = "Error: " + (e.response ? JSON.stringify(e.response.data) : e.message);
+        }
+    }
+</script>
+</body>
+</html>

utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+import logging
+import re
+import shutil
+import tempfile
+import time
+from typing import Optional
+import zipfile
+import gdown
+from pypdf import PdfReader
+import docx as python_docx
+logger = logging.getLogger(__name__)
+def extract_text_from_file(file_path: str, file_type: str) -> Optional[str]:
+    logger.info(f"[TEXT_EXTRACTION] Starting extraction from {file_type.upper()} file: {file_path}")
+    text_content = None
+    try:
+        if file_type == 'pdf':
+            reader = PdfReader(file_path)
+            text_content = "".join(page.extract_text() + "\n" for page in reader.pages if page.extract_text())
+        elif file_type == 'docx':
+            doc = python_docx.Document(file_path)
+            text_content = "\n".join(para.text for para in doc.paragraphs if para.text)
+        elif file_type == 'txt':
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                text_content = f.read()
+        else:
+            logger.warning(f"[TEXT_EXTRACTION] Unsupported file type: {file_type}")
+            return None
+        if not text_content or not text_content.strip():
+            logger.warning(f"[TEXT_EXTRACTION] No text content extracted from {file_path}")
+            return None
+        return text_content.strip()
+    except Exception as e:
+        logger.error(f"[TEXT_EXTRACTION] Error extracting text: {e}", exc_info=True)
+        return None
+FAISS_RAG_SUPPORTED_EXTENSIONS = {
+    'pdf': lambda path: extract_text_from_file(path, 'pdf'),
+    'docx': lambda path: extract_text_from_file(path, 'docx'),
+    'txt': lambda path: extract_text_from_file(path, 'txt'),
+}
+def get_id_from_gdrive_input(url_or_id: str) -> Optional[str]:
+    if not url_or_id: return None
+    match_folder = re.search(r"/folders/([a-zA-Z0-9_-]+)", url_or_id)
+    if match_folder: return match_folder.group(1)
+    match_file_d = re.search(r"/d/([a-zA-Z0-9_-]+)", url_or_id)
+    if match_file_d: return match_file_d.group(1)
+    match_uc = re.search(r"id=([a-zA-Z0-9_-]+)", url_or_id)
+    if match_uc: return match_uc.group(1)
+    return url_or_id if len(url_or_id) > 10 else None
+def download_gdrive_file(file_id_or_url: str, target_path: str) -> bool:
+    """
+    Downloads a single file (like users.csv) from GDrive to a specific path.
+    """
+    logger.info(f"[GDRIVE_SINGLE] Downloading file. Input: {file_id_or_url}")
+    file_id = get_id_from_gdrive_input(file_id_or_url)
+    if not file_id:
+        logger.error("[GDRIVE_SINGLE] Invalid ID")
+        return False
+    try:
+        # Ensure dir exists
+        os.makedirs(os.path.dirname(target_path), exist_ok=True)
+        # fuzzy=True allows gdown to handle permissions more gracefully
+        gdown.download(id=file_id, output=target_path, quiet=False, fuzzy=True)
+        if os.path.exists(target_path) and os.path.getsize(target_path) > 0:
+            logger.info("[GDRIVE_SINGLE] Success.")
+            return True
+        else:
+            logger.error("[GDRIVE_SINGLE] Downloaded file is empty or missing.")
+            return False
+    except Exception as e:
+        logger.error(f"[GDRIVE_SINGLE] Error: {e}", exc_info=True)
+        return False
+def download_and_unzip_gdrive_folder(folder_id_or_url: str, target_dir_for_contents: str) -> bool:
+    logger.info(f"[GDRIVE] Downloading folder. Input: {folder_id_or_url}")
+    folder_id = get_id_from_gdrive_input(folder_id_or_url)
+    if not folder_id: return False
+    temp_dir = tempfile.mkdtemp()
+    try:
+        gdown.download_folder(id=folder_id, output=temp_dir, quiet=False, use_cookies=False)
+        if not os.path.exists(target_dir_for_contents):
+            os.makedirs(target_dir_for_contents)
+        src_root = temp_dir
+        if len(os.listdir(temp_dir)) == 1 and os.path.isdir(os.path.join(temp_dir, os.listdir(temp_dir)[0])):
+            src_root = os.path.join(temp_dir, os.listdir(temp_dir)[0])
+        for item in os.listdir(src_root):
+            shutil.move(os.path.join(src_root, item), os.path.join(target_dir_for_contents, item))
+        logger.info(f"[GDRIVE] Download complete.")
+        return True
+    except Exception as e:
+        logger.error(f"[GDRIVE] Error: {e}", exc_info=True)
+        return False
+    finally:
+        shutil.rmtree(temp_dir, ignore_errors=True)
+def download_and_unzip_gdrive_file(file_id_or_url: str, target_extraction_dir: str) -> bool:
+    logger.info(f"[GDRIVE_ZIP] Downloading ZIP. Input: {file_id_or_url}")
+    file_id = get_id_from_gdrive_input(file_id_or_url)
+    if not file_id: return False
+    temp_zip = os.path.join(tempfile.gettempdir(), "temp_download.zip")
+    try:
+        gdown.download(id=file_id, output=temp_zip, quiet=False)
+        with zipfile.ZipFile(temp_zip, 'r') as zip_ref:
+            zip_ref.extractall(target_extraction_dir)
+        return True
+    except Exception as e:
+        logger.error(f"[GDRIVE_ZIP] Error: {e}", exc_info=True)
+        return False