Upload 15 files
Browse files- .env +52 -0
- Dockerfile +51 -0
- app.py +246 -0
- assets/users.csv +12 -0
- chunker.py +114 -0
- config.py +71 -0
- note.txt +1 -0
- postman.json +153 -0
- rag_components.py +338 -0
- rag_system.py +70 -0
- requirements original.txt +35 -0
- requirements.txt +15 -0
- sources/context.txt +0 -0
- templates/chat-bot.html +108 -0
- utils.py +124 -0
.env
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --- API Credentials (FOR N8N AUTH) ---
|
| 2 |
+
# Used for the /webhook/search endpoint (Basic Auth)
|
| 3 |
+
API_USERNAME=admin
|
| 4 |
+
API_PASSWORD=12345
|
| 5 |
+
|
| 6 |
+
# --- Admin Dashboard Credentials (Fallback) ---
|
| 7 |
+
# Used if users.csv fails or is missing
|
| 8 |
+
FLASK_ADMIN_USERNAME=admin
|
| 9 |
+
FLASK_ADMIN_PASSWORD=1234
|
| 10 |
+
|
| 11 |
+
# --- RAG Settings ---
|
| 12 |
+
# Directory to store the vector index (automatically created)
|
| 13 |
+
RAG_STORAGE_DIR=faiss_storage
|
| 14 |
+
# Directory where your context.txt lives
|
| 15 |
+
SOURCES_DIR=sources
|
| 16 |
+
|
| 17 |
+
# --- RAG System Configuration ---
|
| 18 |
+
|
| 19 |
+
RAG_EMBEDDING_MODEL="BAAI/bge-large-en-v1.5" #BAAI/bge-small-en
|
| 20 |
+
RAG_EMBEDDING_GPU="false"
|
| 21 |
+
RAG_LOAD_INDEX="true"
|
| 22 |
+
|
| 23 |
+
# Step 1: Fetch this many documents from the vector database (FAISS).
|
| 24 |
+
RAG_INITIAL_FETCH_K=10
|
| 25 |
+
|
| 26 |
+
# Step 2: After reranking the initial docs, keep this many final documents for the LLM context.
|
| 27 |
+
RAG_RERANKER_K=5
|
| 28 |
+
|
| 29 |
+
RAG_MAX_FILES_FOR_INCREMENTAL=50
|
| 30 |
+
|
| 31 |
+
# Text chunking settings
|
| 32 |
+
RAG_CHUNK_SIZE=2000
|
| 33 |
+
RAG_CHUNK_OVERLAP=150
|
| 34 |
+
|
| 35 |
+
# --- Reranker & Retrieval Pipeline Settings ---
|
| 36 |
+
RAG_RERANKER_ENABLED=False
|
| 37 |
+
RAG_RERANKER_MODEL="jinaai/jina-reranker-v1-turbo-en"
|
| 38 |
+
# RAG_RERANKER_MODEL=jinaai/jina-reranker-v2-base-multilingual
|
| 39 |
+
|
| 40 |
+
# --- Google Drive Settings (Disabled) ---
|
| 41 |
+
GDRIVE_SOURCES_ENABLED=false
|
| 42 |
+
GDRIVE_FOLDER_URL="1xkBOzr8eN-lXRYNA62jbl3UHdmtZ4TJA"
|
| 43 |
+
GDRIVE_INDEX_ENABLED=false
|
| 44 |
+
GDRIVE_INDEX_URL="1wUsdasdsa7f8qENTR-lFmsZV"
|
| 45 |
+
|
| 46 |
+
GDRIVE_USERS_CSV_ENABLED=true
|
| 47 |
+
GDRIVE_USERS_CSV_URL="1yadsaHX2yy9MttYrLSE20"
|
| 48 |
+
|
| 49 |
+
# --- System ---
|
| 50 |
+
RAG_DETAILED_LOGGING=True
|
| 51 |
+
# PORT=5000
|
| 52 |
+
# FLASK_DEBUG=True
|
Dockerfile
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.11 to support Pandas 3.x and newer libraries
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
# libgl1 and libglib2.0-0 are often needed for CV/PDF libraries
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
libgl1 \
|
| 11 |
+
libglib2.0-0 \
|
| 12 |
+
build-essential \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Copy the requirements file
|
| 16 |
+
COPY requirements.txt requirements.txt
|
| 17 |
+
|
| 18 |
+
# Install Python packages with timeout increase
|
| 19 |
+
RUN pip install --no-cache-dir --timeout=1000 -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# Copy application code
|
| 22 |
+
COPY . /app
|
| 23 |
+
|
| 24 |
+
# Create a non-root user (Security Best Practice)
|
| 25 |
+
RUN useradd -m -u 1000 user
|
| 26 |
+
|
| 27 |
+
# Change ownership of the app directory and the temp directories
|
| 28 |
+
RUN chown -R user:user /app
|
| 29 |
+
|
| 30 |
+
# Create temp directories for HuggingFace/Torch cache and set permissions
|
| 31 |
+
RUN mkdir -p /tmp/transformers_cache /tmp/hf_home /tmp/torch_home && \
|
| 32 |
+
chown -R user:user /tmp/transformers_cache /tmp/hf_home /tmp/torch_home
|
| 33 |
+
|
| 34 |
+
# Switch to the non-root user
|
| 35 |
+
USER user
|
| 36 |
+
|
| 37 |
+
# Expose the port (Standard for HF Spaces)
|
| 38 |
+
EXPOSE 7860
|
| 39 |
+
|
| 40 |
+
# Set environment variables
|
| 41 |
+
ENV FLASK_HOST=0.0.0.0
|
| 42 |
+
ENV FLASK_PORT=7860
|
| 43 |
+
ENV FLASK_DEBUG=False
|
| 44 |
+
|
| 45 |
+
# CRITICAL: Set HF-specific env vars to writable directories
|
| 46 |
+
ENV TRANSFORMERS_CACHE=/tmp/transformers_cache
|
| 47 |
+
ENV HF_HOME=/tmp/hf_home
|
| 48 |
+
ENV TORCH_HOME=/tmp/torch_home
|
| 49 |
+
|
| 50 |
+
# Command to run the app
|
| 51 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, jsonify, Response, render_template
|
| 2 |
+
from flask_cors import CORS
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
import functools
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
# Load environment variables
|
| 10 |
+
load_dotenv()
|
| 11 |
+
|
| 12 |
+
# Custom Imports
|
| 13 |
+
from rag_system import initialize_and_get_rag_system
|
| 14 |
+
from config import (
|
| 15 |
+
API_USERNAME, API_PASSWORD, RAG_SOURCES_DIR,
|
| 16 |
+
GDRIVE_INDEX_ENABLED, GDRIVE_INDEX_ID_OR_URL,
|
| 17 |
+
GDRIVE_USERS_CSV_ENABLED, GDRIVE_USERS_CSV_ID_OR_URL,
|
| 18 |
+
ADMIN_USERNAME, ADMIN_PASSWORD,
|
| 19 |
+
RAG_RERANKER_K
|
| 20 |
+
)
|
| 21 |
+
from utils import download_and_unzip_gdrive_file, download_gdrive_file
|
| 22 |
+
|
| 23 |
+
# Logging Setup
|
| 24 |
+
logging.basicConfig(level=logging.INFO)
|
| 25 |
+
logger = logging.getLogger(__name__)
|
| 26 |
+
|
| 27 |
+
# Flask Init
|
| 28 |
+
app = Flask(__name__, static_folder='static', template_folder='templates')
|
| 29 |
+
CORS(app)
|
| 30 |
+
|
| 31 |
+
# Global State
|
| 32 |
+
rag_system = None
|
| 33 |
+
user_df = None
|
| 34 |
+
_APP_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 35 |
+
|
| 36 |
+
# --- Helper: Load Users ---
|
| 37 |
+
def load_users_from_csv():
|
| 38 |
+
global user_df
|
| 39 |
+
assets_folder = os.path.join(_APP_BASE_DIR, 'assets')
|
| 40 |
+
os.makedirs(assets_folder, exist_ok=True)
|
| 41 |
+
users_csv_path = os.path.join(assets_folder, 'users.csv')
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
if os.path.exists(users_csv_path):
|
| 45 |
+
user_df = pd.read_csv(users_csv_path)
|
| 46 |
+
# Normalize email
|
| 47 |
+
if 'email' in user_df.columns:
|
| 48 |
+
user_df['email'] = user_df['email'].str.lower().str.strip()
|
| 49 |
+
logger.info(f"Loaded {len(user_df)} users from CSV.")
|
| 50 |
+
else:
|
| 51 |
+
logger.warning("users.csv not found in assets folder.")
|
| 52 |
+
user_df = None
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.error(f"Failed to load users.csv: {e}")
|
| 55 |
+
user_df = None
|
| 56 |
+
|
| 57 |
+
# --- Helper: Auth Decorators ---
|
| 58 |
+
def require_api_auth(f):
|
| 59 |
+
"""Protects the N8N Webhook endpoint"""
|
| 60 |
+
@functools.wraps(f)
|
| 61 |
+
def decorated(*args, **kwargs):
|
| 62 |
+
auth = request.authorization
|
| 63 |
+
# Check against API_USERNAME/PASSWORD from .env
|
| 64 |
+
if not auth or auth.username != API_USERNAME or auth.password != API_PASSWORD:
|
| 65 |
+
return Response('Unauthorized', 401, {'WWW-Authenticate': 'Basic realm="API Login Required"'})
|
| 66 |
+
return f(*args, **kwargs)
|
| 67 |
+
return decorated
|
| 68 |
+
|
| 69 |
+
def require_admin_auth(f):
|
| 70 |
+
"""Protects Admin Rebuild/Update endpoints"""
|
| 71 |
+
@functools.wraps(f)
|
| 72 |
+
def decorated(*args, **kwargs):
|
| 73 |
+
auth = request.authorization
|
| 74 |
+
if not auth:
|
| 75 |
+
return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
|
| 76 |
+
|
| 77 |
+
# 1. Check against loaded CSV
|
| 78 |
+
if user_df is not None:
|
| 79 |
+
user_email = auth.username.lower().strip()
|
| 80 |
+
user_record = user_df[user_df['email'] == user_email]
|
| 81 |
+
if not user_record.empty:
|
| 82 |
+
user_data = user_record.iloc[0]
|
| 83 |
+
# Compare password as string
|
| 84 |
+
if str(user_data['password']) == auth.password and user_data['role'] == 'admin':
|
| 85 |
+
return f(*args, **kwargs)
|
| 86 |
+
|
| 87 |
+
# 2. Fallback to .env Admin Credentials
|
| 88 |
+
elif auth.username == ADMIN_USERNAME and auth.password == ADMIN_PASSWORD:
|
| 89 |
+
return f(*args, **kwargs)
|
| 90 |
+
|
| 91 |
+
return Response('Admin auth failed.', 401, {'WWW-Authenticate': 'Basic realm="Admin Login Required"'})
|
| 92 |
+
return decorated
|
| 93 |
+
|
| 94 |
+
# --- Startup Logic (Fixed: No Decorator) ---
|
| 95 |
+
def run_startup_tasks():
|
| 96 |
+
"""Initializes RAG system and loads data. Called explicitly."""
|
| 97 |
+
global rag_system
|
| 98 |
+
logger.info("--- Executing Startup Tasks ---")
|
| 99 |
+
|
| 100 |
+
# 1. Download Users CSV if configured
|
| 101 |
+
if GDRIVE_USERS_CSV_ENABLED and GDRIVE_USERS_CSV_ID_OR_URL:
|
| 102 |
+
target = os.path.join(_APP_BASE_DIR, 'assets', 'users.csv')
|
| 103 |
+
download_gdrive_file(GDRIVE_USERS_CSV_ID_OR_URL, target)
|
| 104 |
+
|
| 105 |
+
# 2. Load User Data
|
| 106 |
+
load_users_from_csv()
|
| 107 |
+
|
| 108 |
+
# 3. Download FAISS Index if configured
|
| 109 |
+
if GDRIVE_INDEX_ENABLED and GDRIVE_INDEX_ID_OR_URL:
|
| 110 |
+
download_and_unzip_gdrive_file(GDRIVE_INDEX_ID_OR_URL, os.getcwd())
|
| 111 |
+
|
| 112 |
+
# 4. Initialize RAG
|
| 113 |
+
rag_system = initialize_and_get_rag_system()
|
| 114 |
+
logger.info("--- Startup Tasks Complete ---")
|
| 115 |
+
|
| 116 |
+
# Execute startup tasks immediately when this module is loaded
|
| 117 |
+
# This ensures it runs before the first request in Flask 3.x
|
| 118 |
+
with app.app_context():
|
| 119 |
+
run_startup_tasks()
|
| 120 |
+
|
| 121 |
+
# ===========================
|
| 122 |
+
# API ROUTES
|
| 123 |
+
# ===========================
|
| 124 |
+
|
| 125 |
+
# --- 1. N8N Webhook (The Core Function) ---
|
| 126 |
+
@app.route('/webhook/search', methods=['POST'])
|
| 127 |
+
@require_api_auth
|
| 128 |
+
def search_knowledgebase_api():
|
| 129 |
+
"""
|
| 130 |
+
Main entry point for N8N.
|
| 131 |
+
Expected JSON: { "query": "...", "use_reranker": bool, "final_k": int }
|
| 132 |
+
"""
|
| 133 |
+
if not rag_system:
|
| 134 |
+
# Try to recover if somehow not initialized
|
| 135 |
+
return jsonify({"error": "RAG not initialized. Check server logs."}), 503
|
| 136 |
+
|
| 137 |
+
data = request.json or {}
|
| 138 |
+
query = data.get('query')
|
| 139 |
+
if not query:
|
| 140 |
+
return jsonify({"error": "Query field is required"}), 400
|
| 141 |
+
|
| 142 |
+
# Configs (with defaults)
|
| 143 |
+
# --- FIX: Use RAG_RERANKER_K from config as the default instead of hardcoded 5 ---
|
| 144 |
+
top_k = data.get('final_k', RAG_RERANKER_K)
|
| 145 |
+
use_reranker = data.get('use_reranker', True)
|
| 146 |
+
|
| 147 |
+
# Dynamic Reranker Toggling
|
| 148 |
+
if rag_system.retriever:
|
| 149 |
+
if not use_reranker:
|
| 150 |
+
rag_system.retriever.reranker = None
|
| 151 |
+
elif use_reranker and rag_system.reranker:
|
| 152 |
+
rag_system.retriever.reranker = rag_system.reranker
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
results = rag_system.search_knowledge_base(query, top_k=top_k)
|
| 156 |
+
return jsonify({
|
| 157 |
+
"results": results,
|
| 158 |
+
"count": len(results),
|
| 159 |
+
"status": "success"
|
| 160 |
+
})
|
| 161 |
+
except Exception as e:
|
| 162 |
+
logger.error(f"Search API Error: {e}")
|
| 163 |
+
return jsonify({"error": str(e)}), 500
|
| 164 |
+
|
| 165 |
+
# --- 2. User Login (RESTORED) ---
|
| 166 |
+
@app.route('/user-login', methods=['POST'])
|
| 167 |
+
def user_login():
|
| 168 |
+
"""
|
| 169 |
+
Standard user login endpoint.
|
| 170 |
+
Checks credentials against users.csv.
|
| 171 |
+
"""
|
| 172 |
+
if user_df is None:
|
| 173 |
+
return jsonify({"error": "User database not available."}), 503
|
| 174 |
+
|
| 175 |
+
data = request.json
|
| 176 |
+
email = data.get('email', '').lower().strip()
|
| 177 |
+
password = data.get('password')
|
| 178 |
+
|
| 179 |
+
if not email or not password:
|
| 180 |
+
return jsonify({"error": "Email and password required"}), 400
|
| 181 |
+
|
| 182 |
+
user_record = user_df[user_df['email'] == email]
|
| 183 |
+
if not user_record.empty:
|
| 184 |
+
u_data = user_record.iloc[0]
|
| 185 |
+
if str(u_data['password']) == str(password):
|
| 186 |
+
# Return user info (excluding password)
|
| 187 |
+
resp = u_data.to_dict()
|
| 188 |
+
if 'password' in resp:
|
| 189 |
+
del resp['password']
|
| 190 |
+
return jsonify(resp), 200
|
| 191 |
+
|
| 192 |
+
return jsonify({"error": "Invalid credentials"}), 401
|
| 193 |
+
|
| 194 |
+
# --- 3. UI Route ---
|
| 195 |
+
@app.route('/')
|
| 196 |
+
def index_route():
|
| 197 |
+
# Renders the HTML Dashboard
|
| 198 |
+
return render_template('chat-bot.html')
|
| 199 |
+
|
| 200 |
+
# --- 4. Admin Auth Check ---
|
| 201 |
+
@app.route('/admin/login', methods=['POST'])
|
| 202 |
+
@require_admin_auth
|
| 203 |
+
def admin_login():
|
| 204 |
+
"""Verifies Admin Credentials via Basic Auth Header"""
|
| 205 |
+
return jsonify({"status": "success", "message": "Authenticated"}), 200
|
| 206 |
+
|
| 207 |
+
# --- 5. Admin RAG Controls ---
|
| 208 |
+
@app.route('/admin/update_faiss_index', methods=['POST'])
|
| 209 |
+
@require_admin_auth
|
| 210 |
+
def update_faiss_index():
|
| 211 |
+
if not rag_system:
|
| 212 |
+
return jsonify({"error": "RAG system not initialized"}), 503
|
| 213 |
+
|
| 214 |
+
data = request.json or {}
|
| 215 |
+
max_files = data.get('max_new_files')
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
result = rag_system.update_index_with_new_files(RAG_SOURCES_DIR, max_files)
|
| 219 |
+
return jsonify(result), 200
|
| 220 |
+
except Exception as e:
|
| 221 |
+
return jsonify({"error": str(e)}), 500
|
| 222 |
+
|
| 223 |
+
@app.route('/admin/rebuild_index', methods=['POST'])
|
| 224 |
+
@require_admin_auth
|
| 225 |
+
def rebuild_index():
|
| 226 |
+
global rag_system
|
| 227 |
+
try:
|
| 228 |
+
# Force rebuild calls the initialization logic with force_rebuild=True
|
| 229 |
+
rag_system = initialize_and_get_rag_system(force_rebuild=True)
|
| 230 |
+
return jsonify({"status": "Index rebuilt successfully"}), 200
|
| 231 |
+
except Exception as e:
|
| 232 |
+
return jsonify({"error": str(e)}), 500
|
| 233 |
+
|
| 234 |
+
# --- 6. Status Check ---
|
| 235 |
+
@app.route('/status', methods=['GET'])
|
| 236 |
+
def status_route():
|
| 237 |
+
return jsonify({
|
| 238 |
+
"status": "online",
|
| 239 |
+
"rag_initialized": rag_system is not None,
|
| 240 |
+
"users_loaded": user_df is not None
|
| 241 |
+
})
|
| 242 |
+
|
| 243 |
+
if __name__ == '__main__':
|
| 244 |
+
# Default to 7860 for Hugging Face Spaces
|
| 245 |
+
port = int(os.environ.get("PORT", 7860))
|
| 246 |
+
app.run(host='0.0.0.0', port=port)
|
assets/users.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sl,name,email,password,role
|
| 2 |
+
1,Sifat Hossain Fahim,fahim@ge-bd.com,WorldTour1234!,admin
|
| 3 |
+
2,Sakib Ahmed,sakib.ahmed@ge-bd.com,12345678!,admin
|
| 4 |
+
3,Rezwanul Islam,rezwanul@ge-bd.com,marstour1234!,admin
|
| 5 |
+
4,Sarwar Jahan,sarwar.piel@ge-bd.com,password123,user
|
| 6 |
+
5,Rezaul Kabir,rezaul.kabir@ge-bd.com,securepass,user
|
| 7 |
+
6,Test,test@test.com,12345678!,user
|
| 8 |
+
7,Sadiquzzaman,sadiquzzaman@ge-bd.com,wqeqw1234,user
|
| 9 |
+
8,Sadman,sadman@ge-bd.com,1234fvb,user
|
| 10 |
+
9,Pavel,pavel@ge-bd.com,12314rdf,user
|
| 11 |
+
10,Sajib,sajib.hossain@ge-bd.com,1234rge,user
|
| 12 |
+
11,Abdur Rahim,arahim@ge-bd.com,23ree4rt,user
|
chunker.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import json
|
| 4 |
+
import argparse
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
+
|
| 7 |
+
# --- UPDATED IMPORT ---
|
| 8 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 9 |
+
# ----------------------
|
| 10 |
+
|
| 11 |
+
from utils import extract_text_from_file, FAISS_RAG_SUPPORTED_EXTENSIONS
|
| 12 |
+
|
| 13 |
+
# --- Logging Setup ---
|
| 14 |
+
logging.basicConfig(
|
| 15 |
+
level=logging.INFO,
|
| 16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 17 |
+
handlers=[
|
| 18 |
+
logging.StreamHandler()
|
| 19 |
+
]
|
| 20 |
+
)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
def process_sources_and_create_chunks(
|
| 24 |
+
sources_dir: str,
|
| 25 |
+
output_file: str,
|
| 26 |
+
chunk_size: int = 1000,
|
| 27 |
+
chunk_overlap: int = 150,
|
| 28 |
+
text_output_dir: Optional[str] = None
|
| 29 |
+
) -> None:
|
| 30 |
+
if not os.path.isdir(sources_dir):
|
| 31 |
+
logger.error(f"Source directory not found: '{sources_dir}'")
|
| 32 |
+
raise FileNotFoundError(f"Source directory not found: '{sources_dir}'")
|
| 33 |
+
|
| 34 |
+
logger.info(f"Starting chunking process. Sources: '{sources_dir}', Output: '{output_file}'")
|
| 35 |
+
|
| 36 |
+
if text_output_dir:
|
| 37 |
+
os.makedirs(text_output_dir, exist_ok=True)
|
| 38 |
+
logger.info(f"Will save raw extracted text to: '{text_output_dir}'")
|
| 39 |
+
|
| 40 |
+
all_chunks_for_json: List[Dict] = []
|
| 41 |
+
processed_files_count = 0
|
| 42 |
+
|
| 43 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 44 |
+
|
| 45 |
+
for filename in os.listdir(sources_dir):
|
| 46 |
+
file_path = os.path.join(sources_dir, filename)
|
| 47 |
+
if not os.path.isfile(file_path):
|
| 48 |
+
continue
|
| 49 |
+
|
| 50 |
+
file_ext = filename.split('.')[-1].lower()
|
| 51 |
+
if file_ext not in FAISS_RAG_SUPPORTED_EXTENSIONS:
|
| 52 |
+
logger.debug(f"Skipping unsupported file: {filename}")
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
+
logger.info(f"Processing source file: {filename}")
|
| 56 |
+
text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
|
| 57 |
+
|
| 58 |
+
if text_content:
|
| 59 |
+
if text_output_dir:
|
| 60 |
+
try:
|
| 61 |
+
text_output_path = os.path.join(text_output_dir, f"{filename}.txt")
|
| 62 |
+
with open(text_output_path, 'w', encoding='utf-8') as f_text:
|
| 63 |
+
f_text.write(text_content)
|
| 64 |
+
except Exception as e_text_save:
|
| 65 |
+
logger.error(f"Could not save extracted text for '{filename}': {e_text_save}")
|
| 66 |
+
|
| 67 |
+
chunks = text_splitter.split_text(text_content)
|
| 68 |
+
for i, chunk_text in enumerate(chunks):
|
| 69 |
+
chunk_data = {
|
| 70 |
+
"page_content": chunk_text,
|
| 71 |
+
"metadata": {
|
| 72 |
+
"source_document_name": filename,
|
| 73 |
+
"chunk_index": i,
|
| 74 |
+
"full_location": f"{filename}, Chunk {i+1}"
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
all_chunks_for_json.append(chunk_data)
|
| 78 |
+
|
| 79 |
+
processed_files_count += 1
|
| 80 |
+
|
| 81 |
+
if not all_chunks_for_json:
|
| 82 |
+
logger.warning(f"No processable documents found in '{sources_dir}'.")
|
| 83 |
+
|
| 84 |
+
output_dir = os.path.dirname(output_file)
|
| 85 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 86 |
+
|
| 87 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 88 |
+
json.dump(all_chunks_for_json, f, indent=2)
|
| 89 |
+
|
| 90 |
+
logger.info(f"Chunking complete. Processed {processed_files_count} files. Total chunks: {len(all_chunks_for_json)}")
|
| 91 |
+
|
| 92 |
+
def main():
|
| 93 |
+
parser = argparse.ArgumentParser()
|
| 94 |
+
parser.add_argument('--sources-dir', type=str, required=True)
|
| 95 |
+
parser.add_argument('--output-file', type=str, required=True)
|
| 96 |
+
parser.add_argument('--text-output-dir', type=str, default=None)
|
| 97 |
+
parser.add_argument('--chunk-size', type=int, default=1000)
|
| 98 |
+
parser.add_argument('--chunk-overlap', type=int, default=150)
|
| 99 |
+
args = parser.parse_args()
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
process_sources_and_create_chunks(
|
| 103 |
+
sources_dir=args.sources_dir,
|
| 104 |
+
output_file=args.output_file,
|
| 105 |
+
chunk_size=args.chunk_size,
|
| 106 |
+
chunk_overlap=args.chunk_overlap,
|
| 107 |
+
text_output_dir=args.text_output_dir
|
| 108 |
+
)
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.critical(f"Chunking failed: {e}", exc_info=True)
|
| 111 |
+
exit(1)
|
| 112 |
+
|
| 113 |
+
if __name__ == "__main__":
|
| 114 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
# --- Logging Setup ---
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
if not logger.handlers:
|
| 7 |
+
logging.basicConfig(
|
| 8 |
+
level=logging.INFO,
|
| 9 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
_MODULE_BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 13 |
+
|
| 14 |
+
# API Authentication for n8n (Basic Auth)
|
| 15 |
+
API_USERNAME = os.getenv("API_USERNAME", "admin")
|
| 16 |
+
API_PASSWORD = os.getenv("API_PASSWORD", "password")
|
| 17 |
+
|
| 18 |
+
# Admin fallback for UI
|
| 19 |
+
ADMIN_USERNAME = os.getenv('FLASK_ADMIN_USERNAME', 'admin')
|
| 20 |
+
ADMIN_PASSWORD = os.getenv('FLASK_ADMIN_PASSWORD', '1234')
|
| 21 |
+
|
| 22 |
+
RAG_FAISS_INDEX_SUBDIR_NAME = "faiss_index"
|
| 23 |
+
RAG_STORAGE_PARENT_DIR = os.getenv("RAG_STORAGE_DIR", os.path.join(_MODULE_BASE_DIR, "faiss_storage"))
|
| 24 |
+
RAG_SOURCES_DIR = os.getenv("SOURCES_DIR", os.path.join(_MODULE_BASE_DIR, "sources"))
|
| 25 |
+
RAG_CHUNKED_SOURCES_FILENAME = "pre_chunked_sources.json"
|
| 26 |
+
|
| 27 |
+
os.makedirs(RAG_SOURCES_DIR, exist_ok=True)
|
| 28 |
+
os.makedirs(RAG_STORAGE_PARENT_DIR, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
# Embedding and model configuration
|
| 31 |
+
RAG_EMBEDDING_MODEL_NAME = os.getenv("RAG_EMBEDDING_MODEL", "BAAI/bge-small-en")
|
| 32 |
+
RAG_EMBEDDING_USE_GPU = os.getenv("RAG_EMBEDDING_GPU", "False").lower() == "true"
|
| 33 |
+
RAG_LOAD_INDEX_ON_STARTUP = os.getenv("RAG_LOAD_INDEX", "True").lower() == "true"
|
| 34 |
+
|
| 35 |
+
# Retrieval Settings
|
| 36 |
+
RAG_INITIAL_FETCH_K = int(os.getenv("RAG_INITIAL_FETCH_K", 20))
|
| 37 |
+
RAG_RERANKER_K = int(os.getenv("RAG_RERANKER_K", 5))
|
| 38 |
+
# RESTORED: Incremental update limit
|
| 39 |
+
RAG_MAX_FILES_FOR_INCREMENTAL = int(os.getenv("RAG_MAX_FILES_FOR_INCREMENTAL", "50"))
|
| 40 |
+
|
| 41 |
+
# Chunk configuration
|
| 42 |
+
RAG_CHUNK_SIZE = int(os.getenv("RAG_CHUNK_SIZE", 1000))
|
| 43 |
+
RAG_CHUNK_OVERLAP = int(os.getenv("RAG_CHUNK_OVERLAP", 150))
|
| 44 |
+
|
| 45 |
+
# Reranker configuration
|
| 46 |
+
RAG_RERANKER_MODEL_NAME = os.getenv("RAG_RERANKER_MODEL", "jinaai/jina-reranker-v2-base-multilingual")
|
| 47 |
+
RAG_RERANKER_ENABLED = os.getenv("RAG_RERANKER_ENABLED", "True").lower() == "true"
|
| 48 |
+
|
| 49 |
+
# GDrive configuration for RAG sources
|
| 50 |
+
GDRIVE_SOURCES_ENABLED = os.getenv("GDRIVE_SOURCES_ENABLED", "False").lower() == "true"
|
| 51 |
+
GDRIVE_FOLDER_ID_OR_URL = os.getenv("GDRIVE_FOLDER_URL")
|
| 52 |
+
|
| 53 |
+
# GDrive configuration for downloading a pre-built FAISS index
|
| 54 |
+
GDRIVE_INDEX_ENABLED = os.getenv("GDRIVE_INDEX_ENABLED", "False").lower() == "true"
|
| 55 |
+
GDRIVE_INDEX_ID_OR_URL = os.getenv("GDRIVE_INDEX_URL")
|
| 56 |
+
|
| 57 |
+
# RESTORED: GDrive configuration for downloading users.csv
|
| 58 |
+
GDRIVE_USERS_CSV_ENABLED = os.getenv("GDRIVE_USERS_CSV_ENABLED", "False").lower() == "true"
|
| 59 |
+
GDRIVE_USERS_CSV_ID_OR_URL = os.getenv("GDRIVE_USERS_CSV_URL")
|
| 60 |
+
|
| 61 |
+
# Detailed logging configuration
|
| 62 |
+
RAG_DETAILED_LOGGING = os.getenv("RAG_DETAILED_LOGGING", "True").lower() == "true"
|
| 63 |
+
|
| 64 |
+
logger.info(f"RAG Config Loaded - Chunk Size: {RAG_CHUNK_SIZE}, Chunk Overlap: {RAG_CHUNK_OVERLAP}")
|
| 65 |
+
logger.info(f"Embedding Model: {RAG_EMBEDDING_MODEL_NAME}")
|
| 66 |
+
logger.info(f"Reranker Model: {RAG_RERANKER_MODEL_NAME}")
|
| 67 |
+
logger.info(f"Retrieval Pipeline: Initial Fetch K={RAG_INITIAL_FETCH_K}, Reranker Final K={RAG_RERANKER_K}")
|
| 68 |
+
logger.info(f"Detailed Logging: {'ENABLED' if RAG_DETAILED_LOGGING else 'DISABLED'}")
|
| 69 |
+
logger.info(f"GDrive Sources Download: {'ENABLED' if GDRIVE_SOURCES_ENABLED else 'DISABLED'}")
|
| 70 |
+
logger.info(f"GDrive Pre-built Index Download: {'ENABLED' if GDRIVE_INDEX_ENABLED else 'DISABLED'}")
|
| 71 |
+
logger.info(f"GDrive users.csv Download: {'ENABLED' if GDRIVE_USERS_CSV_ENABLED else 'DISABLED'}")
|
note.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://homemademirpur-ed-cad-ref.hf.space
|
postman.json
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"info": {
|
| 3 |
+
"_postman_id": "b8f9e9a1-5c8e-4a8e-9b8e-1f8e9a1f8e9a",
|
| 4 |
+
"name": "edmond_cad_refund",
|
| 5 |
+
"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
|
| 6 |
+
},
|
| 7 |
+
"item": [
|
| 8 |
+
{
|
| 9 |
+
"name": "N8N - Search Knowledgebase",
|
| 10 |
+
"request": {
|
| 11 |
+
"auth": {
|
| 12 |
+
"type": "basic",
|
| 13 |
+
"basic": [
|
| 14 |
+
{
|
| 15 |
+
"key": "password",
|
| 16 |
+
"value": "password",
|
| 17 |
+
"type": "string"
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"key": "username",
|
| 21 |
+
"value": "admin",
|
| 22 |
+
"type": "string"
|
| 23 |
+
}
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
"method": "POST",
|
| 27 |
+
"header": [],
|
| 28 |
+
"body": {
|
| 29 |
+
"mode": "raw",
|
| 30 |
+
"raw": "{\n \"query\": \"how to get a refund for electronics?\",\n \"use_reranker\": true,\n \"final_k\": 5,\n \"persona\": [\"standard\"],\n \"tier\": [\"gold\"]\n}",
|
| 31 |
+
"options": {
|
| 32 |
+
"raw": {
|
| 33 |
+
"language": "json"
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
},
|
| 37 |
+
"url": {
|
| 38 |
+
"raw": "{{base_url}}/webhook/search",
|
| 39 |
+
"host": [
|
| 40 |
+
"{{base_url}}"
|
| 41 |
+
],
|
| 42 |
+
"path": [
|
| 43 |
+
"webhook",
|
| 44 |
+
"search"
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
"description": "Main endpoint used by N8N to retrieve context chunks."
|
| 48 |
+
},
|
| 49 |
+
"response": []
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"name": "Admin - Rebuild Index",
|
| 53 |
+
"request": {
|
| 54 |
+
"auth": {
|
| 55 |
+
"type": "basic",
|
| 56 |
+
"basic": [
|
| 57 |
+
{
|
| 58 |
+
"key": "password",
|
| 59 |
+
"value": "1234",
|
| 60 |
+
"type": "string"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"key": "username",
|
| 64 |
+
"value": "admin",
|
| 65 |
+
"type": "string"
|
| 66 |
+
}
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
"method": "POST",
|
| 70 |
+
"header": [],
|
| 71 |
+
"url": {
|
| 72 |
+
"raw": "{{base_url}}/admin/rebuild_index",
|
| 73 |
+
"host": [
|
| 74 |
+
"{{base_url}}"
|
| 75 |
+
],
|
| 76 |
+
"path": [
|
| 77 |
+
"admin",
|
| 78 |
+
"rebuild_index"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
"description": "Completely deletes and rebuilds the FAISS index from sources."
|
| 82 |
+
},
|
| 83 |
+
"response": []
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"name": "Admin - Update Index (Incremental)",
|
| 87 |
+
"request": {
|
| 88 |
+
"auth": {
|
| 89 |
+
"type": "basic",
|
| 90 |
+
"basic": [
|
| 91 |
+
{
|
| 92 |
+
"key": "password",
|
| 93 |
+
"value": "1234",
|
| 94 |
+
"type": "string"
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"key": "username",
|
| 98 |
+
"value": "admin",
|
| 99 |
+
"type": "string"
|
| 100 |
+
}
|
| 101 |
+
]
|
| 102 |
+
},
|
| 103 |
+
"method": "POST",
|
| 104 |
+
"header": [],
|
| 105 |
+
"body": {
|
| 106 |
+
"mode": "raw",
|
| 107 |
+
"raw": "{\n \"max_new_files\": 50\n}",
|
| 108 |
+
"options": {
|
| 109 |
+
"raw": {
|
| 110 |
+
"language": "json"
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"url": {
|
| 115 |
+
"raw": "{{base_url}}/admin/update_faiss_index",
|
| 116 |
+
"host": [
|
| 117 |
+
"{{base_url}}"
|
| 118 |
+
],
|
| 119 |
+
"path": [
|
| 120 |
+
"admin",
|
| 121 |
+
"update_faiss_index"
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
"description": "Adds only new files to the existing index."
|
| 125 |
+
},
|
| 126 |
+
"response": []
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"name": "Public - Status",
|
| 130 |
+
"request": {
|
| 131 |
+
"method": "GET",
|
| 132 |
+
"header": [],
|
| 133 |
+
"url": {
|
| 134 |
+
"raw": "{{base_url}}/status",
|
| 135 |
+
"host": [
|
| 136 |
+
"{{base_url}}"
|
| 137 |
+
],
|
| 138 |
+
"path": [
|
| 139 |
+
"status"
|
| 140 |
+
]
|
| 141 |
+
}
|
| 142 |
+
},
|
| 143 |
+
"response": []
|
| 144 |
+
}
|
| 145 |
+
],
|
| 146 |
+
"variable": [
|
| 147 |
+
{
|
| 148 |
+
"key": "base_url",
|
| 149 |
+
"value": "http://localhost:5000",
|
| 150 |
+
"type": "string"
|
| 151 |
+
}
|
| 152 |
+
]
|
| 153 |
+
}
|
rag_components.py
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
from typing import List, Dict, Optional, Any
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from sentence_transformers import CrossEncoder
|
| 9 |
+
|
| 10 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 11 |
+
from langchain_community.vectorstores import FAISS
|
| 12 |
+
|
| 13 |
+
from langchain_core.documents import Document
|
| 14 |
+
from langchain_core.retrievers import BaseRetriever
|
| 15 |
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
| 16 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 17 |
+
|
| 18 |
+
from config import (
|
| 19 |
+
RAG_RERANKER_MODEL_NAME, RAG_DETAILED_LOGGING,
|
| 20 |
+
RAG_CHUNK_SIZE, RAG_CHUNK_OVERLAP, RAG_CHUNKED_SOURCES_FILENAME,
|
| 21 |
+
RAG_FAISS_INDEX_SUBDIR_NAME, RAG_INITIAL_FETCH_K, RAG_RERANKER_K,
|
| 22 |
+
RAG_MAX_FILES_FOR_INCREMENTAL
|
| 23 |
+
)
|
| 24 |
+
from utils import FAISS_RAG_SUPPORTED_EXTENSIONS
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class DocumentReranker:
|
| 30 |
+
def __init__(self, model_name: str = RAG_RERANKER_MODEL_NAME):
|
| 31 |
+
self.logger = logging.getLogger(__name__ + ".DocumentReranker")
|
| 32 |
+
self.model_name = model_name
|
| 33 |
+
self.model = None
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
self.logger.info(f"[RERANKER_INIT] Loading reranker model: {self.model_name}")
|
| 37 |
+
start_time = time.time()
|
| 38 |
+
self.model = CrossEncoder(model_name, trust_remote_code=True)
|
| 39 |
+
load_time = time.time() - start_time
|
| 40 |
+
self.logger.info(f"[RERANKER_INIT] Reranker model '{self.model_name}' loaded successfully in {load_time:.2f}s")
|
| 41 |
+
except Exception as e:
|
| 42 |
+
self.logger.error(f"[RERANKER_INIT] Failed to load reranker model '{self.model_name}': {e}", exc_info=True)
|
| 43 |
+
raise RuntimeError(f"Could not initialize reranker model: {e}") from e
|
| 44 |
+
|
| 45 |
+
def rerank_documents(self, query: str, documents: List[Document], top_k: int) -> List[Document]:
|
| 46 |
+
if not documents or not self.model:
|
| 47 |
+
return documents[:top_k] if documents else []
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
start_time = time.time()
|
| 51 |
+
doc_pairs = [[query, doc.page_content] for doc in documents]
|
| 52 |
+
scores = self.model.predict(doc_pairs)
|
| 53 |
+
rerank_time = time.time() - start_time
|
| 54 |
+
self.logger.info(f"[RERANKER] Computed relevance scores in {rerank_time:.3f}s")
|
| 55 |
+
|
| 56 |
+
doc_score_pairs = list(zip(documents, scores))
|
| 57 |
+
doc_score_pairs.sort(key=lambda x: x[1], reverse=True)
|
| 58 |
+
|
| 59 |
+
reranked_docs = []
|
| 60 |
+
for doc, score in doc_score_pairs[:top_k]:
|
| 61 |
+
doc.metadata["reranker_score"] = float(score)
|
| 62 |
+
reranked_docs.append(doc)
|
| 63 |
+
|
| 64 |
+
return reranked_docs
|
| 65 |
+
except Exception as e:
|
| 66 |
+
self.logger.error(f"[RERANKER] Error during reranking: {e}", exc_info=True)
|
| 67 |
+
return documents[:top_k] if documents else []
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class FAISSRetrieverWithScore(BaseRetriever):
|
| 71 |
+
vectorstore: FAISS
|
| 72 |
+
reranker: Optional[DocumentReranker] = None
|
| 73 |
+
initial_fetch_k: int = RAG_INITIAL_FETCH_K
|
| 74 |
+
final_k: int = RAG_RERANKER_K
|
| 75 |
+
|
| 76 |
+
def _get_relevant_documents(
|
| 77 |
+
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
| 78 |
+
) -> List[Document]:
|
| 79 |
+
|
| 80 |
+
start_time = time.time()
|
| 81 |
+
num_to_fetch = self.initial_fetch_k if self.reranker else self.final_k
|
| 82 |
+
|
| 83 |
+
# --- FIX: Use global logger, not self.logger ---
|
| 84 |
+
logger.info(f"[RETRIEVER] Fetching {num_to_fetch} docs (Rerank={self.reranker is not None})")
|
| 85 |
+
|
| 86 |
+
docs_and_scores = self.vectorstore.similarity_search_with_score(query, k=num_to_fetch)
|
| 87 |
+
|
| 88 |
+
relevant_docs = []
|
| 89 |
+
for doc, score in docs_and_scores:
|
| 90 |
+
doc.metadata["retrieval_score"] = float(score)
|
| 91 |
+
relevant_docs.append(doc)
|
| 92 |
+
|
| 93 |
+
if self.reranker and relevant_docs:
|
| 94 |
+
relevant_docs = self.reranker.rerank_documents(query, relevant_docs, top_k=self.final_k)
|
| 95 |
+
|
| 96 |
+
total_time = time.time() - start_time
|
| 97 |
+
# --- FIX: Use global logger ---
|
| 98 |
+
logger.info(f"[RETRIEVER] Completed in {total_time:.3f}s. Returned {len(relevant_docs)} docs.")
|
| 99 |
+
return relevant_docs
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class KnowledgeRAG:
|
| 103 |
+
def __init__(
|
| 104 |
+
self,
|
| 105 |
+
index_storage_dir: str,
|
| 106 |
+
embedding_model_name: str,
|
| 107 |
+
use_gpu_for_embeddings: bool,
|
| 108 |
+
chunk_size: int = RAG_CHUNK_SIZE,
|
| 109 |
+
chunk_overlap: int = RAG_CHUNK_OVERLAP,
|
| 110 |
+
reranker_model_name: Optional[str] = None,
|
| 111 |
+
enable_reranker: bool = True,
|
| 112 |
+
):
|
| 113 |
+
self.logger = logging.getLogger(__name__ + ".KnowledgeRAG")
|
| 114 |
+
self.logger.info(f"[RAG_INIT] Initializing KnowledgeRAG system")
|
| 115 |
+
|
| 116 |
+
self.index_storage_dir = index_storage_dir
|
| 117 |
+
os.makedirs(self.index_storage_dir, exist_ok=True)
|
| 118 |
+
|
| 119 |
+
self.embedding_model_name = embedding_model_name
|
| 120 |
+
self.use_gpu_for_embeddings = use_gpu_for_embeddings
|
| 121 |
+
self.chunk_size = chunk_size
|
| 122 |
+
self.chunk_overlap = chunk_overlap
|
| 123 |
+
self.reranker_model_name = reranker_model_name or RAG_RERANKER_MODEL_NAME
|
| 124 |
+
self.enable_reranker = enable_reranker
|
| 125 |
+
self.reranker = None
|
| 126 |
+
|
| 127 |
+
device = "cpu"
|
| 128 |
+
if self.use_gpu_for_embeddings:
|
| 129 |
+
if torch.cuda.is_available():
|
| 130 |
+
self.logger.info(f"[RAG_INIT] CUDA available. Requesting GPU.")
|
| 131 |
+
device = "cuda"
|
| 132 |
+
else:
|
| 133 |
+
self.logger.warning("[RAG_INIT] CUDA not available. Fallback to CPU.")
|
| 134 |
+
|
| 135 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 136 |
+
model_name=self.embedding_model_name,
|
| 137 |
+
model_kwargs={"device": device},
|
| 138 |
+
encode_kwargs={"normalize_embeddings": True}
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
if self.enable_reranker:
|
| 142 |
+
try:
|
| 143 |
+
self.reranker = DocumentReranker(self.reranker_model_name)
|
| 144 |
+
except Exception as e:
|
| 145 |
+
self.logger.warning(f"[RAG_INIT] Reranker Init Failed: {e}")
|
| 146 |
+
self.reranker = None
|
| 147 |
+
|
| 148 |
+
self.vector_store: Optional[FAISS] = None
|
| 149 |
+
self.retriever: Optional[FAISSRetrieverWithScore] = None
|
| 150 |
+
self.processed_source_files: List[str] = []
|
| 151 |
+
|
| 152 |
+
def _save_chunk_config(self):
|
| 153 |
+
"""Persist current chunk settings alongside the FAISS index for change detection."""
|
| 154 |
+
faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 155 |
+
config_file = os.path.join(faiss_path, "chunk_config.json")
|
| 156 |
+
with open(config_file, 'w') as f:
|
| 157 |
+
json.dump({"chunk_size": self.chunk_size, "chunk_overlap": self.chunk_overlap}, f)
|
| 158 |
+
|
| 159 |
+
def _load_chunk_config(self) -> Optional[dict]:
|
| 160 |
+
"""Load previously saved chunk config. Returns None if not found."""
|
| 161 |
+
faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 162 |
+
config_file = os.path.join(faiss_path, "chunk_config.json")
|
| 163 |
+
if os.path.exists(config_file):
|
| 164 |
+
with open(config_file, 'r') as f:
|
| 165 |
+
return json.load(f)
|
| 166 |
+
return None
|
| 167 |
+
|
| 168 |
+
def chunk_config_has_changed(self) -> bool:
|
| 169 |
+
"""Returns True if chunk_size or chunk_overlap differ from what the index was built with."""
|
| 170 |
+
saved = self._load_chunk_config()
|
| 171 |
+
if saved is None:
|
| 172 |
+
return False # No record yet — assume compatible
|
| 173 |
+
changed = saved.get("chunk_size") != self.chunk_size or saved.get("chunk_overlap") != self.chunk_overlap
|
| 174 |
+
if changed:
|
| 175 |
+
self.logger.warning(
|
| 176 |
+
f"[CONFIG_CHANGE] Chunk config mismatch! "
|
| 177 |
+
f"Saved=(size={saved.get('chunk_size')}, overlap={saved.get('chunk_overlap')}) "
|
| 178 |
+
f"Current=(size={self.chunk_size}, overlap={self.chunk_overlap}). "
|
| 179 |
+
f"Index will be rebuilt."
|
| 180 |
+
)
|
| 181 |
+
return changed
|
| 182 |
+
|
| 183 |
+
def build_index_from_source_files(self, source_folder_path: str):
|
| 184 |
+
self.logger.info(f"[INDEX_BUILD] Building from: {source_folder_path}")
|
| 185 |
+
if not os.path.isdir(source_folder_path):
|
| 186 |
+
raise FileNotFoundError(f"Source folder not found: '{source_folder_path}'.")
|
| 187 |
+
|
| 188 |
+
all_docs = []
|
| 189 |
+
processed_files = []
|
| 190 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
|
| 191 |
+
|
| 192 |
+
# 1. Pre-chunked JSON
|
| 193 |
+
pre_chunked_path = os.path.join(self.index_storage_dir, RAG_CHUNKED_SOURCES_FILENAME)
|
| 194 |
+
if os.path.exists(pre_chunked_path):
|
| 195 |
+
try:
|
| 196 |
+
with open(pre_chunked_path, 'r', encoding='utf-8') as f:
|
| 197 |
+
chunk_data_list = json.load(f)
|
| 198 |
+
for chunk in chunk_data_list:
|
| 199 |
+
doc = Document(page_content=chunk.get("page_content", ""), metadata=chunk.get("metadata", {}))
|
| 200 |
+
all_docs.append(doc)
|
| 201 |
+
if 'source_document_name' in doc.metadata:
|
| 202 |
+
processed_files.append(doc.metadata['source_document_name'])
|
| 203 |
+
processed_files = sorted(list(set(processed_files)))
|
| 204 |
+
except Exception as e:
|
| 205 |
+
self.logger.error(f"[INDEX_BUILD] JSON load failed: {e}")
|
| 206 |
+
|
| 207 |
+
# 2. Raw Files
|
| 208 |
+
if not all_docs:
|
| 209 |
+
for filename in os.listdir(source_folder_path):
|
| 210 |
+
file_path = os.path.join(source_folder_path, filename)
|
| 211 |
+
if not os.path.isfile(file_path): continue
|
| 212 |
+
file_ext = filename.split('.')[-1].lower()
|
| 213 |
+
if file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
|
| 214 |
+
text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[file_ext](file_path)
|
| 215 |
+
if text_content:
|
| 216 |
+
chunks = text_splitter.split_text(text_content)
|
| 217 |
+
for i, chunk_text in enumerate(chunks):
|
| 218 |
+
meta = {"source_document_name": filename, "chunk_index": i}
|
| 219 |
+
all_docs.append(Document(page_content=chunk_text, metadata=meta))
|
| 220 |
+
processed_files.append(filename)
|
| 221 |
+
|
| 222 |
+
if not all_docs:
|
| 223 |
+
raise ValueError("No documents to index.")
|
| 224 |
+
|
| 225 |
+
self.processed_source_files = processed_files
|
| 226 |
+
self.logger.info(f"[INDEX_BUILD] Creating FAISS index with {len(all_docs)} chunks.")
|
| 227 |
+
|
| 228 |
+
self.vector_store = FAISS.from_documents(all_docs, self.embeddings)
|
| 229 |
+
faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 230 |
+
self.vector_store.save_local(faiss_path)
|
| 231 |
+
self._save_chunk_config()
|
| 232 |
+
|
| 233 |
+
self.retriever = FAISSRetrieverWithScore(
|
| 234 |
+
vectorstore=self.vector_store,
|
| 235 |
+
reranker=self.reranker,
|
| 236 |
+
initial_fetch_k=RAG_INITIAL_FETCH_K,
|
| 237 |
+
final_k=RAG_RERANKER_K
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
def load_index_from_disk(self):
|
| 241 |
+
faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 242 |
+
if not os.path.exists(faiss_path):
|
| 243 |
+
raise FileNotFoundError("Index not found.")
|
| 244 |
+
|
| 245 |
+
self.vector_store = FAISS.load_local(
|
| 246 |
+
folder_path=faiss_path,
|
| 247 |
+
embeddings=self.embeddings,
|
| 248 |
+
allow_dangerous_deserialization=True
|
| 249 |
+
)
|
| 250 |
+
self.retriever = FAISSRetrieverWithScore(
|
| 251 |
+
vectorstore=self.vector_store,
|
| 252 |
+
reranker=self.reranker,
|
| 253 |
+
initial_fetch_k=RAG_INITIAL_FETCH_K,
|
| 254 |
+
final_k=RAG_RERANKER_K
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
# Restore metadata
|
| 258 |
+
meta_file = os.path.join(faiss_path, "processed_files.json")
|
| 259 |
+
if os.path.exists(meta_file):
|
| 260 |
+
with open(meta_file, 'r') as f:
|
| 261 |
+
self.processed_source_files = json.load(f)
|
| 262 |
+
else:
|
| 263 |
+
self.processed_source_files = ["Loaded from disk (unknown sources)"]
|
| 264 |
+
|
| 265 |
+
self.logger.info("[INDEX_LOAD] Success.")
|
| 266 |
+
|
| 267 |
+
def update_index_with_new_files(self, source_folder_path: str, max_files_to_process: Optional[int] = None) -> Dict[str, Any]:
|
| 268 |
+
self.logger.info(f"[INDEX_UPDATE] Checking for new files in: {source_folder_path}")
|
| 269 |
+
|
| 270 |
+
if not self.vector_store:
|
| 271 |
+
raise RuntimeError("Cannot update: no index loaded.")
|
| 272 |
+
|
| 273 |
+
processed_set = set(self.processed_source_files)
|
| 274 |
+
all_new_files = []
|
| 275 |
+
for filename in sorted(os.listdir(source_folder_path)):
|
| 276 |
+
if filename not in processed_set:
|
| 277 |
+
file_ext = filename.split('.')[-1].lower()
|
| 278 |
+
if file_ext in FAISS_RAG_SUPPORTED_EXTENSIONS:
|
| 279 |
+
all_new_files.append(filename)
|
| 280 |
+
|
| 281 |
+
if not all_new_files:
|
| 282 |
+
return {"status": "success", "message": "No new files found.", "files_added": []}
|
| 283 |
+
|
| 284 |
+
limit = max_files_to_process if max_files_to_process is not None else RAG_MAX_FILES_FOR_INCREMENTAL
|
| 285 |
+
files_to_process = all_new_files[:limit]
|
| 286 |
+
|
| 287 |
+
new_docs = []
|
| 288 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
|
| 289 |
+
|
| 290 |
+
for filename in files_to_process:
|
| 291 |
+
file_path = os.path.join(source_folder_path, filename)
|
| 292 |
+
text_content = FAISS_RAG_SUPPORTED_EXTENSIONS[filename.split('.')[-1].lower()](file_path)
|
| 293 |
+
|
| 294 |
+
if text_content:
|
| 295 |
+
chunks = text_splitter.split_text(text_content)
|
| 296 |
+
for i, chunk_text in enumerate(chunks):
|
| 297 |
+
meta = {"source_document_name": filename, "chunk_index": i}
|
| 298 |
+
new_docs.append(Document(page_content=chunk_text, metadata=meta))
|
| 299 |
+
|
| 300 |
+
if not new_docs:
|
| 301 |
+
return {"status": "warning", "message": "New files found but no text extracted.", "files_added": []}
|
| 302 |
+
|
| 303 |
+
self.vector_store.add_documents(new_docs)
|
| 304 |
+
|
| 305 |
+
faiss_path = os.path.join(self.index_storage_dir, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 306 |
+
self.vector_store.save_local(faiss_path)
|
| 307 |
+
|
| 308 |
+
self.processed_source_files.extend(files_to_process)
|
| 309 |
+
with open(os.path.join(faiss_path, "processed_files.json"), 'w') as f:
|
| 310 |
+
json.dump(sorted(self.processed_source_files), f)
|
| 311 |
+
|
| 312 |
+
return {
|
| 313 |
+
"status": "success",
|
| 314 |
+
"message": f"Added {len(files_to_process)} files.",
|
| 315 |
+
"files_added": files_to_process,
|
| 316 |
+
"remaining": len(all_new_files) - len(files_to_process)
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
def search_knowledge_base(self, query: str, top_k: Optional[int] = None) -> List[Dict[str, Any]]:
|
| 320 |
+
if not self.retriever:
|
| 321 |
+
raise RuntimeError("Retriever not initialized.")
|
| 322 |
+
|
| 323 |
+
original_k = self.retriever.final_k
|
| 324 |
+
if top_k:
|
| 325 |
+
self.retriever.final_k = top_k
|
| 326 |
+
|
| 327 |
+
try:
|
| 328 |
+
docs = self.retriever.invoke(query)
|
| 329 |
+
results = []
|
| 330 |
+
for doc in docs:
|
| 331 |
+
results.append({
|
| 332 |
+
"content": doc.page_content,
|
| 333 |
+
"metadata": doc.metadata,
|
| 334 |
+
"score": doc.metadata.get("reranker_score") or doc.metadata.get("retrieval_score")
|
| 335 |
+
})
|
| 336 |
+
return results
|
| 337 |
+
finally:
|
| 338 |
+
self.retriever.final_k = original_k
|
rag_system.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import shutil
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from rag_components import KnowledgeRAG
|
| 7 |
+
from utils import download_and_unzip_gdrive_folder
|
| 8 |
+
from config import (
|
| 9 |
+
GDRIVE_SOURCES_ENABLED, GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR,
|
| 10 |
+
RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME, RAG_LOAD_INDEX_ON_STARTUP,
|
| 11 |
+
RAG_EMBEDDING_MODEL_NAME, RAG_EMBEDDING_USE_GPU,
|
| 12 |
+
RAG_CHUNK_SIZE, RAG_CHUNK_OVERLAP,
|
| 13 |
+
RAG_RERANKER_MODEL_NAME, RAG_RERANKER_ENABLED
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
def initialize_and_get_rag_system(force_rebuild: bool = False, source_dir_override: Optional[str] = None) -> Optional[KnowledgeRAG]:
|
| 19 |
+
|
| 20 |
+
logger.info("[RAG_SYSTEM_INIT] Initializing...")
|
| 21 |
+
source_dir_to_use = source_dir_override if source_dir_override else RAG_SOURCES_DIR
|
| 22 |
+
|
| 23 |
+
# GDrive Logic (Restored)
|
| 24 |
+
if GDRIVE_SOURCES_ENABLED and not source_dir_override and GDRIVE_FOLDER_ID_OR_URL:
|
| 25 |
+
logger.info("[RAG_SYSTEM_INIT] Downloading sources from GDrive...")
|
| 26 |
+
if os.path.exists(RAG_SOURCES_DIR):
|
| 27 |
+
shutil.rmtree(RAG_SOURCES_DIR)
|
| 28 |
+
download_and_unzip_gdrive_folder(GDRIVE_FOLDER_ID_OR_URL, RAG_SOURCES_DIR)
|
| 29 |
+
|
| 30 |
+
faiss_index_path = os.path.join(RAG_STORAGE_PARENT_DIR, RAG_FAISS_INDEX_SUBDIR_NAME)
|
| 31 |
+
|
| 32 |
+
if force_rebuild and os.path.exists(faiss_index_path):
|
| 33 |
+
logger.info("[RAG_SYSTEM_INIT] Force rebuild: deleting old index.")
|
| 34 |
+
shutil.rmtree(faiss_index_path)
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
rag = KnowledgeRAG(
|
| 38 |
+
index_storage_dir=RAG_STORAGE_PARENT_DIR,
|
| 39 |
+
embedding_model_name=RAG_EMBEDDING_MODEL_NAME,
|
| 40 |
+
use_gpu_for_embeddings=RAG_EMBEDDING_USE_GPU,
|
| 41 |
+
chunk_size=RAG_CHUNK_SIZE,
|
| 42 |
+
chunk_overlap=RAG_CHUNK_OVERLAP,
|
| 43 |
+
reranker_model_name=RAG_RERANKER_MODEL_NAME,
|
| 44 |
+
enable_reranker=RAG_RERANKER_ENABLED,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
loaded = False
|
| 48 |
+
if RAG_LOAD_INDEX_ON_STARTUP and not force_rebuild:
|
| 49 |
+
# Check if chunk settings have changed since the index was built
|
| 50 |
+
if rag.chunk_config_has_changed():
|
| 51 |
+
logger.warning("[RAG_SYSTEM_INIT] Chunk config changed — forcing index rebuild.")
|
| 52 |
+
else:
|
| 53 |
+
try:
|
| 54 |
+
rag.load_index_from_disk()
|
| 55 |
+
loaded = True
|
| 56 |
+
except Exception as e:
|
| 57 |
+
logger.warning(f"[RAG_SYSTEM_INIT] Load failed ({e}). Building new.")
|
| 58 |
+
|
| 59 |
+
if not loaded:
|
| 60 |
+
if not os.path.exists(source_dir_to_use) or not os.listdir(source_dir_to_use):
|
| 61 |
+
logger.warning("[RAG_SYSTEM_INIT] No sources found. System empty.")
|
| 62 |
+
else:
|
| 63 |
+
rag.build_index_from_source_files(source_dir_to_use)
|
| 64 |
+
|
| 65 |
+
logger.info("[RAG_SYSTEM_INIT] Complete.")
|
| 66 |
+
return rag
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
logger.critical(f"[RAG_SYSTEM_INIT] FATAL: {e}", exc_info=True)
|
| 70 |
+
return None
|
requirements original.txt
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask==3.0.3
|
| 2 |
+
Flask_Cors==5.0.0
|
| 3 |
+
flask_session
|
| 4 |
+
numpy
|
| 5 |
+
pandas==2.2.3
|
| 6 |
+
# rapidfuzz==3.10.1
|
| 7 |
+
Requests==2.32.3
|
| 8 |
+
# scikit_learn==1.4.1.post1
|
| 9 |
+
# scikit_learn==1.5.2
|
| 10 |
+
psycopg2-binary==2.9.10
|
| 11 |
+
python-dotenv==1.0.1
|
| 12 |
+
apscheduler==3.11.0
|
| 13 |
+
redis==3.5.3
|
| 14 |
+
faiss-cpu==1.10.0
|
| 15 |
+
groq==0.15.0
|
| 16 |
+
llama_index==0.12.13
|
| 17 |
+
llama_index.llms.groq==0.3.1
|
| 18 |
+
# langchain_groq==0.2.4
|
| 19 |
+
# langchain_core==0.3.39
|
| 20 |
+
sentence_transformers==3.4.0
|
| 21 |
+
gunicorn
|
| 22 |
+
llama-index-embeddings-huggingface==0.5.4
|
| 23 |
+
onnxruntime==1.22.0
|
| 24 |
+
langchain-groq==0.3.2
|
| 25 |
+
python-docx==1.1.2
|
| 26 |
+
langchain==0.3.24
|
| 27 |
+
langchain_community==0.3.23
|
| 28 |
+
gdown==5.2.0
|
| 29 |
+
# torch
|
| 30 |
+
pymupdf==1.25.5
|
| 31 |
+
pypdf==5.4.0
|
| 32 |
+
hf_xet==1.1.10
|
| 33 |
+
# protobuf==3.20.3
|
| 34 |
+
|
| 35 |
+
# must install https://aka.ms/vs/17/release/vc_redist.x64.exe
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Flask==3.1.2
|
| 2 |
+
Flask_Cors==5.0.0
|
| 3 |
+
gdown==5.2.1
|
| 4 |
+
langchain==1.2.10
|
| 5 |
+
langchain_community==0.4.1
|
| 6 |
+
langchain_huggingface==1.2.0
|
| 7 |
+
pandas==3.0.0
|
| 8 |
+
pypdf==6.7.0
|
| 9 |
+
python-dotenv==1.2.1
|
| 10 |
+
python_docx==1.1.2
|
| 11 |
+
sentence_transformers==3.4.0
|
| 12 |
+
torch==2.9.0
|
| 13 |
+
langchain_core
|
| 14 |
+
langchain_text_splitters
|
| 15 |
+
faiss-cpu
|
sources/context.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
templates/chat-bot.html
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>RAG API Dashboard</title>
|
| 6 |
+
<link href="https://fonts.googleapis.com/css?family=Roboto:400,500" rel="stylesheet">
|
| 7 |
+
<style>
|
| 8 |
+
body { font-family: 'Roboto', sans-serif; background: #f4f7f9; color: #333; display: flex; justify-content: center; align-items: center; height: 100vh; margin: 0; }
|
| 9 |
+
.container { background: white; padding: 40px; border-radius: 12px; box-shadow: 0 4px 15px rgba(0,0,0,0.1); width: 400px; text-align: center; }
|
| 10 |
+
h2 { margin-bottom: 20px; color: #2c3e50; }
|
| 11 |
+
.status-badge { display: inline-block; padding: 8px 15px; border-radius: 20px; background: #2ecc71; color: white; font-weight: bold; margin-bottom: 30px; }
|
| 12 |
+
.status-badge.error { background: #e74c3c; }
|
| 13 |
+
.btn { display: block; width: 100%; padding: 12px; margin: 10px 0; border: none; border-radius: 6px; cursor: pointer; font-size: 16px; transition: 0.3s; }
|
| 14 |
+
.btn-primary { background: #3498db; color: white; }
|
| 15 |
+
.btn-primary:hover { background: #2980b9; }
|
| 16 |
+
.btn-danger { background: #e74c3c; color: white; }
|
| 17 |
+
.btn-danger:hover { background: #c0392b; }
|
| 18 |
+
input { width: calc(100% - 22px); padding: 10px; margin: 5px 0 15px; border: 1px solid #ddd; border-radius: 6px; }
|
| 19 |
+
.admin-panel { display: none; text-align: left; border-top: 1px solid #eee; padding-top: 20px; margin-top: 20px; }
|
| 20 |
+
.log-box { background: #2c3e50; color: #2ecc71; padding: 10px; border-radius: 6px; font-family: monospace; font-size: 12px; height: 100px; overflow-y: auto; margin-top: 15px; display:none;}
|
| 21 |
+
</style>
|
| 22 |
+
</head>
|
| 23 |
+
<body>
|
| 24 |
+
|
| 25 |
+
<div class="container">
|
| 26 |
+
<h2>RAG API System</h2>
|
| 27 |
+
<div id="status-badge" class="status-badge">Checking Status...</div>
|
| 28 |
+
|
| 29 |
+
<!-- Login Form -->
|
| 30 |
+
<div id="login-form">
|
| 31 |
+
<input type="text" id="username" placeholder="Admin Username">
|
| 32 |
+
<input type="password" id="password" placeholder="Password">
|
| 33 |
+
<button class="btn btn-primary" onclick="login()">Admin Login</button>
|
| 34 |
+
</div>
|
| 35 |
+
|
| 36 |
+
<!-- Admin Panel (Hidden until logged in) -->
|
| 37 |
+
<div id="admin-panel" class="admin-panel">
|
| 38 |
+
<h3>Admin Controls</h3>
|
| 39 |
+
<p>Manage Vector Index</p>
|
| 40 |
+
|
| 41 |
+
<input type="number" id="max-files" placeholder="Max files (e.g. 50) for Update" value="50">
|
| 42 |
+
<button class="btn btn-primary" onclick="performAction('/admin/update_faiss_index')">Update Index (New Files)</button>
|
| 43 |
+
<button class="btn btn-danger" onclick="performAction('/admin/rebuild_index')">Rebuild Full Index</button>
|
| 44 |
+
|
| 45 |
+
<div id="log-box" class="log-box"></div>
|
| 46 |
+
</div>
|
| 47 |
+
</div>
|
| 48 |
+
|
| 49 |
+
<script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
|
| 50 |
+
<script>
|
| 51 |
+
let authHeader = null;
|
| 52 |
+
|
| 53 |
+
// Check Server Status on Load
|
| 54 |
+
async function checkStatus() {
|
| 55 |
+
try {
|
| 56 |
+
const res = await axios.get('/status');
|
| 57 |
+
const badge = document.getElementById('status-badge');
|
| 58 |
+
if (res.data.rag_initialized) {
|
| 59 |
+
badge.textContent = "System Online";
|
| 60 |
+
badge.className = "status-badge";
|
| 61 |
+
} else {
|
| 62 |
+
badge.textContent = "Not Initialized";
|
| 63 |
+
badge.className = "status-badge error";
|
| 64 |
+
}
|
| 65 |
+
} catch (e) {
|
| 66 |
+
document.getElementById('status-badge').textContent = "Server Offline";
|
| 67 |
+
document.getElementById('status-badge').className = "status-badge error";
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
checkStatus();
|
| 71 |
+
|
| 72 |
+
// Login Logic
|
| 73 |
+
async function login() {
|
| 74 |
+
const u = document.getElementById('username').value;
|
| 75 |
+
const p = document.getElementById('password').value;
|
| 76 |
+
const auth = { username: u, password: p };
|
| 77 |
+
|
| 78 |
+
try {
|
| 79 |
+
await axios.post('/admin/login', {}, { auth });
|
| 80 |
+
authHeader = auth;
|
| 81 |
+
document.getElementById('login-form').style.display = 'none';
|
| 82 |
+
document.getElementById('admin-panel').style.display = 'block';
|
| 83 |
+
} catch (e) {
|
| 84 |
+
alert("Login Failed");
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
// Admin Actions
|
| 89 |
+
async function performAction(url) {
|
| 90 |
+
const logBox = document.getElementById('log-box');
|
| 91 |
+
logBox.style.display = 'block';
|
| 92 |
+
logBox.textContent = "Processing...";
|
| 93 |
+
|
| 94 |
+
const payload = {};
|
| 95 |
+
if(url.includes('update')) {
|
| 96 |
+
payload.max_new_files = document.getElementById('max-files').value;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
try {
|
| 100 |
+
const res = await axios.post(url, payload, { auth: authHeader });
|
| 101 |
+
logBox.textContent = JSON.stringify(res.data, null, 2);
|
| 102 |
+
} catch (e) {
|
| 103 |
+
logBox.textContent = "Error: " + (e.response ? JSON.stringify(e.response.data) : e.message);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
</script>
|
| 107 |
+
</body>
|
| 108 |
+
</html>
|
utils.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import re
|
| 4 |
+
import shutil
|
| 5 |
+
import tempfile
|
| 6 |
+
import time
|
| 7 |
+
from typing import Optional
|
| 8 |
+
import zipfile
|
| 9 |
+
|
| 10 |
+
import gdown
|
| 11 |
+
from pypdf import PdfReader
|
| 12 |
+
import docx as python_docx
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
def extract_text_from_file(file_path: str, file_type: str) -> Optional[str]:
|
| 17 |
+
logger.info(f"[TEXT_EXTRACTION] Starting extraction from {file_type.upper()} file: {file_path}")
|
| 18 |
+
text_content = None
|
| 19 |
+
try:
|
| 20 |
+
if file_type == 'pdf':
|
| 21 |
+
reader = PdfReader(file_path)
|
| 22 |
+
text_content = "".join(page.extract_text() + "\n" for page in reader.pages if page.extract_text())
|
| 23 |
+
elif file_type == 'docx':
|
| 24 |
+
doc = python_docx.Document(file_path)
|
| 25 |
+
text_content = "\n".join(para.text for para in doc.paragraphs if para.text)
|
| 26 |
+
elif file_type == 'txt':
|
| 27 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
| 28 |
+
text_content = f.read()
|
| 29 |
+
else:
|
| 30 |
+
logger.warning(f"[TEXT_EXTRACTION] Unsupported file type: {file_type}")
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
if not text_content or not text_content.strip():
|
| 34 |
+
logger.warning(f"[TEXT_EXTRACTION] No text content extracted from {file_path}")
|
| 35 |
+
return None
|
| 36 |
+
|
| 37 |
+
return text_content.strip()
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.error(f"[TEXT_EXTRACTION] Error extracting text: {e}", exc_info=True)
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
FAISS_RAG_SUPPORTED_EXTENSIONS = {
|
| 43 |
+
'pdf': lambda path: extract_text_from_file(path, 'pdf'),
|
| 44 |
+
'docx': lambda path: extract_text_from_file(path, 'docx'),
|
| 45 |
+
'txt': lambda path: extract_text_from_file(path, 'txt'),
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def get_id_from_gdrive_input(url_or_id: str) -> Optional[str]:
|
| 49 |
+
if not url_or_id: return None
|
| 50 |
+
match_folder = re.search(r"/folders/([a-zA-Z0-9_-]+)", url_or_id)
|
| 51 |
+
if match_folder: return match_folder.group(1)
|
| 52 |
+
match_file_d = re.search(r"/d/([a-zA-Z0-9_-]+)", url_or_id)
|
| 53 |
+
if match_file_d: return match_file_d.group(1)
|
| 54 |
+
match_uc = re.search(r"id=([a-zA-Z0-9_-]+)", url_or_id)
|
| 55 |
+
if match_uc: return match_uc.group(1)
|
| 56 |
+
return url_or_id if len(url_or_id) > 10 else None
|
| 57 |
+
|
| 58 |
+
def download_gdrive_file(file_id_or_url: str, target_path: str) -> bool:
|
| 59 |
+
"""
|
| 60 |
+
Downloads a single file (like users.csv) from GDrive to a specific path.
|
| 61 |
+
"""
|
| 62 |
+
logger.info(f"[GDRIVE_SINGLE] Downloading file. Input: {file_id_or_url}")
|
| 63 |
+
file_id = get_id_from_gdrive_input(file_id_or_url)
|
| 64 |
+
if not file_id:
|
| 65 |
+
logger.error("[GDRIVE_SINGLE] Invalid ID")
|
| 66 |
+
return False
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
# Ensure dir exists
|
| 70 |
+
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
| 71 |
+
# fuzzy=True allows gdown to handle permissions more gracefully
|
| 72 |
+
gdown.download(id=file_id, output=target_path, quiet=False, fuzzy=True)
|
| 73 |
+
|
| 74 |
+
if os.path.exists(target_path) and os.path.getsize(target_path) > 0:
|
| 75 |
+
logger.info("[GDRIVE_SINGLE] Success.")
|
| 76 |
+
return True
|
| 77 |
+
else:
|
| 78 |
+
logger.error("[GDRIVE_SINGLE] Downloaded file is empty or missing.")
|
| 79 |
+
return False
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"[GDRIVE_SINGLE] Error: {e}", exc_info=True)
|
| 82 |
+
return False
|
| 83 |
+
|
| 84 |
+
def download_and_unzip_gdrive_folder(folder_id_or_url: str, target_dir_for_contents: str) -> bool:
|
| 85 |
+
logger.info(f"[GDRIVE] Downloading folder. Input: {folder_id_or_url}")
|
| 86 |
+
folder_id = get_id_from_gdrive_input(folder_id_or_url)
|
| 87 |
+
if not folder_id: return False
|
| 88 |
+
|
| 89 |
+
temp_dir = tempfile.mkdtemp()
|
| 90 |
+
try:
|
| 91 |
+
gdown.download_folder(id=folder_id, output=temp_dir, quiet=False, use_cookies=False)
|
| 92 |
+
|
| 93 |
+
if not os.path.exists(target_dir_for_contents):
|
| 94 |
+
os.makedirs(target_dir_for_contents)
|
| 95 |
+
|
| 96 |
+
src_root = temp_dir
|
| 97 |
+
if len(os.listdir(temp_dir)) == 1 and os.path.isdir(os.path.join(temp_dir, os.listdir(temp_dir)[0])):
|
| 98 |
+
src_root = os.path.join(temp_dir, os.listdir(temp_dir)[0])
|
| 99 |
+
|
| 100 |
+
for item in os.listdir(src_root):
|
| 101 |
+
shutil.move(os.path.join(src_root, item), os.path.join(target_dir_for_contents, item))
|
| 102 |
+
|
| 103 |
+
logger.info(f"[GDRIVE] Download complete.")
|
| 104 |
+
return True
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.error(f"[GDRIVE] Error: {e}", exc_info=True)
|
| 107 |
+
return False
|
| 108 |
+
finally:
|
| 109 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 110 |
+
|
| 111 |
+
def download_and_unzip_gdrive_file(file_id_or_url: str, target_extraction_dir: str) -> bool:
|
| 112 |
+
logger.info(f"[GDRIVE_ZIP] Downloading ZIP. Input: {file_id_or_url}")
|
| 113 |
+
file_id = get_id_from_gdrive_input(file_id_or_url)
|
| 114 |
+
if not file_id: return False
|
| 115 |
+
|
| 116 |
+
temp_zip = os.path.join(tempfile.gettempdir(), "temp_download.zip")
|
| 117 |
+
try:
|
| 118 |
+
gdown.download(id=file_id, output=temp_zip, quiet=False)
|
| 119 |
+
with zipfile.ZipFile(temp_zip, 'r') as zip_ref:
|
| 120 |
+
zip_ref.extractall(target_extraction_dir)
|
| 121 |
+
return True
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error(f"[GDRIVE_ZIP] Error: {e}", exc_info=True)
|
| 124 |
+
return False
|