Spaces:

AnwinMJ
/

BeRu

Runtime error

BeRu / vlm2rag2.py

BeRU Deployer

Deploy BeRU Streamlit RAG System - Add app, models logic, configs, and optimizations for HF Spaces

dec533d about 1 month ago

47.7 kB


	import glob
	import os
	import gc
	import time
	import re
	import hashlib
	from pathlib import Path
	from typing import List, Dict, Tuple, Optional
	import fitz # PyMuPDF
	import torch
	import numpy as np
	from PIL import Image
	from transformers import AutoModel, AutoProcessor, AutoTokenizer # Changed from AutoModelForCausalLM
	from langchain_core.documents import Document
	import pickle
	from numpy.linalg import norm
	import camelot
	import base64
	import pytesseract
	from pdf2image import convert_from_path
	import faiss
	from rank_bm25 import BM25Okapi


	# ========================================
	# 📂 CONFIGURATION
	# ========================================
	PDF_DIR = r"D:\BeRU\testing"
	FAISS_INDEX_PATH = "VLM2Vec-V2rag2"
	MODEL_CACHE_DIR = ".cache"
	IMAGE_OUTPUT_DIR = "extracted_images2"

	# Chunking configuration
	CHUNK_SIZE = 450 # words
	OVERLAP = 100 # words
	MIN_CHUNK_SIZE = 50
	MAX_CHUNK_SIZE = 800

	# Instruction prefixes for better embeddings
	DOCUMENT_INSTRUCTION = "Represent this technical document for semantic search: "
	QUERY_INSTRUCTION = "Represent this question for finding relevant technical information: "

	# Hybrid search weights
	DENSE_WEIGHT = 0.4 # Weight for semantic search
	SPARSE_WEIGHT = 0.6 # Weight for keyword search

	# Create directories
	os.makedirs(PDF_DIR, exist_ok=True)
	os.makedirs(FAISS_INDEX_PATH, exist_ok=True)
	os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
	os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)

	# ========================================
	# 🤖 VLM2Vec-V2 WRAPPER (ENHANCED)
	# ========================================
	class VLM2VecEmbeddings:
	"""VLM2Vec-V2 embedding class with instruction prefixes."""

	def __init__(self, model_name: str = "TIGER-Lab/VLM2Vec-Qwen2VL-2B", cache_dir: str = None):
	print(f"🤖 Loading VLM2Vec-V2 model: {model_name}")

	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f" Device: {self.device}")

	try:
	self.model = AutoModel.from_pretrained(
	model_name,
	cache_dir=cache_dir,
	trust_remote_code=True,
	torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
	).to(self.device)

	self.processor = AutoProcessor.from_pretrained(
	model_name,
	cache_dir=cache_dir,
	trust_remote_code=True
	)

	self.tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	cache_dir=cache_dir,
	trust_remote_code=True
	)

	self.model.eval()

	# Get actual embedding dimension
	test_input = self.tokenizer("test", return_tensors="pt").to(self.device)
	with torch.no_grad():
	test_output = self.model(**test_input, output_hidden_states=True)
	self.embedding_dim = test_output.hidden_states[-1].shape[-1]

	print(f" Embedding dimension: {self.embedding_dim}")
	print("✅ VLM2Vec-V2 loaded successfully\n")

	except Exception as e:
	print(f"❌ Error loading VLM2Vec-V2: {e}")
	raise

	def normalize_text(self, text: str) -> str:
	"""Normalize text for better embeddings."""
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)

	# Remove page numbers
	text = re.sub(r'Page \d+', '', text, flags=re.IGNORECASE)

	# Normalize unicode
	text = text.strip()

	return text

	def embed_documents(self, texts: List[str], add_instruction: bool = True) -> List[List[float]]:
	"""Embed documents with instruction prefix and weighted mean pooling."""
	embeddings = []

	with torch.no_grad():
	for text in texts:
	try:
	# ✅ NORMALIZE TEXT
	clean_text = self.normalize_text(text)

	# ✅ ADD INSTRUCTION PREFIX
	if add_instruction:
	prefixed_text = DOCUMENT_INSTRUCTION + clean_text
	else:
	prefixed_text = clean_text

	inputs = self.tokenizer(
	prefixed_text,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=min(self.tokenizer.model_max_length or 512, 2048)
	).to(self.device)

	outputs = self.model(**inputs, output_hidden_states=True)

	if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
	# ✅ WEIGHTED MEAN POOLING (ignores padding)
	hidden_states = outputs.hidden_states[-1]
	attention_mask = inputs['attention_mask'].unsqueeze(-1).float()

	# Apply attention mask as weights
	weighted_hidden_states = hidden_states * attention_mask
	sum_embeddings = weighted_hidden_states.sum(dim=1)
	sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)

	# Weighted mean
	embedding = (sum_embeddings / sum_mask).squeeze()
	else:
	# Fallback to logits
	attention_mask = inputs['attention_mask'].unsqueeze(-1).float()
	weighted_logits = outputs.logits * attention_mask
	sum_embeddings = weighted_logits.sum(dim=1)
	sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
	embedding = (sum_embeddings / sum_mask).squeeze()

	embeddings.append(embedding.cpu().numpy().tolist())

	except Exception as e:
	print(f" ❌ CRITICAL: Failed to embed text: {e}")
	print(f" Text preview: {text[:100]}")
	raise RuntimeError(f"Embedding failed for text: {text[:50]}...") from e

	return embeddings

	def embed_query(self, text: str) -> List[float]:
	"""Embed query with query-specific instruction."""
	# ✅ DIFFERENT INSTRUCTION FOR QUERIES
	clean_text = self.normalize_text(text)
	prefixed_text = QUERY_INSTRUCTION + clean_text

	# Don't add document instruction again
	return self.embed_documents([prefixed_text], add_instruction=False)[0]

	def embed_image(self, image_path: str, prompt: str = "Technical diagram") -> Optional[List[float]]:
	"""Embed image with Qwen2-VL proper format."""
	try:
	with torch.no_grad():
	image = Image.open(image_path).convert('RGB')

	# ✅ QWEN2-VL CORRECT FORMAT
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt}
	]
	}
	]

	# Apply chat template
	text = self.processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Process with both text and images
	inputs = self.processor(
	text=[text],
	images=[image],
	return_tensors="pt",
	padding=True
	).to(self.device)

	outputs = self.model(**inputs, output_hidden_states=True)

	if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
	hidden_states = outputs.hidden_states[-1]

	# Use weighted mean pooling
	if 'attention_mask' in inputs:
	attention_mask = inputs['attention_mask'].unsqueeze(-1).float()
	weighted_hidden_states = hidden_states * attention_mask
	sum_embeddings = weighted_hidden_states.sum(dim=1)
	sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
	embedding = (sum_embeddings / sum_mask).squeeze()
	else:
	embedding = hidden_states.mean(dim=1).squeeze()
	else:
	# Fallback to pooler output if available
	if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
	embedding = outputs.pooler_output.squeeze()
	else:
	return None

	return embedding.cpu().numpy().tolist()

	except Exception as e:
	print(f" ⚠️ Failed to embed image {Path(image_path).name}: {str(e)[:100]}")
	return None


	# ========================================
	# 🔍 QUERY PREPROCESSING
	# ========================================
	def preprocess_query(query: str) -> str:
	"""Preprocess query by expanding abbreviations."""

	abbreviations = {
	r'\bh2s\b': 'hydrogen sulfide',
	r'\bppm\b': 'parts per million',
	r'\bppe\b': 'personal protective equipment',
	r'\bscba\b': 'self contained breathing apparatus',
	r'\blel\b': 'lower explosive limit',
	r'\bhel\b': 'higher explosive limit',
	r'\buel\b': 'upper explosive limit'
	}

	query_lower = query.lower()
	for abbr, full in abbreviations.items():
	query_lower = re.sub(abbr, full, query_lower)

	# Remove excessive punctuation
	query_lower = re.sub(r'[?!]+$', '', query_lower)

	# Clean extra spaces
	query_lower = re.sub(r'\s+', ' ', query_lower).strip()

	return query_lower


	# ========================================
	# 📊 TABLE EXTRACTION
	# ========================================
	def is_table_of_contents_header(df, page_num):
	"""Detect TOC by checking first row for keywords."""
	if len(df) == 0 or page_num > 15:
	return False

	# Check first row (headers)
	first_row = ' '.join(df.iloc[0].astype(str)).lower()

	# TOC keywords in your images
	toc_keywords = ['section', 'subsection', 'description', 'page no', 'page number', 'contents']

	# If at least 2 keywords match, it's TOC
	keyword_count = sum(1 for keyword in toc_keywords if keyword in first_row)

	return keyword_count >= 2


	def looks_like_toc_data(df):
	"""Check if table data looks like TOC (section numbers + page numbers)."""
	if len(df) < 2 or len(df.columns) < 2:
	return False

	# Check last column: should be mostly page numbers (182-246 range in your case)
	last_col = df.iloc[1:, -1].astype(str) # Skip header row
	numeric_count = sum(val.strip().isdigit() and 50 < int(val.strip()) < 300
	for val in last_col if val.strip().isdigit())

	if len(last_col) > 0 and numeric_count / len(last_col) > 0.7:
	# Check first column: should have section numbers like "10.1", "10.2"
	first_col = df.iloc[1:, 0].astype(str)
	section_pattern = sum(1 for val in first_col
	if re.match(r'^\d+\.?\d*$', val.strip()))

	if section_pattern / len(first_col) > 0.5:
	return True

	return False


	def extract_tables_from_pdf(pdf_path: str) -> List[Document]:
	"""Extract bordered tables with smart TOC detection."""
	chunks = []

	try:
	lattice_tables = camelot.read_pdf(
	pdf_path,
	pages='all',
	flavor='lattice', # Only bordered tables
	suppress_stdout=True
	)

	all_tables = list(lattice_tables)
	seen_tables = set()

	# Track TOC state
	in_toc_section = False
	toc_start_page = None

	print(f" 📊 Found {len(all_tables)} bordered tables")

	for table in all_tables:
	df = table.df
	current_page = table.page

	# Unique ID
	table_id = (current_page, tuple(df.iloc[0].tolist()) if len(df) > 0 else ())
	if table_id in seen_tables:
	continue
	seen_tables.add(table_id)

	# Skip first 5 pages (title pages)
	if current_page <= 5:
	continue

	# Basic validation
	if len(df.columns) < 2 or len(df) < 3 or table.accuracy < 80:
	continue

	# ✅ Detect TOC start (page with header row)
	if not in_toc_section and is_table_of_contents_header(df, current_page):
	in_toc_section = True
	toc_start_page = current_page
	print(f" 🔍 TOC detected at page {current_page}")
	continue

	# ✅ If we're in TOC section, check if this continues the pattern
	if in_toc_section:
	if looks_like_toc_data(df):
	print(f" ⏭️ Skipping TOC continuation on page {current_page}")
	continue
	else:
	# TOC ended, resume normal extraction
	print(f" ✅ TOC ended, found real table on page {current_page}")
	in_toc_section = False

	# Extract valid table
	table_text = table_to_natural_language_enhanced(table)

	if table_text.strip():
	chunks.append(Document(
	page_content=table_text,
	metadata={
	"source": os.path.basename(pdf_path),
	"page": current_page,
	"heading": "Table Data",
	"type": "table",
	"table_accuracy": table.accuracy
	}
	))

	print(f" ✅ Extracted {len(chunks)} valid tables (after TOC filtering)")

	except Exception as e:
	print(f"⚠️ Table extraction failed: {e}")

	finally:
	try:
	del lattice_tables
	del all_tables
	gc.collect()
	time.sleep(0.1)
	except:
	pass

	return chunks



	def table_to_natural_language_enhanced(table) -> str:
	"""Enhanced table-to-natural-language conversion."""
	df = table.df

	if len(df) < 2:
	return ""

	headers = [str(h).strip() for h in df.iloc[0].astype(str).tolist()]
	headers = [h if h and h.lower() not in ['', 'nan', 'none'] else f"Column_{i}"
	for i, h in enumerate(headers)]

	descriptions = []

	for idx in range(1, len(df)):
	row = [str(cell).strip() for cell in df.iloc[idx].astype(str).tolist()]

	if not any(cell and cell.lower() not in ['', 'nan', 'none'] for cell in row):
	continue

	if len(row) > 0 and row[0] and row[0].lower() not in ['', 'nan', 'none']:
	sentence_parts = []

	for i in range(1, min(len(row), len(headers))):
	if row[i] and row[i].lower() not in ['', 'nan', 'none']:
	sentence_parts.append(f"{headers[i]}: {row[i]}")

	if sentence_parts:
	descriptions.append(f"{row[0]} has {', '.join(sentence_parts)}.")
	else:
	descriptions.append(f"{row[0]}.")

	return "\n".join(descriptions)


	def extract_tables_with_ocr(pdf_path: str, page_num: int) -> List[Dict]:
	"""OCR fallback for image-based PDFs."""
	try:
	images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)

	if not images:
	return []

	ocr_text = pytesseract.image_to_string(images[0])
	lines = ocr_text.split('\n')
	table_lines = []

	for line in lines:
	if re.search(r'\s{2,}', line) or '\t' in line:
	table_lines.append(line)

	if len(table_lines) > 2:
	return [{
	"text": "\n".join(table_lines),
	"page": page_num,
	"method": "ocr"
	}]

	return []

	except Exception as e:
	return []


	def get_table_regions(pdf_path: str) -> Dict[int, List[tuple]]:
	"""Get bounding boxes using BOTH lattice and stream methods."""
	table_regions = {}

	try:
	lattice_tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice', suppress_stdout=True)
	stream_tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream', suppress_stdout=True)

	all_tables = list(lattice_tables) + list(stream_tables)

	for table in all_tables:
	page = table.page

	if is_table_of_contents_header(table.df, page):
	continue

	bbox = table._bbox

	if page not in table_regions:
	table_regions[page] = []

	if bbox not in table_regions[page]:
	table_regions[page].append(bbox)

	except Exception as e:
	pass

	return table_regions




	# ========================================
	# 🖼️ IMAGE EXTRACTION
	# ========================================
	def extract_images_from_pdf(pdf_path: str, output_dir: str) -> List[Dict]:
	"""Extract images from PDF."""
	doc = fitz.open(pdf_path)
	image_data = []

	for page_num in range(len(doc)):
	page = doc[page_num]
	images = page.get_images()

	for img_index, img in enumerate(images):
	try:
	xref = img[0]
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]

	if len(image_bytes) < 10000:
	continue

	image_filename = f"{Path(pdf_path).stem}_p{page_num+1}_img{img_index+1}.png"
	image_path = os.path.join(output_dir, image_filename)

	with open(image_path, "wb") as img_file:
	img_file.write(image_bytes)

	image_data.append({
	"path": image_path,
	"page": page_num + 1,
	"source": os.path.basename(pdf_path),
	"type": "image"
	})

	except Exception as e:
	continue

	doc.close()
	return image_data


	# ========================================
	# 📄 TEXT EXTRACTION WITH OVERLAPPING CHUNKS
	# ========================================
	def is_bold_text(span):
	return "bold" in span['font'].lower() or (span['flags'] & 2**4)


	def is_likely_heading(text, font_size, is_bold, avg_font_size):
	if not is_bold:
	return False
	text = text.strip()
	if len(text) > 100 or len(text) < 3:
	return False
	if font_size > avg_font_size * 1.1:
	return True
	if text.isupper() or re.match(r'^\d+\.?\d*\s+[A-Z]', text):
	return True
	return False


	def is_inside_table(block_bbox, table_bboxes):
	"""Check if text block overlaps with table region."""
	bx1, by1, bx2, by2 = block_bbox

	for table_bbox in table_bboxes:
	tx1, ty1, tx2, ty2 = table_bbox

	if not (bx2 < tx1 or bx1 > tx2 or by2 < ty1 or by1 > ty2):
	return True

	return False


	def split_text_with_overlap(text: str, heading: str, source: str, page: int,
	chunk_size: int = CHUNK_SIZE, overlap: int = OVERLAP) -> List[Document]:
	"""Split text with overlap and heading context."""

	words = text.split()

	if len(words) <= chunk_size:
	# ✅ ADD HEADING CONTEXT
	content_with_context = f"Section: {heading}\n\n{text}"

	return [Document(
	page_content=content_with_context,
	metadata={
	"source": source,
	"page": page,
	"heading": heading,
	"type": "text",
	"parent_text": text,
	"chunk_index": 0,
	"total_chunks": 1
	}
	)]

	chunks = []
	chunk_index = 0

	for i in range(0, len(words), chunk_size - overlap):
	chunk_words = words[i:i + chunk_size]

	if len(chunk_words) < MIN_CHUNK_SIZE and len(chunks) > 0:
	break

	chunk_text = " ".join(chunk_words)

	# ✅ ADD HEADING CONTEXT TO EACH CHUNK
	content_with_context = f"Section: {heading}\n\n{chunk_text}"

	chunks.append(Document(
	page_content=content_with_context,
	metadata={
	"source": source,
	"page": page,
	"heading": heading,
	"type": "text",
	"parent_text": text,
	"chunk_index": chunk_index,
	"start_word": i,
	"end_word": i + len(chunk_words)
	}
	))

	chunk_index += 1

	for chunk in chunks:
	chunk.metadata["total_chunks"] = len(chunks)

	return chunks


	def extract_text_chunks_with_overlap(pdf_path: str, table_regions: Dict[int, List[tuple]]) -> List[Document]:
	"""Extract text with overlapping chunks."""
	doc = fitz.open(pdf_path)

	all_font_sizes = []
	for page_num in range(len(doc)):
	page = doc[page_num]
	blocks = page.get_text("dict")["blocks"]
	for block in blocks:
	if "lines" in block:
	for line in block["lines"]:
	for span in line["spans"]:
	all_font_sizes.append(span["size"])

	avg_font_size = sum(all_font_sizes) / len(all_font_sizes) if all_font_sizes else 12

	sections = []
	current_section = ""
	current_heading = "Introduction"
	current_page = 1

	for page_num in range(len(doc)):
	page = doc[page_num]
	blocks = page.get_text("dict")["blocks"]

	page_tables = table_regions.get(page_num + 1, [])

	for block in blocks:
	if "lines" not in block:
	continue

	block_bbox = block.get("bbox", (0, 0, 0, 0))
	if is_inside_table(block_bbox, page_tables):
	continue

	for line in block["lines"]:
	line_text = ""
	line_is_bold = False
	line_font_size = 0

	for span in line["spans"]:
	line_text += span["text"]
	if is_bold_text(span):
	line_is_bold = True
	line_font_size = max(line_font_size, span["size"])

	line_text = line_text.strip()
	if not line_text:
	continue

	if is_likely_heading(line_text, line_font_size, line_is_bold, avg_font_size):
	if current_section.strip():
	sections.append({
	"text": current_section.strip(),
	"heading": current_heading,
	"page": current_page,
	"source": os.path.basename(pdf_path)
	})

	current_heading = line_text
	current_section = ""
	current_page = page_num + 1
	else:
	current_section += line_text + " "

	if current_section.strip():
	sections.append({
	"text": current_section.strip(),
	"heading": current_heading,
	"page": current_page,
	"source": os.path.basename(pdf_path)
	})

	doc.close()

	all_chunks = []

	for section in sections:
	chunks = split_text_with_overlap(
	text=section['text'],
	heading=section['heading'],
	source=section['source'],
	page=section['page'],
	chunk_size=CHUNK_SIZE,
	overlap=OVERLAP
	)
	all_chunks.extend(chunks)

	return all_chunks


	# ========================================
	# 🔄 COMBINED EXTRACTION
	# ========================================
	def extract_all_content_from_pdf(pdf_path: str) -> Tuple[List[Document], List[Dict]]:
	"""Extract text, tables, and images."""

	print(f" 📊 Extracting tables...")
	table_regions = get_table_regions(pdf_path)
	table_chunks = extract_tables_from_pdf(pdf_path)
	print(f" ✅ {len(table_chunks)} table chunks")

	print(f" 📄 Extracting text...")
	text_chunks = extract_text_chunks_with_overlap(pdf_path, table_regions)
	print(f" ✅ {len(text_chunks)} text chunks")

	print(f" 🖼️ Extracting images...")
	images = extract_images_from_pdf(pdf_path, IMAGE_OUTPUT_DIR)
	print(f" ✅ {len(images)} images")

	all_chunks = text_chunks + table_chunks

	return all_chunks, images


	# ========================================
	# 🏗️ BUILD FAISS INDEX WITH STREAMING
	# ========================================
	# Replace the HybridRetriever class and related functions with this optimized version:

	# ========================================
	# 🏗️ BUILD FAISS INDEX WITH BM25
	# ========================================
	def build_multimodal_faiss_streaming(pdf_files: List[str], embedding_model: VLM2VecEmbeddings):
	"""Build FAISS index with streaming and BM25."""

	index_hash_file = f"{FAISS_INDEX_PATH}/index_hash.txt"
	current_hash = hashlib.md5("".join(sorted(pdf_files)).encode()).hexdigest()

	if os.path.exists(index_hash_file):
	with open(index_hash_file, 'r') as f:
	existing_hash = f.read().strip()

	if existing_hash == current_hash:
	print("⚠️ Index already exists for these PDFs!")
	response = input(" Rebuild anyway? (yes/no): ").strip().lower()
	if response != 'yes':
	return None, []

	all_texts = []
	all_image_paths = []

	print("\n📄 Processing PDFs...\n")

	for pdf_file in pdf_files:
	print(f"📖 Processing: {Path(pdf_file).name}")

	try:
	text_chunks, images = extract_all_content_from_pdf(pdf_file)

	all_texts.extend(text_chunks)
	all_image_paths.extend(images)

	except Exception as e:
	print(f" ❌ Error: {e}")
	continue

	print()

	print(f"✅ Total chunks: {len(all_texts)}")
	print(f"✅ Total images: {len(all_image_paths)}\n")

	if len(all_texts) == 0:
	print("❌ No content extracted!")
	return None, []

	# Build text index
	print("🔗 Generating text embeddings...\n")

	text_index = None
	batch_size = 10

	for i in range(0, len(all_texts), batch_size):
	batch = all_texts[i:i+batch_size]
	batch_contents = [doc.page_content for doc in batch]

	try:
	batch_embeddings = embedding_model.embed_documents(batch_contents, add_instruction=True)
	batch_embeddings_np = np.array(batch_embeddings).astype('float32')

	if text_index is None:
	dimension = batch_embeddings_np.shape[1]
	text_index = faiss.IndexFlatIP(dimension)
	print(f" Text embedding dimension: {dimension}")

	faiss.normalize_L2(batch_embeddings_np)
	text_index.add(batch_embeddings_np)

	if (i // batch_size + 1) % 5 == 0:
	print(f" Progress: {i + len(batch)}/{len(all_texts)}")

	except Exception as e:
	print(f" ❌ Error: {e}")
	raise

	print(f" ✅ Complete")

	# Save FAISS index
	faiss.write_index(text_index, f"{FAISS_INDEX_PATH}/text_index.faiss")

	# Save documents
	with open(f"{FAISS_INDEX_PATH}/text_documents.pkl", "wb") as f:
	pickle.dump(all_texts, f)

	# ✅ BUILD AND SAVE BM25 INDEX
	print("\n🔍 Building BM25 index for keyword search...")
	tokenized_docs = [doc.page_content.lower().split() for doc in all_texts]
	bm25_index = BM25Okapi(tokenized_docs,k1=1.3, b=0.65)

	with open(f"{FAISS_INDEX_PATH}/bm25_index.pkl", "wb") as f:
	pickle.dump(bm25_index, f)

	print(" ✅ BM25 index saved")

	# Build image index
	if len(all_image_paths) > 0:
	print(f"\n🖼️ Embedding images...")

	image_index = None
	successful_images = []

	for idx, img_data in enumerate(all_image_paths):
	img_embedding = embedding_model.embed_image(img_data["path"])

	if img_embedding is None:
	continue

	img_embedding_np = np.array([img_embedding]).astype('float32')

	if image_index is None:
	dimension = img_embedding_np.shape[1]
	image_index = faiss.IndexFlatIP(dimension)
	print(f" Image dimension: {dimension}")

	faiss.normalize_L2(img_embedding_np)
	image_index.add(img_embedding_np)
	successful_images.append(img_data)

	if (len(successful_images)) % 10 == 0:
	print(f" Progress: {len(successful_images)}/{len(all_image_paths)}")

	print(f" ✅ {len(successful_images)} images embedded")

	if image_index is not None and len(successful_images) > 0:
	faiss.write_index(image_index, f"{FAISS_INDEX_PATH}/image_index.faiss")

	with open(f"{FAISS_INDEX_PATH}/image_documents.pkl", "wb") as f:
	pickle.dump(successful_images, f)

	# Save hash
	with open(index_hash_file, 'w') as f:
	f.write(current_hash)

	print(f"\n✅ Index saved: {FAISS_INDEX_PATH}\n")

	return text_index, all_texts


	# ========================================
	# 🔍 OPTIMIZED HYBRID SEARCH
	# ========================================
	# ========================================
	# 📊 QUERY WITH BM25 ONLY
	# ========================================
	def query_with_bm25(query: str, k_text: int = 5, k_images: int = 3):
	"""Query using BM25 keyword search only."""

	# ✅ PREPROCESS QUERY
	processed_query = preprocess_query(query)
	print(f" 🔍 Processed: {processed_query}")

	# Load documents
	with open(f"{FAISS_INDEX_PATH}/text_documents.pkl", "rb") as f:
	text_docs = pickle.load(f)

	# ✅ LOAD BM25 INDEX
	try:
	with open(f"{FAISS_INDEX_PATH}/bm25_index.pkl", "rb") as f:
	bm25_index = pickle.load(f)
	except FileNotFoundError:
	print(" ⚠️ BM25 index not found, building on-the-fly...")
	tokenized_docs = [doc.page_content.lower().split() for doc in text_docs]
	bm25_index = BM25Okapi(tokenized_docs)

	# BM25 SEARCH ONLY
	tokenized_query = processed_query.lower().split()
	bm25_scores = bm25_index.get_scores(tokenized_query)

	# Get top k results
	top_indices = np.argsort(bm25_scores)[::-1][:k_text]

	text_results = []
	relevant_pages = set()

	for rank, idx in enumerate(top_indices, 1):
	doc = text_docs[idx]
	score = float(bm25_scores[idx])

	text_results.append({
	"document": doc,
	"score": score,
	"rank": rank,
	"type": doc.metadata.get('type', 'text')
	})
	relevant_pages.add((doc.metadata.get('source'), doc.metadata.get('page')))

	# Get images from relevant pages (not semantic search)
	relevant_images = []

	try:
	image_docs_path = f"{FAISS_INDEX_PATH}/image_documents.pkl"

	if os.path.exists(image_docs_path):
	with open(image_docs_path, "rb") as f:
	image_docs = pickle.load(f)

	# Get images from same pages as top text results
	for img_doc in image_docs:
	img_page = (img_doc['source'], img_doc['page'])
	if img_page in relevant_pages and len(relevant_images) < k_images:
	relevant_images.append({
	"path": img_doc['path'],
	"source": img_doc['source'],
	"page": img_doc['page'],
	"type": "image",
	"score": 0.0,
	"rank": len(relevant_images) + 1,
	"from_page": True
	})

	except Exception as e:
	pass

	return {
	"text_results": text_results,
	"images": relevant_images,
	"query": query,
	"processed_query": processed_query
	}


	# ========================================
	# 📊 DISPLAY RESULTS (BM25 ONLY)
	# ========================================
	def display_results_bm25(results: Dict):
	"""Display BM25 results."""

	print("\n📚 TOP RESULTS (BM25 Keyword Search):\n")

	for result in results['text_results']:
	doc = result["document"]
	print(f"[{result['rank']}] BM25 Score: {result['score']:.4f} \| {doc.metadata.get('type', 'N/A')}")
	print(f" 📄 {doc.metadata.get('source')} - Page {doc.metadata.get('page')}")
	print(f" 📌 {doc.metadata.get('heading', 'N/A')[:60]}")

	if 'total_chunks' in doc.metadata and doc.metadata.get('total_chunks', 1) > 1:
	print(f" 🔗 Chunk {doc.metadata.get('chunk_index', 0)+1}/{doc.metadata.get('total_chunks')}")

	print(f" 📝 {doc.page_content[:200]}...")
	print()

	print("\n🖼️ IMAGES:\n")
	if results['images']:
	for img in results['images']:
	print(f"[{img['rank']}] {img['source']} - Page {img['page']}")
	print(f" {img['path']}\n")
	else:
	print(" No images found\n")


	# ========================================
	# 🔍 HYBRID SEARCH IMPLEMENTATION
	# ========================================

	def normalize_scores(scores: np.ndarray) -> np.ndarray:
	"""Min-max normalization to 0-1 range."""
	if len(scores) == 0:
	return scores

	min_score = np.min(scores)
	max_score = np.max(scores)

	if max_score == min_score:
	return np.ones_like(scores)

	return (scores - min_score) / (max_score - min_score)


	def query_with_hybrid(query: str, embedding_model: VLM2VecEmbeddings,
	k_text: int = 5, k_images: int = 3,
	dense_weight: float = DENSE_WEIGHT,
	sparse_weight: float = SPARSE_WEIGHT):

	"""
	Hybrid search combining semantic (FAISS) and keyword (BM25) retrieval.
	"""

	processed_query = preprocess_query(query)
	print(f" 🔍 Processed: {processed_query}")

	with open(f"{FAISS_INDEX_PATH}/text_documents.pkl", "rb") as f:
	text_docs = pickle.load(f)

	# SEMANTIC SEARCH
	print(f" 🧠 Running semantic search...")

	try:
	text_index = faiss.read_index(f"{FAISS_INDEX_PATH}/text_index.faiss")

	query_embedding = embedding_model.embed_query(processed_query)
	query_np = np.array([query_embedding]).astype('float32')
	faiss.normalize_L2(query_np)

	k_retrieve = min(k_text * 3, len(text_docs))
	distances, indices = text_index.search(query_np, k_retrieve)

	semantic_scores = distances[0]
	semantic_indices = indices[0]

	print(f" ✅ Retrieved {len(semantic_indices)} semantic results")

	except Exception as e:
	print(f" ⚠️ Semantic search failed: {e}")
	semantic_scores = np.array([])
	semantic_indices = np.array([])

	# BM25 SEARCH
	print(f" 🔤 Running BM25 keyword search...")

	try:
	with open(f"{FAISS_INDEX_PATH}/bm25_index.pkl", "rb") as f:
	bm25_index = pickle.load(f)
	except FileNotFoundError:
	tokenized_docs = [doc.page_content.lower().split() for doc in text_docs]
	bm25_index = BM25Okapi(tokenized_docs, k1=1.3, b=0.65)

	tokenized_query = processed_query.lower().split()
	bm25_scores_all = bm25_index.get_scores(tokenized_query)

	print(f" ✅ Scored {len(bm25_scores_all)} documents")

	# SCORE FUSION
	print(f" ⚖️ Fusing scores (semantic: {dense_weight}, BM25: {sparse_weight})...")

	combined_scores = {}

	if len(semantic_scores) > 0:
	semantic_scores_norm = normalize_scores(semantic_scores)

	for idx, score in zip(semantic_indices, semantic_scores_norm):
	if idx < len(text_docs):
	combined_scores[idx] = dense_weight * score

	bm25_scores_norm = normalize_scores(bm25_scores_all)

	for idx, score in enumerate(bm25_scores_norm):
	if idx in combined_scores:
	combined_scores[idx] += sparse_weight * score
	else:
	combined_scores[idx] = sparse_weight * score

	sorted_indices = sorted(combined_scores.keys(),
	key=lambda x: combined_scores[x],
	reverse=True)

	top_indices = sorted_indices[:k_text]

	print(f" ✅ Top {len(top_indices)} results selected")

	# PREPARE RESULTS
	text_results = []
	relevant_pages = set()

	for rank, idx in enumerate(top_indices, 1):
	doc = text_docs[idx]

	semantic_score = semantic_scores_norm[np.where(semantic_indices == idx)[0][0]] if idx in semantic_indices else 0.0
	bm25_score = bm25_scores_norm[idx]
	combined_score = combined_scores[idx]

	text_results.append({
	"document": doc,
	"score": combined_score,
	"semantic_score": float(semantic_score),
	"bm25_score": float(bm25_score),
	"rank": rank,
	"type": doc.metadata.get('type', 'text')
	})
	relevant_pages.add((doc.metadata.get('source'), doc.metadata.get('page')))

	# GET IMAGES
	relevant_images = []

	try:
	image_docs_path = f"{FAISS_INDEX_PATH}/image_documents.pkl"

	if os.path.exists(image_docs_path):
	with open(image_docs_path, "rb") as f:
	image_docs = pickle.load(f)

	for img_doc in image_docs:
	img_page = (img_doc['source'], img_doc['page'])
	if img_page in relevant_pages and len(relevant_images) < k_images:
	relevant_images.append({
	"path": img_doc['path'],
	"source": img_doc['source'],
	"page": img_doc['page'],
	"type": "image",
	"score": 0.0,
	"rank": len(relevant_images) + 1,
	"from_page": True
	})
	except Exception as e:
	pass

	return {
	"text_results": text_results,
	"images": relevant_images,
	"query": query,
	"processed_query": processed_query,
	"method": "hybrid"
	}


	def display_results_hybrid(results: Dict):
	"""Display hybrid search results."""

	print("\n📚 TOP RESULTS (Hybrid Search: Semantic + BM25):\n")

	for result in results['text_results']:
	doc = result["document"]
	print(f"[{result['rank']}] Combined: {result['score']:.4f} "
	f"(Semantic: {result['semantic_score']:.4f}, BM25: {result['bm25_score']:.4f}) "
	f"\| {doc.metadata.get('type', 'N/A')}")
	print(f" 📄 {doc.metadata.get('source')} - Page {doc.metadata.get('page')}")
	print(f" 📌 {doc.metadata.get('heading', 'N/A')[:60]}")

	if 'total_chunks' in doc.metadata and doc.metadata.get('total_chunks', 1) > 1:
	print(f" 🔗 Chunk {doc.metadata.get('chunk_index', 0)+1}/{doc.metadata.get('total_chunks')}")

	print(f" 📝 {doc.page_content[:200]}...")
	print()

	print("\n🖼️ IMAGES:\n")
	if results['images']:
	for img in results['images']:
	print(f"[{img['rank']}] {img['source']} - Page {img['page']}")
	print(f" {img['path']}\n")
	else:
	print(" No images found\n")


	# ========================================
	# 📖 GET CONTEXT WITH PARENTS
	# ========================================
	def get_context_with_parents(results: Dict) -> List[Dict]:
	"""Extract full parent contexts."""

	seen_parents = set()
	contexts = []

	for result in results['text_results']:
	doc = result['document']
	parent = doc.metadata.get('parent_text')

	if parent and parent not in seen_parents:
	contexts.append({
	"text": parent,
	"source": doc.metadata['source'],
	"page": doc.metadata['page'],
	"heading": doc.metadata['heading'],
	"type": doc.metadata.get('type', 'text'),
	"is_parent": True
	})
	seen_parents.add(parent)
	elif not parent:
	contexts.append({
	"text": doc.page_content,
	"source": doc.metadata['source'],
	"page": doc.metadata['page'],
	"heading": doc.metadata['heading'],
	"type": doc.metadata.get('type', 'text'),
	"is_parent": False
	})

	return contexts


	# ========================================
	# 🚀 MAIN EXECUTION (UPDATED FOR HYBRID)
	# ========================================
	if __name__ == "__main__":
	print("="*70)
	print("🚀 RAG with HYBRID SEARCH (Semantic + BM25)")
	print("="*70 + "\n")

	pdf_files = glob.glob(f"{PDF_DIR}/*.pdf")
	print(f"📂 Found {len(pdf_files)} PDF files\n")

	if len(pdf_files) == 0:
	print("❌ No PDFs found!")
	exit(1)

	print("\n🤖 Loading VLM2Vec model...")
	embedding_model = VLM2VecEmbeddings(
	model_name="TIGER-Lab/VLM2Vec-Qwen2VL-2B",
	cache_dir=MODEL_CACHE_DIR
	)

	# Load or build index
	if os.path.exists(f"{FAISS_INDEX_PATH}/text_index.faiss"):
	print(f"✅ Loading existing index\n")

	if not os.path.exists(f"{FAISS_INDEX_PATH}/bm25_index.pkl"):
	print("⚠️ BM25 index missing, building now...")

	with open(f"{FAISS_INDEX_PATH}/text_documents.pkl", "rb") as f:
	all_texts = pickle.load(f)

	print(" Building BM25 index...")
	tokenized_docs = [doc.page_content.lower().split() for doc in all_texts]
	bm25_index = BM25Okapi(tokenized_docs, k1=1.3, b=0.65)

	with open(f"{FAISS_INDEX_PATH}/bm25_index.pkl", "wb") as f:
	pickle.dump(bm25_index, f)

	print(" ✅ BM25 index saved\n")

	else:
	print("🔨 Building new index...\n")

	embedding_model = VLM2VecEmbeddings(
	model_name="TIGER-Lab/VLM2Vec-Qwen2VL-2B",
	cache_dir=MODEL_CACHE_DIR
	)

	index, documents = build_multimodal_faiss_streaming(pdf_files, embedding_model)

	if index is None:
	exit(0)

	# Interactive testing
	print("="*70)
	print("🧪 TESTING MODE - HYBRID SEARCH")
	print(f" Weights: Semantic {DENSE_WEIGHT} \| BM25 {SPARSE_WEIGHT}")
	print("="*70 + "\n")

	test_queries = [
	"What is the higher and lower explosive limit of butane?",
	"What are the precautions taken while handling H2S?",
	"What are the Personal Protection used for Sulfolane?",
	"What is the Composition of Platforming Feed and Product?",
	"Explain Dual function platforming catalyst chemistry.",
	"Steps to be followed in Amine Regeneration Unit for normal shutdown process.",
	"Could you tell me what De-greasing of Amine System in pre startup wash",
	]

	print("📋 SUGGESTED QUERIES:")
	for i, q in enumerate(test_queries, 1):
	print(f" {i}. {q}")
	print()
	print("💡 Type 'mode' to switch between hybrid/bm25/semantic")
	print()

	current_mode = "hybrid"

	while True:
	user_query = input(f"💬 Query [{current_mode}] (or 1-5, 'mode', or 'exit'): ").strip()

	if user_query.lower() == 'exit':
	print("\n✅ Done!")
	break

	if user_query.lower() == 'mode':
	print("\n🔄 Select mode:")
	print(" 1. Hybrid (Semantic + BM25)")
	print(" 2. BM25 only")
	print(" 3. Semantic only")
	mode_choice = input(" Choice (1-3): ").strip()

	if mode_choice == '1':
	current_mode = "hybrid"
	elif mode_choice == '2':
	current_mode = "bm25"
	elif mode_choice == '3':
	current_mode = "semantic"

	print(f" ✅ Mode set to: {current_mode}\n")
	continue

	if user_query.isdigit() and 1 <= int(user_query) <= len(test_queries):
	user_query = test_queries[int(user_query) - 1]

	if not user_query:
	continue

	print(f"\n{'='*60}")
	print(f"🔍 Query: {user_query}")
	print(f"🔧 Mode: {current_mode.upper()}")
	print(f"{'='*60}\n")

	try:
	if current_mode == "hybrid":
	results = query_with_hybrid(user_query, embedding_model, k_text=5, k_images=3)
	display_results_hybrid(results)
	elif current_mode == "bm25":
	results = query_with_bm25(user_query, k_text=5, k_images=3)
	display_results_bm25(results)
	else: # semantic only
	results = query_with_hybrid(user_query, embedding_model, k_text=5, k_images=3,
	dense_weight=1.0, sparse_weight=0.0)
	display_results_hybrid(results)

	print("\n📖 FULL CONTEXT:\n")
	contexts = get_context_with_parents(results)

	for i, ctx in enumerate(contexts[:3], 1):
	print(f"[{i}] {ctx['heading'][:50]}")
	if ctx['is_parent']:
	print(f" ✅ Full section")
	print(f" {ctx['text'][:300]}...\n")

	print("="*60 + "\n")

	except Exception as e:
	print(f"\n❌ Error: {e}\n")
	import traceback
	traceback.print_exc()