import os import lancedb import fitz # PyMuPDF (The replacement for Poppler) from sentence_transformers import SentenceTransformer from lancedb.pydantic import LanceModel, Vector from PIL import Image import warnings # Suppress warnings warnings.filterwarnings("ignore") # --- CONFIGURATION --- DATA_DIR = "data" VECTOR_DB_DIR = "data/lancedb" print("Loading CLIP Model (Visual Intelligence)...") model = SentenceTransformer('clip-ViT-B-32') # Connect to DB ldb = lancedb.connect(VECTOR_DB_DIR) class VisualSchema(LanceModel): vector: Vector(512) filename: str page: int filepath: str # Create or Open the table try: tbl = ldb.open_table("visuals") except: tbl = ldb.create_table("visuals", schema=VisualSchema) def process_pdf_visuals(filepath): filename = os.path.basename(filepath) print(f"👀 Scanning visuals: {filename}...") try: # OPEN PDF WITH PYMUPDF (No Poppler needed) doc = fitz.open(filepath) data_to_add = [] for i, page in enumerate(doc): try: # Render page to image (RGB) # matrix=fitz.Matrix(0.5, 0.5) scales it down for speed (approx 72-100 DPI) pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5)) # Convert to PIL Image img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # VISUAL EMBEDDING vector = model.encode(img) data_to_add.append({ "vector": vector, "filename": filename, "page": i + 1, "filepath": filepath }) if len(data_to_add) >= 10: tbl.add(data_to_add) data_to_add = [] except Exception as e: # Skip pages that fail to render continue if data_to_add: tbl.add(data_to_add) doc.close() except Exception as e: print(f"Skipping {filename}: {e}") def main(): print("Starting Visual Ingestion...") for root, dirs, files in os.walk(DATA_DIR): for file in files: if file.lower().endswith('.pdf'): process_pdf_visuals(os.path.join(root, file)) print("Visual Indexing Complete!") if __name__ == "__main__": main()