Spaces:
Sleeping
Sleeping
| import os | |
| import lancedb | |
| import fitz # PyMuPDF (The replacement for Poppler) | |
| from sentence_transformers import SentenceTransformer | |
| from lancedb.pydantic import LanceModel, Vector | |
| from PIL import Image | |
| import warnings | |
| # Suppress warnings | |
| warnings.filterwarnings("ignore") | |
| # --- CONFIGURATION --- | |
| DATA_DIR = "data" | |
| VECTOR_DB_DIR = "data/lancedb" | |
| print("Loading CLIP Model (Visual Intelligence)...") | |
| model = SentenceTransformer('clip-ViT-B-32') | |
| # Connect to DB | |
| ldb = lancedb.connect(VECTOR_DB_DIR) | |
| class VisualSchema(LanceModel): | |
| vector: Vector(512) | |
| filename: str | |
| page: int | |
| filepath: str | |
| # Create or Open the table | |
| try: | |
| tbl = ldb.open_table("visuals") | |
| except: | |
| tbl = ldb.create_table("visuals", schema=VisualSchema) | |
| def process_pdf_visuals(filepath): | |
| filename = os.path.basename(filepath) | |
| print(f"👀 Scanning visuals: {filename}...") | |
| try: | |
| # OPEN PDF WITH PYMUPDF (No Poppler needed) | |
| doc = fitz.open(filepath) | |
| data_to_add = [] | |
| for i, page in enumerate(doc): | |
| try: | |
| # Render page to image (RGB) | |
| # matrix=fitz.Matrix(0.5, 0.5) scales it down for speed (approx 72-100 DPI) | |
| pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5)) | |
| # Convert to PIL Image | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| # VISUAL EMBEDDING | |
| vector = model.encode(img) | |
| data_to_add.append({ | |
| "vector": vector, | |
| "filename": filename, | |
| "page": i + 1, | |
| "filepath": filepath | |
| }) | |
| if len(data_to_add) >= 10: | |
| tbl.add(data_to_add) | |
| data_to_add = [] | |
| except Exception as e: | |
| # Skip pages that fail to render | |
| continue | |
| if data_to_add: | |
| tbl.add(data_to_add) | |
| doc.close() | |
| except Exception as e: | |
| print(f"Skipping {filename}: {e}") | |
| def main(): | |
| print("Starting Visual Ingestion...") | |
| for root, dirs, files in os.walk(DATA_DIR): | |
| for file in files: | |
| if file.lower().endswith('.pdf'): | |
| process_pdf_visuals(os.path.join(root, file)) | |
| print("Visual Indexing Complete!") | |
| if __name__ == "__main__": | |
| main() |