archive-explorer / ingest_visual.py
AKMESSI's picture
initial commit
d0a567e
import os
import lancedb
import fitz # PyMuPDF (The replacement for Poppler)
from sentence_transformers import SentenceTransformer
from lancedb.pydantic import LanceModel, Vector
from PIL import Image
import warnings
# Suppress warnings
warnings.filterwarnings("ignore")
# --- CONFIGURATION ---
DATA_DIR = "data"
VECTOR_DB_DIR = "data/lancedb"
print("Loading CLIP Model (Visual Intelligence)...")
model = SentenceTransformer('clip-ViT-B-32')
# Connect to DB
ldb = lancedb.connect(VECTOR_DB_DIR)
class VisualSchema(LanceModel):
vector: Vector(512)
filename: str
page: int
filepath: str
# Create or Open the table
try:
tbl = ldb.open_table("visuals")
except:
tbl = ldb.create_table("visuals", schema=VisualSchema)
def process_pdf_visuals(filepath):
filename = os.path.basename(filepath)
print(f"👀 Scanning visuals: {filename}...")
try:
# OPEN PDF WITH PYMUPDF (No Poppler needed)
doc = fitz.open(filepath)
data_to_add = []
for i, page in enumerate(doc):
try:
# Render page to image (RGB)
# matrix=fitz.Matrix(0.5, 0.5) scales it down for speed (approx 72-100 DPI)
pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5))
# Convert to PIL Image
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# VISUAL EMBEDDING
vector = model.encode(img)
data_to_add.append({
"vector": vector,
"filename": filename,
"page": i + 1,
"filepath": filepath
})
if len(data_to_add) >= 10:
tbl.add(data_to_add)
data_to_add = []
except Exception as e:
# Skip pages that fail to render
continue
if data_to_add:
tbl.add(data_to_add)
doc.close()
except Exception as e:
print(f"Skipping {filename}: {e}")
def main():
print("Starting Visual Ingestion...")
for root, dirs, files in os.walk(DATA_DIR):
for file in files:
if file.lower().endswith('.pdf'):
process_pdf_visuals(os.path.join(root, file))
print("Visual Indexing Complete!")
if __name__ == "__main__":
main()