File size: 2,470 Bytes
d0a567e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import lancedb
import fitz # PyMuPDF (The replacement for Poppler)
from sentence_transformers import SentenceTransformer
from lancedb.pydantic import LanceModel, Vector
from PIL import Image
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# --- CONFIGURATION ---
DATA_DIR = "data"
VECTOR_DB_DIR = "data/lancedb"

print("Loading CLIP Model (Visual Intelligence)...")
model = SentenceTransformer('clip-ViT-B-32')

# Connect to DB
ldb = lancedb.connect(VECTOR_DB_DIR)

class VisualSchema(LanceModel):
    vector: Vector(512)
    filename: str
    page: int
    filepath: str

# Create or Open the table
try:
    tbl = ldb.open_table("visuals")
except:
    tbl = ldb.create_table("visuals", schema=VisualSchema)

def process_pdf_visuals(filepath):
    filename = os.path.basename(filepath)
    print(f"👀 Scanning visuals: {filename}...")
    
    try:
        # OPEN PDF WITH PYMUPDF (No Poppler needed)
        doc = fitz.open(filepath)
        
        data_to_add = []
        
        for i, page in enumerate(doc):
            try:
                # Render page to image (RGB)
                # matrix=fitz.Matrix(0.5, 0.5) scales it down for speed (approx 72-100 DPI)
                pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5))
                
                # Convert to PIL Image
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                
                # VISUAL EMBEDDING
                vector = model.encode(img)
                
                data_to_add.append({
                    "vector": vector,
                    "filename": filename,
                    "page": i + 1,
                    "filepath": filepath
                })
                
                if len(data_to_add) >= 10:
                    tbl.add(data_to_add)
                    data_to_add = []
            except Exception as e:
                # Skip pages that fail to render
                continue
        
        if data_to_add:
            tbl.add(data_to_add)
            
        doc.close()

    except Exception as e:
        print(f"Skipping {filename}: {e}")

def main():
    print("Starting Visual Ingestion...")
    for root, dirs, files in os.walk(DATA_DIR):
        for file in files:
            if file.lower().endswith('.pdf'):
                process_pdf_visuals(os.path.join(root, file))
    print("Visual Indexing Complete!")

if __name__ == "__main__":
    main()