archive-explorer / ingest.py
AKMESSI's picture
initial commit
d0a567e
import os
import sqlite3
import lancedb
import PyPDF2
from sentence_transformers import SentenceTransformer
from lancedb.pydantic import LanceModel, Vector
import warnings
# Suppress warnings
warnings.filterwarnings("ignore")
# CONFIGURATION
DATA_DIR = "data"
DB_NAME = "epstein.db"
VECTOR_DB_DIR = "data/lancedb"
print("Initializing models and databases...")
# 1. Setup SQLite (For Keyword Search)
conn = sqlite3.connect(DB_NAME)
cursor = conn.cursor()
# Create main table and FTS (Full Text Search) virtual table
cursor.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
filename TEXT,
filepath TEXT,
page_number INTEGER,
text_content TEXT
)
""")
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
filename,
text_content,
content='pages',
content_rowid='id'
)
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content);
END;
""")
conn.commit()
# 2. Setup LanceDB (For Vector/AI Search)
model = SentenceTransformer('all-MiniLM-L6-v2')
ldb = lancedb.connect(VECTOR_DB_DIR)
# --- THE FIX: Use Pydantic to define the Schema ---
class PageSchema(LanceModel):
vector: Vector(384) # 384 is the dimension of all-MiniLM-L6-v2
text: str
filename: str
page_number: int
filepath: str
# Create or Open the table using the Class Schema
try:
tbl = ldb.open_table("pages")
except:
tbl = ldb.create_table("pages", schema=PageSchema)
# --------------------------------------------------
def chunk_text(text, chunk_size=500):
"""Split long page text into smaller chunks for better vector search"""
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
current_length += len(word) + 1
current_chunk.append(word)
if current_length >= chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def process_pdf(filepath):
filename = os.path.basename(filepath)
print(f"Processing: {filename}...")
try:
with open(filepath, 'rb') as f:
reader = PyPDF2.PdfReader(f)
num_pages = len(reader.pages)
for i in range(num_pages):
try:
page = reader.pages[i]
text = page.extract_text()
# Junk Filter: Skip pages with too little text
if not text or len(text.strip()) < 50:
continue
clean_text = text.replace('\x00', '') # Remove null bytes
# A. Insert into SQLite (Keyword Search)
cursor.execute(
"INSERT INTO pages (filename, filepath, page_number, text_content) VALUES (?, ?, ?, ?)",
(filename, filepath, i + 1, clean_text)
)
# B. Insert into LanceDB (Vector Search)
chunks = chunk_text(clean_text)
vectors = model.encode(chunks)
data_to_add = []
for chunk, vector in zip(chunks, vectors):
data_to_add.append({
"vector": vector,
"text": chunk,
"filename": filename,
"page_number": i + 1,
"filepath": filepath
})
if data_to_add:
tbl.add(data_to_add)
except Exception as e:
print(f" Error on page {i+1}: {e}")
conn.commit()
except Exception as e:
print(f"Failed to read {filename}: {e}")
def main():
print(f"Scanning directory: {DATA_DIR}")
pdf_count = 0
for root, dirs, files in os.walk(DATA_DIR):
for file in files:
if file.lower().endswith('.pdf'):
full_path = os.path.join(root, file)
process_pdf(full_path)
pdf_count += 1
print(f"Done! Processed {pdf_count} PDF files.")
print("Run 'python app.py' next to start the server.")
if __name__ == "__main__":
main()