Spaces:

AKMESSI
/

archive-explorer

Sleeping

App Files Files Community

archive-explorer / ingest.py

AKMESSI

initial commit

d0a567e about 2 months ago

raw

history blame contribute delete

4.74 kB

	import os
	import sqlite3
	import lancedb
	import PyPDF2
	from sentence_transformers import SentenceTransformer
	from lancedb.pydantic import LanceModel, Vector
	import warnings

	# Suppress warnings
	warnings.filterwarnings("ignore")

	# CONFIGURATION
	DATA_DIR = "data"
	DB_NAME = "epstein.db"
	VECTOR_DB_DIR = "data/lancedb"

	print("Initializing models and databases...")

	# 1. Setup SQLite (For Keyword Search)
	conn = sqlite3.connect(DB_NAME)
	cursor = conn.cursor()

	# Create main table and FTS (Full Text Search) virtual table
	cursor.execute("""
	CREATE TABLE IF NOT EXISTS pages (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	filename TEXT,
	filepath TEXT,
	page_number INTEGER,
	text_content TEXT
	)
	""")
	cursor.execute("""
	CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts USING fts5(
	filename,
	text_content,
	content='pages',
	content_rowid='id'
	)
	""")
	cursor.execute("""
	CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
	INSERT INTO pages_fts(rowid, filename, text_content) VALUES (new.id, new.filename, new.text_content);
	END;
	""")
	conn.commit()

	# 2. Setup LanceDB (For Vector/AI Search)
	model = SentenceTransformer('all-MiniLM-L6-v2')
	ldb = lancedb.connect(VECTOR_DB_DIR)

	# --- THE FIX: Use Pydantic to define the Schema ---
	class PageSchema(LanceModel):
	vector: Vector(384) # 384 is the dimension of all-MiniLM-L6-v2
	text: str
	filename: str
	page_number: int
	filepath: str

	# Create or Open the table using the Class Schema
	try:
	tbl = ldb.open_table("pages")
	except:
	tbl = ldb.create_table("pages", schema=PageSchema)
	# --------------------------------------------------

	def chunk_text(text, chunk_size=500):
	"""Split long page text into smaller chunks for better vector search"""
	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0

	for word in words:
	current_length += len(word) + 1
	current_chunk.append(word)
	if current_length >= chunk_size:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	current_length = 0

	if current_chunk:
	chunks.append(" ".join(current_chunk))
	return chunks

	def process_pdf(filepath):
	filename = os.path.basename(filepath)
	print(f"Processing: {filename}...")

	try:
	with open(filepath, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	num_pages = len(reader.pages)

	for i in range(num_pages):
	try:
	page = reader.pages[i]
	text = page.extract_text()

	# Junk Filter: Skip pages with too little text
	if not text or len(text.strip()) < 50:
	continue

	clean_text = text.replace('\x00', '') # Remove null bytes

	# A. Insert into SQLite (Keyword Search)
	cursor.execute(
	"INSERT INTO pages (filename, filepath, page_number, text_content) VALUES (?, ?, ?, ?)",
	(filename, filepath, i + 1, clean_text)
	)

	# B. Insert into LanceDB (Vector Search)
	chunks = chunk_text(clean_text)
	vectors = model.encode(chunks)

	data_to_add = []
	for chunk, vector in zip(chunks, vectors):
	data_to_add.append({
	"vector": vector,
	"text": chunk,
	"filename": filename,
	"page_number": i + 1,
	"filepath": filepath
	})

	if data_to_add:
	tbl.add(data_to_add)

	except Exception as e:
	print(f" Error on page {i+1}: {e}")

	conn.commit()

	except Exception as e:
	print(f"Failed to read {filename}: {e}")

	def main():
	print(f"Scanning directory: {DATA_DIR}")
	pdf_count = 0

	for root, dirs, files in os.walk(DATA_DIR):
	for file in files:
	if file.lower().endswith('.pdf'):
	full_path = os.path.join(root, file)
	process_pdf(full_path)
	pdf_count += 1

	print(f"Done! Processed {pdf_count} PDF files.")
	print("Run 'python app.py' next to start the server.")

	if __name__ == "__main__":
	main()