vedam_ai / ocr_loader.py
vikramvasudevan's picture
Upload folder using huggingface_hub
72cf8cd verified
import logging
from config import VedamConfig
from file_handler import page_text_generator
from db import VedamDatabase
from embeddings import get_embedding
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class OcrLoader:
def __init__(self, scripture_name: str) -> None:
self.scripture_config = [
scripture
for scripture in VedamConfig.scriptures
if scripture["name"] == scripture_name
][0]
self.BATCH_SIZE = 100
self.db = VedamDatabase()
def load(self):
if(self.db.does_data_exist(collection_name=self.scripture_config["collection_name"])):
logger.info("πŸš€ Data already exists in ChromaDB... Not loading now")
return
logger.info("πŸš€ Starting OCR file ingestion into ChromaDB...")
batch = []
total_loaded = 0
for item in page_text_generator(output_dir=self.scripture_config["output_dir"]):
logger.debug(
f"πŸ“„ Queued page {item['metadata']['page']} for embedding and storage"
)
batch.append(item)
if len(batch) == self.BATCH_SIZE:
logger.info(
f"πŸ“¦ Loading batch of {self.BATCH_SIZE} pages to {self.scripture_config["collection_name"]} in ChromaDB..."
)
self.db.load(
collection_name=self.scripture_config["collection_name"],
documents=[d["document"] for d in batch],
ids=[d["id"] for d in batch],
metadatas=[d["metadata"] for d in batch],
embeddings=[get_embedding(d["document"]) for d in batch],
)
total_loaded += len(batch)
logger.info(f"βœ… Total loaded so far: {total_loaded}")
batch = []
if batch:
logger.info(f"πŸ“¦ Loading final batch of {len(batch)} pages to {self.scripture_config["collection_name"]} in ChromaDB...")
self.db.load(
collection_name=self.scripture_config["collection_name"],
documents=[d["document"] for d in batch],
ids=[d["id"] for d in batch],
metadatas=[d["metadata"] for d in batch],
embeddings=[get_embedding(d["document"]) for d in batch],
)
total_loaded += len(batch)
logger.info(f"βœ… Final total pages loaded: {total_loaded}")
logger.info("🏁 OCR ingestion complete.")