Spaces:
Runtime error
Runtime error
| import logging | |
| from config import VedamConfig | |
| from file_handler import page_text_generator | |
| from db import VedamDatabase | |
| from embeddings import get_embedding | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.INFO) | |
| class OcrLoader: | |
| def __init__(self, scripture_name: str) -> None: | |
| self.scripture_config = [ | |
| scripture | |
| for scripture in VedamConfig.scriptures | |
| if scripture["name"] == scripture_name | |
| ][0] | |
| self.BATCH_SIZE = 100 | |
| self.db = VedamDatabase() | |
| def load(self): | |
| if(self.db.does_data_exist(collection_name=self.scripture_config["collection_name"])): | |
| logger.info("π Data already exists in ChromaDB... Not loading now") | |
| return | |
| logger.info("π Starting OCR file ingestion into ChromaDB...") | |
| batch = [] | |
| total_loaded = 0 | |
| for item in page_text_generator(output_dir=self.scripture_config["output_dir"]): | |
| logger.debug( | |
| f"π Queued page {item['metadata']['page']} for embedding and storage" | |
| ) | |
| batch.append(item) | |
| if len(batch) == self.BATCH_SIZE: | |
| logger.info( | |
| f"π¦ Loading batch of {self.BATCH_SIZE} pages to {self.scripture_config["collection_name"]} in ChromaDB..." | |
| ) | |
| self.db.load( | |
| collection_name=self.scripture_config["collection_name"], | |
| documents=[d["document"] for d in batch], | |
| ids=[d["id"] for d in batch], | |
| metadatas=[d["metadata"] for d in batch], | |
| embeddings=[get_embedding(d["document"]) for d in batch], | |
| ) | |
| total_loaded += len(batch) | |
| logger.info(f"β Total loaded so far: {total_loaded}") | |
| batch = [] | |
| if batch: | |
| logger.info(f"π¦ Loading final batch of {len(batch)} pages to {self.scripture_config["collection_name"]} in ChromaDB...") | |
| self.db.load( | |
| collection_name=self.scripture_config["collection_name"], | |
| documents=[d["document"] for d in batch], | |
| ids=[d["id"] for d in batch], | |
| metadatas=[d["metadata"] for d in batch], | |
| embeddings=[get_embedding(d["document"]) for d in batch], | |
| ) | |
| total_loaded += len(batch) | |
| logger.info(f"β Final total pages loaded: {total_loaded}") | |
| logger.info("π OCR ingestion complete.") | |