import logging import os from src.configs.config import LOG_DIR from src.utils.search_docs_utils import preprocess_and_save_documents, preprocess_and_save_parlement # Set up logging to a file for tracking the preprocessing steps LOG_FILE = os.path.join(LOG_DIR, "preprocess_titlecat_titledate_embeddings.log") logging.basicConfig( filename=LOG_FILE, level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) def run(): """ Run the preprocessing pipeline for both document and parliament transcript embeddings. - First, preprocess and embed the title+category of legal documents. - Then, preprocess and embed the title+date of parliament transcripts. Logs each step to a file for traceability. """ logging.info("1/2 Starting preprocessing and embedding for documents (title+category).") preprocess_and_save_documents() logging.info("2/2 Starting preprocessing and embedding for parliament transcripts (title+date).") preprocess_and_save_parlement() logging.info("Preprocessing and embedding completed.") if __name__ == "__main__": # Entry point: launch the full preprocessing pipeline run()