Spaces:
Sleeping
Sleeping
| import logging | |
| import os | |
| from src.configs.config import LOG_DIR | |
| from src.utils.search_docs_utils import preprocess_and_save_documents, preprocess_and_save_parlement | |
| # Set up logging to a file for tracking the preprocessing steps | |
| LOG_FILE = os.path.join(LOG_DIR, "preprocess_titlecat_titledate_embeddings.log") | |
| logging.basicConfig( | |
| filename=LOG_FILE, | |
| level=logging.INFO, | |
| format="%(asctime)s - %(levelname)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| def run(): | |
| """ | |
| Run the preprocessing pipeline for both document and parliament transcript embeddings. | |
| - First, preprocess and embed the title+category of legal documents. | |
| - Then, preprocess and embed the title+date of parliament transcripts. | |
| Logs each step to a file for traceability. | |
| """ | |
| logging.info("1/2 Starting preprocessing and embedding for documents (title+category).") | |
| preprocess_and_save_documents() | |
| logging.info("2/2 Starting preprocessing and embedding for parliament transcripts (title+date).") | |
| preprocess_and_save_parlement() | |
| logging.info("Preprocessing and embedding completed.") | |
| if __name__ == "__main__": | |
| # Entry point: launch the full preprocessing pipeline | |
| run() |