RAG_APP / src /scripts /3_preprocess_titlecat_titledate_embeddings.py
sxid003's picture
Upload 83 files
3107242 verified
import logging
import os
from src.configs.config import LOG_DIR
from src.utils.search_docs_utils import preprocess_and_save_documents, preprocess_and_save_parlement
# Set up logging to a file for tracking the preprocessing steps
LOG_FILE = os.path.join(LOG_DIR, "preprocess_titlecat_titledate_embeddings.log")
logging.basicConfig(
filename=LOG_FILE,
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
def run():
"""
Run the preprocessing pipeline for both document and parliament transcript embeddings.
- First, preprocess and embed the title+category of legal documents.
- Then, preprocess and embed the title+date of parliament transcripts.
Logs each step to a file for traceability.
"""
logging.info("1/2 Starting preprocessing and embedding for documents (title+category).")
preprocess_and_save_documents()
logging.info("2/2 Starting preprocessing and embedding for parliament transcripts (title+date).")
preprocess_and_save_parlement()
logging.info("Preprocessing and embedding completed.")
if __name__ == "__main__":
# Entry point: launch the full preprocessing pipeline
run()