Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

File size: 1,206 Bytes
import logging
import os
from src.configs.config import LOG_DIR
from src.utils.search_docs_utils import preprocess_and_save_documents, preprocess_and_save_parlement

# Set up logging to a file for tracking the preprocessing steps
LOG_FILE = os.path.join(LOG_DIR, "preprocess_titlecat_titledate_embeddings.log")
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

def run():
    """
    Run the preprocessing pipeline for both document and parliament transcript embeddings.
    - First, preprocess and embed the title+category of legal documents.
    - Then, preprocess and embed the title+date of parliament transcripts.
    Logs each step to a file for traceability.
    """
    logging.info("1/2 Starting preprocessing and embedding for documents (title+category).")
    preprocess_and_save_documents()
    logging.info("2/2 Starting preprocessing and embedding for parliament transcripts (title+date).")
    preprocess_and_save_parlement()
    logging.info("Preprocessing and embedding completed.")

if __name__ == "__main__":
    # Entry point: launch the full preprocessing pipeline
    run()