File size: 1,206 Bytes
3107242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import logging
import os
from src.configs.config import LOG_DIR
from src.utils.search_docs_utils import preprocess_and_save_documents, preprocess_and_save_parlement

# Set up logging to a file for tracking the preprocessing steps
LOG_FILE = os.path.join(LOG_DIR, "preprocess_titlecat_titledate_embeddings.log")
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

def run():
    """
    Run the preprocessing pipeline for both document and parliament transcript embeddings.
    - First, preprocess and embed the title+category of legal documents.
    - Then, preprocess and embed the title+date of parliament transcripts.
    Logs each step to a file for traceability.
    """
    logging.info("1/2 Starting preprocessing and embedding for documents (title+category).")
    preprocess_and_save_documents()
    logging.info("2/2 Starting preprocessing and embedding for parliament transcripts (title+date).")
    preprocess_and_save_parlement()
    logging.info("Preprocessing and embedding completed.")

if __name__ == "__main__":
    # Entry point: launch the full preprocessing pipeline
    run()