Spaces:

HOLOKIATEAM
/

RAG_APP

Sleeping

RAG_APP / src /scripts /3_preprocess_titlecat_titledate_embeddings.py

Upload 83 files

3107242 verified 6 months ago

1.21 kB

	import logging
	import os
	from src.configs.config import LOG_DIR
	from src.utils.search_docs_utils import preprocess_and_save_documents, preprocess_and_save_parlement

	# Set up logging to a file for tracking the preprocessing steps
	LOG_FILE = os.path.join(LOG_DIR, "preprocess_titlecat_titledate_embeddings.log")
	logging.basicConfig(
	filename=LOG_FILE,
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)

	def run():
	"""
	Run the preprocessing pipeline for both document and parliament transcript embeddings.
	- First, preprocess and embed the title+category of legal documents.
	- Then, preprocess and embed the title+date of parliament transcripts.
	Logs each step to a file for traceability.
	"""
	logging.info("1/2 Starting preprocessing and embedding for documents (title+category).")
	preprocess_and_save_documents()
	logging.info("2/2 Starting preprocessing and embedding for parliament transcripts (title+date).")
	preprocess_and_save_parlement()
	logging.info("Preprocessing and embedding completed.")

	if __name__ == "__main__":
	# Entry point: launch the full preprocessing pipeline
	run()