from typing import List, Any from pydantic import BaseModel from pathlib import Path from backend.classes.pdf_extractor import PyMuPDFExtractor, PyMuPDFExtractorConfig from backend.utils.utils import create_pdf_extractor from backend.classes.pdf_extractor import BasePDFExtractorConfig class DataPreparerConfig(BaseModel): input_data_path: str output_data_path: str output_file: str pdf_extractor: BasePDFExtractorConfig class DataPreparer: def __init__(self, config: DataPreparerConfig): self.config = config self.input_data_path = self.config.input_data_path self.output_data_path = self.config.output_data_path self.output_file = self.config.output_file self.pdf_extractor_config = PyMuPDFExtractorConfig() self.pdf_extractor = create_pdf_extractor(PyMuPDFExtractor, self.pdf_extractor_config) def get_pdf_files(self) -> list: # Get all pdf files from folder in a recursive manner using pathlib.Path pdf_files = [] for path in Path(self.input_data_path).rglob("*.pdf"): pdf_files.append(path) return pdf_files def save_data_to_jsonl(self, data: List[Any], file_path: str): try: # Save text to a file with open(file_path, "w", encoding="utf-8") as f: for entry in data: f.write(entry.model_dump_json() + "\n") except Exception as e: print(f"Error saving data to file: {e}") def prepare_data(self): # Read pdf files from folder pdf_files = self.get_pdf_files() # Extract text from pdf files for pdf_file in pdf_files: # Extract pdf data in markdown pdf_data = self.pdf_extractor.extract(pdf_file) # Get file name and construct output file name file_name = pdf_file.stem.replace(" ", "_") output_file = self.output_file.format(file_name=file_name) # Save pdf data to json self.save_data_to_jsonl(pdf_data, str(Path(self.output_data_path) / output_file))