Spaces:
Runtime error
Runtime error
| from typing import List, Any | |
| from pydantic import BaseModel | |
| from pathlib import Path | |
| from backend.classes.pdf_extractor import PyMuPDFExtractor, PyMuPDFExtractorConfig | |
| from backend.utils.utils import create_pdf_extractor | |
| from backend.classes.pdf_extractor import BasePDFExtractorConfig | |
| class DataPreparerConfig(BaseModel): | |
| input_data_path: str | |
| output_data_path: str | |
| output_file: str | |
| pdf_extractor: BasePDFExtractorConfig | |
| class DataPreparer: | |
| def __init__(self, config: DataPreparerConfig): | |
| self.config = config | |
| self.input_data_path = self.config.input_data_path | |
| self.output_data_path = self.config.output_data_path | |
| self.output_file = self.config.output_file | |
| self.pdf_extractor_config = PyMuPDFExtractorConfig() | |
| self.pdf_extractor = create_pdf_extractor(PyMuPDFExtractor, self.pdf_extractor_config) | |
| def get_pdf_files(self) -> list: | |
| # Get all pdf files from folder in a recursive manner using pathlib.Path | |
| pdf_files = [] | |
| for path in Path(self.input_data_path).rglob("*.pdf"): | |
| pdf_files.append(path) | |
| return pdf_files | |
| def save_data_to_jsonl(self, data: List[Any], file_path: str): | |
| try: | |
| # Save text to a file | |
| with open(file_path, "w", encoding="utf-8") as f: | |
| for entry in data: | |
| f.write(entry.model_dump_json() + "\n") | |
| except Exception as e: | |
| print(f"Error saving data to file: {e}") | |
| def prepare_data(self): | |
| # Read pdf files from folder | |
| pdf_files = self.get_pdf_files() | |
| # Extract text from pdf files | |
| for pdf_file in pdf_files: | |
| # Extract pdf data in markdown | |
| pdf_data = self.pdf_extractor.extract(pdf_file) | |
| # Get file name and construct output file name | |
| file_name = pdf_file.stem.replace(" ", "_") | |
| output_file = self.output_file.format(file_name=file_name) | |
| # Save pdf data to json | |
| self.save_data_to_jsonl(pdf_data, str(Path(self.output_data_path) / output_file)) | |