File size: 2,107 Bytes
e68d535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from typing import List, Any

from pydantic import BaseModel
from pathlib import Path

from backend.classes.pdf_extractor import PyMuPDFExtractor, PyMuPDFExtractorConfig
from backend.utils.utils import create_pdf_extractor

from backend.classes.pdf_extractor import BasePDFExtractorConfig


class DataPreparerConfig(BaseModel):
    
    input_data_path: str
    output_data_path: str
    output_file: str
    pdf_extractor: BasePDFExtractorConfig


class DataPreparer:
    def __init__(self, config: DataPreparerConfig):
        self.config = config
        self.input_data_path = self.config.input_data_path
        self.output_data_path = self.config.output_data_path
        self.output_file = self.config.output_file

        self.pdf_extractor_config = PyMuPDFExtractorConfig()
        self.pdf_extractor = create_pdf_extractor(PyMuPDFExtractor, self.pdf_extractor_config)

    def get_pdf_files(self) -> list:
        # Get all pdf files from folder in a recursive manner using pathlib.Path
        pdf_files = []
        for path in Path(self.input_data_path).rglob("*.pdf"):
            pdf_files.append(path)

        return pdf_files

    def save_data_to_jsonl(self, data: List[Any], file_path: str):
        try:
            # Save text to a file
            with open(file_path, "w", encoding="utf-8") as f:
                for entry in data:
                    f.write(entry.model_dump_json() + "\n")
        except Exception as e:
            print(f"Error saving data to file: {e}")
    
    def prepare_data(self):
        # Read pdf files from folder
        pdf_files = self.get_pdf_files()

        # Extract text from pdf files
        for pdf_file in pdf_files:
            # Extract pdf data in markdown
            pdf_data = self.pdf_extractor.extract(pdf_file)

            # Get file name and construct output file name
            file_name = pdf_file.stem.replace(" ", "_")
            output_file = self.output_file.format(file_name=file_name)

            # Save pdf data to json
            self.save_data_to_jsonl(pdf_data, str(Path(self.output_data_path) / output_file))