File size: 1,430 Bytes
7d217f1
 
 
 
 
6992a5d
7d217f1
 
 
 
 
 
 
 
 
6992a5d
7d217f1
 
6992a5d
7d217f1
6992a5d
7d217f1
 
 
6992a5d
 
 
 
7d217f1
 
 
 
 
 
 
 
 
 
 
 
6992a5d
 
7d217f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
from pathlib import Path

from config import VedamConfig


def write_to_file_and_create_dir(file_path_str, content):
    """
    Writes content to a specified file, creating parent directories if they don't exist.

    Args:
        file_path_str (str): The path to the file, including its name.
        content (str): The string content to write to the file.
    """
    file_path = Path(file_path_str)

    # Create parent directories if they don't exist
    file_path.parent.mkdir(parents=True, exist_ok=True)

    # Write content to the file
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)


def page_text_generator(output_dir: str):
    for filename in sorted(
        os.listdir(output_dir), key=lambda f: int(f.strip("page").strip(".txt"))
    ):
        if filename.endswith(".txt"):
            page_num = int(filename.strip("page").strip(".txt"))
            file_path = os.path.join(output_dir, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read().strip()
                if text:
                    yield {
                        "id": str(page_num),
                        "document": text,
                        "metadata": {
                            "page": page_num,
                            "file": filename,
                            "num_chars": len(text),
                        },
                    }