File size: 1,531 Bytes
7e820ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from pypdf import PdfReader
from typing import Dict, List
import re 

def load_documents(data_path: str) -> str:
    '''

    Read the linkedin pdf and the summary in the data folder



    Parameters:

    - data_path (str): The path to the data folder



    Returns:

    - output (Dict[str, str]): A dictionary containing the text document and summary

    '''
    reader = PdfReader(f"{data_path}\linkedin.pdf")
    text_document = ""
    for page in reader.pages:
        text_document += page.extract_text()

    with open(f"{data_path}\summary.txt", "r") as f:
        summary = f.read()   
    output = f"{text_document}\n{summary}"
    return output

def sliding_window_chunk(text: str, overlap: int = 20, chunk_size: int = 200) -> List[str]:
    '''

    Split the text into chunks of non-empty substrings



    Parameters:

    - text (str): The text to split



    Returns:

    - chunks (List[str]): A list of chunks of text

    '''

    # Remove unwanted characters 
    text = re.sub(r'[\xa0\n]', " ", text)

    # Split the text into chunks of non-empty substrings
    words = text.split()
    chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), overlap)]
    return chunks


# if __name__ == "__main__":
#     # reader = PdfReader("Week_1\Data_w1\linkedin.pdf")
#     # linkedin = ""
#     # for page in reader.pages:
#     #     linkedin += page.extract_text()

#     # text_chunks = sliding_window_chunk(linkedin)
#     # print(len(text_chunks))