|
|
from pypdf import PdfReader
|
|
|
from typing import Dict, List
|
|
|
import re
|
|
|
|
|
|
def load_documents(data_path: str) -> str:
|
|
|
'''
|
|
|
Read the linkedin pdf and the summary in the data folder
|
|
|
|
|
|
Parameters:
|
|
|
- data_path (str): The path to the data folder
|
|
|
|
|
|
Returns:
|
|
|
- output (Dict[str, str]): A dictionary containing the text document and summary
|
|
|
'''
|
|
|
reader = PdfReader(f"{data_path}\linkedin.pdf")
|
|
|
text_document = ""
|
|
|
for page in reader.pages:
|
|
|
text_document += page.extract_text()
|
|
|
|
|
|
with open(f"{data_path}\summary.txt", "r") as f:
|
|
|
summary = f.read()
|
|
|
output = f"{text_document}\n{summary}"
|
|
|
return output
|
|
|
|
|
|
def sliding_window_chunk(text: str, overlap: int = 20, chunk_size: int = 200) -> List[str]:
|
|
|
'''
|
|
|
Split the text into chunks of non-empty substrings
|
|
|
|
|
|
Parameters:
|
|
|
- text (str): The text to split
|
|
|
|
|
|
Returns:
|
|
|
- chunks (List[str]): A list of chunks of text
|
|
|
'''
|
|
|
|
|
|
|
|
|
text = re.sub(r'[\xa0\n]', " ", text)
|
|
|
|
|
|
|
|
|
words = text.split()
|
|
|
chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), overlap)]
|
|
|
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|