from pypdf import PdfReader from typing import Dict, List import re def load_documents(data_path: str) -> str: ''' Read the linkedin pdf and the summary in the data folder Parameters: - data_path (str): The path to the data folder Returns: - output (Dict[str, str]): A dictionary containing the text document and summary ''' reader = PdfReader(f"{data_path}\linkedin.pdf") text_document = "" for page in reader.pages: text_document += page.extract_text() with open(f"{data_path}\summary.txt", "r") as f: summary = f.read() output = f"{text_document}\n{summary}" return output def sliding_window_chunk(text: str, overlap: int = 20, chunk_size: int = 200) -> List[str]: ''' Split the text into chunks of non-empty substrings Parameters: - text (str): The text to split Returns: - chunks (List[str]): A list of chunks of text ''' # Remove unwanted characters text = re.sub(r'[\xa0\n]', " ", text) # Split the text into chunks of non-empty substrings words = text.split() chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), overlap)] return chunks # if __name__ == "__main__": # # reader = PdfReader("Week_1\Data_w1\linkedin.pdf") # # linkedin = "" # # for page in reader.pages: # # linkedin += page.extract_text() # # text_chunks = sliding_window_chunk(linkedin) # # print(len(text_chunks))