Week_1 / text_chunk.py
Pham23's picture
Upload folder using huggingface_hub
7e820ed verified
from pypdf import PdfReader
from typing import Dict, List
import re
def load_documents(data_path: str) -> str:
'''
Read the linkedin pdf and the summary in the data folder
Parameters:
- data_path (str): The path to the data folder
Returns:
- output (Dict[str, str]): A dictionary containing the text document and summary
'''
reader = PdfReader(f"{data_path}\linkedin.pdf")
text_document = ""
for page in reader.pages:
text_document += page.extract_text()
with open(f"{data_path}\summary.txt", "r") as f:
summary = f.read()
output = f"{text_document}\n{summary}"
return output
def sliding_window_chunk(text: str, overlap: int = 20, chunk_size: int = 200) -> List[str]:
'''
Split the text into chunks of non-empty substrings
Parameters:
- text (str): The text to split
Returns:
- chunks (List[str]): A list of chunks of text
'''
# Remove unwanted characters
text = re.sub(r'[\xa0\n]', " ", text)
# Split the text into chunks of non-empty substrings
words = text.split()
chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), overlap)]
return chunks
# if __name__ == "__main__":
# # reader = PdfReader("Week_1\Data_w1\linkedin.pdf")
# # linkedin = ""
# # for page in reader.pages:
# # linkedin += page.extract_text()
# # text_chunks = sliding_window_chunk(linkedin)
# # print(len(text_chunks))