Spaces:

Pham23
/

Week_1

Runtime error

Week_1 / text_chunk.py

Upload folder using huggingface_hub

7e820ed verified 7 months ago

1.53 kB

	from pypdf import PdfReader
	from typing import Dict, List
	import re

	def load_documents(data_path: str) -> str:
	'''
	Read the linkedin pdf and the summary in the data folder

	Parameters:
	- data_path (str): The path to the data folder

	Returns:
	- output (Dict[str, str]): A dictionary containing the text document and summary
	'''
	reader = PdfReader(f"{data_path}\linkedin.pdf")
	text_document = ""
	for page in reader.pages:
	text_document += page.extract_text()

	with open(f"{data_path}\summary.txt", "r") as f:
	summary = f.read()
	output = f"{text_document}\n{summary}"
	return output

	def sliding_window_chunk(text: str, overlap: int = 20, chunk_size: int = 200) -> List[str]:
	'''
	Split the text into chunks of non-empty substrings

	Parameters:
	- text (str): The text to split

	Returns:
	- chunks (List[str]): A list of chunks of text
	'''

	# Remove unwanted characters
	text = re.sub(r'[\xa0\n]', " ", text)

	# Split the text into chunks of non-empty substrings
	words = text.split()
	chunks = [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), overlap)]
	return chunks


	# if __name__ == "__main__":
	# # reader = PdfReader("Week_1\Data_w1\linkedin.pdf")
	# # linkedin = ""
	# # for page in reader.pages:
	# # linkedin += page.extract_text()

	# # text_chunks = sliding_window_chunk(linkedin)
	# # print(len(text_chunks))