Spaces:

Hao2727
/

pdfsearch

Runtime error

pdfsearch / text_utils.py

Upload folder using huggingface_hub

dc41094 verified about 2 years ago

895 Bytes

	from nltk.tokenize import sent_tokenize
	import json

	def split_text(paragraphs,chunk_size=300,overlap_size=100):
	"""按指定 chunk_size 和 overlap_size 交叠割文本"""
	sentences = [s.strip() for p in paragraphs for s in sent_tokenize(p)]
	chunks = []
	i= 0
	while i < len(sentences):
	chunk = sentences[i]
	overlap = ''
	prev_len = 0
	prev = i - 1
	# 向前计算重叠部分
	while prev >= 0 and len(sentences[prev])+len(overlap) <= overlap_size:
	overlap = sentences[prev] + ' ' + overlap
	prev -= 1
	chunk = overlap+chunk
	next = i + 1
	# 向后计算当前chunk
	while next < len(sentences) and len(sentences[next])+len(chunk) <= chunk_size:
	chunk = chunk + ' ' + sentences[next]
	next += 1
	chunks.append(chunk)
	i = next
	return chunks