pdfsearch / text_utils.py
Hao2727's picture
Upload folder using huggingface_hub
dc41094 verified
raw
history blame contribute delete
895 Bytes
from nltk.tokenize import sent_tokenize
import json
def split_text(paragraphs,chunk_size=300,overlap_size=100):
"""按指定 chunk_size 和 overlap_size 交叠割文本"""
sentences = [s.strip() for p in paragraphs for s in sent_tokenize(p)]
chunks = []
i= 0
while i < len(sentences):
chunk = sentences[i]
overlap = ''
prev_len = 0
prev = i - 1
# 向前计算重叠部分
while prev >= 0 and len(sentences[prev])+len(overlap) <= overlap_size:
overlap = sentences[prev] + ' ' + overlap
prev -= 1
chunk = overlap+chunk
next = i + 1
# 向后计算当前chunk
while next < len(sentences) and len(sentences[next])+len(chunk) <= chunk_size:
chunk = chunk + ' ' + sentences[next]
next += 1
chunks.append(chunk)
i = next
return chunks