Spaces:

deeme
/

long

Paused

App Files Files Community

long / core /writer_utils.py

deeme

Upload 111 files

217acfe verified 11 months ago

raw

history blame contribute delete

8.01 kB

	import uuid

	# 定义了用于Wirter yield的数据类型，同时也是前端展示的“关键点”消息
	class KeyPointMsg(dict):
	def __init__(self, title='', subtitle='', prompt_name=''):
	super().__init__()
	if not title and not subtitle and prompt_name:
	pass
	elif title and subtitle and not prompt_name:
	pass
	else:
	raise ValueError('Either title and subtitle or prompt_name must be provided')

	self.update({
	'id': str(uuid.uuid4()),
	'title': title,
	'subtitle': subtitle,
	'prompt_name': prompt_name,
	'finished': False
	})

	def set_finished(self):
	assert not self['finished'], 'finished flag is already set'
	self['finished'] = True
	return self # 返回self，方便链式调用

	def is_finished(self):
	return self['finished']

	def is_prompt(self):
	return bool(self.prompt_name)

	def is_title(self):
	return bool(self.title)

	@property
	def id(self):
	return self['id']

	@property
	def title(self):
	return self['title']

	@property
	def subtitle(self):
	return self['subtitle']

	@property
	def prompt_name(self):
	prompt_name = self['prompt_name']
	if len(prompt_name) >= 10:
	return prompt_name[:10] + '...'
	return prompt_name


	import re
	from difflib import Differ

	# 后续考虑采用现成的库实现，目前逻辑过于繁琐，而且太慢了
	def detect_max_edit_span(a, b):
	diff = Differ().compare(a, b)

	l = 0
	r = 0
	flag_count_l = True

	for tag in diff:
	if tag.startswith(' '):
	if flag_count_l:
	l += 1
	else:
	r += 1
	else:
	flag_count_l = False
	r = 0

	return l, -r

	def split_text_by_separators(text, separators, keep_separators=True):
	"""
	将文本按指定的分隔符分割为段落
	Args:
	text: 要分割的文本
	separators: 分隔符列表
	keep_separators: 是否在结果中保留分隔符，默认为True
	Returns:
	包含分割后段落的列表
	"""
	pattern = f'({"\|".join(map(re.escape, separators))}+)'
	chunks = re.split(pattern, text)

	paragraphs = []
	current_para = []

	for i in range(0, len(chunks), 2):
	content = chunks[i]
	separator = chunks[i + 1] if i + 1 < len(chunks) else ''

	current_para.append(content)
	if keep_separators and separator:
	current_para.append(separator)

	if content.strip():
	paragraphs.append(''.join(current_para))
	current_para = []

	return paragraphs

	def split_text_into_paragraphs(text, keep_separators=True):
	return split_text_by_separators(text, ['\n'], keep_separators)

	def split_text_into_sentences(text, keep_separators=True):
	return split_text_by_separators(text, ['\n', '。', '？', '！', '；'], keep_separators)

	def run_and_echo_yield_func(func, args, *kwargs):
	echo_text = ""
	all_messages = []
	for messages in func(args, *kwargs):
	all_messages.append(messages)
	new_echo_text = "\n".join(f"{msg['role']}:\n{msg['content']}" for msg in messages)
	if new_echo_text.startswith(echo_text):
	delta_echo_text = new_echo_text[len(echo_text):]
	else:
	echo_text = ""
	print('\n--------------------------------')
	delta_echo_text = new_echo_text

	print(delta_echo_text, end="")
	echo_text = echo_text + delta_echo_text
	return all_messages

	def run_yield_func(func, args, *kwargs):
	gen = func(args, *kwargs)
	try:
	while True:
	next(gen)
	except StopIteration as e:
	return e.value

	def split_text_into_chunks(text, max_chunk_size, min_chunk_n, min_chunk_size=1, max_chunk_n=1000):
	def split_paragraph(para):
	mid = len(para) // 2
	split_pattern = r'[。？；]'
	split_points = [m.end() for m in re.finditer(split_pattern, para)]

	if not split_points:
	raise Exception("没有找到分割点!")

	closest_point = min(split_points, key=lambda x: abs(x - mid))
	if not para[:closest_point].strip() or not para[closest_point:].strip():
	raise Exception("没有找到分割点!")

	return para[:closest_point], para[closest_point:]

	paragraphs = split_text_into_paragraphs(text)

	assert max_chunk_n >= 1, "max_chunk_n必须大于等于1"
	assert sum(len(p) for p in paragraphs) >= min_chunk_size, f"分割时，输入的文本长度小于要求的min_chunk_size:{min_chunk_size}"
	count = 0 # 防止死循环
	while len(paragraphs) > max_chunk_n or min(len(p) for p in paragraphs) < min_chunk_size:
	assert (count:=count+1) < 1000, "分割进入死循环！"

	# 找出相邻chunks中和最小的两个进行合并
	min_sum = float('inf')
	min_i = 0

	for i in range(len(paragraphs) - 1):
	curr_sum = len(paragraphs[i]) + len(paragraphs[i + 1])
	if curr_sum < min_sum:
	min_sum = curr_sum
	min_i = i

	# 合并这两个chunks
	paragraphs[min_i:min_i + 2] = [''.join(paragraphs[min_i:min_i + 2])]

	while len(paragraphs) < min_chunk_n or max(len(p) for p in paragraphs) > max_chunk_size:
	assert (count:=count+1) < 1000, "分割进入死循环！"
	longest_para_i = max(range(len(paragraphs)), key=lambda i: len(paragraphs[i]))
	part1, part2 = split_paragraph(paragraphs[longest_para_i])
	if len(part1) < min_chunk_size or len(part2) < min_chunk_size or len(paragraphs) + 1 > max_chunk_n:
	raise Exception("没有找到合适的分割点!")
	paragraphs[longest_para_i:longest_para_i+1] = [part1, part2]

	return paragraphs

	def test_split_text_into_chunks():
	# Test case 1: Simple paragraph splitting
	text1 = "这是第一段。这是第二段。这是第三段。"
	result1 = split_text_into_chunks(text1, max_chunk_size=10, min_chunk_n=3)
	print("Test 1 result:", result1)
	assert len(result1) == 3, f"Expected 3 chunks, got {len(result1)}"


	# Test case 2: Long paragraph splitting
	text2 = "这是一个很长的段落，包含了很多句子。它应该被分割成多个小块。这里有一些标点符号，比如句号。还有问号？以及分号；这些都可以用来分割文本。"
	result2 = split_text_into_chunks(text2, max_chunk_size=20, min_chunk_n=4)
	print("Test 2 result:", result2)
	assert len(result2) >= 4, f"Expected at least 4 chunks, got {len(result2)}"
	assert all(len(chunk) <= 20 for chunk in result2), "Some chunks are longer than max_chunk_size"

	# Test case 3: Text with newlines
	text3 = "第一段。\n\n第二段。\n第三段。\n\n第四段很长，需要被分割。这是第四段的继续。"
	result3 = split_text_into_chunks(text3, max_chunk_size=15, min_chunk_n=5)
	print("Test 3 result:", result3)
	assert len(result3) >= 5, f"Expected at least 5 chunks, got {len(result3)}"
	assert all(len(chunk) <= 15 for chunk in result3), "Some chunks are longer than max_chunk_size"

	print("All tests passed!")

	if __name__ == "__main__":
	print(detect_max_edit_span("我吃西红柿", "我不喜欢吃西红柿"))
	print(detect_max_edit_span("我吃西红柿", "不喜欢吃西红柿"))
	print(detect_max_edit_span("我吃西红柿", "我不喜欢吃"))
	print(detect_max_edit_span("我吃西红柿", "你不喜欢吃西瓜"))

	test_split_text_into_chunks()