diff --git a/crazy_functions/CodeInterpreter.py b/crazy_functions/CodeInterpreter.py
deleted file mode 100644
index 283dd87a93140c5621579e62c9d6d368537e4824..0000000000000000000000000000000000000000
--- a/crazy_functions/CodeInterpreter.py
+++ /dev/null
@@ -1,232 +0,0 @@
-from collections.abc import Callable, Iterable, Mapping
-from typing import Any
-from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc
-from toolbox import promote_file_to_downloadzone, get_log_folder
-from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
-from .crazy_utils import input_clipping, try_install_deps
-from multiprocessing import Process, Pipe
-import os
-import time
-
-templete = """
-```python
-import ... # Put dependencies here, e.g. import numpy as np
-
-class TerminalFunction(object): # Do not change the name of the class, The name of the class must be `TerminalFunction`
-
- def run(self, path): # The name of the function must be `run`, it takes only a positional argument.
- # rewrite the function you have just written here
- ...
- return generated_file_path
-```
-"""
-
-def inspect_dependency(chatbot, history):
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return True
-
-def get_code_block(reply):
- import re
- pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks
- matches = re.findall(pattern, reply) # find all code blocks in text
- if len(matches) == 1:
- return matches[0].strip('python') # code block
- for match in matches:
- if 'class TerminalFunction' in match:
- return match.strip('python') # code block
- raise RuntimeError("GPT is not generating proper code.")
-
-def gpt_interact_multi_step(txt, file_type, llm_kwargs, chatbot, history):
- # 输入
- prompt_compose = [
- f'Your job:\n'
- f'1. write a single Python function, which takes a path of a `{file_type}` file as the only argument and returns a `string` containing the result of analysis or the path of generated files. \n',
- f"2. You should write this function to perform following task: " + txt + "\n",
- f"3. Wrap the output python function with markdown codeblock."
- ]
- i_say = "".join(prompt_compose)
- demo = []
-
- # 第一步
- gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs=i_say, inputs_show_user=i_say,
- llm_kwargs=llm_kwargs, chatbot=chatbot, history=demo,
- sys_prompt= r"You are a programmer."
- )
- history.extend([i_say, gpt_say])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
-
- # 第二步
- prompt_compose = [
- "If previous stage is successful, rewrite the function you have just written to satisfy following templete: \n",
- templete
- ]
- i_say = "".join(prompt_compose); inputs_show_user = "If previous stage is successful, rewrite the function you have just written to satisfy executable templete. "
- gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs=i_say, inputs_show_user=inputs_show_user,
- llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
- sys_prompt= r"You are a programmer."
- )
- code_to_return = gpt_say
- history.extend([i_say, gpt_say])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
-
- # # 第三步
- # i_say = "Please list to packages to install to run the code above. Then show me how to use `try_install_deps` function to install them."
- # i_say += 'For instance. `try_install_deps(["opencv-python", "scipy", "numpy"])`'
- # installation_advance = yield from request_gpt_model_in_new_thread_with_ui_alive(
- # inputs=i_say, inputs_show_user=inputs_show_user,
- # llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
- # sys_prompt= r"You are a programmer."
- # )
- # # # 第三步
- # i_say = "Show me how to use `pip` to install packages to run the code above. "
- # i_say += 'For instance. `pip install -r opencv-python scipy numpy`'
- # installation_advance = yield from request_gpt_model_in_new_thread_with_ui_alive(
- # inputs=i_say, inputs_show_user=i_say,
- # llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
- # sys_prompt= r"You are a programmer."
- # )
- installation_advance = ""
-
- return code_to_return, installation_advance, txt, file_type, llm_kwargs, chatbot, history
-
-def make_module(code):
- module_file = 'gpt_fn_' + gen_time_str().replace('-','_')
- with open(f'{get_log_folder()}/{module_file}.py', 'w', encoding='utf8') as f:
- f.write(code)
-
- def get_class_name(class_string):
- import re
- # Use regex to extract the class name
- class_name = re.search(r'class (\w+)\(', class_string).group(1)
- return class_name
-
- class_name = get_class_name(code)
- return f"{get_log_folder().replace('/', '.')}.{module_file}->{class_name}"
-
-def init_module_instance(module):
- import importlib
- module_, class_ = module.split('->')
- init_f = getattr(importlib.import_module(module_), class_)
- return init_f()
-
-def for_immediate_show_off_when_possible(file_type, fp, chatbot):
- if file_type in ['png', 'jpg']:
- image_path = os.path.abspath(fp)
- chatbot.append(['这是一张图片, 展示如下:',
- f'本地文件地址:
`{image_path}`
'+
- f'本地文件预览:

'
- ])
- return chatbot
-
-def subprocess_worker(instance, file_path, return_dict):
- return_dict['result'] = instance.run(file_path)
-
-def have_any_recent_upload_files(chatbot):
- _5min = 5 * 60
- if not chatbot: return False # chatbot is None
- most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None)
- if not most_recent_uploaded: return False # most_recent_uploaded is None
- if time.time() - most_recent_uploaded["time"] < _5min: return True # most_recent_uploaded is new
- else: return False # most_recent_uploaded is too old
-
-def get_recent_file_prompt_support(chatbot):
- most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None)
- path = most_recent_uploaded['path']
- return path
-
-@CatchException
-def 虚空终端CodeInterpreter(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- """
- txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
- llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行
- plugin_kwargs 插件模型的参数,暂时没有用武之地
- chatbot 聊天显示框的句柄,用于显示给用户
- history 聊天历史,前情提要
- system_prompt 给gpt的静默提醒
- web_port 当前软件运行的端口号
- """
- raise NotImplementedError
-
- # 清空历史,以免输入溢出
- history = []; clear_file_downloadzone(chatbot)
-
- # 基本信息:功能、贡献者
- chatbot.append([
- "函数插件功能?",
- "CodeInterpreter开源版, 此插件处于开发阶段, 建议暂时不要使用, 插件初始化中 ..."
- ])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- if have_any_recent_upload_files(chatbot):
- file_path = get_recent_file_prompt_support(chatbot)
- else:
- chatbot.append(["文件检索", "没有发现任何近期上传的文件。"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # 读取文件
- if ("recently_uploaded_files" in plugin_kwargs) and (plugin_kwargs["recently_uploaded_files"] == ""): plugin_kwargs.pop("recently_uploaded_files")
- recently_uploaded_files = plugin_kwargs.get("recently_uploaded_files", None)
- file_path = recently_uploaded_files[-1]
- file_type = file_path.split('.')[-1]
-
- # 粗心检查
- if is_the_upload_folder(txt):
- chatbot.append([
- "...",
- f"请在输入框内填写需求,然后再次点击该插件(文件路径 {file_path} 已经被记忆)"
- ])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
- # 开始干正事
- for j in range(5): # 最多重试5次
- try:
- code, installation_advance, txt, file_type, llm_kwargs, chatbot, history = \
- yield from gpt_interact_multi_step(txt, file_type, llm_kwargs, chatbot, history)
- code = get_code_block(code)
- res = make_module(code)
- instance = init_module_instance(res)
- break
- except Exception as e:
- chatbot.append([f"第{j}次代码生成尝试,失败了", f"错误追踪\n```\n{trimmed_format_exc()}\n```\n"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # 代码生成结束, 开始执行
- try:
- import multiprocessing
- manager = multiprocessing.Manager()
- return_dict = manager.dict()
-
- p = multiprocessing.Process(target=subprocess_worker, args=(instance, file_path, return_dict))
- # only has 10 seconds to run
- p.start(); p.join(timeout=10)
- if p.is_alive(): p.terminate(); p.join()
- p.close()
- res = return_dict['result']
- # res = instance.run(file_path)
- except Exception as e:
- chatbot.append(["执行失败了", f"错误追踪\n```\n{trimmed_format_exc()}\n```\n"])
- # chatbot.append(["如果是缺乏依赖,请参考以下建议", installation_advance])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
- # 顺利完成,收尾
- res = str(res)
- if os.path.exists(res):
- chatbot.append(["执行成功了,结果是一个有效文件", "结果:" + res])
- new_file_path = promote_file_to_downloadzone(res, chatbot=chatbot)
- chatbot = for_immediate_show_off_when_possible(file_type, new_file_path, chatbot)
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
- else:
- chatbot.append(["执行成功了,结果是一个字符串", "结果:" + res])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新
-
-"""
-测试:
- 裁剪图像,保留下半部分
- 交换图像的蓝色通道和红色通道
- 将图像转为灰度图像
- 将csv文件转excel表格
-"""
\ No newline at end of file
diff --git "a/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py" "b/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py"
deleted file mode 100644
index 8433895f538e826e4294b7d6503583aafc2b34c8..0000000000000000000000000000000000000000
--- "a/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py"
+++ /dev/null
@@ -1,106 +0,0 @@
-from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg
-from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything
-
-
-
-@CatchException
-def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- """
- txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
- llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行
- plugin_kwargs 插件模型的参数,暂时没有用武之地
- chatbot 聊天显示框的句柄,用于显示给用户
- history 聊天历史,前情提要
- system_prompt 给gpt的静默提醒
- web_port 当前软件运行的端口号
- """
- history = [] # 清空历史,以免输入溢出
-
- # < --------------------读取参数--------------- >
- if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
- kai_id = plugin_kwargs.get("advanced_arg", 'default')
-
- chatbot.append((f"向`{kai_id}`知识库中添加文件。", "[Local Message] 从一批文件(txt, md, tex)中读取数据构建知识库, 然后进行问答。"))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # resolve deps
- try:
- from zh_langchain import construct_vector_store
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
- from .crazy_utils import knowledge_archive_interface
- except Exception as e:
- chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- from .crazy_utils import try_install_deps
- try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
- yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
- return
-
- # < --------------------读取文件--------------- >
- file_manifest = []
- spl = ["txt", "doc", "docx", "email", "epub", "html", "json", "md", "msg", "pdf", "ppt", "pptx", "rtf"]
- for sp in spl:
- _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}')
- file_manifest += file_manifest_tmp
-
- if len(file_manifest) == 0:
- chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
- # < -------------------预热文本向量化模组--------------- >
- chatbot.append(['
'.join(file_manifest), "正在预热文本向量化模组, 如果是第一次运行, 将消耗较长时间下载中文向量化模型..."])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- print('Checking Text2vec ...')
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
- with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
- HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese")
-
- # < -------------------构建知识库--------------- >
- chatbot.append(['
'.join(file_manifest), "正在构建知识库..."])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- print('Establishing knowledge archive ...')
- with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络
- kai = knowledge_archive_interface()
- kai.feed_archive(file_manifest=file_manifest, id=kai_id)
- kai_files = kai.get_loaded_file()
- kai_files = '
'.join(kai_files)
- # chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"])
- # yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- # chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id()
- # chatbot._cookies['lock_plugin'] = 'crazy_functions.Langchain知识库->读取知识库作答'
- # chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"])
- chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
-
-@CatchException
-def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1):
- # resolve deps
- try:
- from zh_langchain import construct_vector_store
- from langchain.embeddings.huggingface import HuggingFaceEmbeddings
- from .crazy_utils import knowledge_archive_interface
- except Exception as e:
- chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- from .crazy_utils import try_install_deps
- try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain'])
- yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history)
- return
-
- # < ------------------- --------------- >
- kai = knowledge_archive_interface()
-
- if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
- kai_id = plugin_kwargs.get("advanced_arg", 'default')
- resp, prompt = kai.answer_with_archive_by_id(txt, kai_id)
-
- chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
- gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs=prompt, inputs_show_user=txt,
- llm_kwargs=llm_kwargs, chatbot=chatbot, history=[],
- sys_prompt=system_prompt
- )
- history.extend((prompt, gpt_say))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
diff --git "a/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py" "b/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py"
deleted file mode 100644
index b736fe896979cf3c8b08910c8bb21bfb4809c9a4..0000000000000000000000000000000000000000
--- "a/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py"
+++ /dev/null
@@ -1,245 +0,0 @@
-from toolbox import update_ui, trimmed_format_exc, promote_file_to_downloadzone, get_log_folder
-from toolbox import CatchException, report_exception, write_history_to_file, zip_folder
-
-
-class PaperFileGroup():
- def __init__(self):
- self.file_paths = []
- self.file_contents = []
- self.sp_file_contents = []
- self.sp_file_index = []
- self.sp_file_tag = []
-
- # count_token
- from request_llms.bridge_all import model_info
- enc = model_info["gpt-3.5-turbo"]['tokenizer']
- def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
- self.get_token_num = get_token_num
-
- def run_file_split(self, max_token_limit=1900):
- """
- 将长文本分离开来
- """
- for index, file_content in enumerate(self.file_contents):
- if self.get_token_num(file_content) < max_token_limit:
- self.sp_file_contents.append(file_content)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index])
- else:
- from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
- segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
- for j, segment in enumerate(segments):
- self.sp_file_contents.append(segment)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
-
- print('Segmentation: done')
- def merge_result(self):
- self.file_result = ["" for _ in range(len(self.file_paths))]
- for r, k in zip(self.sp_file_result, self.sp_file_index):
- self.file_result[k] += r
-
- def write_result(self):
- manifest = []
- for path, res in zip(self.file_paths, self.file_result):
- with open(path + '.polish.tex', 'w', encoding='utf8') as f:
- manifest.append(path + '.polish.tex')
- f.write(res)
- return manifest
-
- def zip_result(self):
- import os, time
- folder = os.path.dirname(self.file_paths[0])
- t = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
- zip_folder(folder, get_log_folder(), f'{t}-polished.zip')
-
-
-def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='polish'):
- import time, os, re
- from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
-
-
- # <-------- 读取Latex文件,删除其中的所有注释 ---------->
- pfg = PaperFileGroup()
-
- for index, fp in enumerate(file_manifest):
- with open(fp, 'r', encoding='utf-8', errors='replace') as f:
- file_content = f.read()
- # 定义注释的正则表达式
- comment_pattern = r'(?
- pfg.run_file_split(max_token_limit=1024)
- n_split = len(pfg.sp_file_contents)
-
-
- # <-------- 多线程润色开始 ---------->
- if language == 'en':
- if mode == 'polish':
- inputs_array = ["Below is a section from an academic paper, polish this section to meet the academic standard, " +
- "improve the grammar, clarity and overall readability, do not modify any latex command such as \section, \cite and equations:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- else:
- inputs_array = [r"Below is a section from an academic paper, proofread this section." +
- r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " +
- r"Answer me only with the revised text:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- inputs_show_user_array = [f"Polish {f}" for f in pfg.sp_file_tag]
- sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
- elif language == 'zh':
- if mode == 'polish':
- inputs_array = [f"以下是一篇学术论文中的一段内容,请将此部分润色以满足学术标准,提高语法、清晰度和整体可读性,不要修改任何LaTeX命令,例如\section,\cite和方程式:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- else:
- inputs_array = [f"以下是一篇学术论文中的一段内容,请对这部分内容进行语法矫正。不要修改任何LaTeX命令,例如\section,\cite和方程式:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- inputs_show_user_array = [f"润色 {f}" for f in pfg.sp_file_tag]
- sys_prompt_array=["你是一位专业的中文学术论文作家。" for _ in range(n_split)]
-
-
- gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array=inputs_array,
- inputs_show_user_array=inputs_show_user_array,
- llm_kwargs=llm_kwargs,
- chatbot=chatbot,
- history_array=[[""] for _ in range(n_split)],
- sys_prompt_array=sys_prompt_array,
- # max_workers=5, # 并行任务数量限制,最多同时执行5个,其他的排队等待
- scroller_max_len = 80
- )
-
- # <-------- 文本碎片重组为完整的tex文件,整理结果为压缩包 ---------->
- try:
- pfg.sp_file_result = []
- for i_say, gpt_say in zip(gpt_response_collection[0::2], gpt_response_collection[1::2]):
- pfg.sp_file_result.append(gpt_say)
- pfg.merge_result()
- pfg.write_result()
- pfg.zip_result()
- except:
- print(trimmed_format_exc())
-
- # <-------- 整理结果,退出 ---------->
- create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md"
- res = write_history_to_file(gpt_response_collection, file_basename=create_report_file_name)
- promote_file_to_downloadzone(res, chatbot=chatbot)
-
- history = gpt_response_collection
- chatbot.append((f"{fp}完成了吗?", res))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
-
-@CatchException
-def Latex英文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # 基本信息:功能、贡献者
- chatbot.append([
- "函数插件功能?",
- "对整个Latex项目进行润色。函数插件贡献者: Binary-Husky。(注意,此插件不调用Latex,如果有Latex环境,请使用“Latex英文纠错+高亮”插件)"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # 尝试导入依赖,如果缺少依赖,则给出安装建议
- try:
- import tiktoken
- except:
- report_exception(chatbot, history,
- a=f"解析项目: {txt}",
- b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- history = [] # 清空历史,以免输入溢出
- import glob, os
- if os.path.exists(txt):
- project_folder = txt
- else:
- if txt == "": txt = '空空如也的输入栏'
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
- if len(file_manifest) == 0:
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en')
-
-
-
-
-
-
-@CatchException
-def Latex中文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # 基本信息:功能、贡献者
- chatbot.append([
- "函数插件功能?",
- "对整个Latex项目进行润色。函数插件贡献者: Binary-Husky"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # 尝试导入依赖,如果缺少依赖,则给出安装建议
- try:
- import tiktoken
- except:
- report_exception(chatbot, history,
- a=f"解析项目: {txt}",
- b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- history = [] # 清空历史,以免输入溢出
- import glob, os
- if os.path.exists(txt):
- project_folder = txt
- else:
- if txt == "": txt = '空空如也的输入栏'
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
- if len(file_manifest) == 0:
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh')
-
-
-
-
-@CatchException
-def Latex英文纠错(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # 基本信息:功能、贡献者
- chatbot.append([
- "函数插件功能?",
- "对整个Latex项目进行纠错。函数插件贡献者: Binary-Husky"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # 尝试导入依赖,如果缺少依赖,则给出安装建议
- try:
- import tiktoken
- except:
- report_exception(chatbot, history,
- a=f"解析项目: {txt}",
- b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- history = [] # 清空历史,以免输入溢出
- import glob, os
- if os.path.exists(txt):
- project_folder = txt
- else:
- if txt == "": txt = '空空如也的输入栏'
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
- if len(file_manifest) == 0:
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='proofread')
-
-
-
diff --git "a/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py" "b/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py"
deleted file mode 100644
index 49470c864e59b790b09789b97227e7b00768ccfd..0000000000000000000000000000000000000000
--- "a/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py"
+++ /dev/null
@@ -1,176 +0,0 @@
-from toolbox import update_ui, promote_file_to_downloadzone
-from toolbox import CatchException, report_exception, write_history_to_file
-fast_debug = False
-
-class PaperFileGroup():
- def __init__(self):
- self.file_paths = []
- self.file_contents = []
- self.sp_file_contents = []
- self.sp_file_index = []
- self.sp_file_tag = []
-
- # count_token
- from request_llms.bridge_all import model_info
- enc = model_info["gpt-3.5-turbo"]['tokenizer']
- def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
- self.get_token_num = get_token_num
-
- def run_file_split(self, max_token_limit=1900):
- """
- 将长文本分离开来
- """
- for index, file_content in enumerate(self.file_contents):
- if self.get_token_num(file_content) < max_token_limit:
- self.sp_file_contents.append(file_content)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index])
- else:
- from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
- segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
- for j, segment in enumerate(segments):
- self.sp_file_contents.append(segment)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
-
- print('Segmentation: done')
-
-def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'):
- import time, os, re
- from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
-
- # <-------- 读取Latex文件,删除其中的所有注释 ---------->
- pfg = PaperFileGroup()
-
- for index, fp in enumerate(file_manifest):
- with open(fp, 'r', encoding='utf-8', errors='replace') as f:
- file_content = f.read()
- # 定义注释的正则表达式
- comment_pattern = r'(?
- pfg.run_file_split(max_token_limit=1024)
- n_split = len(pfg.sp_file_contents)
-
- # <-------- 抽取摘要 ---------->
- # if language == 'en':
- # abs_extract_inputs = f"Please write an abstract for this paper"
-
- # # 单线,获取文章meta信息
- # paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive(
- # inputs=abs_extract_inputs,
- # inputs_show_user=f"正在抽取摘要信息。",
- # llm_kwargs=llm_kwargs,
- # chatbot=chatbot, history=[],
- # sys_prompt="Your job is to collect information from materials。",
- # )
-
- # <-------- 多线程润色开始 ---------->
- if language == 'en->zh':
- inputs_array = ["Below is a section from an English academic paper, translate it into Chinese, do not modify any latex command such as \section, \cite and equations:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
- sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
- elif language == 'zh->en':
- inputs_array = [f"Below is a section from a Chinese academic paper, translate it into English, do not modify any latex command such as \section, \cite and equations:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag]
- sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)]
-
- gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array=inputs_array,
- inputs_show_user_array=inputs_show_user_array,
- llm_kwargs=llm_kwargs,
- chatbot=chatbot,
- history_array=[[""] for _ in range(n_split)],
- sys_prompt_array=sys_prompt_array,
- # max_workers=5, # OpenAI所允许的最大并行过载
- scroller_max_len = 80
- )
-
- # <-------- 整理结果,退出 ---------->
- create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md"
- res = write_history_to_file(gpt_response_collection, create_report_file_name)
- promote_file_to_downloadzone(res, chatbot=chatbot)
- history = gpt_response_collection
- chatbot.append((f"{fp}完成了吗?", res))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
-
-
-
-
-@CatchException
-def Latex英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # 基本信息:功能、贡献者
- chatbot.append([
- "函数插件功能?",
- "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # 尝试导入依赖,如果缺少依赖,则给出安装建议
- try:
- import tiktoken
- except:
- report_exception(chatbot, history,
- a=f"解析项目: {txt}",
- b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- history = [] # 清空历史,以免输入溢出
- import glob, os
- if os.path.exists(txt):
- project_folder = txt
- else:
- if txt == "": txt = '空空如也的输入栏'
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
- if len(file_manifest) == 0:
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en->zh')
-
-
-
-
-
-@CatchException
-def Latex中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # 基本信息:功能、贡献者
- chatbot.append([
- "函数插件功能?",
- "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # 尝试导入依赖,如果缺少依赖,则给出安装建议
- try:
- import tiktoken
- except:
- report_exception(chatbot, history,
- a=f"解析项目: {txt}",
- b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- history = [] # 清空历史,以免输入溢出
- import glob, os
- if os.path.exists(txt):
- project_folder = txt
- else:
- if txt == "": txt = '空空如也的输入栏'
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
- if len(file_manifest) == 0:
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh->en')
\ No newline at end of file
diff --git "a/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py" "b/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py"
deleted file mode 100644
index 18a8d1bab26af31e7ac0671b95c91660d6d7f02d..0000000000000000000000000000000000000000
--- "a/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py"
+++ /dev/null
@@ -1,306 +0,0 @@
-from toolbox import update_ui, trimmed_format_exc, get_conf, get_log_folder, promote_file_to_downloadzone
-from toolbox import CatchException, report_exception, update_ui_lastest_msg, zip_result, gen_time_str
-from functools import partial
-import glob, os, requests, time
-pj = os.path.join
-ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/")
-
-# =================================== 工具函数 ===============================================
-# 专业词汇声明 = 'If the term "agent" is used in this section, it should be translated to "智能体". '
-def switch_prompt(pfg, mode, more_requirement):
- """
- Generate prompts and system prompts based on the mode for proofreading or translating.
- Args:
- - pfg: Proofreader or Translator instance.
- - mode: A string specifying the mode, either 'proofread' or 'translate_zh'.
-
- Returns:
- - inputs_array: A list of strings containing prompts for users to respond to.
- - sys_prompt_array: A list of strings containing prompts for system prompts.
- """
- n_split = len(pfg.sp_file_contents)
- if mode == 'proofread_en':
- inputs_array = [r"Below is a section from an academic paper, proofread this section." +
- r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + more_requirement +
- r"Answer me only with the revised text:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)]
- elif mode == 'translate_zh':
- inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese. " + more_requirement +
- r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " +
- r"Answer me only with the translated text:" +
- f"\n\n{frag}" for frag in pfg.sp_file_contents]
- sys_prompt_array = ["You are a professional translator." for _ in range(n_split)]
- else:
- assert False, "未知指令"
- return inputs_array, sys_prompt_array
-
-def desend_to_extracted_folder_if_exist(project_folder):
- """
- Descend into the extracted folder if it exists, otherwise return the original folder.
-
- Args:
- - project_folder: A string specifying the folder path.
-
- Returns:
- - A string specifying the path to the extracted folder, or the original folder if there is no extracted folder.
- """
- maybe_dir = [f for f in glob.glob(f'{project_folder}/*') if os.path.isdir(f)]
- if len(maybe_dir) == 0: return project_folder
- if maybe_dir[0].endswith('.extract'): return maybe_dir[0]
- return project_folder
-
-def move_project(project_folder, arxiv_id=None):
- """
- Create a new work folder and copy the project folder to it.
-
- Args:
- - project_folder: A string specifying the folder path of the project.
-
- Returns:
- - A string specifying the path to the new work folder.
- """
- import shutil, time
- time.sleep(2) # avoid time string conflict
- if arxiv_id is not None:
- new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder')
- else:
- new_workfolder = f'{get_log_folder()}/{gen_time_str()}'
- try:
- shutil.rmtree(new_workfolder)
- except:
- pass
-
- # align subfolder if there is a folder wrapper
- items = glob.glob(pj(project_folder,'*'))
- items = [item for item in items if os.path.basename(item)!='__MACOSX']
- if len(glob.glob(pj(project_folder,'*.tex'))) == 0 and len(items) == 1:
- if os.path.isdir(items[0]): project_folder = items[0]
-
- shutil.copytree(src=project_folder, dst=new_workfolder)
- return new_workfolder
-
-def arxiv_download(chatbot, history, txt, allow_cache=True):
- def check_cached_translation_pdf(arxiv_id):
- translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation')
- if not os.path.exists(translation_dir):
- os.makedirs(translation_dir)
- target_file = pj(translation_dir, 'translate_zh.pdf')
- if os.path.exists(target_file):
- promote_file_to_downloadzone(target_file, rename_file=None, chatbot=chatbot)
- target_file_compare = pj(translation_dir, 'comparison.pdf')
- if os.path.exists(target_file_compare):
- promote_file_to_downloadzone(target_file_compare, rename_file=None, chatbot=chatbot)
- return target_file
- return False
- def is_float(s):
- try:
- float(s)
- return True
- except ValueError:
- return False
- if ('.' in txt) and ('/' not in txt) and is_float(txt): # is arxiv ID
- txt = 'https://arxiv.org/abs/' + txt.strip()
- if ('.' in txt) and ('/' not in txt) and is_float(txt[:10]): # is arxiv ID
- txt = 'https://arxiv.org/abs/' + txt[:10]
- if not txt.startswith('https://arxiv.org'):
- return txt, None
-
- # <-------------- inspect format ------------->
- chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...'])
- yield from update_ui(chatbot=chatbot, history=history)
- time.sleep(1) # 刷新界面
-
- url_ = txt # https://arxiv.org/abs/1707.06690
- if not txt.startswith('https://arxiv.org/abs/'):
- msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}。"
- yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面
- return msg, None
- # <-------------- set format ------------->
- arxiv_id = url_.split('/abs/')[-1]
- if 'v' in arxiv_id: arxiv_id = arxiv_id[:10]
- cached_translation_pdf = check_cached_translation_pdf(arxiv_id)
- if cached_translation_pdf and allow_cache: return cached_translation_pdf, arxiv_id
-
- url_tar = url_.replace('/abs/', '/e-print/')
- translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print')
- extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract')
- os.makedirs(translation_dir, exist_ok=True)
-
- # <-------------- download arxiv source file ------------->
- dst = pj(translation_dir, arxiv_id+'.tar')
- if os.path.exists(dst):
- yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面
- else:
- yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history) # 刷新界面
- proxies = get_conf('proxies')
- r = requests.get(url_tar, proxies=proxies)
- with open(dst, 'wb+') as f:
- f.write(r.content)
- # <-------------- extract file ------------->
- yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面
- from toolbox import extract_archive
- extract_archive(file_path=dst, dest_dir=extract_dst)
- return extract_dst, arxiv_id
-# ========================================= 插件主程序1 =====================================================
-
-
-@CatchException
-def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # <-------------- information about this plugin ------------->
- chatbot.append([ "函数插件功能?",
- "对整个Latex项目进行纠错, 用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # <-------------- more requirements ------------->
- if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
- more_req = plugin_kwargs.get("advanced_arg", "")
- _switch_prompt_ = partial(switch_prompt, more_requirement=more_req)
-
- # <-------------- check deps ------------->
- try:
- import glob, os, time, subprocess
- subprocess.Popen(['pdflatex', '-version'])
- from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex
- except Exception as e:
- chatbot.append([ f"解析项目: {txt}",
- f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
-
- # <-------------- clear history and read input ------------->
- history = []
- if os.path.exists(txt):
- project_folder = txt
- else:
- if txt == "": txt = '空空如也的输入栏'
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
- if len(file_manifest) == 0:
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
-
- # <-------------- if is a zip/tar file ------------->
- project_folder = desend_to_extracted_folder_if_exist(project_folder)
-
-
- # <-------------- move latex project away from temp folder ------------->
- project_folder = move_project(project_folder, arxiv_id=None)
-
-
- # <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
- if not os.path.exists(project_folder + '/merge_proofread_en.tex'):
- yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
- chatbot, history, system_prompt, mode='proofread_en', switch_prompt=_switch_prompt_)
-
-
- # <-------------- compile PDF ------------->
- success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread_en',
- work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
-
-
- # <-------------- zip PDF ------------->
- zip_res = zip_result(project_folder)
- if success:
- chatbot.append((f"成功啦", '请查收结果(压缩包)...'))
- yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
- promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
- else:
- chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...'))
- yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
- promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
-
- # <-------------- we are done ------------->
- return success
-
-# ========================================= 插件主程序2 =====================================================
-
-@CatchException
-def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # <-------------- information about this plugin ------------->
- chatbot.append([
- "函数插件功能?",
- "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 此插件Windows支持最佳,Linux下必须使用Docker安装,详见项目主README.md。目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # <-------------- more requirements ------------->
- if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
- more_req = plugin_kwargs.get("advanced_arg", "")
- no_cache = more_req.startswith("--no-cache")
- if no_cache: more_req.lstrip("--no-cache")
- allow_cache = not no_cache
- _switch_prompt_ = partial(switch_prompt, more_requirement=more_req)
-
- # <-------------- check deps ------------->
- try:
- import glob, os, time, subprocess
- subprocess.Popen(['pdflatex', '-version'])
- from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex
- except Exception as e:
- chatbot.append([ f"解析项目: {txt}",
- f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"])
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
-
- # <-------------- clear history and read input ------------->
- history = []
- txt, arxiv_id = yield from arxiv_download(chatbot, history, txt, allow_cache)
- if txt.endswith('.pdf'):
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
-
- if os.path.exists(txt):
- project_folder = txt
- else:
- if txt == "": txt = '空空如也的输入栏'
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无法处理: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)]
- if len(file_manifest) == 0:
- report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- return
-
-
- # <-------------- if is a zip/tar file ------------->
- project_folder = desend_to_extracted_folder_if_exist(project_folder)
-
-
- # <-------------- move latex project away from temp folder ------------->
- project_folder = move_project(project_folder, arxiv_id)
-
-
- # <-------------- if merge_translate_zh is already generated, skip gpt req ------------->
- if not os.path.exists(project_folder + '/merge_translate_zh.tex'):
- yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs,
- chatbot, history, system_prompt, mode='translate_zh', switch_prompt=_switch_prompt_)
-
-
- # <-------------- compile PDF ------------->
- success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', mode='translate_zh',
- work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder)
-
- # <-------------- zip PDF ------------->
- zip_res = zip_result(project_folder)
- if success:
- chatbot.append((f"成功啦", '请查收结果(压缩包)...'))
- yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
- promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
- else:
- chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 您可以到Github Issue区, 用该压缩包进行反馈。如系统是Linux,请检查系统字体(见Github wiki) ...'))
- yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面
- promote_file_to_downloadzone(file=zip_res, chatbot=chatbot)
-
-
- # <-------------- we are done ------------->
- return success
diff --git a/crazy_functions/__init__.py b/crazy_functions/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/crazy_functions/agent_fns/auto_agent.py b/crazy_functions/agent_fns/auto_agent.py
deleted file mode 100644
index 4f8fda9d5872db9c178321d43415b24dbea024bb..0000000000000000000000000000000000000000
--- a/crazy_functions/agent_fns/auto_agent.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, ProxyNetworkActivate
-from toolbox import report_exception, get_log_folder, update_ui_lastest_msg, Singleton
-from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom
-from crazy_functions.agent_fns.general import AutoGenGeneral
-
-
-
-class AutoGenMath(AutoGenGeneral):
-
- def define_agents(self):
- from autogen import AssistantAgent, UserProxyAgent
- return [
- {
- "name": "assistant", # name of the agent.
- "cls": AssistantAgent, # class of the agent.
- },
- {
- "name": "user_proxy", # name of the agent.
- "cls": UserProxyAgent, # class of the agent.
- "human_input_mode": "ALWAYS", # always ask for human input.
- "llm_config": False, # disables llm-based auto reply.
- },
- ]
\ No newline at end of file
diff --git a/crazy_functions/agent_fns/echo_agent.py b/crazy_functions/agent_fns/echo_agent.py
deleted file mode 100644
index 52bf72debc7a56a89b277ced80078ea6b985e1fa..0000000000000000000000000000000000000000
--- a/crazy_functions/agent_fns/echo_agent.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom
-
-class EchoDemo(PluginMultiprocessManager):
- def subprocess_worker(self, child_conn):
- # ⭐⭐ 子进程
- self.child_conn = child_conn
- while True:
- msg = self.child_conn.recv() # PipeCom
- if msg.cmd == "user_input":
- # wait futher user input
- self.child_conn.send(PipeCom("show", msg.content))
- wait_success = self.subprocess_worker_wait_user_feedback(wait_msg="我准备好处理下一个问题了.")
- if not wait_success:
- # wait timeout, terminate this subprocess_worker
- break
- elif msg.cmd == "terminate":
- self.child_conn.send(PipeCom("done", ""))
- break
- print('[debug] subprocess_worker terminated')
\ No newline at end of file
diff --git a/crazy_functions/agent_fns/general.py b/crazy_functions/agent_fns/general.py
deleted file mode 100644
index 49bc4dc89e9e1244891c15ff73bb0ae065d51821..0000000000000000000000000000000000000000
--- a/crazy_functions/agent_fns/general.py
+++ /dev/null
@@ -1,134 +0,0 @@
-from toolbox import trimmed_format_exc, get_conf, ProxyNetworkActivate
-from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom
-from request_llms.bridge_all import predict_no_ui_long_connection
-import time
-
-def gpt_academic_generate_oai_reply(
- self,
- messages,
- sender,
- config,
-):
- llm_config = self.llm_config if config is None else config
- if llm_config is False:
- return False, None
- if messages is None:
- messages = self._oai_messages[sender]
-
- inputs = messages[-1]['content']
- history = []
- for message in messages[:-1]:
- history.append(message['content'])
- context=messages[-1].pop("context", None)
- assert context is None, "预留参数 context 未实现"
-
- reply = predict_no_ui_long_connection(
- inputs=inputs,
- llm_kwargs=llm_config,
- history=history,
- sys_prompt=self._oai_system_message[0]['content'],
- console_slience=True
- )
- assumed_done = reply.endswith('\nTERMINATE')
- return True, reply
-
-class AutoGenGeneral(PluginMultiprocessManager):
- def gpt_academic_print_override(self, user_proxy, message, sender):
- # ⭐⭐ run in subprocess
- self.child_conn.send(PipeCom("show", sender.name + "\n\n---\n\n" + message["content"]))
-
- def gpt_academic_get_human_input(self, user_proxy, message):
- # ⭐⭐ run in subprocess
- patience = 300
- begin_waiting_time = time.time()
- self.child_conn.send(PipeCom("interact", message))
- while True:
- time.sleep(0.5)
- if self.child_conn.poll():
- wait_success = True
- break
- if time.time() - begin_waiting_time > patience:
- self.child_conn.send(PipeCom("done", ""))
- wait_success = False
- break
- if wait_success:
- return self.child_conn.recv().content
- else:
- raise TimeoutError("等待用户输入超时")
-
- def define_agents(self):
- raise NotImplementedError
-
- def exe_autogen(self, input):
- # ⭐⭐ run in subprocess
- input = input.content
- with ProxyNetworkActivate("AutoGen"):
- code_execution_config = {"work_dir": self.autogen_work_dir, "use_docker": self.use_docker}
- agents = self.define_agents()
- user_proxy = None
- assistant = None
- for agent_kwargs in agents:
- agent_cls = agent_kwargs.pop('cls')
- kwargs = {
- 'llm_config':self.llm_kwargs,
- 'code_execution_config':code_execution_config
- }
- kwargs.update(agent_kwargs)
- agent_handle = agent_cls(**kwargs)
- agent_handle._print_received_message = lambda a,b: self.gpt_academic_print_override(agent_kwargs, a, b)
- for d in agent_handle._reply_func_list:
- if hasattr(d['reply_func'],'__name__') and d['reply_func'].__name__ == 'generate_oai_reply':
- d['reply_func'] = gpt_academic_generate_oai_reply
- if agent_kwargs['name'] == 'user_proxy':
- agent_handle.get_human_input = lambda a: self.gpt_academic_get_human_input(user_proxy, a)
- user_proxy = agent_handle
- if agent_kwargs['name'] == 'assistant': assistant = agent_handle
- try:
- if user_proxy is None or assistant is None: raise Exception("用户代理或助理代理未定义")
- user_proxy.initiate_chat(assistant, message=input)
- except Exception as e:
- tb_str = '```\n' + trimmed_format_exc() + '```'
- self.child_conn.send(PipeCom("done", "AutoGen 执行失败: \n\n" + tb_str))
-
- def subprocess_worker(self, child_conn):
- # ⭐⭐ run in subprocess
- self.child_conn = child_conn
- while True:
- msg = self.child_conn.recv() # PipeCom
- self.exe_autogen(msg)
-
-
-class AutoGenGroupChat(AutoGenGeneral):
- def exe_autogen(self, input):
- # ⭐⭐ run in subprocess
- import autogen
-
- input = input.content
- with ProxyNetworkActivate("AutoGen"):
- code_execution_config = {"work_dir": self.autogen_work_dir, "use_docker": self.use_docker}
- agents = self.define_agents()
- agents_instances = []
- for agent_kwargs in agents:
- agent_cls = agent_kwargs.pop("cls")
- kwargs = {"code_execution_config": code_execution_config}
- kwargs.update(agent_kwargs)
- agent_handle = agent_cls(**kwargs)
- agent_handle._print_received_message = lambda a, b: self.gpt_academic_print_override(agent_kwargs, a, b)
- agents_instances.append(agent_handle)
- if agent_kwargs["name"] == "user_proxy":
- user_proxy = agent_handle
- user_proxy.get_human_input = lambda a: self.gpt_academic_get_human_input(user_proxy, a)
- try:
- groupchat = autogen.GroupChat(agents=agents_instances, messages=[], max_round=50)
- manager = autogen.GroupChatManager(groupchat=groupchat, **self.define_group_chat_manager_config())
- manager._print_received_message = lambda a, b: self.gpt_academic_print_override(agent_kwargs, a, b)
- manager.get_human_input = lambda a: self.gpt_academic_get_human_input(manager, a)
- if user_proxy is None:
- raise Exception("user_proxy is not defined")
- user_proxy.initiate_chat(manager, message=input)
- except Exception:
- tb_str = "```\n" + trimmed_format_exc() + "```"
- self.child_conn.send(PipeCom("done", "AutoGen exe failed: \n\n" + tb_str))
-
- def define_group_chat_manager_config(self):
- raise NotImplementedError
diff --git a/crazy_functions/agent_fns/persistent.py b/crazy_functions/agent_fns/persistent.py
deleted file mode 100644
index 82c869cb18ceba5c56e05d3d8b18bb968cf3b35e..0000000000000000000000000000000000000000
--- a/crazy_functions/agent_fns/persistent.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from toolbox import Singleton
-@Singleton
-class GradioMultiuserManagerForPersistentClasses():
- def __init__(self):
- self.mapping = {}
-
- def already_alive(self, key):
- return (key in self.mapping) and (self.mapping[key].is_alive())
-
- def set(self, key, x):
- self.mapping[key] = x
- return self.mapping[key]
-
- def get(self, key):
- return self.mapping[key]
-
diff --git a/crazy_functions/agent_fns/pipe.py b/crazy_functions/agent_fns/pipe.py
deleted file mode 100644
index bb3bc78520d50b0a7995d0390208f69867c5b7e1..0000000000000000000000000000000000000000
--- a/crazy_functions/agent_fns/pipe.py
+++ /dev/null
@@ -1,194 +0,0 @@
-from toolbox import get_log_folder, update_ui, gen_time_str, get_conf, promote_file_to_downloadzone
-from crazy_functions.agent_fns.watchdog import WatchDog
-import time, os
-
-class PipeCom:
- def __init__(self, cmd, content) -> None:
- self.cmd = cmd
- self.content = content
-
-
-class PluginMultiprocessManager:
- def __init__(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # ⭐ run in main process
- self.autogen_work_dir = os.path.join(get_log_folder("autogen"), gen_time_str())
- self.previous_work_dir_files = {}
- self.llm_kwargs = llm_kwargs
- self.plugin_kwargs = plugin_kwargs
- self.chatbot = chatbot
- self.history = history
- self.system_prompt = system_prompt
- # self.web_port = web_port
- self.alive = True
- self.use_docker = get_conf("AUTOGEN_USE_DOCKER")
- self.last_user_input = ""
- # create a thread to monitor self.heartbeat, terminate the instance if no heartbeat for a long time
- timeout_seconds = 5 * 60
- self.heartbeat_watchdog = WatchDog(timeout=timeout_seconds, bark_fn=self.terminate, interval=5)
- self.heartbeat_watchdog.begin_watch()
-
- def feed_heartbeat_watchdog(self):
- # feed this `dog`, so the dog will not `bark` (bark_fn will terminate the instance)
- self.heartbeat_watchdog.feed()
-
- def is_alive(self):
- return self.alive
-
- def launch_subprocess_with_pipe(self):
- # ⭐ run in main process
- from multiprocessing import Process, Pipe
-
- parent_conn, child_conn = Pipe()
- self.p = Process(target=self.subprocess_worker, args=(child_conn,))
- self.p.daemon = True
- self.p.start()
- return parent_conn
-
- def terminate(self):
- self.p.terminate()
- self.alive = False
- print("[debug] instance terminated")
-
- def subprocess_worker(self, child_conn):
- # ⭐⭐ run in subprocess
- raise NotImplementedError
-
- def send_command(self, cmd):
- # ⭐ run in main process
- repeated = False
- if cmd == self.last_user_input:
- repeated = True
- cmd = ""
- else:
- self.last_user_input = cmd
- self.parent_conn.send(PipeCom("user_input", cmd))
- return repeated, cmd
-
- def immediate_showoff_when_possible(self, fp):
- # ⭐ 主进程
- # 获取fp的拓展名
- file_type = fp.split('.')[-1]
- # 如果是文本文件, 则直接显示文本内容
- if file_type.lower() in ['png', 'jpg']:
- image_path = os.path.abspath(fp)
- self.chatbot.append([
- '检测到新生图像:',
- f'本地文件预览:

'
- ])
- yield from update_ui(chatbot=self.chatbot, history=self.history)
-
- def overwatch_workdir_file_change(self):
- # ⭐ 主进程 Docker 外挂文件夹监控
- path_to_overwatch = self.autogen_work_dir
- change_list = []
- # 扫描路径下的所有文件, 并与self.previous_work_dir_files中所记录的文件进行对比,
- # 如果有新文件出现,或者文件的修改时间发生变化,则更新self.previous_work_dir_files中
- # 把新文件和发生变化的文件的路径记录到 change_list 中
- for root, dirs, files in os.walk(path_to_overwatch):
- for file in files:
- file_path = os.path.join(root, file)
- if file_path not in self.previous_work_dir_files.keys():
- last_modified_time = os.stat(file_path).st_mtime
- self.previous_work_dir_files.update({file_path: last_modified_time})
- change_list.append(file_path)
- else:
- last_modified_time = os.stat(file_path).st_mtime
- if last_modified_time != self.previous_work_dir_files[file_path]:
- self.previous_work_dir_files[file_path] = last_modified_time
- change_list.append(file_path)
- if len(change_list) > 0:
- file_links = ""
- for f in change_list:
- res = promote_file_to_downloadzone(f)
- file_links += f'
{res}'
- yield from self.immediate_showoff_when_possible(f)
-
- self.chatbot.append(['检测到新生文档.', f'文档清单如下: {file_links}'])
- yield from update_ui(chatbot=self.chatbot, history=self.history)
- return change_list
-
-
- def main_process_ui_control(self, txt, create_or_resume) -> str:
- # ⭐ 主进程
- if create_or_resume == 'create':
- self.cnt = 1
- self.parent_conn = self.launch_subprocess_with_pipe() # ⭐⭐⭐
- repeated, cmd_to_autogen = self.send_command(txt)
- if txt == 'exit':
- self.chatbot.append([f"结束", "结束信号已明确,终止AutoGen程序。"])
- yield from update_ui(chatbot=self.chatbot, history=self.history)
- self.terminate()
- return "terminate"
-
- # patience = 10
-
- while True:
- time.sleep(0.5)
- if not self.alive:
- # the heartbeat watchdog might have it killed
- self.terminate()
- return "terminate"
- if self.parent_conn.poll():
- self.feed_heartbeat_watchdog()
- if "[GPT-Academic] 等待中" in self.chatbot[-1][-1]:
- self.chatbot.pop(-1) # remove the last line
- if "等待您的进一步指令" in self.chatbot[-1][-1]:
- self.chatbot.pop(-1) # remove the last line
- if '[GPT-Academic] 等待中' in self.chatbot[-1][-1]:
- self.chatbot.pop(-1) # remove the last line
- msg = self.parent_conn.recv() # PipeCom
- if msg.cmd == "done":
- self.chatbot.append([f"结束", msg.content])
- self.cnt += 1
- yield from update_ui(chatbot=self.chatbot, history=self.history)
- self.terminate()
- break
- if msg.cmd == "show":
- yield from self.overwatch_workdir_file_change()
- notice = ""
- if repeated: notice = "(自动忽略重复的输入)"
- self.chatbot.append([f"运行阶段-{self.cnt}(上次用户反馈输入为: 「{cmd_to_autogen}」{notice}", msg.content])
- self.cnt += 1
- yield from update_ui(chatbot=self.chatbot, history=self.history)
- if msg.cmd == "interact":
- yield from self.overwatch_workdir_file_change()
- self.chatbot.append([f"程序抵达用户反馈节点.", msg.content +
- "\n\n等待您的进一步指令." +
- "\n\n(1) 一般情况下您不需要说什么, 清空输入区, 然后直接点击“提交”以继续. " +
- "\n\n(2) 如果您需要补充些什么, 输入要反馈的内容, 直接点击“提交”以继续. " +
- "\n\n(3) 如果您想终止程序, 输入exit, 直接点击“提交”以终止AutoGen并解锁. "
- ])
- yield from update_ui(chatbot=self.chatbot, history=self.history)
- # do not terminate here, leave the subprocess_worker instance alive
- return "wait_feedback"
- else:
- self.feed_heartbeat_watchdog()
- if '[GPT-Academic] 等待中' not in self.chatbot[-1][-1]:
- # begin_waiting_time = time.time()
- self.chatbot.append(["[GPT-Academic] 等待AutoGen执行结果 ...", "[GPT-Academic] 等待中"])
- self.chatbot[-1] = [self.chatbot[-1][0], self.chatbot[-1][1].replace("[GPT-Academic] 等待中", "[GPT-Academic] 等待中.")]
- yield from update_ui(chatbot=self.chatbot, history=self.history)
- # if time.time() - begin_waiting_time > patience:
- # self.chatbot.append([f"结束", "等待超时, 终止AutoGen程序。"])
- # yield from update_ui(chatbot=self.chatbot, history=self.history)
- # self.terminate()
- # return "terminate"
-
- self.terminate()
- return "terminate"
-
- def subprocess_worker_wait_user_feedback(self, wait_msg="wait user feedback"):
- # ⭐⭐ run in subprocess
- patience = 5 * 60
- begin_waiting_time = time.time()
- self.child_conn.send(PipeCom("interact", wait_msg))
- while True:
- time.sleep(0.5)
- if self.child_conn.poll():
- wait_success = True
- break
- if time.time() - begin_waiting_time > patience:
- self.child_conn.send(PipeCom("done", ""))
- wait_success = False
- break
- return wait_success
diff --git a/crazy_functions/agent_fns/watchdog.py b/crazy_functions/agent_fns/watchdog.py
deleted file mode 100644
index 2a2bdfab95097d6c4ad36329ab1fa02dd2ebe868..0000000000000000000000000000000000000000
--- a/crazy_functions/agent_fns/watchdog.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import threading, time
-
-class WatchDog():
- def __init__(self, timeout, bark_fn, interval=3, msg="") -> None:
- self.last_feed = None
- self.timeout = timeout
- self.bark_fn = bark_fn
- self.interval = interval
- self.msg = msg
- self.kill_dog = False
-
- def watch(self):
- while True:
- if self.kill_dog: break
- if time.time() - self.last_feed > self.timeout:
- if len(self.msg) > 0: print(self.msg)
- self.bark_fn()
- break
- time.sleep(self.interval)
-
- def begin_watch(self):
- self.last_feed = time.time()
- th = threading.Thread(target=self.watch)
- th.daemon = True
- th.start()
-
- def feed(self):
- self.last_feed = time.time()
diff --git "a/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py" "b/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py"
deleted file mode 100644
index 336d7cfc85ac159841758123fa057bd20a0bbbec..0000000000000000000000000000000000000000
--- "a/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py"
+++ /dev/null
@@ -1,141 +0,0 @@
-from toolbox import CatchException, update_ui, promote_file_to_downloadzone
-from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
-import datetime, json
-
-def fetch_items(list_of_items, batch_size):
- for i in range(0, len(list_of_items), batch_size):
- yield list_of_items[i:i + batch_size]
-
-def string_to_options(arguments):
- import argparse
- import shlex
-
- # Create an argparse.ArgumentParser instance
- parser = argparse.ArgumentParser()
-
- # Add command-line arguments
- parser.add_argument("--llm_to_learn", type=str, help="LLM model to learn", default="gpt-3.5-turbo")
- parser.add_argument("--prompt_prefix", type=str, help="Prompt prefix", default='')
- parser.add_argument("--system_prompt", type=str, help="System prompt", default='')
- parser.add_argument("--batch", type=int, help="System prompt", default=50)
- parser.add_argument("--pre_seq_len", type=int, help="pre_seq_len", default=50)
- parser.add_argument("--learning_rate", type=float, help="learning_rate", default=2e-2)
- parser.add_argument("--num_gpus", type=int, help="num_gpus", default=1)
- parser.add_argument("--json_dataset", type=str, help="json_dataset", default="")
- parser.add_argument("--ptuning_directory", type=str, help="ptuning_directory", default="")
-
-
-
- # Parse the arguments
- args = parser.parse_args(shlex.split(arguments))
-
- return args
-
-@CatchException
-def 微调数据集生成(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- """
- txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
- llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行
- plugin_kwargs 插件模型的参数
- chatbot 聊天显示框的句柄,用于显示给用户
- history 聊天历史,前情提要
- system_prompt 给gpt的静默提醒
- web_port 当前软件运行的端口号
- """
- history = [] # 清空历史,以免输入溢出
- chatbot.append(("这是什么功能?", "[Local Message] 微调数据集生成"))
- if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
- args = plugin_kwargs.get("advanced_arg", None)
- if args is None:
- chatbot.append(("没给定指令", "退出"))
- yield from update_ui(chatbot=chatbot, history=history); return
- else:
- arguments = string_to_options(arguments=args)
-
- dat = []
- with open(txt, 'r', encoding='utf8') as f:
- for line in f.readlines():
- json_dat = json.loads(line)
- dat.append(json_dat["content"])
-
- llm_kwargs['llm_model'] = arguments.llm_to_learn
- for batch in fetch_items(dat, arguments.batch):
- res = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array=[f"{arguments.prompt_prefix}\n\n{b}" for b in (batch)],
- inputs_show_user_array=[f"Show Nothing" for _ in (batch)],
- llm_kwargs=llm_kwargs,
- chatbot=chatbot,
- history_array=[[] for _ in (batch)],
- sys_prompt_array=[arguments.system_prompt for _ in (batch)],
- max_workers=10 # OpenAI所允许的最大并行过载
- )
-
- with open(txt+'.generated.json', 'a+', encoding='utf8') as f:
- for b, r in zip(batch, res[1::2]):
- f.write(json.dumps({"content":b, "summary":r}, ensure_ascii=False)+'\n')
-
- promote_file_to_downloadzone(txt+'.generated.json', rename_file='generated.json', chatbot=chatbot)
- return
-
-
-
-@CatchException
-def 启动微调(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- """
- txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
- llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行
- plugin_kwargs 插件模型的参数
- chatbot 聊天显示框的句柄,用于显示给用户
- history 聊天历史,前情提要
- system_prompt 给gpt的静默提醒
- web_port 当前软件运行的端口号
- """
- import subprocess
- history = [] # 清空历史,以免输入溢出
- chatbot.append(("这是什么功能?", "[Local Message] 微调数据集生成"))
- if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg")
- args = plugin_kwargs.get("advanced_arg", None)
- if args is None:
- chatbot.append(("没给定指令", "退出"))
- yield from update_ui(chatbot=chatbot, history=history); return
- else:
- arguments = string_to_options(arguments=args)
-
-
-
- pre_seq_len = arguments.pre_seq_len # 128
- learning_rate = arguments.learning_rate # 2e-2
- num_gpus = arguments.num_gpus # 1
- json_dataset = arguments.json_dataset # 't_code.json'
- ptuning_directory = arguments.ptuning_directory # '/home/hmp/ChatGLM2-6B/ptuning'
-
- command = f"torchrun --standalone --nnodes=1 --nproc-per-node={num_gpus} main.py \
- --do_train \
- --train_file AdvertiseGen/{json_dataset} \
- --validation_file AdvertiseGen/{json_dataset} \
- --preprocessing_num_workers 20 \
- --prompt_column content \
- --response_column summary \
- --overwrite_cache \
- --model_name_or_path THUDM/chatglm2-6b \
- --output_dir output/clothgen-chatglm2-6b-pt-{pre_seq_len}-{learning_rate} \
- --overwrite_output_dir \
- --max_source_length 256 \
- --max_target_length 256 \
- --per_device_train_batch_size 1 \
- --per_device_eval_batch_size 1 \
- --gradient_accumulation_steps 16 \
- --predict_with_generate \
- --max_steps 100 \
- --logging_steps 10 \
- --save_steps 20 \
- --learning_rate {learning_rate} \
- --pre_seq_len {pre_seq_len} \
- --quantization_bit 4"
-
- process = subprocess.Popen(command, shell=True, cwd=ptuning_directory)
- try:
- process.communicate(timeout=3600*24)
- except subprocess.TimeoutExpired:
- process.kill()
- return
diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py
deleted file mode 100644
index 0c623b8e027858b2579a021769bb304e34c4e373..0000000000000000000000000000000000000000
--- a/crazy_functions/crazy_functions_test.py
+++ /dev/null
@@ -1,231 +0,0 @@
-"""
-这是什么?
- 这个文件用于函数插件的单元测试
- 运行方法 python crazy_functions/crazy_functions_test.py
-"""
-
-# ==============================================================================================================================
-
-def validate_path():
- import os, sys
- dir_name = os.path.dirname(__file__)
- root_dir_assume = os.path.abspath(os.path.dirname(__file__) + '/..')
- os.chdir(root_dir_assume)
- sys.path.append(root_dir_assume)
-validate_path() # validate path so you can run from base directory
-
-# ==============================================================================================================================
-
-from colorful import *
-from toolbox import get_conf, ChatBotWithCookies
-import contextlib
-import os
-import sys
-from functools import wraps
-proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \
- get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY')
-
-llm_kwargs = {
- 'api_key': API_KEY,
- 'llm_model': LLM_MODEL,
- 'top_p':1.0,
- 'max_length': None,
- 'temperature':1.0,
-}
-plugin_kwargs = { }
-chatbot = ChatBotWithCookies(llm_kwargs)
-history = []
-system_prompt = "Serve me as a writing and programming assistant."
-web_port = 1024
-
-# ==============================================================================================================================
-
-def silence_stdout(func):
- @wraps(func)
- def wrapper(*args, **kwargs):
- _original_stdout = sys.stdout
- sys.stdout = open(os.devnull, 'w')
- for q in func(*args, **kwargs):
- sys.stdout = _original_stdout
- yield q
- sys.stdout = open(os.devnull, 'w')
- sys.stdout.close()
- sys.stdout = _original_stdout
- return wrapper
-
-class CLI_Printer():
- def __init__(self) -> None:
- self.pre_buf = ""
-
- def print(self, buf):
- bufp = ""
- for index, chat in enumerate(buf):
- a, b = chat
- bufp += sprint亮靛('[Me]:' + a) + '\n'
- bufp += '[GPT]:' + b
- if index < len(buf)-1:
- bufp += '\n'
-
- if self.pre_buf!="" and bufp.startswith(self.pre_buf):
- print(bufp[len(self.pre_buf):], end='')
- else:
- print('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'+bufp, end='')
- self.pre_buf = bufp
- return
-
-cli_printer = CLI_Printer()
-# ==============================================================================================================================
-def test_解析一个Python项目():
- from crazy_functions.解析项目源代码 import 解析一个Python项目
- txt = "crazy_functions/test_project/python/dqn"
- for cookies, cb, hist, msg in 解析一个Python项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_解析一个Cpp项目():
- from crazy_functions.解析项目源代码 import 解析一个C项目
- txt = "crazy_functions/test_project/cpp/cppipc"
- for cookies, cb, hist, msg in 解析一个C项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_Latex英文润色():
- from crazy_functions.Latex全文润色 import Latex英文润色
- txt = "crazy_functions/test_project/latex/attention"
- for cookies, cb, hist, msg in Latex英文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_Markdown中译英():
- from crazy_functions.批量Markdown翻译 import Markdown中译英
- txt = "README.md"
- for cookies, cb, hist, msg in Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_批量翻译PDF文档():
- from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
- txt = "crazy_functions/test_project/pdf_and_word"
- for cookies, cb, hist, msg in 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_谷歌检索小助手():
- from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
- txt = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=auto+reinforcement+learning&btnG="
- for cookies, cb, hist, msg in 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_总结word文档():
- from crazy_functions.总结word文档 import 总结word文档
- txt = "crazy_functions/test_project/pdf_and_word"
- for cookies, cb, hist, msg in 总结word文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_下载arxiv论文并翻译摘要():
- from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要
- txt = "1812.10695"
- for cookies, cb, hist, msg in 下载arxiv论文并翻译摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_联网回答问题():
- from crazy_functions.联网的ChatGPT import 连接网络回答问题
- # txt = "谁是应急食品?"
- # >> '根据以上搜索结果可以得知,应急食品是“原神”游戏中的角色派蒙的外号。'
- # txt = "道路千万条,安全第一条。后面两句是?"
- # >> '行车不规范,亲人两行泪。'
- # txt = "You should have gone for the head. What does that mean?"
- # >> The phrase "You should have gone for the head" is a quote from the Marvel movies, Avengers: Infinity War and Avengers: Endgame. It was spoken by the character Thanos in Infinity War and by Thor in Endgame.
- txt = "AutoGPT是什么?"
- for cookies, cb, hist, msg in 连接网络回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print("当前问答:", cb[-1][-1].replace("\n"," "))
- for i, it in enumerate(cb): print亮蓝(it[0]); print亮黄(it[1])
-
-def test_解析ipynb文件():
- from crazy_functions.解析JupyterNotebook import 解析ipynb文件
- txt = "crazy_functions/test_samples"
- for cookies, cb, hist, msg in 解析ipynb文件(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-
-def test_数学动画生成manim():
- from crazy_functions.数学动画生成manim import 动画生成
- txt = "A ball split into 2, and then split into 4, and finally split into 8."
- for cookies, cb, hist, msg in 动画生成(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-
-
-def test_Markdown多语言():
- from crazy_functions.批量Markdown翻译 import Markdown翻译指定语言
- txt = "README.md"
- history = []
- for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]:
- plugin_kwargs = {"advanced_arg": lang}
- for cookies, cb, hist, msg in Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- print(cb)
-
-def test_Langchain知识库():
- from crazy_functions.Langchain知识库 import 知识库问答
- txt = "./"
- chatbot = ChatBotWithCookies(llm_kwargs)
- for cookies, cb, hist, msg in silence_stdout(知识库问答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- cli_printer.print(cb) # print(cb)
-
- chatbot = ChatBotWithCookies(cookies)
- from crazy_functions.Langchain知识库 import 读取知识库作答
- txt = "What is the installation method?"
- for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- cli_printer.print(cb) # print(cb)
-
-def test_Langchain知识库读取():
- from crazy_functions.Langchain知识库 import 读取知识库作答
- txt = "远程云服务器部署?"
- for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- cli_printer.print(cb) # print(cb)
-
-def test_Latex():
- from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF
-
- # txt = r"https://arxiv.org/abs/1706.03762"
- # txt = r"https://arxiv.org/abs/1902.03185"
- # txt = r"https://arxiv.org/abs/2305.18290"
- # txt = r"https://arxiv.org/abs/2305.17608"
- # txt = r"https://arxiv.org/abs/2211.16068" # ACE
- # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE
- # txt = r"https://arxiv.org/abs/2002.09253"
- # txt = r"https://arxiv.org/abs/2306.07831"
- # txt = r"https://arxiv.org/abs/2212.10156"
- # txt = r"https://arxiv.org/abs/2211.11559"
- # txt = r"https://arxiv.org/abs/2303.08774"
- txt = r"https://arxiv.org/abs/2303.12712"
- # txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder"
-
-
- for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- cli_printer.print(cb) # print(cb)
-
-
-
- # txt = "2302.02948.tar"
- # print(txt)
- # main_tex, work_folder = Latex预处理(txt)
- # print('main tex:', main_tex)
- # res = 编译Latex(main_tex, work_folder)
- # # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
- # cli_printer.print(cb) # print(cb)
-
-
-
-# test_解析一个Python项目()
-# test_Latex英文润色()
-# test_Markdown中译英()
-# test_批量翻译PDF文档()
-# test_谷歌检索小助手()
-# test_总结word文档()
-# test_下载arxiv论文并翻译摘要()
-# test_解析一个Cpp项目()
-# test_联网回答问题()
-# test_解析ipynb文件()
-# test_数学动画生成manim()
-# test_Langchain知识库()
-# test_Langchain知识库读取()
-if __name__ == "__main__":
- test_Latex()
- input("程序完成,回车退出。")
- print("退出。")
\ No newline at end of file
diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py
deleted file mode 100644
index 4d3b1953d424c8d0c9ba01882b55c2fe0ee18941..0000000000000000000000000000000000000000
--- a/crazy_functions/crazy_utils.py
+++ /dev/null
@@ -1,606 +0,0 @@
-from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton
-import threading
-import os
-import logging
-
-def input_clipping(inputs, history, max_token_limit):
- import numpy as np
- from request_llms.bridge_all import model_info
- enc = model_info["gpt-3.5-turbo"]['tokenizer']
- def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
-
- mode = 'input-and-history'
- # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史
- input_token_num = get_token_num(inputs)
- if input_token_num < max_token_limit//2:
- mode = 'only-history'
- max_token_limit = max_token_limit - input_token_num
-
- everything = [inputs] if mode == 'input-and-history' else ['']
- everything.extend(history)
- n_token = get_token_num('\n'.join(everything))
- everything_token = [get_token_num(e) for e in everything]
- delta = max(everything_token) // 16 # 截断时的颗粒度
-
- while n_token > max_token_limit:
- where = np.argmax(everything_token)
- encoded = enc.encode(everything[where], disallowed_special=())
- clipped_encoded = encoded[:len(encoded)-delta]
- everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char
- everything_token[where] = get_token_num(everything[where])
- n_token = get_token_num('\n'.join(everything))
-
- if mode == 'input-and-history':
- inputs = everything[0]
- else:
- pass
- history = everything[1:]
- return inputs, history
-
-def request_gpt_model_in_new_thread_with_ui_alive(
- inputs, inputs_show_user, llm_kwargs,
- chatbot, history, sys_prompt, refresh_interval=0.2,
- handle_token_exceed=True,
- retry_times_at_unknown_error=2,
- ):
- """
- Request GPT model,请求GPT模型同时维持用户界面活跃。
-
- 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行):
- inputs (string): List of inputs (输入)
- inputs_show_user (string): List of inputs to show user(展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性)
- top_p (float): Top p value for sampling from model distribution (GPT参数,浮点数)
- temperature (float): Temperature value for sampling from model distribution(GPT参数,浮点数)
- chatbot: chatbot inputs and outputs (用户界面对话窗口句柄,用于数据流可视化)
- history (list): List of chat history (历史,对话历史列表)
- sys_prompt (string): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样)
- refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果)
- handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启
- retry_times_at_unknown_error:失败时的重试次数
-
- 输出 Returns:
- future: 输出,GPT返回的结果
- """
- import time
- from concurrent.futures import ThreadPoolExecutor
- from request_llms.bridge_all import predict_no_ui_long_connection
- # 用户反馈
- chatbot.append([inputs_show_user, ""])
- yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
- executor = ThreadPoolExecutor(max_workers=16)
- mutable = ["", time.time(), ""]
- # 看门狗耐心
- watch_dog_patience = 5
- # 请求任务
- def _req_gpt(inputs, history, sys_prompt):
- retry_op = retry_times_at_unknown_error
- exceeded_cnt = 0
- while True:
- # watchdog error
- if len(mutable) >= 2 and (time.time()-mutable[1]) > watch_dog_patience:
- raise RuntimeError("检测到程序终止。")
- try:
- # 【第一种情况】:顺利完成
- result = predict_no_ui_long_connection(
- inputs=inputs, llm_kwargs=llm_kwargs,
- history=history, sys_prompt=sys_prompt, observe_window=mutable)
- return result
- except ConnectionAbortedError as token_exceeded_error:
- # 【第二种情况】:Token溢出
- if handle_token_exceed:
- exceeded_cnt += 1
- # 【选择处理】 尝试计算比例,尽可能多地保留文本
- from toolbox import get_reduce_token_percent
- p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
- MAX_TOKEN = get_max_token(llm_kwargs)
- EXCEED_ALLO = 512 + 512 * exceeded_cnt
- inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO)
- mutable[0] += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
- continue # 返回重试
- else:
- # 【选择放弃】
- tb_str = '```\n' + trimmed_format_exc() + '```'
- mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
- return mutable[0] # 放弃
- except:
- # 【第三种情况】:其他错误:重试几次
- tb_str = '```\n' + trimmed_format_exc() + '```'
- print(tb_str)
- mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
- if retry_op > 0:
- retry_op -= 1
- mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n"
- if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
- time.sleep(30)
- time.sleep(5)
- continue # 返回重试
- else:
- time.sleep(5)
- return mutable[0] # 放弃
-
- # 提交任务
- future = executor.submit(_req_gpt, inputs, history, sys_prompt)
- while True:
- # yield一次以刷新前端页面
- time.sleep(refresh_interval)
- # “喂狗”(看门狗)
- mutable[1] = time.time()
- if future.done():
- break
- chatbot[-1] = [chatbot[-1][0], mutable[0]]
- yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
-
- final_result = future.result()
- chatbot[-1] = [chatbot[-1][0], final_result]
- yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息
- return final_result
-
-def can_multi_process(llm):
- if llm.startswith('gpt-'): return True
- if llm.startswith('api2d-'): return True
- if llm.startswith('azure-'): return True
- if llm.startswith('spark'): return True
- if llm.startswith('zhipuai'): return True
- return False
-
-def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array, inputs_show_user_array, llm_kwargs,
- chatbot, history_array, sys_prompt_array,
- refresh_interval=0.2, max_workers=-1, scroller_max_len=30,
- handle_token_exceed=True, show_user_at_complete=False,
- retry_times_at_unknown_error=2,
- ):
- """
- Request GPT model using multiple threads with UI and high efficiency
- 请求GPT模型的[多线程]版。
- 具备以下功能:
- 实时在UI上反馈远程数据流
- 使用线程池,可调节线程池的大小避免openai的流量限制错误
- 处理中途中止的情况
- 网络等出问题时,会把traceback和已经接收的数据转入输出
-
- 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行):
- inputs_array (list): List of inputs (每个子任务的输入)
- inputs_show_user_array (list): List of inputs to show user(每个子任务展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性)
- llm_kwargs: llm_kwargs参数
- chatbot: chatbot (用户界面对话窗口句柄,用于数据流可视化)
- history_array (list): List of chat history (历史对话输入,双层列表,第一层列表是子任务分解,第二层列表是对话历史)
- sys_prompt_array (list): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样)
- refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果)
- max_workers (int, optional): Maximum number of threads (default: see config.py) (最大线程数,如果子任务非常多,需要用此选项防止高频地请求openai导致错误)
- scroller_max_len (int, optional): Maximum length for scroller (default: 30)(数据流的显示最后收到的多少个字符,仅仅服务于视觉效果)
- handle_token_exceed (bool, optional): (是否在输入过长时,自动缩减文本)
- handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启
- show_user_at_complete (bool, optional): (在结束时,把完整输入-输出结果显示在聊天框)
- retry_times_at_unknown_error:子任务失败时的重试次数
-
- 输出 Returns:
- list: List of GPT model responses (每个子任务的输出汇总,如果某个子任务出错,response中会携带traceback报错信息,方便调试和定位问题。)
- """
- import time, random
- from concurrent.futures import ThreadPoolExecutor
- from request_llms.bridge_all import predict_no_ui_long_connection
- assert len(inputs_array) == len(history_array)
- assert len(inputs_array) == len(sys_prompt_array)
- if max_workers == -1: # 读取配置文件
- try: max_workers = get_conf('DEFAULT_WORKER_NUM')
- except: max_workers = 8
- if max_workers <= 0: max_workers = 3
- # 屏蔽掉 chatglm的多线程,可能会导致严重卡顿
- if not can_multi_process(llm_kwargs['llm_model']):
- max_workers = 1
-
- executor = ThreadPoolExecutor(max_workers=max_workers)
- n_frag = len(inputs_array)
- # 用户反馈
- chatbot.append(["请开始多线程操作。", ""])
- yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
- # 跨线程传递
- mutable = [["", time.time(), "等待中"] for _ in range(n_frag)]
-
- # 看门狗耐心
- watch_dog_patience = 5
-
- # 子线程任务
- def _req_gpt(index, inputs, history, sys_prompt):
- gpt_say = ""
- retry_op = retry_times_at_unknown_error
- exceeded_cnt = 0
- mutable[index][2] = "执行中"
- detect_timeout = lambda: len(mutable[index]) >= 2 and (time.time()-mutable[index][1]) > watch_dog_patience
- while True:
- # watchdog error
- if detect_timeout(): raise RuntimeError("检测到程序终止。")
- try:
- # 【第一种情况】:顺利完成
- gpt_say = predict_no_ui_long_connection(
- inputs=inputs, llm_kwargs=llm_kwargs, history=history,
- sys_prompt=sys_prompt, observe_window=mutable[index], console_slience=True
- )
- mutable[index][2] = "已成功"
- return gpt_say
- except ConnectionAbortedError as token_exceeded_error:
- # 【第二种情况】:Token溢出
- if handle_token_exceed:
- exceeded_cnt += 1
- # 【选择处理】 尝试计算比例,尽可能多地保留文本
- from toolbox import get_reduce_token_percent
- p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error))
- MAX_TOKEN = get_max_token(llm_kwargs)
- EXCEED_ALLO = 512 + 512 * exceeded_cnt
- inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO)
- gpt_say += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n'
- mutable[index][2] = f"截断重试"
- continue # 返回重试
- else:
- # 【选择放弃】
- tb_str = '```\n' + trimmed_format_exc() + '```'
- gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
- if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0]
- mutable[index][2] = "输入过长已放弃"
- return gpt_say # 放弃
- except:
- # 【第三种情况】:其他错误
- if detect_timeout(): raise RuntimeError("检测到程序终止。")
- tb_str = '```\n' + trimmed_format_exc() + '```'
- print(tb_str)
- gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n"
- if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0]
- if retry_op > 0:
- retry_op -= 1
- wait = random.randint(5, 20)
- if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str):
- wait = wait * 3
- fail_info = "OpenAI绑定信用卡可解除频率限制 "
- else:
- fail_info = ""
- # 也许等待十几秒后,情况会好转
- for i in range(wait):
- mutable[index][2] = f"{fail_info}等待重试 {wait-i}"; time.sleep(1)
- # 开始重试
- if detect_timeout(): raise RuntimeError("检测到程序终止。")
- mutable[index][2] = f"重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}"
- continue # 返回重试
- else:
- mutable[index][2] = "已失败"
- wait = 5
- time.sleep(5)
- return gpt_say # 放弃
-
- # 异步任务开始
- futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(
- range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
- cnt = 0
- while True:
- # yield一次以刷新前端页面
- time.sleep(refresh_interval)
- cnt += 1
- worker_done = [h.done() for h in futures]
- # 更好的UI视觉效果
- observe_win = []
- # 每个线程都要“喂狗”(看门狗)
- for thread_index, _ in enumerate(worker_done):
- mutable[thread_index][1] = time.time()
- # 在前端打印些好玩的东西
- for thread_index, _ in enumerate(worker_done):
- print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
- replace('\n', '').replace('`', '.').replace(
- ' ', '.').replace('
', '.....').replace('$', '.')+"`... ]"
- observe_win.append(print_something_really_funny)
- # 在前端打印些好玩的东西
- stat_str = ''.join([f'`{mutable[thread_index][2]}`: {obs}\n\n'
- if not done else f'`{mutable[thread_index][2]}`\n\n'
- for thread_index, done, obs in zip(range(len(worker_done)), worker_done, observe_win)])
- # 在前端打印些好玩的东西
- chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt % 10+1))]
- yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
- if all(worker_done):
- executor.shutdown()
- break
-
- # 异步任务结束
- gpt_response_collection = []
- for inputs_show_user, f in zip(inputs_show_user_array, futures):
- gpt_res = f.result()
- gpt_response_collection.extend([inputs_show_user, gpt_res])
-
- # 是否在结束时,在界面上显示结果
- if show_user_at_complete:
- for inputs_show_user, f in zip(inputs_show_user_array, futures):
- gpt_res = f.result()
- chatbot.append([inputs_show_user, gpt_res])
- yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面
- time.sleep(0.5)
- return gpt_response_collection
-
-
-
-def read_and_clean_pdf_text(fp):
- """
- 这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好
-
- **输入参数说明**
- - `fp`:需要读取和清理文本的pdf文件路径
-
- **输出参数说明**
- - `meta_txt`:清理后的文本内容字符串
- - `page_one_meta`:第一页清理后的文本内容列表
-
- **函数功能**
- 读取pdf文件并清理其中的文本内容,清理规则包括:
- - 提取所有块元的文本信息,并合并为一个字符串
- - 去除短块(字符数小于100)并替换为回车符
- - 清理多余的空行
- - 合并小写字母开头的段落块并替换为空格
- - 清除重复的换行
- - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔
- """
- import fitz, copy
- import re
- import numpy as np
- from colorful import print亮黄, print亮绿
- fc = 0 # Index 0 文本
- fs = 1 # Index 1 字体
- fb = 2 # Index 2 框框
- REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 (比正文字体小,如参考文献、脚注、图注等)
- REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的?时,判定为不是正文(有些文章的正文部分字体大小不是100%统一的,有肉眼不可见的小变化)
- def primary_ffsize(l):
- """
- 提取文本块主字体
- """
- fsize_statiscs = {}
- for wtf in l['spans']:
- if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
- fsize_statiscs[wtf['size']] += len(wtf['text'])
- return max(fsize_statiscs, key=fsize_statiscs.get)
-
- def ffsize_same(a,b):
- """
- 提取字体大小是否近似相等
- """
- return abs((a-b)/max(a,b)) < 0.02
-
- with fitz.open(fp) as doc:
- meta_txt = []
- meta_font = []
-
- meta_line = []
- meta_span = []
- ############################## <第 1 步,搜集初始信息> ##################################
- for index, page in enumerate(doc):
- # file_content += page.get_text()
- text_areas = page.get_text("dict") # 获取页面上的文本信息
- for t in text_areas['blocks']:
- if 'lines' in t:
- pf = 998
- for l in t['lines']:
- txt_line = "".join([wtf['text'] for wtf in l['spans']])
- if len(txt_line) == 0: continue
- pf = primary_ffsize(l)
- meta_line.append([txt_line, pf, l['bbox'], l])
- for wtf in l['spans']: # for l in t['lines']:
- meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
- # meta_line.append(["NEW_BLOCK", pf])
- # 块元提取 for each word segment with in line for each line cross-line words for each block
- meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
- '- ', '') for t in text_areas['blocks'] if 'lines' in t])
- meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
- for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
- if index == 0:
- page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
- '- ', '') for t in text_areas['blocks'] if 'lines' in t]
-
- ############################## <第 2 步,获取正文主字体> ##################################
- try:
- fsize_statiscs = {}
- for span in meta_span:
- if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
- fsize_statiscs[span[1]] += span[2]
- main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
- if REMOVE_FOOT_NOTE:
- give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
- except:
- raise RuntimeError(f'抱歉, 我们暂时无法解析此PDF文档: {fp}。')
- ############################## <第 3 步,切分和重新整合> ##################################
- mega_sec = []
- sec = []
- for index, line in enumerate(meta_line):
- if index == 0:
- sec.append(line[fc])
- continue
- if REMOVE_FOOT_NOTE:
- if meta_line[index][fs] <= give_up_fize_threshold:
- continue
- if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
- # 尝试识别段落
- if meta_line[index][fc].endswith('.') and\
- (meta_line[index-1][fc] != 'NEW_BLOCK') and \
- (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
- sec[-1] += line[fc]
- sec[-1] += "\n\n"
- else:
- sec[-1] += " "
- sec[-1] += line[fc]
- else:
- if (index+1 < len(meta_line)) and \
- meta_line[index][fs] > main_fsize:
- # 单行 + 字体大
- mega_sec.append(copy.deepcopy(sec))
- sec = []
- sec.append("# " + line[fc])
- else:
- # 尝试识别section
- if meta_line[index-1][fs] > meta_line[index][fs]:
- sec.append("\n" + line[fc])
- else:
- sec.append(line[fc])
- mega_sec.append(copy.deepcopy(sec))
-
- finals = []
- for ms in mega_sec:
- final = " ".join(ms)
- final = final.replace('- ', ' ')
- finals.append(final)
- meta_txt = finals
-
- ############################## <第 4 步,乱七八糟的后处理> ##################################
- def 把字符太少的块清除为回车(meta_txt):
- for index, block_txt in enumerate(meta_txt):
- if len(block_txt) < 100:
- meta_txt[index] = '\n'
- return meta_txt
- meta_txt = 把字符太少的块清除为回车(meta_txt)
-
- def 清理多余的空行(meta_txt):
- for index in reversed(range(1, len(meta_txt))):
- if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
- meta_txt.pop(index)
- return meta_txt
- meta_txt = 清理多余的空行(meta_txt)
-
- def 合并小写开头的段落块(meta_txt):
- def starts_with_lowercase_word(s):
- pattern = r"^[a-z]+"
- match = re.match(pattern, s)
- if match:
- return True
- else:
- return False
- for _ in range(100):
- for index, block_txt in enumerate(meta_txt):
- if starts_with_lowercase_word(block_txt):
- if meta_txt[index-1] != '\n':
- meta_txt[index-1] += ' '
- else:
- meta_txt[index-1] = ''
- meta_txt[index-1] += meta_txt[index]
- meta_txt[index] = '\n'
- return meta_txt
- meta_txt = 合并小写开头的段落块(meta_txt)
- meta_txt = 清理多余的空行(meta_txt)
-
- meta_txt = '\n'.join(meta_txt)
- # 清除重复的换行
- for _ in range(5):
- meta_txt = meta_txt.replace('\n\n', '\n')
-
- # 换行 -> 双换行
- meta_txt = meta_txt.replace('\n', '\n\n')
-
- ############################## <第 5 步,展示分割效果> ##################################
- # for f in finals:
- # print亮黄(f)
- # print亮绿('***************************')
-
- return meta_txt, page_one_meta
-
-
-def get_files_from_everything(txt, type): # type='.md'
- """
- 这个函数是用来获取指定目录下所有指定类型(如.md)的文件,并且对于网络上的文件,也可以获取它。
- 下面是对每个参数和返回值的说明:
- 参数
- - txt: 路径或网址,表示要搜索的文件或者文件夹路径或网络上的文件。
- - type: 字符串,表示要搜索的文件类型。默认是.md。
- 返回值
- - success: 布尔值,表示函数是否成功执行。
- - file_manifest: 文件路径列表,里面包含以指定类型为后缀名的所有文件的绝对路径。
- - project_folder: 字符串,表示文件所在的文件夹路径。如果是网络上的文件,就是临时文件夹的路径。
- 该函数详细注释已添加,请确认是否满足您的需要。
- """
- import glob, os
-
- success = True
- if txt.startswith('http'):
- # 网络的远程文件
- import requests
- from toolbox import get_conf
- from toolbox import get_log_folder, gen_time_str
- proxies = get_conf('proxies')
- try:
- r = requests.get(txt, proxies=proxies)
- except:
- raise ConnectionRefusedError(f"无法下载资源{txt},请检查。")
- path = os.path.join(get_log_folder(plugin_name='web_download'), gen_time_str()+type)
- with open(path, 'wb+') as f: f.write(r.content)
- project_folder = get_log_folder(plugin_name='web_download')
- file_manifest = [path]
- elif txt.endswith(type):
- # 直接给定文件
- file_manifest = [txt]
- project_folder = os.path.dirname(txt)
- elif os.path.exists(txt):
- # 本地路径,递归搜索
- project_folder = txt
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*'+type, recursive=True)]
- if len(file_manifest) == 0:
- success = False
- else:
- project_folder = None
- file_manifest = []
- success = False
-
- return success, file_manifest, project_folder
-
-
-
-@Singleton
-class nougat_interface():
- def __init__(self):
- self.threadLock = threading.Lock()
-
- def nougat_with_timeout(self, command, cwd, timeout=3600):
- import subprocess
- from toolbox import ProxyNetworkActivate
- logging.info(f'正在执行命令 {command}')
- with ProxyNetworkActivate("Nougat_Download"):
- process = subprocess.Popen(command, shell=True, cwd=cwd, env=os.environ)
- try:
- stdout, stderr = process.communicate(timeout=timeout)
- except subprocess.TimeoutExpired:
- process.kill()
- stdout, stderr = process.communicate()
- print("Process timed out!")
- return False
- return True
-
-
- def NOUGAT_parse_pdf(self, fp, chatbot, history):
- from toolbox import update_ui_lastest_msg
-
- yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...",
- chatbot=chatbot, history=history, delay=0)
- self.threadLock.acquire()
- import glob, threading, os
- from toolbox import get_log_folder, gen_time_str
- dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str())
- os.makedirs(dst)
-
- yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载NOUGAT... (提示:首次运行需要花费较长时间下载NOUGAT参数)",
- chatbot=chatbot, history=history, delay=0)
- self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600)
- res = glob.glob(os.path.join(dst,'*.mmd'))
- if len(res) == 0:
- self.threadLock.release()
- raise RuntimeError("Nougat解析论文失败。")
- self.threadLock.release()
- return res[0]
-
-
-
-
-def try_install_deps(deps, reload_m=[]):
- import subprocess, sys, importlib
- for dep in deps:
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', dep])
- import site
- importlib.reload(site)
- for m in reload_m:
- importlib.reload(__import__(m))
-
-
-def get_plugin_arg(plugin_kwargs, key, default):
- # 如果参数是空的
- if (key in plugin_kwargs) and (plugin_kwargs[key] == ""): plugin_kwargs.pop(key)
- # 正常情况
- return plugin_kwargs.get(key, default)
diff --git a/crazy_functions/game_fns/game_ascii_art.py b/crazy_functions/game_fns/game_ascii_art.py
deleted file mode 100644
index e0b700877415f04437413ac1765fa90fe1b0844f..0000000000000000000000000000000000000000
--- a/crazy_functions/game_fns/game_ascii_art.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from toolbox import CatchException, update_ui, update_ui_lastest_msg
-from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState
-from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
-from request_llms.bridge_all import predict_no_ui_long_connection
-from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing
-import random
-
-
-class MiniGame_ASCII_Art(GptAcademicGameBaseState):
- def step(self, prompt, chatbot, history):
- if self.step_cnt == 0:
- chatbot.append(["我画你猜(动物)", "请稍等..."])
- else:
- if prompt.strip() == 'exit':
- self.delete_game = True
- yield from update_ui_lastest_msg(lastmsg=f"谜底是{self.obj},游戏结束。", chatbot=chatbot, history=history, delay=0.)
- return
- chatbot.append([prompt, ""])
- yield from update_ui(chatbot=chatbot, history=history)
-
- if self.step_cnt == 0:
- self.lock_plugin(chatbot)
- self.cur_task = 'draw'
-
- if self.cur_task == 'draw':
- avail_obj = ["狗","猫","鸟","鱼","老鼠","蛇"]
- self.obj = random.choice(avail_obj)
- inputs = "I want to play a game called Guess the ASCII art. You can draw the ASCII art and I will try to guess it. " + \
- f"This time you draw a {self.obj}. Note that you must not indicate what you have draw in the text, and you should only produce the ASCII art wrapped by ```. "
- raw_res = predict_no_ui_long_connection(inputs=inputs, llm_kwargs=self.llm_kwargs, history=[], sys_prompt="")
- self.cur_task = 'identify user guess'
- res = get_code_block(raw_res)
- history += ['', f'the answer is {self.obj}', inputs, res]
- yield from update_ui_lastest_msg(lastmsg=res, chatbot=chatbot, history=history, delay=0.)
-
- elif self.cur_task == 'identify user guess':
- if is_same_thing(self.obj, prompt, self.llm_kwargs):
- self.delete_game = True
- yield from update_ui_lastest_msg(lastmsg="你猜对了!", chatbot=chatbot, history=history, delay=0.)
- else:
- self.cur_task = 'identify user guess'
- yield from update_ui_lastest_msg(lastmsg="猜错了,再试试,输入“exit”获取答案。", chatbot=chatbot, history=history, delay=0.)
\ No newline at end of file
diff --git a/crazy_functions/game_fns/game_interactive_story.py b/crazy_functions/game_fns/game_interactive_story.py
deleted file mode 100644
index 5c25f4a350409006ca7a4cd03f010d6b47eb044f..0000000000000000000000000000000000000000
--- a/crazy_functions/game_fns/game_interactive_story.py
+++ /dev/null
@@ -1,212 +0,0 @@
-prompts_hs = """ 请以“{headstart}”为开头,编写一个小说的第一幕。
-
-- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。
-- 出现人物时,给出人物的名字。
-- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。
-- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。
-- 字数要求:第一幕的字数少于300字,且少于2个段落。
-"""
-
-prompts_interact = """ 小说的前文回顾:
-「
-{previously_on_story}
-」
-
-你是一个作家,根据以上的情节,给出4种不同的后续剧情发展方向,每个发展方向都精明扼要地用一句话说明。稍后,我将在这4个选择中,挑选一种剧情发展。
-
-输出格式例如:
-1. 后续剧情发展1
-2. 后续剧情发展2
-3. 后续剧情发展3
-4. 后续剧情发展4
-"""
-
-
-prompts_resume = """小说的前文回顾:
-「
-{previously_on_story}
-」
-
-你是一个作家,我们正在互相讨论,确定后续剧情的发展。
-在以下的剧情发展中,
-「
-{choice}
-」
-我认为更合理的是:{user_choice}。
-请在前文的基础上(不要重复前文),围绕我选定的剧情情节,编写小说的下一幕。
-
-- 禁止杜撰不符合我选择的剧情。
-- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。
-- 不要重复前文。
-- 出现人物时,给出人物的名字。
-- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。
-- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。
-- 小说的下一幕字数少于300字,且少于2个段落。
-"""
-
-
-prompts_terminate = """小说的前文回顾:
-「
-{previously_on_story}
-」
-
-你是一个作家,我们正在互相讨论,确定后续剧情的发展。
-现在,故事该结束了,我认为最合理的故事结局是:{user_choice}。
-
-请在前文的基础上(不要重复前文),编写小说的最后一幕。
-
-- 不要重复前文。
-- 出现人物时,给出人物的名字。
-- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。
-- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。
-- 字数要求:最后一幕的字数少于1000字。
-"""
-
-
-from toolbox import CatchException, update_ui, update_ui_lastest_msg
-from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState
-from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
-from request_llms.bridge_all import predict_no_ui_long_connection
-from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing
-import random
-
-
-class MiniGame_ResumeStory(GptAcademicGameBaseState):
- story_headstart = [
- '先行者知道,他现在是全宇宙中唯一的一个人了。',
- '深夜,一个年轻人穿过天安门广场向纪念堂走去。在二十二世纪编年史中,计算机把他的代号定为M102。',
- '他知道,这最后一课要提前讲了。又一阵剧痛从肝部袭来,几乎使他晕厥过去。',
- '在距地球五万光年的远方,在银河系的中心,一场延续了两万年的星际战争已接近尾声。那里的太空中渐渐隐现出一个方形区域,仿佛灿烂的群星的背景被剪出一个方口。',
- '伊依一行三人乘坐一艘游艇在南太平洋上做吟诗航行,他们的目的地是南极,如果几天后能顺利到达那里,他们将钻出地壳去看诗云。',
- '很多人生来就会莫名其妙地迷上一样东西,仿佛他的出生就是要和这东西约会似的,正是这样,圆圆迷上了肥皂泡。'
- ]
-
-
- def begin_game_step_0(self, prompt, chatbot, history):
- # init game at step 0
- self.headstart = random.choice(self.story_headstart)
- self.story = []
- chatbot.append(["互动写故事", f"这次的故事开头是:{self.headstart}"])
- self.sys_prompt_ = '你是一个想象力丰富的杰出作家。正在与你的朋友互动,一起写故事,因此你每次写的故事段落应少于300字(结局除外)。'
-
-
- def generate_story_image(self, story_paragraph):
- try:
- from crazy_functions.图片生成 import gen_image
- prompt_ = predict_no_ui_long_connection(inputs=story_paragraph, llm_kwargs=self.llm_kwargs, history=[], sys_prompt='你需要根据用户给出的小说段落,进行简短的环境描写。要求:80字以内。')
- image_url, image_path = gen_image(self.llm_kwargs, prompt_, '512x512', model="dall-e-2", quality='standard', style='natural')
- return f'
'
- except:
- return ''
-
- def step(self, prompt, chatbot, history):
-
- """
- 首先,处理游戏初始化等特殊情况
- """
- if self.step_cnt == 0:
- self.begin_game_step_0(prompt, chatbot, history)
- self.lock_plugin(chatbot)
- self.cur_task = 'head_start'
- else:
- if prompt.strip() == 'exit' or prompt.strip() == '结束剧情':
- # should we terminate game here?
- self.delete_game = True
- yield from update_ui_lastest_msg(lastmsg=f"游戏结束。", chatbot=chatbot, history=history, delay=0.)
- return
- if '剧情收尾' in prompt:
- self.cur_task = 'story_terminate'
- # # well, game resumes
- # chatbot.append([prompt, ""])
- # update ui, don't keep the user waiting
- yield from update_ui(chatbot=chatbot, history=history)
-
-
- """
- 处理游戏的主体逻辑
- """
- if self.cur_task == 'head_start':
- """
- 这是游戏的第一步
- """
- inputs_ = prompts_hs.format(headstart=self.headstart)
- history_ = []
- story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs_, '故事开头', self.llm_kwargs,
- chatbot, history_, self.sys_prompt_
- )
- self.story.append(story_paragraph)
- # # 配图
- yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.)
- yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.)
-
- # # 构建后续剧情引导
- previously_on_story = ""
- for s in self.story:
- previously_on_story += s + '\n'
- inputs_ = prompts_interact.format(previously_on_story=previously_on_story)
- history_ = []
- self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs_, '请在以下几种故事走向中,选择一种(当然,您也可以选择给出其他故事走向):', self.llm_kwargs,
- chatbot,
- history_,
- self.sys_prompt_
- )
- self.cur_task = 'user_choice'
-
-
- elif self.cur_task == 'user_choice':
- """
- 根据用户的提示,确定故事的下一步
- """
- if '请在以下几种故事走向中,选择一种' in chatbot[-1][0]: chatbot.pop(-1)
- previously_on_story = ""
- for s in self.story:
- previously_on_story += s + '\n'
- inputs_ = prompts_resume.format(previously_on_story=previously_on_story, choice=self.next_choices, user_choice=prompt)
- history_ = []
- story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs_, f'下一段故事(您的选择是:{prompt})。', self.llm_kwargs,
- chatbot, history_, self.sys_prompt_
- )
- self.story.append(story_paragraph)
- # # 配图
- yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.)
- yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.)
-
- # # 构建后续剧情引导
- previously_on_story = ""
- for s in self.story:
- previously_on_story += s + '\n'
- inputs_ = prompts_interact.format(previously_on_story=previously_on_story)
- history_ = []
- self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs_,
- '请在以下几种故事走向中,选择一种。当然,您也可以给出您心中的其他故事走向。另外,如果您希望剧情立即收尾,请输入剧情走向,并以“剧情收尾”四个字提示程序。', self.llm_kwargs,
- chatbot,
- history_,
- self.sys_prompt_
- )
- self.cur_task = 'user_choice'
-
-
- elif self.cur_task == 'story_terminate':
- """
- 根据用户的提示,确定故事的结局
- """
- previously_on_story = ""
- for s in self.story:
- previously_on_story += s + '\n'
- inputs_ = prompts_terminate.format(previously_on_story=previously_on_story, user_choice=prompt)
- history_ = []
- story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs_, f'故事收尾(您的选择是:{prompt})。', self.llm_kwargs,
- chatbot, history_, self.sys_prompt_
- )
- # # 配图
- yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.)
- yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.)
-
- # terminate game
- self.delete_game = True
- return
diff --git a/crazy_functions/game_fns/game_utils.py b/crazy_functions/game_fns/game_utils.py
deleted file mode 100644
index 09b6f7a935f3e1f254c4cd0f3b74f78e4c2af298..0000000000000000000000000000000000000000
--- a/crazy_functions/game_fns/game_utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-
-from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError
-from request_llms.bridge_all import predict_no_ui_long_connection
-def get_code_block(reply):
- import re
- pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks
- matches = re.findall(pattern, reply) # find all code blocks in text
- if len(matches) == 1:
- return "```" + matches[0] + "```" # code block
- raise RuntimeError("GPT is not generating proper code.")
-
-def is_same_thing(a, b, llm_kwargs):
- from pydantic import BaseModel, Field
- class IsSameThing(BaseModel):
- is_same_thing: bool = Field(description="determine whether two objects are same thing.", default=False)
-
- def run_gpt_fn(inputs, sys_prompt, history=[]):
- return predict_no_ui_long_connection(
- inputs=inputs, llm_kwargs=llm_kwargs,
- history=history, sys_prompt=sys_prompt, observe_window=[]
- )
-
- gpt_json_io = GptJsonIO(IsSameThing)
- inputs_01 = "Identity whether the user input and the target is the same thing: \n target object: {a} \n user input object: {b} \n\n\n".format(a=a, b=b)
- inputs_01 += "\n\n\n Note that the user may describe the target object with a different language, e.g. cat and 猫 are the same thing."
- analyze_res_cot_01 = run_gpt_fn(inputs_01, "", [])
-
- inputs_02 = inputs_01 + gpt_json_io.format_instructions
- analyze_res = run_gpt_fn(inputs_02, "", [inputs_01, analyze_res_cot_01])
-
- try:
- res = gpt_json_io.generate_output_auto_repair(analyze_res, run_gpt_fn)
- return res.is_same_thing
- except JsonStringError as e:
- return False
\ No newline at end of file
diff --git a/crazy_functions/gen_fns/gen_fns_shared.py b/crazy_functions/gen_fns/gen_fns_shared.py
deleted file mode 100644
index 8e73794e84437e861d3468d4f0ab799deae6d98c..0000000000000000000000000000000000000000
--- a/crazy_functions/gen_fns/gen_fns_shared.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import time
-import importlib
-from toolbox import trimmed_format_exc, gen_time_str, get_log_folder
-from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, is_the_upload_folder
-from toolbox import promote_file_to_downloadzone, get_log_folder, update_ui_lastest_msg
-import multiprocessing
-
-def get_class_name(class_string):
- import re
- # Use regex to extract the class name
- class_name = re.search(r'class (\w+)\(', class_string).group(1)
- return class_name
-
-def try_make_module(code, chatbot):
- module_file = 'gpt_fn_' + gen_time_str().replace('-','_')
- fn_path = f'{get_log_folder(plugin_name="gen_plugin_verify")}/{module_file}.py'
- with open(fn_path, 'w', encoding='utf8') as f: f.write(code)
- promote_file_to_downloadzone(fn_path, chatbot=chatbot)
- class_name = get_class_name(code)
- manager = multiprocessing.Manager()
- return_dict = manager.dict()
- p = multiprocessing.Process(target=is_function_successfully_generated, args=(fn_path, class_name, return_dict))
- # only has 10 seconds to run
- p.start(); p.join(timeout=10)
- if p.is_alive(): p.terminate(); p.join()
- p.close()
- return return_dict["success"], return_dict['traceback']
-
-# check is_function_successfully_generated
-def is_function_successfully_generated(fn_path, class_name, return_dict):
- return_dict['success'] = False
- return_dict['traceback'] = ""
- try:
- # Create a spec for the module
- module_spec = importlib.util.spec_from_file_location('example_module', fn_path)
- # Load the module
- example_module = importlib.util.module_from_spec(module_spec)
- module_spec.loader.exec_module(example_module)
- # Now you can use the module
- some_class = getattr(example_module, class_name)
- # Now you can create an instance of the class
- instance = some_class()
- return_dict['success'] = True
- return
- except:
- return_dict['traceback'] = trimmed_format_exc()
- return
-
-def subprocess_worker(code, file_path, return_dict):
- return_dict['result'] = None
- return_dict['success'] = False
- return_dict['traceback'] = ""
- try:
- module_file = 'gpt_fn_' + gen_time_str().replace('-','_')
- fn_path = f'{get_log_folder(plugin_name="gen_plugin_run")}/{module_file}.py'
- with open(fn_path, 'w', encoding='utf8') as f: f.write(code)
- class_name = get_class_name(code)
- # Create a spec for the module
- module_spec = importlib.util.spec_from_file_location('example_module', fn_path)
- # Load the module
- example_module = importlib.util.module_from_spec(module_spec)
- module_spec.loader.exec_module(example_module)
- # Now you can use the module
- some_class = getattr(example_module, class_name)
- # Now you can create an instance of the class
- instance = some_class()
- return_dict['result'] = instance.run(file_path)
- return_dict['success'] = True
- except:
- return_dict['traceback'] = trimmed_format_exc()
diff --git a/crazy_functions/ipc_fns/mp.py b/crazy_functions/ipc_fns/mp.py
deleted file mode 100644
index 575d47ccecbb775205193085c58c06a114d3bfc2..0000000000000000000000000000000000000000
--- a/crazy_functions/ipc_fns/mp.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import platform
-import pickle
-import multiprocessing
-
-def run_in_subprocess_wrapper_func(v_args):
- func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args)
- import sys
- try:
- result = func(*args, **kwargs)
- return_dict['result'] = result
- except Exception as e:
- exc_info = sys.exc_info()
- exception_dict['exception'] = exc_info
-
-def run_in_subprocess_with_timeout(func, timeout=60):
- if platform.system() == 'Linux':
- def wrapper(*args, **kwargs):
- return_dict = multiprocessing.Manager().dict()
- exception_dict = multiprocessing.Manager().dict()
- v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict))
- process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,))
- process.start()
- process.join(timeout)
- if process.is_alive():
- process.terminate()
- raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务')
- process.close()
- if 'exception' in exception_dict:
- # ooops, the subprocess ran into an exception
- exc_info = exception_dict['exception']
- raise exc_info[1].with_traceback(exc_info[2])
- if 'result' in return_dict.keys():
- # If the subprocess ran successfully, return the result
- return return_dict['result']
- return wrapper
- else:
- return func
\ No newline at end of file
diff --git a/crazy_functions/json_fns/pydantic_io.py b/crazy_functions/json_fns/pydantic_io.py
deleted file mode 100644
index 4e300d65dd918f890d64e68e0cc5a37f36366585..0000000000000000000000000000000000000000
--- a/crazy_functions/json_fns/pydantic_io.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-https://github.com/langchain-ai/langchain/blob/master/docs/extras/modules/model_io/output_parsers/pydantic.ipynb
-
-Example 1.
-
-# Define your desired data structure.
-class Joke(BaseModel):
- setup: str = Field(description="question to set up a joke")
- punchline: str = Field(description="answer to resolve the joke")
-
- # You can add custom validation logic easily with Pydantic.
- @validator("setup")
- def question_ends_with_question_mark(cls, field):
- if field[-1] != "?":
- raise ValueError("Badly formed question!")
- return field
-
-
-Example 2.
-
-# Here's another example, but with a compound typed field.
-class Actor(BaseModel):
- name: str = Field(description="name of an actor")
- film_names: List[str] = Field(description="list of names of films they starred in")
-"""
-
-import json, re, logging
-
-
-PYDANTIC_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below.
-
-As an example, for the schema {{"properties": {{"foo": {{"title": "Foo", "description": "a list of strings", "type": "array", "items": {{"type": "string"}}}}}}, "required": ["foo"]}}
-the object {{"foo": ["bar", "baz"]}} is a well-formatted instance of the schema. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not well-formatted.
-
-Here is the output schema:
-```
-{schema}
-```"""
-
-
-PYDANTIC_FORMAT_INSTRUCTIONS_SIMPLE = """The output should be formatted as a JSON instance that conforms to the JSON schema below.
-```
-{schema}
-```"""
-
-class JsonStringError(Exception): ...
-
-class GptJsonIO():
-
- def __init__(self, schema, example_instruction=True):
- self.pydantic_object = schema
- self.example_instruction = example_instruction
- self.format_instructions = self.generate_format_instructions()
-
- def generate_format_instructions(self):
- schema = self.pydantic_object.schema()
-
- # Remove extraneous fields.
- reduced_schema = schema
- if "title" in reduced_schema:
- del reduced_schema["title"]
- if "type" in reduced_schema:
- del reduced_schema["type"]
- # Ensure json in context is well-formed with double quotes.
- if self.example_instruction:
- schema_str = json.dumps(reduced_schema)
- return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str)
- else:
- return PYDANTIC_FORMAT_INSTRUCTIONS_SIMPLE.format(schema=schema_str)
-
- def generate_output(self, text):
- # Greedy search for 1st json candidate.
- match = re.search(
- r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL
- )
- json_str = ""
- if match: json_str = match.group()
- json_object = json.loads(json_str, strict=False)
- final_object = self.pydantic_object.parse_obj(json_object)
- return final_object
-
- def generate_repair_prompt(self, broken_json, error):
- prompt = "Fix a broken json string.\n\n" + \
- "(1) The broken json string need to fix is: \n\n" + \
- "```" + "\n" + \
- broken_json + "\n" + \
- "```" + "\n\n" + \
- "(2) The error message is: \n\n" + \
- error + "\n\n" + \
- "Now, fix this json string. \n\n"
- return prompt
-
- def generate_output_auto_repair(self, response, gpt_gen_fn):
- """
- response: string containing canidate json
- gpt_gen_fn: gpt_gen_fn(inputs, sys_prompt)
- """
- try:
- result = self.generate_output(response)
- except Exception as e:
- try:
- logging.info(f'Repairing json:{response}')
- repair_prompt = self.generate_repair_prompt(broken_json = response, error=repr(e))
- result = self.generate_output(gpt_gen_fn(repair_prompt, self.format_instructions))
- logging.info('Repaire json success.')
- except Exception as e:
- # 没辙了,放弃治疗
- logging.info('Repaire json fail.')
- raise JsonStringError('Cannot repair json.', str(e))
- return result
-
diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py
deleted file mode 100644
index 8772f5e1fb530d72be282deaef2eb18ed9ffa1d2..0000000000000000000000000000000000000000
--- a/crazy_functions/latex_fns/latex_actions.py
+++ /dev/null
@@ -1,467 +0,0 @@
-from toolbox import update_ui, update_ui_lastest_msg, get_log_folder
-from toolbox import get_conf, objdump, objload, promote_file_to_downloadzone
-from .latex_toolbox import PRESERVE, TRANSFORM
-from .latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace
-from .latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process
-from .latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout
-from .latex_toolbox import find_title_and_abs
-
-import os, shutil
-import re
-import numpy as np
-
-pj = os.path.join
-
-
-def split_subprocess(txt, project_folder, return_dict, opts):
- """
- break down latex file to a linked list,
- each node use a preserve flag to indicate whether it should
- be proccessed by GPT.
- """
- text = txt
- mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM
-
- # 吸收title与作者以上的部分
- text, mask = set_forbidden_text(text, mask, r"^(.*?)\\maketitle", re.DOTALL)
- text, mask = set_forbidden_text(text, mask, r"^(.*?)\\begin{document}", re.DOTALL)
- # 吸收iffalse注释
- text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL)
- # 吸收在42行以内的begin-end组合
- text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42)
- # 吸收匿名公式
- text, mask = set_forbidden_text(text, mask, [ r"\$\$([^$]+)\$\$", r"\\\[.*?\\\]" ], re.DOTALL)
- # 吸收其他杂项
- text, mask = set_forbidden_text(text, mask, [ r"\\section\{(.*?)\}", r"\\section\*\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ])
- text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ])
- text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL)
- text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL)
- text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL)
- text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}"], re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\begin\{figure\}(.*?)\\end\{figure\}", r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}"], re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\begin\{multline\}(.*?)\\end\{multline\}", r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}"], re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\begin\{table\}(.*?)\\end\{table\}", r"\\begin\{table\*\}(.*?)\\end\{table\*\}"], re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\begin\{minipage\}(.*?)\\end\{minipage\}", r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}"], re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\begin\{align\*\}(.*?)\\end\{align\*\}", r"\\begin\{align\}(.*?)\\end\{align\}"], re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\begin\{equation\}(.*?)\\end\{equation\}", r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}"], re.DOTALL)
- text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"])
- text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}", r"\\item "])
- text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL)
- # reverse 操作必须放在最后
- text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
- text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\abstract\{(.*?)\}", re.DOTALL, forbid_wrapper=True)
- text, mask = reverse_forbidden_text(text, mask, r"\\begin\{abstract\}(.*?)\\end\{abstract\}", re.DOTALL, forbid_wrapper=True)
- root = convert_to_linklist(text, mask)
-
- # 最后一步处理,增强稳健性
- root = post_process(root)
-
- # 输出html调试文件,用红色标注处保留区(PRESERVE),用黑色标注转换区(TRANSFORM)
- with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f:
- segment_parts_for_gpt = []
- nodes = []
- node = root
- while True:
- nodes.append(node)
- show_html = node.string.replace('\n','
')
- if not node.preserve:
- segment_parts_for_gpt.append(node.string)
- f.write(f'#{node.range}{show_html}#
')
- else:
- f.write(f'{show_html}
')
- node = node.next
- if node is None: break
-
- for n in nodes: n.next = None # break
- return_dict['nodes'] = nodes
- return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt
- return return_dict
-
-class LatexPaperSplit():
- """
- break down latex file to a linked list,
- each node use a preserve flag to indicate whether it should
- be proccessed by GPT.
- """
- def __init__(self) -> None:
- self.nodes = None
- self.msg = "*{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \
- "版权归原文作者所有。翻译内容可靠性无保障,请仔细鉴别并以原文为准。" + \
- "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。"
- # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者)
- self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\"
- self.title = "unknown"
- self.abstract = "unknown"
-
- def read_title_and_abstract(self, txt):
- try:
- title, abstract = find_title_and_abs(txt)
- if title is not None:
- self.title = title.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '')
- if abstract is not None:
- self.abstract = abstract.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '')
- except:
- pass
-
- def merge_result(self, arr, mode, msg, buggy_lines=[], buggy_line_surgery_n_lines=10):
- """
- Merge the result after the GPT process completed
- """
- result_string = ""
- node_cnt = 0
- line_cnt = 0
-
- for node in self.nodes:
- if node.preserve:
- line_cnt += node.string.count('\n')
- result_string += node.string
- else:
- translated_txt = fix_content(arr[node_cnt], node.string)
- begin_line = line_cnt
- end_line = line_cnt + translated_txt.count('\n')
-
- # reverse translation if any error
- if any([begin_line-buggy_line_surgery_n_lines <= b_line <= end_line+buggy_line_surgery_n_lines for b_line in buggy_lines]):
- translated_txt = node.string
-
- result_string += translated_txt
- node_cnt += 1
- line_cnt += translated_txt.count('\n')
-
- if mode == 'translate_zh':
- pattern = re.compile(r'\\begin\{abstract\}.*\n')
- match = pattern.search(result_string)
- if not match:
- # match \abstract{xxxx}
- pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
- match = pattern_compile.search(result_string)
- position = match.regs[1][0]
- else:
- # match \begin{abstract}xxxx\end{abstract}
- position = match.end()
- result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
- return result_string
-
-
- def split(self, txt, project_folder, opts):
- """
- break down latex file to a linked list,
- each node use a preserve flag to indicate whether it should
- be proccessed by GPT.
- P.S. use multiprocessing to avoid timeout error
- """
- import multiprocessing
- manager = multiprocessing.Manager()
- return_dict = manager.dict()
- p = multiprocessing.Process(
- target=split_subprocess,
- args=(txt, project_folder, return_dict, opts))
- p.start()
- p.join()
- p.close()
- self.nodes = return_dict['nodes']
- self.sp = return_dict['segment_parts_for_gpt']
- return self.sp
-
-
-class LatexPaperFileGroup():
- """
- use tokenizer to break down text according to max_token_limit
- """
- def __init__(self):
- self.file_paths = []
- self.file_contents = []
- self.sp_file_contents = []
- self.sp_file_index = []
- self.sp_file_tag = []
- # count_token
- from request_llms.bridge_all import model_info
- enc = model_info["gpt-3.5-turbo"]['tokenizer']
- def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
- self.get_token_num = get_token_num
-
- def run_file_split(self, max_token_limit=1900):
- """
- use tokenizer to break down text according to max_token_limit
- """
- for index, file_content in enumerate(self.file_contents):
- if self.get_token_num(file_content) < max_token_limit:
- self.sp_file_contents.append(file_content)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index])
- else:
- from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
- segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit)
- for j, segment in enumerate(segments):
- self.sp_file_contents.append(segment)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
-
- def merge_result(self):
- self.file_result = ["" for _ in range(len(self.file_paths))]
- for r, k in zip(self.sp_file_result, self.sp_file_index):
- self.file_result[k] += r
-
- def write_result(self):
- manifest = []
- for path, res in zip(self.file_paths, self.file_result):
- with open(path + '.polish.tex', 'w', encoding='utf8') as f:
- manifest.append(path + '.polish.tex')
- f.write(res)
- return manifest
-
-
-def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]):
- import time, os, re
- from ..crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
- from .latex_actions import LatexPaperFileGroup, LatexPaperSplit
-
- # <-------- 寻找主tex文件 ---------->
- maintex = find_main_tex_file(file_manifest, mode)
- chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。'))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- time.sleep(3)
-
- # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ---------->
- main_tex_basename = os.path.basename(maintex)
- assert main_tex_basename.endswith('.tex')
- main_tex_basename_bare = main_tex_basename[:-4]
- may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl')
- if os.path.exists(may_exist_bbl):
- shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl'))
- shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl'))
- shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl'))
-
- with open(maintex, 'r', encoding='utf-8', errors='replace') as f:
- content = f.read()
- merged_content = merge_tex_files(project_folder, content, mode)
-
- with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f:
- f.write(merged_content)
-
- # <-------- 精细切分latex文件 ---------->
- chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件,这需要一段时间计算,文档越长耗时越长,请耐心等待。'))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- lps = LatexPaperSplit()
- lps.read_title_and_abstract(merged_content)
- res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数
- # <-------- 拆分过长的latex片段 ---------->
- pfg = LatexPaperFileGroup()
- for index, r in enumerate(res):
- pfg.file_paths.append('segment-' + str(index))
- pfg.file_contents.append(r)
-
- pfg.run_file_split(max_token_limit=1024)
- n_split = len(pfg.sp_file_contents)
-
- # <-------- 根据需要切换prompt ---------->
- inputs_array, sys_prompt_array = switch_prompt(pfg, mode)
- inputs_show_user_array = [f"{mode} {f}" for f in pfg.sp_file_tag]
-
- if os.path.exists(pj(project_folder,'temp.pkl')):
-
- # <-------- 【仅调试】如果存在调试缓存文件,则跳过GPT请求环节 ---------->
- pfg = objload(file=pj(project_folder,'temp.pkl'))
-
- else:
- # <-------- gpt 多线程请求 ---------->
- history_array = [[""] for _ in range(n_split)]
- # LATEX_EXPERIMENTAL, = get_conf('LATEX_EXPERIMENTAL')
- # if LATEX_EXPERIMENTAL:
- # paper_meta = f"The paper you processing is `{lps.title}`, a part of the abstraction is `{lps.abstract}`"
- # paper_meta_max_len = 888
- # history_array = [[ paper_meta[:paper_meta_max_len] + '...', "Understand, what should I do?"] for _ in range(n_split)]
-
- gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array=inputs_array,
- inputs_show_user_array=inputs_show_user_array,
- llm_kwargs=llm_kwargs,
- chatbot=chatbot,
- history_array=history_array,
- sys_prompt_array=sys_prompt_array,
- # max_workers=5, # 并行任务数量限制, 最多同时执行5个, 其他的排队等待
- scroller_max_len = 40
- )
-
- # <-------- 文本碎片重组为完整的tex片段 ---------->
- pfg.sp_file_result = []
- for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents):
- pfg.sp_file_result.append(gpt_say)
- pfg.merge_result()
-
- # <-------- 临时存储用于调试 ---------->
- pfg.get_token_num = None
- objdump(pfg, file=pj(project_folder,'temp.pkl'))
-
- write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder)
-
- # <-------- 写出文件 ---------->
- msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。"
- final_tex = lps.merge_result(pfg.file_result, mode, msg)
- objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl'))
-
- with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f:
- if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex)
-
-
- # <-------- 整理结果, 退出 ---------->
- chatbot.append((f"完成了吗?", 'GPT结果已输出, 即将编译PDF'))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # <-------- 返回 ---------->
- return project_folder + f'/merge_{mode}.tex'
-
-
-def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified, fixed_line=[]):
- try:
- with open(log_path, 'r', encoding='utf-8', errors='replace') as f:
- log = f.read()
- import re
- buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log)
- buggy_lines = [int(l) for l in buggy_lines]
- buggy_lines = sorted(buggy_lines)
- buggy_line = buggy_lines[0]-1
- print("reversing tex line that has errors", buggy_line)
-
- # 重组,逆转出错的段落
- if buggy_line not in fixed_line:
- fixed_line.append(buggy_line)
-
- lps, file_result, mode, msg = objload(file=pj(work_folder_modified,'merge_result.pkl'))
- final_tex = lps.merge_result(file_result, mode, msg, buggy_lines=fixed_line, buggy_line_surgery_n_lines=5*n_fix)
-
- with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f:
- f.write(final_tex)
-
- return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines
- except:
- print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
- return False, -1, [-1]
-
-
-def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'):
- import os, time
- n_fix = 1
- fixed_line = []
- max_try = 32
- chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
- chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
- yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面
-
- while True:
- import os
- may_exist_bbl = pj(work_folder_modified, f'merge.bbl')
- target_bbl = pj(work_folder_modified, f'{main_file_modified}.bbl')
- if os.path.exists(may_exist_bbl) and not os.path.exists(target_bbl):
- shutil.copyfile(may_exist_bbl, target_bbl)
-
- # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
-
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
-
- if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
- # 只有第二步成功,才能继续下面的步骤
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面
- if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')):
- ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux', work_folder_original)
- if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')):
- ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified)
-
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
-
- if mode!='translate_zh':
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
- print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex')
- ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd())
-
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
- ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
-
- # <---------- 检查结果 ----------->
- results_ = ""
- original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf'))
- modified_pdf_success = os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf'))
- diff_pdf_success = os.path.exists(pj(work_folder, f'merge_diff.pdf'))
- results_ += f"原始PDF编译是否成功: {original_pdf_success};"
- results_ += f"转化PDF编译是否成功: {modified_pdf_success};"
- results_ += f"对比PDF编译是否成功: {diff_pdf_success};"
- yield from update_ui_lastest_msg(f'第{n_fix}编译结束:
{results_}...', chatbot, history) # 刷新Gradio前端界面
-
- if diff_pdf_success:
- result_pdf = pj(work_folder_modified, f'merge_diff.pdf') # get pdf path
- promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI
- if modified_pdf_success:
- yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 正在尝试生成对比PDF, 请稍候 ...', chatbot, history) # 刷新Gradio前端界面
- result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') # get pdf path
- origin_pdf = pj(work_folder_original, f'{main_file_original}.pdf') # get pdf path
- if os.path.exists(pj(work_folder, '..', 'translation')):
- shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf'))
- promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI
- # 将两个PDF拼接
- if original_pdf_success:
- try:
- from .latex_toolbox import merge_pdfs
- concat_pdf = pj(work_folder_modified, f'comparison.pdf')
- merge_pdfs(origin_pdf, result_pdf, concat_pdf)
- if os.path.exists(pj(work_folder, '..', 'translation')):
- shutil.copyfile(concat_pdf, pj(work_folder, '..', 'translation', 'comparison.pdf'))
- promote_file_to_downloadzone(concat_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI
- except Exception as e:
- print(e)
- pass
- return True # 成功啦
- else:
- if n_fix>=max_try: break
- n_fix += 1
- can_retry, main_file_modified, buggy_lines = remove_buggy_lines(
- file_path=pj(work_folder_modified, f'{main_file_modified}.tex'),
- log_path=pj(work_folder_modified, f'{main_file_modified}.log'),
- tex_name=f'{main_file_modified}.tex',
- tex_name_pure=f'{main_file_modified}',
- n_fix=n_fix,
- work_folder_modified=work_folder_modified,
- fixed_line=fixed_line
- )
- yield from update_ui_lastest_msg(f'由于最为关键的转化PDF编译失败, 将根据报错信息修正tex源文件并重试, 当前报错的latex代码处于第{buggy_lines}行 ...', chatbot, history) # 刷新Gradio前端界面
- if not can_retry: break
-
- return False # 失败啦
-
-
-def write_html(sp_file_contents, sp_file_result, chatbot, project_folder):
- # write html
- try:
- import shutil
- from crazy_functions.pdf_fns.report_gen_html import construct_html
- from toolbox import gen_time_str
- ch = construct_html()
- orig = ""
- trans = ""
- final = []
- for c,r in zip(sp_file_contents, sp_file_result):
- final.append(c)
- final.append(r)
- for i, k in enumerate(final):
- if i%2==0:
- orig = k
- if i%2==1:
- trans = k
- ch.add_row(a=orig, b=trans)
- create_report_file_name = f"{gen_time_str()}.trans.html"
- res = ch.save_file(create_report_file_name)
- shutil.copyfile(res, pj(project_folder, create_report_file_name))
- promote_file_to_downloadzone(file=res, chatbot=chatbot)
- except:
- from toolbox import trimmed_format_exc
- print('writing html result failed:', trimmed_format_exc())
diff --git a/crazy_functions/latex_fns/latex_toolbox.py b/crazy_functions/latex_fns/latex_toolbox.py
deleted file mode 100644
index 0a6a873b50b8299fa28bc41e27cf7a27a16637ae..0000000000000000000000000000000000000000
--- a/crazy_functions/latex_fns/latex_toolbox.py
+++ /dev/null
@@ -1,562 +0,0 @@
-import os, shutil
-import re
-import numpy as np
-PRESERVE = 0
-TRANSFORM = 1
-
-pj = os.path.join
-
-class LinkedListNode():
- """
- Linked List Node
- """
- def __init__(self, string, preserve=True) -> None:
- self.string = string
- self.preserve = preserve
- self.next = None
- self.range = None
- # self.begin_line = 0
- # self.begin_char = 0
-
-def convert_to_linklist(text, mask):
- root = LinkedListNode("", preserve=True)
- current_node = root
- for c, m, i in zip(text, mask, range(len(text))):
- if (m==PRESERVE and current_node.preserve) \
- or (m==TRANSFORM and not current_node.preserve):
- # add
- current_node.string += c
- else:
- current_node.next = LinkedListNode(c, preserve=(m==PRESERVE))
- current_node = current_node.next
- return root
-
-def post_process(root):
- # 修复括号
- node = root
- while True:
- string = node.string
- if node.preserve:
- node = node.next
- if node is None: break
- continue
- def break_check(string):
- str_stack = [""] # (lv, index)
- for i, c in enumerate(string):
- if c == '{':
- str_stack.append('{')
- elif c == '}':
- if len(str_stack) == 1:
- print('stack fix')
- return i
- str_stack.pop(-1)
- else:
- str_stack[-1] += c
- return -1
- bp = break_check(string)
-
- if bp == -1:
- pass
- elif bp == 0:
- node.string = string[:1]
- q = LinkedListNode(string[1:], False)
- q.next = node.next
- node.next = q
- else:
- node.string = string[:bp]
- q = LinkedListNode(string[bp:], False)
- q.next = node.next
- node.next = q
-
- node = node.next
- if node is None: break
-
- # 屏蔽空行和太短的句子
- node = root
- while True:
- if len(node.string.strip('\n').strip(''))==0: node.preserve = True
- if len(node.string.strip('\n').strip(''))<42: node.preserve = True
- node = node.next
- if node is None: break
- node = root
- while True:
- if node.next and node.preserve and node.next.preserve:
- node.string += node.next.string
- node.next = node.next.next
- node = node.next
- if node is None: break
-
- # 将前后断行符脱离
- node = root
- prev_node = None
- while True:
- if not node.preserve:
- lstriped_ = node.string.lstrip().lstrip('\n')
- if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)):
- prev_node.string += node.string[:-len(lstriped_)]
- node.string = lstriped_
- rstriped_ = node.string.rstrip().rstrip('\n')
- if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)):
- node.next.string = node.string[len(rstriped_):] + node.next.string
- node.string = rstriped_
- # =====
- prev_node = node
- node = node.next
- if node is None: break
-
- # 标注节点的行数范围
- node = root
- n_line = 0
- expansion = 2
- while True:
- n_l = node.string.count('\n')
- node.range = [n_line-expansion, n_line+n_l+expansion] # 失败时,扭转的范围
- n_line = n_line+n_l
- node = node.next
- if node is None: break
- return root
-
-
-"""
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1)
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-"""
-
-
-def set_forbidden_text(text, mask, pattern, flags=0):
- """
- Add a preserve text area in this paper
- e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}"
- you can mask out (mask = PRESERVE so that text become untouchable for GPT)
- everything between "\begin{equation}" and "\end{equation}"
- """
- if isinstance(pattern, list): pattern = '|'.join(pattern)
- pattern_compile = re.compile(pattern, flags)
- for res in pattern_compile.finditer(text):
- mask[res.span()[0]:res.span()[1]] = PRESERVE
- return text, mask
-
-def reverse_forbidden_text(text, mask, pattern, flags=0, forbid_wrapper=True):
- """
- Move area out of preserve area (make text editable for GPT)
- count the number of the braces so as to catch compelete text area.
- e.g.
- \begin{abstract} blablablablablabla. \end{abstract}
- """
- if isinstance(pattern, list): pattern = '|'.join(pattern)
- pattern_compile = re.compile(pattern, flags)
- for res in pattern_compile.finditer(text):
- if not forbid_wrapper:
- mask[res.span()[0]:res.span()[1]] = TRANSFORM
- else:
- mask[res.regs[0][0]: res.regs[1][0]] = PRESERVE # '\\begin{abstract}'
- mask[res.regs[1][0]: res.regs[1][1]] = TRANSFORM # abstract
- mask[res.regs[1][1]: res.regs[0][1]] = PRESERVE # abstract
- return text, mask
-
-def set_forbidden_text_careful_brace(text, mask, pattern, flags=0):
- """
- Add a preserve text area in this paper (text become untouchable for GPT).
- count the number of the braces so as to catch compelete text area.
- e.g.
- \caption{blablablablabla\texbf{blablabla}blablabla.}
- """
- pattern_compile = re.compile(pattern, flags)
- for res in pattern_compile.finditer(text):
- brace_level = -1
- p = begin = end = res.regs[0][0]
- for _ in range(1024*16):
- if text[p] == '}' and brace_level == 0: break
- elif text[p] == '}': brace_level -= 1
- elif text[p] == '{': brace_level += 1
- p += 1
- end = p+1
- mask[begin:end] = PRESERVE
- return text, mask
-
-def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True):
- """
- Move area out of preserve area (make text editable for GPT)
- count the number of the braces so as to catch compelete text area.
- e.g.
- \caption{blablablablabla\texbf{blablabla}blablabla.}
- """
- pattern_compile = re.compile(pattern, flags)
- for res in pattern_compile.finditer(text):
- brace_level = 0
- p = begin = end = res.regs[1][0]
- for _ in range(1024*16):
- if text[p] == '}' and brace_level == 0: break
- elif text[p] == '}': brace_level -= 1
- elif text[p] == '{': brace_level += 1
- p += 1
- end = p
- mask[begin:end] = TRANSFORM
- if forbid_wrapper:
- mask[res.regs[0][0]:begin] = PRESERVE
- mask[end:res.regs[0][1]] = PRESERVE
- return text, mask
-
-def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42):
- """
- Find all \begin{} ... \end{} text block that with less than limit_n_lines lines.
- Add it to preserve area
- """
- pattern_compile = re.compile(pattern, flags)
- def search_with_line_limit(text, mask):
- for res in pattern_compile.finditer(text):
- cmd = res.group(1) # begin{what}
- this = res.group(2) # content between begin and end
- this_mask = mask[res.regs[2][0]:res.regs[2][1]]
- white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof',
- 'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate']
- if (cmd in white_list) or this.count('\n') >= limit_n_lines: # use a magical number 42
- this, this_mask = search_with_line_limit(this, this_mask)
- mask[res.regs[2][0]:res.regs[2][1]] = this_mask
- else:
- mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE
- return text, mask
- return search_with_line_limit(text, mask)
-
-
-
-"""
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-Latex Merge File
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-"""
-
-def find_main_tex_file(file_manifest, mode):
- """
- 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。
- P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码)
- """
- canidates = []
- for texf in file_manifest:
- if os.path.basename(texf).startswith('merge'):
- continue
- with open(texf, 'r', encoding='utf8', errors='ignore') as f:
- file_content = f.read()
- if r'\documentclass' in file_content:
- canidates.append(texf)
- else:
- continue
-
- if len(canidates) == 0:
- raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)')
- elif len(canidates) == 1:
- return canidates[0]
- else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回
- canidates_score = []
- # 给出一些判定模板文档的词作为扣分项
- unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
- expected_words = ['\input', '\ref', '\cite']
- for texf in canidates:
- canidates_score.append(0)
- with open(texf, 'r', encoding='utf8', errors='ignore') as f:
- file_content = f.read()
- file_content = rm_comments(file_content)
- for uw in unexpected_words:
- if uw in file_content:
- canidates_score[-1] -= 1
- for uw in expected_words:
- if uw in file_content:
- canidates_score[-1] += 1
- select = np.argmax(canidates_score) # 取评分最高者返回
- return canidates[select]
-
-def rm_comments(main_file):
- new_file_remove_comment_lines = []
- for l in main_file.splitlines():
- # 删除整行的空注释
- if l.lstrip().startswith("%"):
- pass
- else:
- new_file_remove_comment_lines.append(l)
- main_file = '\n'.join(new_file_remove_comment_lines)
- # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令
- main_file = re.sub(r'(? 0 and node_string.count('\_') > final_tex.count('\_'):
- # walk and replace any _ without \
- final_tex = re.sub(r"(?= limit_n_lines: # use a magical number 42
- this, this_mask = search_with_line_limit(this, this_mask)
- mask[res.regs[2][0]:res.regs[2][1]] = this_mask
- else:
- mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE
- return text, mask
- return search_with_line_limit(text, mask)
-
-class LinkedListNode():
- """
- Linked List Node
- """
- def __init__(self, string, preserve=True) -> None:
- self.string = string
- self.preserve = preserve
- self.next = None
- # self.begin_line = 0
- # self.begin_char = 0
-
-def convert_to_linklist(text, mask):
- root = LinkedListNode("", preserve=True)
- current_node = root
- for c, m, i in zip(text, mask, range(len(text))):
- if (m==PRESERVE and current_node.preserve) \
- or (m==TRANSFORM and not current_node.preserve):
- # add
- current_node.string += c
- else:
- current_node.next = LinkedListNode(c, preserve=(m==PRESERVE))
- current_node = current_node.next
- return root
-"""
-========================================================================
-Latex Merge File
-========================================================================
-"""
-
-def 寻找Latex主文件(file_manifest, mode):
- """
- 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。
- P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码)
- """
- canidates = []
- for texf in file_manifest:
- if os.path.basename(texf).startswith('merge'):
- continue
- with open(texf, 'r', encoding='utf8') as f:
- file_content = f.read()
- if r'\documentclass' in file_content:
- canidates.append(texf)
- else:
- continue
-
- if len(canidates) == 0:
- raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)')
- elif len(canidates) == 1:
- return canidates[0]
- else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回
- canidates_score = []
- # 给出一些判定模板文档的词作为扣分项
- unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers']
- expected_words = ['\input', '\ref', '\cite']
- for texf in canidates:
- canidates_score.append(0)
- with open(texf, 'r', encoding='utf8') as f:
- file_content = f.read()
- for uw in unexpected_words:
- if uw in file_content:
- canidates_score[-1] -= 1
- for uw in expected_words:
- if uw in file_content:
- canidates_score[-1] += 1
- select = np.argmax(canidates_score) # 取评分最高者返回
- return canidates[select]
-
-def rm_comments(main_file):
- new_file_remove_comment_lines = []
- for l in main_file.splitlines():
- # 删除整行的空注释
- if l.lstrip().startswith("%"):
- pass
- else:
- new_file_remove_comment_lines.append(l)
- main_file = '\n'.join(new_file_remove_comment_lines)
- # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令
- main_file = re.sub(r'(? 0 and node_string.count('\_') > final_tex.count('\_'):
- # walk and replace any _ without \
- final_tex = re.sub(r"(?')
- if not node.preserve:
- segment_parts_for_gpt.append(node.string)
- f.write(f'#{show_html}#
')
- else:
- f.write(f'{show_html}
')
- node = node.next
- if node is None: break
-
- for n in nodes: n.next = None # break
- return_dict['nodes'] = nodes
- return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt
- return return_dict
-
-
-
-class LatexPaperSplit():
- """
- break down latex file to a linked list,
- each node use a preserve flag to indicate whether it should
- be proccessed by GPT.
- """
- def __init__(self) -> None:
- self.nodes = None
- self.msg = "*{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \
- "版权归原文作者所有。翻译内容可靠性无保障,请仔细鉴别并以原文为准。" + \
- "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。"
- # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者)
- self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\"
-
- def merge_result(self, arr, mode, msg):
- """
- Merge the result after the GPT process completed
- """
- result_string = ""
- p = 0
- for node in self.nodes:
- if node.preserve:
- result_string += node.string
- else:
- result_string += fix_content(arr[p], node.string)
- p += 1
- if mode == 'translate_zh':
- pattern = re.compile(r'\\begin\{abstract\}.*\n')
- match = pattern.search(result_string)
- if not match:
- # match \abstract{xxxx}
- pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL)
- match = pattern_compile.search(result_string)
- position = match.regs[1][0]
- else:
- # match \begin{abstract}xxxx\end{abstract}
- position = match.end()
- result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:]
- return result_string
-
- def split(self, txt, project_folder, opts):
- """
- break down latex file to a linked list,
- each node use a preserve flag to indicate whether it should
- be proccessed by GPT.
- P.S. use multiprocessing to avoid timeout error
- """
- import multiprocessing
- manager = multiprocessing.Manager()
- return_dict = manager.dict()
- p = multiprocessing.Process(
- target=split_subprocess,
- args=(txt, project_folder, return_dict, opts))
- p.start()
- p.join()
- p.close()
- self.nodes = return_dict['nodes']
- self.sp = return_dict['segment_parts_for_gpt']
- return self.sp
-
-
-
-class LatexPaperFileGroup():
- """
- use tokenizer to break down text according to max_token_limit
- """
- def __init__(self):
- self.file_paths = []
- self.file_contents = []
- self.sp_file_contents = []
- self.sp_file_index = []
- self.sp_file_tag = []
-
- # count_token
- from request_llm.bridge_all import model_info
- enc = model_info["gpt-3.5-turbo"]['tokenizer']
- def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
- self.get_token_num = get_token_num
-
- def run_file_split(self, max_token_limit=1900):
- """
- use tokenizer to break down text according to max_token_limit
- """
- for index, file_content in enumerate(self.file_contents):
- if self.get_token_num(file_content) < max_token_limit:
- self.sp_file_contents.append(file_content)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index])
- else:
- from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
- segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit)
- for j, segment in enumerate(segments):
- self.sp_file_contents.append(segment)
- self.sp_file_index.append(index)
- self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex")
- print('Segmentation: done')
-
- def merge_result(self):
- self.file_result = ["" for _ in range(len(self.file_paths))]
- for r, k in zip(self.sp_file_result, self.sp_file_index):
- self.file_result[k] += r
-
- def write_result(self):
- manifest = []
- for path, res in zip(self.file_paths, self.file_result):
- with open(path + '.polish.tex', 'w', encoding='utf8') as f:
- manifest.append(path + '.polish.tex')
- f.write(res)
- return manifest
-
-def write_html(sp_file_contents, sp_file_result, chatbot, project_folder):
-
- # write html
- try:
- import shutil
- from .crazy_utils import construct_html
- from toolbox import gen_time_str
- ch = construct_html()
- orig = ""
- trans = ""
- final = []
- for c,r in zip(sp_file_contents, sp_file_result):
- final.append(c)
- final.append(r)
- for i, k in enumerate(final):
- if i%2==0:
- orig = k
- if i%2==1:
- trans = k
- ch.add_row(a=orig, b=trans)
- create_report_file_name = f"{gen_time_str()}.trans.html"
- ch.save_file(create_report_file_name)
- shutil.copyfile(pj('./gpt_log/', create_report_file_name), pj(project_folder, create_report_file_name))
- promote_file_to_downloadzone(file=f'./gpt_log/{create_report_file_name}', chatbot=chatbot)
- except:
- from toolbox import trimmed_format_exc
- print('writing html result failed:', trimmed_format_exc())
-
-def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]):
- import time, os, re
- from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
- from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit, 寻找Latex主文件
-
- # <-------- 寻找主tex文件 ---------->
- maintex = 寻找Latex主文件(file_manifest, mode)
- chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。'))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- time.sleep(3)
-
- # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ---------->
- main_tex_basename = os.path.basename(maintex)
- assert main_tex_basename.endswith('.tex')
- main_tex_basename_bare = main_tex_basename[:-4]
- may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl')
- if os.path.exists(may_exist_bbl):
- shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl'))
- shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl'))
- shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl'))
-
- with open(maintex, 'r', encoding='utf-8', errors='replace') as f:
- content = f.read()
- merged_content = merge_tex_files(project_folder, content, mode)
-
- with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f:
- f.write(merged_content)
-
- # <-------- 精细切分latex文件 ---------->
- chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件,这需要一段时间计算,文档越长耗时越长,请耐心等待。'))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
- lps = LatexPaperSplit()
- res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数
-
- # <-------- 拆分过长的latex片段 ---------->
- pfg = LatexPaperFileGroup()
- for index, r in enumerate(res):
- pfg.file_paths.append('segment-' + str(index))
- pfg.file_contents.append(r)
-
- pfg.run_file_split(max_token_limit=1024)
- n_split = len(pfg.sp_file_contents)
-
- # <-------- 根据需要切换prompt ---------->
- inputs_array, sys_prompt_array = switch_prompt(pfg, mode)
- inputs_show_user_array = [f"{mode} {f}" for f in pfg.sp_file_tag]
-
- if os.path.exists(pj(project_folder,'temp.pkl')):
-
- # <-------- 【仅调试】如果存在调试缓存文件,则跳过GPT请求环节 ---------->
- pfg = objload(file=pj(project_folder,'temp.pkl'))
-
- else:
- # <-------- gpt 多线程请求 ---------->
- gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array=inputs_array,
- inputs_show_user_array=inputs_show_user_array,
- llm_kwargs=llm_kwargs,
- chatbot=chatbot,
- history_array=[[""] for _ in range(n_split)],
- sys_prompt_array=sys_prompt_array,
- # max_workers=5, # 并行任务数量限制, 最多同时执行5个, 其他的排队等待
- scroller_max_len = 40
- )
-
- # <-------- 文本碎片重组为完整的tex片段 ---------->
- pfg.sp_file_result = []
- for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents):
- pfg.sp_file_result.append(gpt_say)
- pfg.merge_result()
-
- # <-------- 临时存储用于调试 ---------->
- pfg.get_token_num = None
- objdump(pfg, file=pj(project_folder,'temp.pkl'))
-
- write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder)
-
- # <-------- 写出文件 ---------->
- msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。"
- final_tex = lps.merge_result(pfg.file_result, mode, msg)
- with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f:
- if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex)
-
-
- # <-------- 整理结果, 退出 ---------->
- chatbot.append((f"完成了吗?", 'GPT结果已输出, 正在编译PDF'))
- yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-
- # <-------- 返回 ---------->
- return project_folder + f'/merge_{mode}.tex'
-
-
-
-def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified):
- try:
- with open(log_path, 'r', encoding='utf-8', errors='replace') as f:
- log = f.read()
- with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
- file_lines = f.readlines()
- import re
- buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log)
- buggy_lines = [int(l) for l in buggy_lines]
- buggy_lines = sorted(buggy_lines)
- print("removing lines that has errors", buggy_lines)
- file_lines.pop(buggy_lines[0]-1)
- with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f:
- f.writelines(file_lines)
- return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines
- except:
- print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.")
- return False, -1, [-1]
-
-def compile_latex_with_timeout(command, cwd, timeout=60):
- import subprocess
- process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd)
- try:
- stdout, stderr = process.communicate(timeout=timeout)
- except subprocess.TimeoutExpired:
- process.kill()
- stdout, stderr = process.communicate()
- print("Process timed out!")
- return False
- return True
-
-def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'):
- import os, time
- current_dir = os.getcwd()
- n_fix = 1
- max_try = 32
- chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history)
- chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面
- yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面
-
- while True:
- import os
-
- # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
-
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
-
- if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')):
- # 只有第二步成功,才能继续下面的步骤
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面
- if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')):
- ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux', work_folder_original)
- if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')):
- ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified)
-
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified)
-
- if mode!='translate_zh':
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面
- print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex')
- ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex')
-
- yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
- ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
- ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder)
-
-
- # <---------- 检查结果 ----------->
- results_ = ""
- original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf'))
- modified_pdf_success = os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf'))
- diff_pdf_success = os.path.exists(pj(work_folder, f'merge_diff.pdf'))
- results_ += f"原始PDF编译是否成功: {original_pdf_success};"
- results_ += f"转化PDF编译是否成功: {modified_pdf_success};"
- results_ += f"对比PDF编译是否成功: {diff_pdf_success};"
- yield from update_ui_lastest_msg(f'第{n_fix}编译结束:
{results_}...', chatbot, history) # 刷新Gradio前端界面
-
- if diff_pdf_success:
- result_pdf = pj(work_folder_modified, f'merge_diff.pdf') # get pdf path
- promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI
- if modified_pdf_success:
- yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 即将退出 ...', chatbot, history) # 刷新Gradio前端界面
- result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') # get pdf path
- if os.path.exists(pj(work_folder, '..', 'translation')):
- shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf'))
- promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI
- return True # 成功啦
- else:
- if n_fix>=max_try: break
- n_fix += 1
- can_retry, main_file_modified, buggy_lines = remove_buggy_lines(
- file_path=pj(work_folder_modified, f'{main_file_modified}.tex'),
- log_path=pj(work_folder_modified, f'{main_file_modified}.log'),
- tex_name=f'{main_file_modified}.tex',
- tex_name_pure=f'{main_file_modified}',
- n_fix=n_fix,
- work_folder_modified=work_folder_modified,
- )
- yield from update_ui_lastest_msg(f'由于最为关键的转化PDF编译失败, 将根据报错信息修正tex源文件并重试, 当前报错的latex代码处于第{buggy_lines}行 ...', chatbot, history) # 刷新Gradio前端界面
- if not can_retry: break
-
- return False # 失败啦
-
-
-
diff --git a/crazy_functions/live_audio/aliyunASR.py b/crazy_functions/live_audio/aliyunASR.py
deleted file mode 100644
index cba4c01f86be93b4fbb7ef474330a6a104c59431..0000000000000000000000000000000000000000
--- a/crazy_functions/live_audio/aliyunASR.py
+++ /dev/null
@@ -1,261 +0,0 @@
-import time, logging, json, sys, struct
-import numpy as np
-from scipy.io.wavfile import WAVE_FORMAT
-
-def write_numpy_to_wave(filename, rate, data, add_header=False):
- """
- Write a NumPy array as a WAV file.
- """
- def _array_tofile(fid, data):
- # ravel gives a c-contiguous buffer
- fid.write(data.ravel().view('b').data)
-
- if hasattr(filename, 'write'):
- fid = filename
- else:
- fid = open(filename, 'wb')
-
- fs = rate
-
- try:
- dkind = data.dtype.kind
- if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and
- data.dtype.itemsize == 1)):
- raise ValueError("Unsupported data type '%s'" % data.dtype)
-
- header_data = b''
-
- header_data += b'RIFF'
- header_data += b'\x00\x00\x00\x00'
- header_data += b'WAVE'
-
- # fmt chunk
- header_data += b'fmt '
- if dkind == 'f':
- format_tag = WAVE_FORMAT.IEEE_FLOAT
- else:
- format_tag = WAVE_FORMAT.PCM
- if data.ndim == 1:
- channels = 1
- else:
- channels = data.shape[1]
- bit_depth = data.dtype.itemsize * 8
- bytes_per_second = fs*(bit_depth // 8)*channels
- block_align = channels * (bit_depth // 8)
-
- fmt_chunk_data = struct.pack(' 0xFFFFFFFF:
- raise ValueError("Data exceeds wave file size limit")
- if add_header:
- fid.write(header_data)
- # data chunk
- fid.write(b'data')
- fid.write(struct.pack('' or (data.dtype.byteorder == '=' and
- sys.byteorder == 'big'):
- data = data.byteswap()
- _array_tofile(fid, data)
-
- if add_header:
- # Determine file size and place it in correct
- # position at start of the file.
- size = fid.tell()
- fid.seek(4)
- fid.write(struct.pack('{}".format(args))
- pass
-
- def test_on_close(self, *args):
- self.aliyun_service_ok = False
- pass
-
- def test_on_result_chg(self, message, *args):
- # print("test_on_chg:{}".format(message))
- message = json.loads(message)
- self.parsed_text = message['payload']['result']
- self.event_on_result_chg.set()
-
- def test_on_completed(self, message, *args):
- # print("on_completed:args=>{} message=>{}".format(args, message))
- pass
-
- def audio_convertion_thread(self, uuid):
- # 在一个异步线程中采集音频
- import nls # pip install git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git
- import tempfile
- from scipy import io
- from toolbox import get_conf
- from .audio_io import change_sample_rate
- from .audio_io import RealtimeAudioDistribution
- NEW_SAMPLERATE = 16000
- rad = RealtimeAudioDistribution()
- rad.clean_up()
- temp_folder = tempfile.gettempdir()
- TOKEN, APPKEY = get_conf('ALIYUN_TOKEN', 'ALIYUN_APPKEY')
- if len(TOKEN) == 0:
- TOKEN = self.get_token()
- self.aliyun_service_ok = True
- URL="wss://nls-gateway.aliyuncs.com/ws/v1"
- sr = nls.NlsSpeechTranscriber(
- url=URL,
- token=TOKEN,
- appkey=APPKEY,
- on_sentence_begin=self.test_on_sentence_begin,
- on_sentence_end=self.test_on_sentence_end,
- on_start=self.test_on_start,
- on_result_changed=self.test_on_result_chg,
- on_completed=self.test_on_completed,
- on_error=self.test_on_error,
- on_close=self.test_on_close,
- callback_args=[uuid.hex]
- )
- timeout_limit_second = 20
- r = sr.start(aformat="pcm",
- timeout=timeout_limit_second,
- enable_intermediate_result=True,
- enable_punctuation_prediction=True,
- enable_inverse_text_normalization=True)
-
- import webrtcvad
- vad = webrtcvad.Vad()
- vad.set_mode(1)
-
- is_previous_frame_transmitted = False # 上一帧是否有人说话
- previous_frame_data = None
- echo_cnt = 0 # 在没有声音之后,继续向服务器发送n次音频数据
- echo_cnt_max = 4 # 在没有声音之后,继续向服务器发送n次音频数据
- keep_alive_last_send_time = time.time()
- while not self.stop:
- # time.sleep(self.capture_interval)
- audio = rad.read(uuid.hex)
- if audio is not None:
- # convert to pcm file
- temp_file = f'{temp_folder}/{uuid.hex}.pcm' #
- dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000
- write_numpy_to_wave(temp_file, NEW_SAMPLERATE, dsdata)
- # read pcm binary
- with open(temp_file, "rb") as f: data = f.read()
- is_speaking, info = is_speaker_speaking(vad, data, NEW_SAMPLERATE)
-
- if is_speaking or echo_cnt > 0:
- # 如果话筒激活 / 如果处于回声收尾阶段
- echo_cnt -= 1
- if not is_previous_frame_transmitted: # 上一帧没有人声,但是我们把上一帧同样加上
- if previous_frame_data is not None: data = previous_frame_data + data
- if is_speaking:
- echo_cnt = echo_cnt_max
- slices = zip(*(iter(data),) * 640) # 640个字节为一组
- for i in slices: sr.send_audio(bytes(i))
- keep_alive_last_send_time = time.time()
- is_previous_frame_transmitted = True
- else:
- is_previous_frame_transmitted = False
- echo_cnt = 0
- # 保持链接激活,即使没有声音,也根据时间间隔,发送一些音频片段给服务器
- if time.time() - keep_alive_last_send_time > timeout_limit_second/2:
- slices = zip(*(iter(data),) * 640) # 640个字节为一组
- for i in slices: sr.send_audio(bytes(i))
- keep_alive_last_send_time = time.time()
- is_previous_frame_transmitted = True
- self.audio_shape = info
- else:
- time.sleep(0.1)
-
- if not self.aliyun_service_ok:
- self.stop = True
- self.stop_msg = 'Aliyun音频服务异常,请检查ALIYUN_TOKEN和ALIYUN_APPKEY是否过期。'
- r = sr.stop()
-
- def get_token(self):
- from toolbox import get_conf
- import json
- from aliyunsdkcore.request import CommonRequest
- from aliyunsdkcore.client import AcsClient
- AccessKey_ID, AccessKey_secret = get_conf('ALIYUN_ACCESSKEY', 'ALIYUN_SECRET')
-
- # 创建AcsClient实例
- client = AcsClient(
- AccessKey_ID,
- AccessKey_secret,
- "cn-shanghai"
- )
-
- # 创建request,并设置参数。
- request = CommonRequest()
- request.set_method('POST')
- request.set_domain('nls-meta.cn-shanghai.aliyuncs.com')
- request.set_version('2019-02-28')
- request.set_action_name('CreateToken')
-
- try:
- response = client.do_action_with_exception(request)
- print(response)
- jss = json.loads(response)
- if 'Token' in jss and 'Id' in jss['Token']:
- token = jss['Token']['Id']
- expireTime = jss['Token']['ExpireTime']
- print("token = " + token)
- print("expireTime = " + str(expireTime))
- except Exception as e:
- print(e)
-
- return token
diff --git a/crazy_functions/live_audio/audio_io.py b/crazy_functions/live_audio/audio_io.py
deleted file mode 100644
index 00fd3f2d846ccf20eb300b796bb91842315e3482..0000000000000000000000000000000000000000
--- a/crazy_functions/live_audio/audio_io.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import numpy as np
-from scipy import interpolate
-
-def Singleton(cls):
- _instance = {}
-
- def _singleton(*args, **kargs):
- if cls not in _instance:
- _instance[cls] = cls(*args, **kargs)
- return _instance[cls]
-
- return _singleton
-
-
-@Singleton
-class RealtimeAudioDistribution():
- def __init__(self) -> None:
- self.data = {}
- self.max_len = 1024*1024
- self.rate = 48000 # 只读,每秒采样数量
-
- def clean_up(self):
- self.data = {}
-
- def feed(self, uuid, audio):
- self.rate, audio_ = audio
- # print('feed', len(audio_), audio_[-25:])
- if uuid not in self.data:
- self.data[uuid] = audio_
- else:
- new_arr = np.concatenate((self.data[uuid], audio_))
- if len(new_arr) > self.max_len: new_arr = new_arr[-self.max_len:]
- self.data[uuid] = new_arr
-
- def read(self, uuid):
- if uuid in self.data:
- res = self.data.pop(uuid)
- # print('\r read-', len(res), '-', max(res), end='', flush=True)
- else:
- res = None
- return res
-
-def change_sample_rate(audio, old_sr, new_sr):
- duration = audio.shape[0] / old_sr
-
- time_old = np.linspace(0, duration, audio.shape[0])
- time_new = np.linspace(0, duration, int(audio.shape[0] * new_sr / old_sr))
-
- interpolator = interpolate.interp1d(time_old, audio.T)
- new_audio = interpolator(time_new).T
- return new_audio.astype(np.int16)
\ No newline at end of file
diff --git a/crazy_functions/multi_stage/multi_stage_utils.py b/crazy_functions/multi_stage/multi_stage_utils.py
deleted file mode 100644
index 1395e79ff132de3622d2dd3b3867f3916399e061..0000000000000000000000000000000000000000
--- a/crazy_functions/multi_stage/multi_stage_utils.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from pydantic import BaseModel, Field
-from typing import List
-from toolbox import update_ui_lastest_msg, disable_auto_promotion
-from toolbox import CatchException, update_ui, get_conf, select_api_key, get_log_folder
-from request_llms.bridge_all import predict_no_ui_long_connection
-from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError
-import time
-import pickle
-
-def have_any_recent_upload_files(chatbot):
- _5min = 5 * 60
- if not chatbot: return False # chatbot is None
- most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None)
- if not most_recent_uploaded: return False # most_recent_uploaded is None
- if time.time() - most_recent_uploaded["time"] < _5min: return True # most_recent_uploaded is new
- else: return False # most_recent_uploaded is too old
-
-class GptAcademicState():
- def __init__(self):
- self.reset()
-
- def reset(self):
- pass
-
- def dump_state(self, chatbot):
- chatbot._cookies['plugin_state'] = pickle.dumps(self)
-
- def set_state(self, chatbot, key, value):
- setattr(self, key, value)
- chatbot._cookies['plugin_state'] = pickle.dumps(self)
-
- def get_state(chatbot, cls=None):
- state = chatbot._cookies.get('plugin_state', None)
- if state is not None: state = pickle.loads(state)
- elif cls is not None: state = cls()
- else: state = GptAcademicState()
- state.chatbot = chatbot
- return state
-
-
-class GptAcademicGameBaseState():
- """
- 1. first init: __init__ ->
- """
- def init_game(self, chatbot, lock_plugin):
- self.plugin_name = None
- self.callback_fn = None
- self.delete_game = False
- self.step_cnt = 0
-
- def lock_plugin(self, chatbot):
- if self.callback_fn is None:
- raise ValueError("callback_fn is None")
- chatbot._cookies['lock_plugin'] = self.callback_fn
- self.dump_state(chatbot)
-
- def get_plugin_name(self):
- if self.plugin_name is None:
- raise ValueError("plugin_name is None")
- return self.plugin_name
-
- def dump_state(self, chatbot):
- chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = pickle.dumps(self)
-
- def set_state(self, chatbot, key, value):
- setattr(self, key, value)
- chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = pickle.dumps(self)
-
- @staticmethod
- def sync_state(chatbot, llm_kwargs, cls, plugin_name, callback_fn, lock_plugin=True):
- state = chatbot._cookies.get(f'plugin_state/{plugin_name}', None)
- if state is not None:
- state = pickle.loads(state)
- else:
- state = cls()
- state.init_game(chatbot, lock_plugin)
- state.plugin_name = plugin_name
- state.llm_kwargs = llm_kwargs
- state.chatbot = chatbot
- state.callback_fn = callback_fn
- return state
-
- def continue_game(self, prompt, chatbot, history):
- # 游戏主体
- yield from self.step(prompt, chatbot, history)
- self.step_cnt += 1
- # 保存状态,收尾
- self.dump_state(chatbot)
- # 如果游戏结束,清理
- if self.delete_game:
- chatbot._cookies['lock_plugin'] = None
- chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = None
- yield from update_ui(chatbot=chatbot, history=history)
diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py
deleted file mode 100644
index a9614814020335fc83e63b859319a961300d94b4..0000000000000000000000000000000000000000
--- a/crazy_functions/pdf_fns/breakdown_txt.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout
-
-def force_breakdown(txt, limit, get_token_fn):
- """ 当无法用标点、空行分割时,我们用最暴力的方法切割
- """
- for i in reversed(range(len(txt))):
- if get_token_fn(txt[:i]) < limit:
- return txt[:i], txt[i:]
- return "Tiktoken未知错误", "Tiktoken未知错误"
-
-
-def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage):
- """ 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
- 当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出
- """
- _min = int(5e4)
- _max = int(1e5)
- # print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage))
- if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0:
- remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage
- remain_txt_to_cut_storage = ""
- if len(remain_txt_to_cut) > _max:
- remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage
- remain_txt_to_cut = remain_txt_to_cut[:_max]
- return remain_txt_to_cut, remain_txt_to_cut_storage
-
-
-def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False):
- """ 文本切分
- """
- res = []
- total_len = len(txt_tocut)
- fin_len = 0
- remain_txt_to_cut = txt_tocut
- remain_txt_to_cut_storage = ""
- # 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage
- remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
-
- while True:
- if get_token_fn(remain_txt_to_cut) <= limit:
- # 如果剩余文本的token数小于限制,那么就不用切了
- res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut)
- break
- else:
- # 如果剩余文本的token数大于限制,那么就切
- lines = remain_txt_to_cut.split('\n')
-
- # 估计一个切分点
- estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines)
- estimated_line_cut = int(estimated_line_cut)
-
- # 开始查找合适切分点的偏移(cnt)
- cnt = 0
- for cnt in reversed(range(estimated_line_cut)):
- if must_break_at_empty_line:
- # 首先尝试用双空行(\n\n)作为切分点
- if lines[cnt] != "":
- continue
- prev = "\n".join(lines[:cnt])
- post = "\n".join(lines[cnt:])
- if get_token_fn(prev) < limit:
- break
-
- if cnt == 0:
- # 如果没有找到合适的切分点
- if break_anyway:
- # 是否允许暴力切分
- prev, post = force_breakdown(txt_tocut, limit, get_token_fn)
- else:
- # 不允许直接报错
- raise RuntimeError(f"存在一行极长的文本!{txt_tocut}")
-
- # 追加列表
- res.append(prev); fin_len+=len(prev)
- # 准备下一次迭代
- remain_txt_to_cut = post
- remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage)
- process = fin_len/total_len
- print(f'正在文本切分 {int(process*100)}%')
- if len(remain_txt_to_cut.strip()) == 0:
- break
- return res
-
-
-def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"):
- """ 使用多种方式尝试切分文本,以满足 token 限制
- """
- from request_llms.bridge_all import model_info
- enc = model_info[llm_model]['tokenizer']
- def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=()))
- try:
- # 第1次尝试,将双空行(\n\n)作为切分点
- return cut(limit, get_token_fn, txt, must_break_at_empty_line=True)
- except RuntimeError:
- try:
- # 第2次尝试,将单空行(\n)作为切分点
- return cut(limit, get_token_fn, txt, must_break_at_empty_line=False)
- except RuntimeError:
- try:
- # 第3次尝试,将英文句号(.)作为切分点
- res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在
- return [r.replace('。\n', '.') for r in res]
- except RuntimeError as e:
- try:
- # 第4次尝试,将中文句号(。)作为切分点
- res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False)
- return [r.replace('。。\n', '。') for r in res]
- except RuntimeError as e:
- # 第5次尝试,没办法了,随便切一下吧
- return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True)
-
-breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60)
-
-if __name__ == '__main__':
- from crazy_functions.crazy_utils import read_and_clean_pdf_text
- file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf")
-
- from request_llms.bridge_all import model_info
- for i in range(5):
- file_content += file_content
-
- print(len(file_content))
- TOKEN_LIMIT_PER_FRAGMENT = 2500
- res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT)
-
diff --git a/crazy_functions/pdf_fns/parse_pdf.py b/crazy_functions/pdf_fns/parse_pdf.py
deleted file mode 100644
index fa27de516feb735c0ac92ffa02be97164343d8cf..0000000000000000000000000000000000000000
--- a/crazy_functions/pdf_fns/parse_pdf.py
+++ /dev/null
@@ -1,171 +0,0 @@
-from functools import lru_cache
-from toolbox import gen_time_str
-from toolbox import promote_file_to_downloadzone
-from toolbox import write_history_to_file, promote_file_to_downloadzone
-from toolbox import get_conf
-from toolbox import ProxyNetworkActivate
-from colorful import *
-import requests
-import random
-import copy
-import os
-import math
-
-class GROBID_OFFLINE_EXCEPTION(Exception): pass
-
-def get_avail_grobid_url():
- GROBID_URLS = get_conf('GROBID_URLS')
- if len(GROBID_URLS) == 0: return None
- try:
- _grobid_url = random.choice(GROBID_URLS) # 随机负载均衡
- if _grobid_url.endswith('/'): _grobid_url = _grobid_url.rstrip('/')
- with ProxyNetworkActivate('Connect_Grobid'):
- res = requests.get(_grobid_url+'/api/isalive')
- if res.text=='true': return _grobid_url
- else: return None
- except:
- return None
-
-@lru_cache(maxsize=32)
-def parse_pdf(pdf_path, grobid_url):
- import scipdf # pip install scipdf_parser
- if grobid_url.endswith('/'): grobid_url = grobid_url.rstrip('/')
- try:
- with ProxyNetworkActivate('Connect_Grobid'):
- article_dict = scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url)
- except GROBID_OFFLINE_EXCEPTION:
- raise GROBID_OFFLINE_EXCEPTION("GROBID服务不可用,请修改config中的GROBID_URL,可修改成本地GROBID服务。")
- except:
- raise RuntimeError("解析PDF失败,请检查PDF是否损坏。")
- return article_dict
-
-
-def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files):
- # -=-=-=-=-=-=-=-= 写出第1个文件:翻译前后混合 -=-=-=-=-=-=-=-=
- res_path = write_history_to_file(meta + ["# Meta Translation" , paper_meta_info] + gpt_response_collection, file_basename=f"{gen_time_str()}translated_and_original.md", file_fullname=None)
- promote_file_to_downloadzone(res_path, rename_file=os.path.basename(res_path)+'.md', chatbot=chatbot)
- generated_conclusion_files.append(res_path)
-
- # -=-=-=-=-=-=-=-= 写出第2个文件:仅翻译后的文本 -=-=-=-=-=-=-=-=
- translated_res_array = []
- # 记录当前的大章节标题:
- last_section_name = ""
- for index, value in enumerate(gpt_response_collection):
- # 先挑选偶数序列号:
- if index % 2 != 0:
- # 先提取当前英文标题:
- cur_section_name = gpt_response_collection[index-1].split('\n')[0].split(" Part")[0]
- # 如果index是1的话,则直接使用first section name:
- if cur_section_name != last_section_name:
- cur_value = cur_section_name + '\n'
- last_section_name = copy.deepcopy(cur_section_name)
- else:
- cur_value = ""
- # 再做一个小修改:重新修改当前part的标题,默认用英文的
- cur_value += value
- translated_res_array.append(cur_value)
- res_path = write_history_to_file(meta + ["# Meta Translation" , paper_meta_info] + translated_res_array,
- file_basename = f"{gen_time_str()}-translated_only.md",
- file_fullname = None,
- auto_caption = False)
- promote_file_to_downloadzone(res_path, rename_file=os.path.basename(res_path)+'.md', chatbot=chatbot)
- generated_conclusion_files.append(res_path)
- return res_path
-
-def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG):
- from crazy_functions.pdf_fns.report_gen_html import construct_html
- from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
- from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
- from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
-
- prompt = "以下是一篇学术论文的基本信息:\n"
- # title
- title = article_dict.get('title', '无法获取 title'); prompt += f'title:{title}\n\n'
- # authors
- authors = article_dict.get('authors', '无法获取 authors')[:100]; prompt += f'authors:{authors}\n\n'
- # abstract
- abstract = article_dict.get('abstract', '无法获取 abstract'); prompt += f'abstract:{abstract}\n\n'
- # command
- prompt += f"请将题目和摘要翻译为{DST_LANG}。"
- meta = [f'# Title:\n\n', title, f'# Abstract:\n\n', abstract ]
-
- # 单线,获取文章meta信息
- paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive(
- inputs=prompt,
- inputs_show_user=prompt,
- llm_kwargs=llm_kwargs,
- chatbot=chatbot, history=[],
- sys_prompt="You are an academic paper reader。",
- )
-
- # 多线,翻译
- inputs_array = []
- inputs_show_user_array = []
-
- # get_token_num
- from request_llms.bridge_all import model_info
- enc = model_info[llm_kwargs['llm_model']]['tokenizer']
- def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
-
- def break_down(txt):
- raw_token_num = get_token_num(txt)
- if raw_token_num <= TOKEN_LIMIT_PER_FRAGMENT:
- return [txt]
- else:
- # raw_token_num > TOKEN_LIMIT_PER_FRAGMENT
- # find a smooth token limit to achieve even seperation
- count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT))
- token_limit_smooth = raw_token_num // count + count
- return breakdown_text_to_satisfy_token_limit(txt, limit=token_limit_smooth, llm_model=llm_kwargs['llm_model'])
-
- for section in article_dict.get('sections'):
- if len(section['text']) == 0: continue
- section_frags = break_down(section['text'])
- for i, fragment in enumerate(section_frags):
- heading = section['heading']
- if len(section_frags) > 1: heading += f' Part-{i+1}'
- inputs_array.append(
- f"你需要翻译{heading}章节,内容如下: \n\n{fragment}"
- )
- inputs_show_user_array.append(
- f"# {heading}\n\n{fragment}"
- )
-
- gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
- inputs_array=inputs_array,
- inputs_show_user_array=inputs_show_user_array,
- llm_kwargs=llm_kwargs,
- chatbot=chatbot,
- history_array=[meta for _ in inputs_array],
- sys_prompt_array=[
- "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in inputs_array],
- )
- # -=-=-=-=-=-=-=-= 写出Markdown文件 -=-=-=-=-=-=-=-=
- produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files)
-
- # -=-=-=-=-=-=-=-= 写出HTML文件 -=-=-=-=-=-=-=-=
- ch = construct_html()
- orig = ""
- trans = ""
- gpt_response_collection_html = copy.deepcopy(gpt_response_collection)
- for i,k in enumerate(gpt_response_collection_html):
- if i%2==0:
- gpt_response_collection_html[i] = inputs_show_user_array[i//2]
- else:
- # 先提取当前英文标题:
- cur_section_name = gpt_response_collection[i-1].split('\n')[0].split(" Part")[0]
- cur_value = cur_section_name + "\n" + gpt_response_collection_html[i]
- gpt_response_collection_html[i] = cur_value
-
- final = ["", "", "一、论文概况", "", "Abstract", paper_meta_info, "二、论文翻译", ""]
- final.extend(gpt_response_collection_html)
- for i, k in enumerate(final):
- if i%2==0:
- orig = k
- if i%2==1:
- trans = k
- ch.add_row(a=orig, b=trans)
- create_report_file_name = f"{os.path.basename(fp)}.trans.html"
- html_file = ch.save_file(create_report_file_name)
- generated_conclusion_files.append(html_file)
- promote_file_to_downloadzone(html_file, rename_file=os.path.basename(html_file), chatbot=chatbot)
diff --git a/crazy_functions/pdf_fns/report_gen_html.py b/crazy_functions/pdf_fns/report_gen_html.py
deleted file mode 100644
index 21829212ff13a2dfd1492f05ac9abc73907dce7b..0000000000000000000000000000000000000000
--- a/crazy_functions/pdf_fns/report_gen_html.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from toolbox import update_ui, get_conf, trimmed_format_exc, get_log_folder
-import os
-
-
-
-
-class construct_html():
- def __init__(self) -> None:
- self.html_string = ""
-
- def add_row(self, a, b):
- from toolbox import markdown_convertion
- template = """
- {
- primary_col: {
- header: String.raw`__PRIMARY_HEADER__`,
- msg: String.raw`__PRIMARY_MSG__`,
- },
- secondary_rol: {
- header: String.raw`__SECONDARY_HEADER__`,
- msg: String.raw`__SECONDARY_MSG__`,
- }
- },
- """
- def std(str):
- str = str.replace(r'`',r'`')
- if str.endswith("\\"): str += ' '
- if str.endswith("}"): str += ' '
- if str.endswith("$"): str += ' '
- return str
-
- template_ = template
- a_lines = a.split('\n')
- b_lines = b.split('\n')
-
- if len(a_lines) == 1 or len(a_lines[0]) > 50:
- template_ = template_.replace("__PRIMARY_HEADER__", std(a[:20]))
- template_ = template_.replace("__PRIMARY_MSG__", std(markdown_convertion(a)))
- else:
- template_ = template_.replace("__PRIMARY_HEADER__", std(a_lines[0]))
- template_ = template_.replace("__PRIMARY_MSG__", std(markdown_convertion('\n'.join(a_lines[1:]))))
-
- if len(b_lines) == 1 or len(b_lines[0]) > 50:
- template_ = template_.replace("__SECONDARY_HEADER__", std(b[:20]))
- template_ = template_.replace("__SECONDARY_MSG__", std(markdown_convertion(b)))
- else:
- template_ = template_.replace("__SECONDARY_HEADER__", std(b_lines[0]))
- template_ = template_.replace("__SECONDARY_MSG__", std(markdown_convertion('\n'.join(b_lines[1:]))))
- self.html_string += template_
-
- def save_file(self, file_name):
- from toolbox import get_log_folder
- with open('crazy_functions/pdf_fns/report_template.html', 'r', encoding='utf8') as f:
- html_template = f.read()
- html_template = html_template.replace("__TF_ARR__", self.html_string)
- with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f:
- f.write(html_template.encode('utf-8', 'ignore').decode())
- return os.path.join(get_log_folder(), file_name)
diff --git a/crazy_functions/pdf_fns/report_template.html b/crazy_functions/pdf_fns/report_template.html
deleted file mode 100644
index 39a1e7ce482949978ff90c4738a9adb8803660e6..0000000000000000000000000000000000000000
--- a/crazy_functions/pdf_fns/report_template.html
+++ /dev/null
@@ -1,104 +0,0 @@
-
-
-
-
-
- __TITLE__
-
-
-
-
-
-
-
-
-
diff --git a/crazy_functions/test_project/cpp/cppipc/buffer.cpp b/crazy_functions/test_project/cpp/cppipc/buffer.cpp
deleted file mode 100644
index 084b8153e9401f4e9dc5a6a67cfb5f48b0183ccb..0000000000000000000000000000000000000000
--- a/crazy_functions/test_project/cpp/cppipc/buffer.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "libipc/buffer.h"
-#include "libipc/utility/pimpl.h"
-
-#include
-
-namespace ipc {
-
-bool operator==(buffer const & b1, buffer const & b2) {
- return (b1.size() == b2.size()) && (std::memcmp(b1.data(), b2.data(), b1.size()) == 0);
-}
-
-bool operator!=(buffer const & b1, buffer const & b2) {
- return !(b1 == b2);
-}
-
-class buffer::buffer_ : public pimpl {
-public:
- void* p_;
- std::size_t s_;
- void* a_;
- buffer::destructor_t d_;
-
- buffer_(void* p, std::size_t s, buffer::destructor_t d, void* a)
- : p_(p), s_(s), a_(a), d_(d) {
- }
-
- ~buffer_() {
- if (d_ == nullptr) return;
- d_((a_ == nullptr) ? p_ : a_, s_);
- }
-};
-
-buffer::buffer()
- : buffer(nullptr, 0, nullptr, nullptr) {
-}
-
-buffer::buffer(void* p, std::size_t s, destructor_t d)
- : p_(p_->make(p, s, d, nullptr)) {
-}
-
-buffer::buffer(void* p, std::size_t s, destructor_t d, void* additional)
- : p_(p_->make(p, s, d, additional)) {
-}
-
-buffer::buffer(void* p, std::size_t s)
- : buffer(p, s, nullptr) {
-}
-
-buffer::buffer(char const & c)
- : buffer(const_cast(&c), 1) {
-}
-
-buffer::buffer(buffer&& rhs)
- : buffer() {
- swap(rhs);
-}
-
-buffer::~buffer() {
- p_->clear();
-}
-
-void buffer::swap(buffer& rhs) {
- std::swap(p_, rhs.p_);
-}
-
-buffer& buffer::operator=(buffer rhs) {
- swap(rhs);
- return *this;
-}
-
-bool buffer::empty() const noexcept {
- return (impl(p_)->p_ == nullptr) || (impl(p_)->s_ == 0);
-}
-
-void* buffer::data() noexcept {
- return impl(p_)->p_;
-}
-
-void const * buffer::data() const noexcept {
- return impl(p_)->p_;
-}
-
-std::size_t buffer::size() const noexcept {
- return impl(p_)->s_;
-}
-
-} // namespace ipc
diff --git a/crazy_functions/test_project/cpp/cppipc/ipc.cpp b/crazy_functions/test_project/cpp/cppipc/ipc.cpp
deleted file mode 100644
index 4dc71c071c524906205cc4e2eae9ca8bac8b2d2c..0000000000000000000000000000000000000000
--- a/crazy_functions/test_project/cpp/cppipc/ipc.cpp
+++ /dev/null
@@ -1,701 +0,0 @@
-
-#include
-#include
-#include
-#include // std::pair, std::move, std::forward
-#include
-#include // aligned_storage_t
-#include
-#include
-#include
-#include
-
-#include "libipc/ipc.h"
-#include "libipc/def.h"
-#include "libipc/shm.h"
-#include "libipc/pool_alloc.h"
-#include "libipc/queue.h"
-#include "libipc/policy.h"
-#include "libipc/rw_lock.h"
-#include "libipc/waiter.h"
-
-#include "libipc/utility/log.h"
-#include "libipc/utility/id_pool.h"
-#include "libipc/utility/scope_guard.h"
-#include "libipc/utility/utility.h"
-
-#include "libipc/memory/resource.h"
-#include "libipc/platform/detail.h"
-#include "libipc/circ/elem_array.h"
-
-namespace {
-
-using msg_id_t = std::uint32_t;
-using acc_t = std::atomic;
-
-template
-struct msg_t;
-
-template
-struct msg_t<0, AlignSize> {
- msg_id_t cc_id_;
- msg_id_t id_;
- std::int32_t remain_;
- bool storage_;
-};
-
-template
-struct msg_t : msg_t<0, AlignSize> {
- std::aligned_storage_t data_ {};
-
- msg_t() = default;
- msg_t(msg_id_t cc_id, msg_id_t id, std::int32_t remain, void const * data, std::size_t size)
- : msg_t<0, AlignSize> {cc_id, id, remain, (data == nullptr) || (size == 0)} {
- if (this->storage_) {
- if (data != nullptr) {
- // copy storage-id
- *reinterpret_cast(&data_) =
- *static_cast(data);
- }
- }
- else std::memcpy(&data_, data, size);
- }
-};
-
-template
-ipc::buff_t make_cache(T& data, std::size_t size) {
- auto ptr = ipc::mem::alloc(size);
- std::memcpy(ptr, &data, (ipc::detail::min)(sizeof(data), size));
- return { ptr, size, ipc::mem::free };
-}
-
-struct cache_t {
- std::size_t fill_;
- ipc::buff_t buff_;
-
- cache_t(std::size_t f, ipc::buff_t && b)
- : fill_(f), buff_(std::move(b))
- {}
-
- void append(void const * data, std::size_t size) {
- if (fill_ >= buff_.size() || data == nullptr || size == 0) return;
- auto new_fill = (ipc::detail::min)(fill_ + size, buff_.size());
- std::memcpy(static_cast(buff_.data()) + fill_, data, new_fill - fill_);
- fill_ = new_fill;
- }
-};
-
-auto cc_acc() {
- static ipc::shm::handle acc_h("__CA_CONN__", sizeof(acc_t));
- return static_cast(acc_h.get());
-}
-
-IPC_CONSTEXPR_ std::size_t align_chunk_size(std::size_t size) noexcept {
- return (((size - 1) / ipc::large_msg_align) + 1) * ipc::large_msg_align;
-}
-
-IPC_CONSTEXPR_ std::size_t calc_chunk_size(std::size_t size) noexcept {
- return ipc::make_align(alignof(std::max_align_t), align_chunk_size(
- ipc::make_align(alignof(std::max_align_t), sizeof(std::atomic)) + size));
-}
-
-struct chunk_t {
- std::atomic &conns() noexcept {
- return *reinterpret_cast *>(this);
- }
-
- void *data() noexcept {
- return reinterpret_cast(this)
- + ipc::make_align(alignof(std::max_align_t), sizeof(std::atomic));
- }
-};
-
-struct chunk_info_t {
- ipc::id_pool<> pool_;
- ipc::spin_lock lock_;
-
- IPC_CONSTEXPR_ static std::size_t chunks_mem_size(std::size_t chunk_size) noexcept {
- return ipc::id_pool<>::max_count * chunk_size;
- }
-
- ipc::byte_t *chunks_mem() noexcept {
- return reinterpret_cast(this + 1);
- }
-
- chunk_t *at(std::size_t chunk_size, ipc::storage_id_t id) noexcept {
- if (id < 0) return nullptr;
- return reinterpret_cast(chunks_mem() + (chunk_size * id));
- }
-};
-
-auto& chunk_storages() {
- class chunk_handle_t {
- ipc::shm::handle handle_;
-
- public:
- chunk_info_t *get_info(std::size_t chunk_size) {
- if (!handle_.valid() &&
- !handle_.acquire( ("__CHUNK_INFO__" + ipc::to_string(chunk_size)).c_str(),
- sizeof(chunk_info_t) + chunk_info_t::chunks_mem_size(chunk_size) )) {
- ipc::error("[chunk_storages] chunk_shm.id_info_.acquire failed: chunk_size = %zd\n", chunk_size);
- return nullptr;
- }
- auto info = static_cast(handle_.get());
- if (info == nullptr) {
- ipc::error("[chunk_storages] chunk_shm.id_info_.get failed: chunk_size = %zd\n", chunk_size);
- return nullptr;
- }
- return info;
- }
- };
- static ipc::map chunk_hs;
- return chunk_hs;
-}
-
-chunk_info_t *chunk_storage_info(std::size_t chunk_size) {
- auto &storages = chunk_storages();
- std::decay_t::iterator it;
- {
- static ipc::rw_lock lock;
- IPC_UNUSED_ std::shared_lock guard {lock};
- if ((it = storages.find(chunk_size)) == storages.end()) {
- using chunk_handle_t = std::decay_t::value_type::second_type;
- guard.unlock();
- IPC_UNUSED_ std::lock_guard guard {lock};
- it = storages.emplace(chunk_size, chunk_handle_t{}).first;
- }
- }
- return it->second.get_info(chunk_size);
-}
-
-std::pair acquire_storage(std::size_t size, ipc::circ::cc_t conns) {
- std::size_t chunk_size = calc_chunk_size(size);
- auto info = chunk_storage_info(chunk_size);
- if (info == nullptr) return {};
-
- info->lock_.lock();
- info->pool_.prepare();
- // got an unique id
- auto id = info->pool_.acquire();
- info->lock_.unlock();
-
- auto chunk = info->at(chunk_size, id);
- if (chunk == nullptr) return {};
- chunk->conns().store(conns, std::memory_order_relaxed);
- return { id, chunk->data() };
-}
-
-void *find_storage(ipc::storage_id_t id, std::size_t size) {
- if (id < 0) {
- ipc::error("[find_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size);
- return nullptr;
- }
- std::size_t chunk_size = calc_chunk_size(size);
- auto info = chunk_storage_info(chunk_size);
- if (info == nullptr) return nullptr;
- return info->at(chunk_size, id)->data();
-}
-
-void release_storage(ipc::storage_id_t id, std::size_t size) {
- if (id < 0) {
- ipc::error("[release_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size);
- return;
- }
- std::size_t chunk_size = calc_chunk_size(size);
- auto info = chunk_storage_info(chunk_size);
- if (info == nullptr) return;
- info->lock_.lock();
- info->pool_.release(id);
- info->lock_.unlock();
-}
-
-template
-bool sub_rc(ipc::wr