diff --git a/crazy_functions/CodeInterpreter.py b/crazy_functions/CodeInterpreter.py deleted file mode 100644 index 283dd87a93140c5621579e62c9d6d368537e4824..0000000000000000000000000000000000000000 --- a/crazy_functions/CodeInterpreter.py +++ /dev/null @@ -1,232 +0,0 @@ -from collections.abc import Callable, Iterable, Mapping -from typing import Any -from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc -from toolbox import promote_file_to_downloadzone, get_log_folder -from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive -from .crazy_utils import input_clipping, try_install_deps -from multiprocessing import Process, Pipe -import os -import time - -templete = """ -```python -import ... # Put dependencies here, e.g. import numpy as np - -class TerminalFunction(object): # Do not change the name of the class, The name of the class must be `TerminalFunction` - - def run(self, path): # The name of the function must be `run`, it takes only a positional argument. - # rewrite the function you have just written here - ... - return generated_file_path -``` -""" - -def inspect_dependency(chatbot, history): - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return True - -def get_code_block(reply): - import re - pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks - matches = re.findall(pattern, reply) # find all code blocks in text - if len(matches) == 1: - return matches[0].strip('python') # code block - for match in matches: - if 'class TerminalFunction' in match: - return match.strip('python') # code block - raise RuntimeError("GPT is not generating proper code.") - -def gpt_interact_multi_step(txt, file_type, llm_kwargs, chatbot, history): - # 输入 - prompt_compose = [ - f'Your job:\n' - f'1. write a single Python function, which takes a path of a `{file_type}` file as the only argument and returns a `string` containing the result of analysis or the path of generated files. \n', - f"2. You should write this function to perform following task: " + txt + "\n", - f"3. Wrap the output python function with markdown codeblock." - ] - i_say = "".join(prompt_compose) - demo = [] - - # 第一步 - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs=i_say, inputs_show_user=i_say, - llm_kwargs=llm_kwargs, chatbot=chatbot, history=demo, - sys_prompt= r"You are a programmer." - ) - history.extend([i_say, gpt_say]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 - - # 第二步 - prompt_compose = [ - "If previous stage is successful, rewrite the function you have just written to satisfy following templete: \n", - templete - ] - i_say = "".join(prompt_compose); inputs_show_user = "If previous stage is successful, rewrite the function you have just written to satisfy executable templete. " - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs=i_say, inputs_show_user=inputs_show_user, - llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, - sys_prompt= r"You are a programmer." - ) - code_to_return = gpt_say - history.extend([i_say, gpt_say]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 - - # # 第三步 - # i_say = "Please list to packages to install to run the code above. Then show me how to use `try_install_deps` function to install them." - # i_say += 'For instance. `try_install_deps(["opencv-python", "scipy", "numpy"])`' - # installation_advance = yield from request_gpt_model_in_new_thread_with_ui_alive( - # inputs=i_say, inputs_show_user=inputs_show_user, - # llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, - # sys_prompt= r"You are a programmer." - # ) - # # # 第三步 - # i_say = "Show me how to use `pip` to install packages to run the code above. " - # i_say += 'For instance. `pip install -r opencv-python scipy numpy`' - # installation_advance = yield from request_gpt_model_in_new_thread_with_ui_alive( - # inputs=i_say, inputs_show_user=i_say, - # llm_kwargs=llm_kwargs, chatbot=chatbot, history=history, - # sys_prompt= r"You are a programmer." - # ) - installation_advance = "" - - return code_to_return, installation_advance, txt, file_type, llm_kwargs, chatbot, history - -def make_module(code): - module_file = 'gpt_fn_' + gen_time_str().replace('-','_') - with open(f'{get_log_folder()}/{module_file}.py', 'w', encoding='utf8') as f: - f.write(code) - - def get_class_name(class_string): - import re - # Use regex to extract the class name - class_name = re.search(r'class (\w+)\(', class_string).group(1) - return class_name - - class_name = get_class_name(code) - return f"{get_log_folder().replace('/', '.')}.{module_file}->{class_name}" - -def init_module_instance(module): - import importlib - module_, class_ = module.split('->') - init_f = getattr(importlib.import_module(module_), class_) - return init_f() - -def for_immediate_show_off_when_possible(file_type, fp, chatbot): - if file_type in ['png', 'jpg']: - image_path = os.path.abspath(fp) - chatbot.append(['这是一张图片, 展示如下:', - f'本地文件地址:
`{image_path}`
'+ - f'本地文件预览:
' - ]) - return chatbot - -def subprocess_worker(instance, file_path, return_dict): - return_dict['result'] = instance.run(file_path) - -def have_any_recent_upload_files(chatbot): - _5min = 5 * 60 - if not chatbot: return False # chatbot is None - most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None) - if not most_recent_uploaded: return False # most_recent_uploaded is None - if time.time() - most_recent_uploaded["time"] < _5min: return True # most_recent_uploaded is new - else: return False # most_recent_uploaded is too old - -def get_recent_file_prompt_support(chatbot): - most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None) - path = most_recent_uploaded['path'] - return path - -@CatchException -def 虚空终端CodeInterpreter(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - """ - txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 - llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行 - plugin_kwargs 插件模型的参数,暂时没有用武之地 - chatbot 聊天显示框的句柄,用于显示给用户 - history 聊天历史,前情提要 - system_prompt 给gpt的静默提醒 - web_port 当前软件运行的端口号 - """ - raise NotImplementedError - - # 清空历史,以免输入溢出 - history = []; clear_file_downloadzone(chatbot) - - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "CodeInterpreter开源版, 此插件处于开发阶段, 建议暂时不要使用, 插件初始化中 ..." - ]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - if have_any_recent_upload_files(chatbot): - file_path = get_recent_file_prompt_support(chatbot) - else: - chatbot.append(["文件检索", "没有发现任何近期上传的文件。"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 读取文件 - if ("recently_uploaded_files" in plugin_kwargs) and (plugin_kwargs["recently_uploaded_files"] == ""): plugin_kwargs.pop("recently_uploaded_files") - recently_uploaded_files = plugin_kwargs.get("recently_uploaded_files", None) - file_path = recently_uploaded_files[-1] - file_type = file_path.split('.')[-1] - - # 粗心检查 - if is_the_upload_folder(txt): - chatbot.append([ - "...", - f"请在输入框内填写需求,然后再次点击该插件(文件路径 {file_path} 已经被记忆)" - ]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # 开始干正事 - for j in range(5): # 最多重试5次 - try: - code, installation_advance, txt, file_type, llm_kwargs, chatbot, history = \ - yield from gpt_interact_multi_step(txt, file_type, llm_kwargs, chatbot, history) - code = get_code_block(code) - res = make_module(code) - instance = init_module_instance(res) - break - except Exception as e: - chatbot.append([f"第{j}次代码生成尝试,失败了", f"错误追踪\n```\n{trimmed_format_exc()}\n```\n"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 代码生成结束, 开始执行 - try: - import multiprocessing - manager = multiprocessing.Manager() - return_dict = manager.dict() - - p = multiprocessing.Process(target=subprocess_worker, args=(instance, file_path, return_dict)) - # only has 10 seconds to run - p.start(); p.join(timeout=10) - if p.is_alive(): p.terminate(); p.join() - p.close() - res = return_dict['result'] - # res = instance.run(file_path) - except Exception as e: - chatbot.append(["执行失败了", f"错误追踪\n```\n{trimmed_format_exc()}\n```\n"]) - # chatbot.append(["如果是缺乏依赖,请参考以下建议", installation_advance]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # 顺利完成,收尾 - res = str(res) - if os.path.exists(res): - chatbot.append(["执行成功了,结果是一个有效文件", "结果:" + res]) - new_file_path = promote_file_to_downloadzone(res, chatbot=chatbot) - chatbot = for_immediate_show_off_when_possible(file_type, new_file_path, chatbot) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 - else: - chatbot.append(["执行成功了,结果是一个字符串", "结果:" + res]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新 - -""" -测试: - 裁剪图像,保留下半部分 - 交换图像的蓝色通道和红色通道 - 将图像转为灰度图像 - 将csv文件转excel表格 -""" \ No newline at end of file diff --git "a/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py" "b/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py" deleted file mode 100644 index 8433895f538e826e4294b7d6503583aafc2b34c8..0000000000000000000000000000000000000000 --- "a/crazy_functions/Langchain\347\237\245\350\257\206\345\272\223.py" +++ /dev/null @@ -1,106 +0,0 @@ -from toolbox import CatchException, update_ui, ProxyNetworkActivate, update_ui_lastest_msg -from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, get_files_from_everything - - - -@CatchException -def 知识库问答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - """ - txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 - llm_kwargs gpt模型参数, 如温度和top_p等, 一般原样传递下去就行 - plugin_kwargs 插件模型的参数,暂时没有用武之地 - chatbot 聊天显示框的句柄,用于显示给用户 - history 聊天历史,前情提要 - system_prompt 给gpt的静默提醒 - web_port 当前软件运行的端口号 - """ - history = [] # 清空历史,以免输入溢出 - - # < --------------------读取参数--------------- > - if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") - kai_id = plugin_kwargs.get("advanced_arg", 'default') - - chatbot.append((f"向`{kai_id}`知识库中添加文件。", "[Local Message] 从一批文件(txt, md, tex)中读取数据构建知识库, 然后进行问答。")) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # resolve deps - try: - from zh_langchain import construct_vector_store - from langchain.embeddings.huggingface import HuggingFaceEmbeddings - from .crazy_utils import knowledge_archive_interface - except Exception as e: - chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - from .crazy_utils import try_install_deps - try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) - yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) - return - - # < --------------------读取文件--------------- > - file_manifest = [] - spl = ["txt", "doc", "docx", "email", "epub", "html", "json", "md", "msg", "pdf", "ppt", "pptx", "rtf"] - for sp in spl: - _, file_manifest_tmp, _ = get_files_from_everything(txt, type=f'.{sp}') - file_manifest += file_manifest_tmp - - if len(file_manifest) == 0: - chatbot.append(["没有找到任何可读取文件", "当前支持的格式包括: txt, md, docx, pptx, pdf, json等"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - # < -------------------预热文本向量化模组--------------- > - chatbot.append(['
'.join(file_manifest), "正在预热文本向量化模组, 如果是第一次运行, 将消耗较长时间下载中文向量化模型..."]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - print('Checking Text2vec ...') - from langchain.embeddings.huggingface import HuggingFaceEmbeddings - with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 - HuggingFaceEmbeddings(model_name="GanymedeNil/text2vec-large-chinese") - - # < -------------------构建知识库--------------- > - chatbot.append(['
'.join(file_manifest), "正在构建知识库..."]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - print('Establishing knowledge archive ...') - with ProxyNetworkActivate('Download_LLM'): # 临时地激活代理网络 - kai = knowledge_archive_interface() - kai.feed_archive(file_manifest=file_manifest, id=kai_id) - kai_files = kai.get_loaded_file() - kai_files = '
'.join(kai_files) - # chatbot.append(['知识库构建成功', "正在将知识库存储至cookie中"]) - # yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - # chatbot._cookies['langchain_plugin_embedding'] = kai.get_current_archive_id() - # chatbot._cookies['lock_plugin'] = 'crazy_functions.Langchain知识库->读取知识库作答' - # chatbot.append(['完成', "“根据知识库作答”函数插件已经接管问答系统, 提问吧! 但注意, 您接下来不能再使用其他插件了,刷新页面即可以退出知识库问答模式。"]) - chatbot.append(['构建完成', f"当前知识库内的有效文件:\n\n---\n\n{kai_files}\n\n---\n\n请切换至“知识库问答”插件进行知识库访问, 或者使用此插件继续上传更多文件。"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 - -@CatchException -def 读取知识库作答(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port=-1): - # resolve deps - try: - from zh_langchain import construct_vector_store - from langchain.embeddings.huggingface import HuggingFaceEmbeddings - from .crazy_utils import knowledge_archive_interface - except Exception as e: - chatbot.append(["依赖不足", "导入依赖失败。正在尝试自动安装,请查看终端的输出或耐心等待..."]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - from .crazy_utils import try_install_deps - try_install_deps(['zh_langchain==0.2.1', 'pypinyin'], reload_m=['pypinyin', 'zh_langchain']) - yield from update_ui_lastest_msg("安装完成,您可以再次重试。", chatbot, history) - return - - # < ------------------- --------------- > - kai = knowledge_archive_interface() - - if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") - kai_id = plugin_kwargs.get("advanced_arg", 'default') - resp, prompt = kai.answer_with_archive_by_id(txt, kai_id) - - chatbot.append((txt, f'[知识库 {kai_id}] ' + prompt)) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 - gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs=prompt, inputs_show_user=txt, - llm_kwargs=llm_kwargs, chatbot=chatbot, history=[], - sys_prompt=system_prompt - ) - history.extend((prompt, gpt_say)) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新 diff --git "a/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py" "b/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py" deleted file mode 100644 index b736fe896979cf3c8b08910c8bb21bfb4809c9a4..0000000000000000000000000000000000000000 --- "a/crazy_functions/Latex\345\205\250\346\226\207\346\266\246\350\211\262.py" +++ /dev/null @@ -1,245 +0,0 @@ -from toolbox import update_ui, trimmed_format_exc, promote_file_to_downloadzone, get_log_folder -from toolbox import CatchException, report_exception, write_history_to_file, zip_folder - - -class PaperFileGroup(): - def __init__(self): - self.file_paths = [] - self.file_contents = [] - self.sp_file_contents = [] - self.sp_file_index = [] - self.sp_file_tag = [] - - # count_token - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - - def run_file_split(self, max_token_limit=1900): - """ - 将长文本分离开来 - """ - for index, file_content in enumerate(self.file_contents): - if self.get_token_num(file_content) < max_token_limit: - self.sp_file_contents.append(file_content) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index]) - else: - from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit - segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) - for j, segment in enumerate(segments): - self.sp_file_contents.append(segment) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") - - print('Segmentation: done') - def merge_result(self): - self.file_result = ["" for _ in range(len(self.file_paths))] - for r, k in zip(self.sp_file_result, self.sp_file_index): - self.file_result[k] += r - - def write_result(self): - manifest = [] - for path, res in zip(self.file_paths, self.file_result): - with open(path + '.polish.tex', 'w', encoding='utf8') as f: - manifest.append(path + '.polish.tex') - f.write(res) - return manifest - - def zip_result(self): - import os, time - folder = os.path.dirname(self.file_paths[0]) - t = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) - zip_folder(folder, get_log_folder(), f'{t}-polished.zip') - - -def 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='polish'): - import time, os, re - from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency - - - # <-------- 读取Latex文件,删除其中的所有注释 ----------> - pfg = PaperFileGroup() - - for index, fp in enumerate(file_manifest): - with open(fp, 'r', encoding='utf-8', errors='replace') as f: - file_content = f.read() - # 定义注释的正则表达式 - comment_pattern = r'(? - pfg.run_file_split(max_token_limit=1024) - n_split = len(pfg.sp_file_contents) - - - # <-------- 多线程润色开始 ----------> - if language == 'en': - if mode == 'polish': - inputs_array = ["Below is a section from an academic paper, polish this section to meet the academic standard, " + - "improve the grammar, clarity and overall readability, do not modify any latex command such as \section, \cite and equations:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - else: - inputs_array = [r"Below is a section from an academic paper, proofread this section." + - r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + - r"Answer me only with the revised text:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - inputs_show_user_array = [f"Polish {f}" for f in pfg.sp_file_tag] - sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)] - elif language == 'zh': - if mode == 'polish': - inputs_array = [f"以下是一篇学术论文中的一段内容,请将此部分润色以满足学术标准,提高语法、清晰度和整体可读性,不要修改任何LaTeX命令,例如\section,\cite和方程式:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - else: - inputs_array = [f"以下是一篇学术论文中的一段内容,请对这部分内容进行语法矫正。不要修改任何LaTeX命令,例如\section,\cite和方程式:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - inputs_show_user_array = [f"润色 {f}" for f in pfg.sp_file_tag] - sys_prompt_array=["你是一位专业的中文学术论文作家。" for _ in range(n_split)] - - - gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array=inputs_array, - inputs_show_user_array=inputs_show_user_array, - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history_array=[[""] for _ in range(n_split)], - sys_prompt_array=sys_prompt_array, - # max_workers=5, # 并行任务数量限制,最多同时执行5个,其他的排队等待 - scroller_max_len = 80 - ) - - # <-------- 文本碎片重组为完整的tex文件,整理结果为压缩包 ----------> - try: - pfg.sp_file_result = [] - for i_say, gpt_say in zip(gpt_response_collection[0::2], gpt_response_collection[1::2]): - pfg.sp_file_result.append(gpt_say) - pfg.merge_result() - pfg.write_result() - pfg.zip_result() - except: - print(trimmed_format_exc()) - - # <-------- 整理结果,退出 ----------> - create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md" - res = write_history_to_file(gpt_response_collection, file_basename=create_report_file_name) - promote_file_to_downloadzone(res, chatbot=chatbot) - - history = gpt_response_collection - chatbot.append((f"{fp}完成了吗?", res)) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - -@CatchException -def Latex英文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "对整个Latex项目进行润色。函数插件贡献者: Binary-Husky。(注意,此插件不调用Latex,如果有Latex环境,请使用“Latex英文纠错+高亮”插件)"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - import tiktoken - except: - report_exception(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - history = [] # 清空历史,以免输入溢出 - import glob, os - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] - if len(file_manifest) == 0: - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en') - - - - - - -@CatchException -def Latex中文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "对整个Latex项目进行润色。函数插件贡献者: Binary-Husky"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - import tiktoken - except: - report_exception(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - history = [] # 清空历史,以免输入溢出 - import glob, os - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] - if len(file_manifest) == 0: - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh') - - - - -@CatchException -def Latex英文纠错(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "对整个Latex项目进行纠错。函数插件贡献者: Binary-Husky"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - import tiktoken - except: - report_exception(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - history = [] # 清空历史,以免输入溢出 - import glob, os - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] - if len(file_manifest) == 0: - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - yield from 多文件润色(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en', mode='proofread') - - - diff --git "a/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py" "b/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py" deleted file mode 100644 index 49470c864e59b790b09789b97227e7b00768ccfd..0000000000000000000000000000000000000000 --- "a/crazy_functions/Latex\345\205\250\346\226\207\347\277\273\350\257\221.py" +++ /dev/null @@ -1,176 +0,0 @@ -from toolbox import update_ui, promote_file_to_downloadzone -from toolbox import CatchException, report_exception, write_history_to_file -fast_debug = False - -class PaperFileGroup(): - def __init__(self): - self.file_paths = [] - self.file_contents = [] - self.sp_file_contents = [] - self.sp_file_index = [] - self.sp_file_tag = [] - - # count_token - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - - def run_file_split(self, max_token_limit=1900): - """ - 将长文本分离开来 - """ - for index, file_content in enumerate(self.file_contents): - if self.get_token_num(file_content) < max_token_limit: - self.sp_file_contents.append(file_content) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index]) - else: - from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit - segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) - for j, segment in enumerate(segments): - self.sp_file_contents.append(segment) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") - - print('Segmentation: done') - -def 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en'): - import time, os, re - from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency - - # <-------- 读取Latex文件,删除其中的所有注释 ----------> - pfg = PaperFileGroup() - - for index, fp in enumerate(file_manifest): - with open(fp, 'r', encoding='utf-8', errors='replace') as f: - file_content = f.read() - # 定义注释的正则表达式 - comment_pattern = r'(? - pfg.run_file_split(max_token_limit=1024) - n_split = len(pfg.sp_file_contents) - - # <-------- 抽取摘要 ----------> - # if language == 'en': - # abs_extract_inputs = f"Please write an abstract for this paper" - - # # 单线,获取文章meta信息 - # paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( - # inputs=abs_extract_inputs, - # inputs_show_user=f"正在抽取摘要信息。", - # llm_kwargs=llm_kwargs, - # chatbot=chatbot, history=[], - # sys_prompt="Your job is to collect information from materials。", - # ) - - # <-------- 多线程润色开始 ----------> - if language == 'en->zh': - inputs_array = ["Below is a section from an English academic paper, translate it into Chinese, do not modify any latex command such as \section, \cite and equations:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] - sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] - elif language == 'zh->en': - inputs_array = [f"Below is a section from a Chinese academic paper, translate it into English, do not modify any latex command such as \section, \cite and equations:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - inputs_show_user_array = [f"翻译 {f}" for f in pfg.sp_file_tag] - sys_prompt_array = ["You are a professional academic paper translator." for _ in range(n_split)] - - gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array=inputs_array, - inputs_show_user_array=inputs_show_user_array, - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history_array=[[""] for _ in range(n_split)], - sys_prompt_array=sys_prompt_array, - # max_workers=5, # OpenAI所允许的最大并行过载 - scroller_max_len = 80 - ) - - # <-------- 整理结果,退出 ----------> - create_report_file_name = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()) + f"-chatgpt.polish.md" - res = write_history_to_file(gpt_response_collection, create_report_file_name) - promote_file_to_downloadzone(res, chatbot=chatbot) - history = gpt_response_collection - chatbot.append((f"{fp}完成了吗?", res)) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - - - - -@CatchException -def Latex英译中(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - import tiktoken - except: - report_exception(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - history = [] # 清空历史,以免输入溢出 - import glob, os - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] - if len(file_manifest) == 0: - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='en->zh') - - - - - -@CatchException -def Latex中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # 基本信息:功能、贡献者 - chatbot.append([ - "函数插件功能?", - "对整个Latex项目进行翻译。函数插件贡献者: Binary-Husky"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # 尝试导入依赖,如果缺少依赖,则给出安装建议 - try: - import tiktoken - except: - report_exception(chatbot, history, - a=f"解析项目: {txt}", - b=f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade tiktoken```。") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - history = [] # 清空历史,以免输入溢出 - import glob, os - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] - if len(file_manifest) == 0: - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - yield from 多文件翻译(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, language='zh->en') \ No newline at end of file diff --git "a/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py" "b/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py" deleted file mode 100644 index 18a8d1bab26af31e7ac0671b95c91660d6d7f02d..0000000000000000000000000000000000000000 --- "a/crazy_functions/Latex\350\276\223\345\207\272PDF\347\273\223\346\236\234.py" +++ /dev/null @@ -1,306 +0,0 @@ -from toolbox import update_ui, trimmed_format_exc, get_conf, get_log_folder, promote_file_to_downloadzone -from toolbox import CatchException, report_exception, update_ui_lastest_msg, zip_result, gen_time_str -from functools import partial -import glob, os, requests, time -pj = os.path.join -ARXIV_CACHE_DIR = os.path.expanduser(f"~/arxiv_cache/") - -# =================================== 工具函数 =============================================== -# 专业词汇声明 = 'If the term "agent" is used in this section, it should be translated to "智能体". ' -def switch_prompt(pfg, mode, more_requirement): - """ - Generate prompts and system prompts based on the mode for proofreading or translating. - Args: - - pfg: Proofreader or Translator instance. - - mode: A string specifying the mode, either 'proofread' or 'translate_zh'. - - Returns: - - inputs_array: A list of strings containing prompts for users to respond to. - - sys_prompt_array: A list of strings containing prompts for system prompts. - """ - n_split = len(pfg.sp_file_contents) - if mode == 'proofread_en': - inputs_array = [r"Below is a section from an academic paper, proofread this section." + - r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + more_requirement + - r"Answer me only with the revised text:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - sys_prompt_array = ["You are a professional academic paper writer." for _ in range(n_split)] - elif mode == 'translate_zh': - inputs_array = [r"Below is a section from an English academic paper, translate it into Chinese. " + more_requirement + - r"Do not modify any latex command such as \section, \cite, \begin, \item and equations. " + - r"Answer me only with the translated text:" + - f"\n\n{frag}" for frag in pfg.sp_file_contents] - sys_prompt_array = ["You are a professional translator." for _ in range(n_split)] - else: - assert False, "未知指令" - return inputs_array, sys_prompt_array - -def desend_to_extracted_folder_if_exist(project_folder): - """ - Descend into the extracted folder if it exists, otherwise return the original folder. - - Args: - - project_folder: A string specifying the folder path. - - Returns: - - A string specifying the path to the extracted folder, or the original folder if there is no extracted folder. - """ - maybe_dir = [f for f in glob.glob(f'{project_folder}/*') if os.path.isdir(f)] - if len(maybe_dir) == 0: return project_folder - if maybe_dir[0].endswith('.extract'): return maybe_dir[0] - return project_folder - -def move_project(project_folder, arxiv_id=None): - """ - Create a new work folder and copy the project folder to it. - - Args: - - project_folder: A string specifying the folder path of the project. - - Returns: - - A string specifying the path to the new work folder. - """ - import shutil, time - time.sleep(2) # avoid time string conflict - if arxiv_id is not None: - new_workfolder = pj(ARXIV_CACHE_DIR, arxiv_id, 'workfolder') - else: - new_workfolder = f'{get_log_folder()}/{gen_time_str()}' - try: - shutil.rmtree(new_workfolder) - except: - pass - - # align subfolder if there is a folder wrapper - items = glob.glob(pj(project_folder,'*')) - items = [item for item in items if os.path.basename(item)!='__MACOSX'] - if len(glob.glob(pj(project_folder,'*.tex'))) == 0 and len(items) == 1: - if os.path.isdir(items[0]): project_folder = items[0] - - shutil.copytree(src=project_folder, dst=new_workfolder) - return new_workfolder - -def arxiv_download(chatbot, history, txt, allow_cache=True): - def check_cached_translation_pdf(arxiv_id): - translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'translation') - if not os.path.exists(translation_dir): - os.makedirs(translation_dir) - target_file = pj(translation_dir, 'translate_zh.pdf') - if os.path.exists(target_file): - promote_file_to_downloadzone(target_file, rename_file=None, chatbot=chatbot) - target_file_compare = pj(translation_dir, 'comparison.pdf') - if os.path.exists(target_file_compare): - promote_file_to_downloadzone(target_file_compare, rename_file=None, chatbot=chatbot) - return target_file - return False - def is_float(s): - try: - float(s) - return True - except ValueError: - return False - if ('.' in txt) and ('/' not in txt) and is_float(txt): # is arxiv ID - txt = 'https://arxiv.org/abs/' + txt.strip() - if ('.' in txt) and ('/' not in txt) and is_float(txt[:10]): # is arxiv ID - txt = 'https://arxiv.org/abs/' + txt[:10] - if not txt.startswith('https://arxiv.org'): - return txt, None - - # <-------------- inspect format -------------> - chatbot.append([f"检测到arxiv文档连接", '尝试下载 ...']) - yield from update_ui(chatbot=chatbot, history=history) - time.sleep(1) # 刷新界面 - - url_ = txt # https://arxiv.org/abs/1707.06690 - if not txt.startswith('https://arxiv.org/abs/'): - msg = f"解析arxiv网址失败, 期望格式例如: https://arxiv.org/abs/1707.06690。实际得到格式: {url_}。" - yield from update_ui_lastest_msg(msg, chatbot=chatbot, history=history) # 刷新界面 - return msg, None - # <-------------- set format -------------> - arxiv_id = url_.split('/abs/')[-1] - if 'v' in arxiv_id: arxiv_id = arxiv_id[:10] - cached_translation_pdf = check_cached_translation_pdf(arxiv_id) - if cached_translation_pdf and allow_cache: return cached_translation_pdf, arxiv_id - - url_tar = url_.replace('/abs/', '/e-print/') - translation_dir = pj(ARXIV_CACHE_DIR, arxiv_id, 'e-print') - extract_dst = pj(ARXIV_CACHE_DIR, arxiv_id, 'extract') - os.makedirs(translation_dir, exist_ok=True) - - # <-------------- download arxiv source file -------------> - dst = pj(translation_dir, arxiv_id+'.tar') - if os.path.exists(dst): - yield from update_ui_lastest_msg("调用缓存", chatbot=chatbot, history=history) # 刷新界面 - else: - yield from update_ui_lastest_msg("开始下载", chatbot=chatbot, history=history) # 刷新界面 - proxies = get_conf('proxies') - r = requests.get(url_tar, proxies=proxies) - with open(dst, 'wb+') as f: - f.write(r.content) - # <-------------- extract file -------------> - yield from update_ui_lastest_msg("下载完成", chatbot=chatbot, history=history) # 刷新界面 - from toolbox import extract_archive - extract_archive(file_path=dst, dest_dir=extract_dst) - return extract_dst, arxiv_id -# ========================================= 插件主程序1 ===================================================== - - -@CatchException -def Latex英文纠错加PDF对比(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # <-------------- information about this plugin -------------> - chatbot.append([ "函数插件功能?", - "对整个Latex项目进行纠错, 用latex编译为PDF对修正处做高亮。函数插件贡献者: Binary-Husky。注意事项: 目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。仅在Windows系统进行了测试,其他操作系统表现未知。"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # <-------------- more requirements -------------> - if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") - more_req = plugin_kwargs.get("advanced_arg", "") - _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) - - # <-------------- check deps -------------> - try: - import glob, os, time, subprocess - subprocess.Popen(['pdflatex', '-version']) - from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex - except Exception as e: - chatbot.append([ f"解析项目: {txt}", - f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - - # <-------------- clear history and read input -------------> - history = [] - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] - if len(file_manifest) == 0: - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - - # <-------------- if is a zip/tar file -------------> - project_folder = desend_to_extracted_folder_if_exist(project_folder) - - - # <-------------- move latex project away from temp folder -------------> - project_folder = move_project(project_folder, arxiv_id=None) - - - # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> - if not os.path.exists(project_folder + '/merge_proofread_en.tex'): - yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, - chatbot, history, system_prompt, mode='proofread_en', switch_prompt=_switch_prompt_) - - - # <-------------- compile PDF -------------> - success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_proofread_en', - work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) - - - # <-------------- zip PDF -------------> - zip_res = zip_result(project_folder) - if success: - chatbot.append((f"成功啦", '请查收结果(压缩包)...')) - yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 - promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) - else: - chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 也是可读的, 您可以到Github Issue区, 用该压缩包+对话历史存档进行反馈 ...')) - yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 - promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) - - # <-------------- we are done -------------> - return success - -# ========================================= 插件主程序2 ===================================================== - -@CatchException -def Latex翻译中文并重新编译PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # <-------------- information about this plugin -------------> - chatbot.append([ - "函数插件功能?", - "对整个Latex项目进行翻译, 生成中文PDF。函数插件贡献者: Binary-Husky。注意事项: 此插件Windows支持最佳,Linux下必须使用Docker安装,详见项目主README.md。目前仅支持GPT3.5/GPT4,其他模型转化效果未知。目前对机器学习类文献转化效果最好,其他类型文献转化效果未知。"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # <-------------- more requirements -------------> - if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") - more_req = plugin_kwargs.get("advanced_arg", "") - no_cache = more_req.startswith("--no-cache") - if no_cache: more_req.lstrip("--no-cache") - allow_cache = not no_cache - _switch_prompt_ = partial(switch_prompt, more_requirement=more_req) - - # <-------------- check deps -------------> - try: - import glob, os, time, subprocess - subprocess.Popen(['pdflatex', '-version']) - from .latex_fns.latex_actions import Latex精细分解与转化, 编译Latex - except Exception as e: - chatbot.append([ f"解析项目: {txt}", - f"尝试执行Latex指令失败。Latex没有安装, 或者不在环境变量PATH中。安装方法https://tug.org/texlive/。报错信息\n\n```\n\n{trimmed_format_exc()}\n\n```\n\n"]) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - - # <-------------- clear history and read input -------------> - history = [] - txt, arxiv_id = yield from arxiv_download(chatbot, history, txt, allow_cache) - if txt.endswith('.pdf'): - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"发现已经存在翻译好的PDF文档") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - - if os.path.exists(txt): - project_folder = txt - else: - if txt == "": txt = '空空如也的输入栏' - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无法处理: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] - if len(file_manifest) == 0: - report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}") - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - return - - - # <-------------- if is a zip/tar file -------------> - project_folder = desend_to_extracted_folder_if_exist(project_folder) - - - # <-------------- move latex project away from temp folder -------------> - project_folder = move_project(project_folder, arxiv_id) - - - # <-------------- if merge_translate_zh is already generated, skip gpt req -------------> - if not os.path.exists(project_folder + '/merge_translate_zh.tex'): - yield from Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, - chatbot, history, system_prompt, mode='translate_zh', switch_prompt=_switch_prompt_) - - - # <-------------- compile PDF -------------> - success = yield from 编译Latex(chatbot, history, main_file_original='merge', main_file_modified='merge_translate_zh', mode='translate_zh', - work_folder_original=project_folder, work_folder_modified=project_folder, work_folder=project_folder) - - # <-------------- zip PDF -------------> - zip_res = zip_result(project_folder) - if success: - chatbot.append((f"成功啦", '请查收结果(压缩包)...')) - yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 - promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) - else: - chatbot.append((f"失败了", '虽然PDF生成失败了, 但请查收结果(压缩包), 内含已经翻译的Tex文档, 您可以到Github Issue区, 用该压缩包进行反馈。如系统是Linux,请检查系统字体(见Github wiki) ...')) - yield from update_ui(chatbot=chatbot, history=history); time.sleep(1) # 刷新界面 - promote_file_to_downloadzone(file=zip_res, chatbot=chatbot) - - - # <-------------- we are done -------------> - return success diff --git a/crazy_functions/__init__.py b/crazy_functions/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/crazy_functions/agent_fns/auto_agent.py b/crazy_functions/agent_fns/auto_agent.py deleted file mode 100644 index 4f8fda9d5872db9c178321d43415b24dbea024bb..0000000000000000000000000000000000000000 --- a/crazy_functions/agent_fns/auto_agent.py +++ /dev/null @@ -1,23 +0,0 @@ -from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, ProxyNetworkActivate -from toolbox import report_exception, get_log_folder, update_ui_lastest_msg, Singleton -from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom -from crazy_functions.agent_fns.general import AutoGenGeneral - - - -class AutoGenMath(AutoGenGeneral): - - def define_agents(self): - from autogen import AssistantAgent, UserProxyAgent - return [ - { - "name": "assistant", # name of the agent. - "cls": AssistantAgent, # class of the agent. - }, - { - "name": "user_proxy", # name of the agent. - "cls": UserProxyAgent, # class of the agent. - "human_input_mode": "ALWAYS", # always ask for human input. - "llm_config": False, # disables llm-based auto reply. - }, - ] \ No newline at end of file diff --git a/crazy_functions/agent_fns/echo_agent.py b/crazy_functions/agent_fns/echo_agent.py deleted file mode 100644 index 52bf72debc7a56a89b277ced80078ea6b985e1fa..0000000000000000000000000000000000000000 --- a/crazy_functions/agent_fns/echo_agent.py +++ /dev/null @@ -1,19 +0,0 @@ -from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom - -class EchoDemo(PluginMultiprocessManager): - def subprocess_worker(self, child_conn): - # ⭐⭐ 子进程 - self.child_conn = child_conn - while True: - msg = self.child_conn.recv() # PipeCom - if msg.cmd == "user_input": - # wait futher user input - self.child_conn.send(PipeCom("show", msg.content)) - wait_success = self.subprocess_worker_wait_user_feedback(wait_msg="我准备好处理下一个问题了.") - if not wait_success: - # wait timeout, terminate this subprocess_worker - break - elif msg.cmd == "terminate": - self.child_conn.send(PipeCom("done", "")) - break - print('[debug] subprocess_worker terminated') \ No newline at end of file diff --git a/crazy_functions/agent_fns/general.py b/crazy_functions/agent_fns/general.py deleted file mode 100644 index 49bc4dc89e9e1244891c15ff73bb0ae065d51821..0000000000000000000000000000000000000000 --- a/crazy_functions/agent_fns/general.py +++ /dev/null @@ -1,134 +0,0 @@ -from toolbox import trimmed_format_exc, get_conf, ProxyNetworkActivate -from crazy_functions.agent_fns.pipe import PluginMultiprocessManager, PipeCom -from request_llms.bridge_all import predict_no_ui_long_connection -import time - -def gpt_academic_generate_oai_reply( - self, - messages, - sender, - config, -): - llm_config = self.llm_config if config is None else config - if llm_config is False: - return False, None - if messages is None: - messages = self._oai_messages[sender] - - inputs = messages[-1]['content'] - history = [] - for message in messages[:-1]: - history.append(message['content']) - context=messages[-1].pop("context", None) - assert context is None, "预留参数 context 未实现" - - reply = predict_no_ui_long_connection( - inputs=inputs, - llm_kwargs=llm_config, - history=history, - sys_prompt=self._oai_system_message[0]['content'], - console_slience=True - ) - assumed_done = reply.endswith('\nTERMINATE') - return True, reply - -class AutoGenGeneral(PluginMultiprocessManager): - def gpt_academic_print_override(self, user_proxy, message, sender): - # ⭐⭐ run in subprocess - self.child_conn.send(PipeCom("show", sender.name + "\n\n---\n\n" + message["content"])) - - def gpt_academic_get_human_input(self, user_proxy, message): - # ⭐⭐ run in subprocess - patience = 300 - begin_waiting_time = time.time() - self.child_conn.send(PipeCom("interact", message)) - while True: - time.sleep(0.5) - if self.child_conn.poll(): - wait_success = True - break - if time.time() - begin_waiting_time > patience: - self.child_conn.send(PipeCom("done", "")) - wait_success = False - break - if wait_success: - return self.child_conn.recv().content - else: - raise TimeoutError("等待用户输入超时") - - def define_agents(self): - raise NotImplementedError - - def exe_autogen(self, input): - # ⭐⭐ run in subprocess - input = input.content - with ProxyNetworkActivate("AutoGen"): - code_execution_config = {"work_dir": self.autogen_work_dir, "use_docker": self.use_docker} - agents = self.define_agents() - user_proxy = None - assistant = None - for agent_kwargs in agents: - agent_cls = agent_kwargs.pop('cls') - kwargs = { - 'llm_config':self.llm_kwargs, - 'code_execution_config':code_execution_config - } - kwargs.update(agent_kwargs) - agent_handle = agent_cls(**kwargs) - agent_handle._print_received_message = lambda a,b: self.gpt_academic_print_override(agent_kwargs, a, b) - for d in agent_handle._reply_func_list: - if hasattr(d['reply_func'],'__name__') and d['reply_func'].__name__ == 'generate_oai_reply': - d['reply_func'] = gpt_academic_generate_oai_reply - if agent_kwargs['name'] == 'user_proxy': - agent_handle.get_human_input = lambda a: self.gpt_academic_get_human_input(user_proxy, a) - user_proxy = agent_handle - if agent_kwargs['name'] == 'assistant': assistant = agent_handle - try: - if user_proxy is None or assistant is None: raise Exception("用户代理或助理代理未定义") - user_proxy.initiate_chat(assistant, message=input) - except Exception as e: - tb_str = '```\n' + trimmed_format_exc() + '```' - self.child_conn.send(PipeCom("done", "AutoGen 执行失败: \n\n" + tb_str)) - - def subprocess_worker(self, child_conn): - # ⭐⭐ run in subprocess - self.child_conn = child_conn - while True: - msg = self.child_conn.recv() # PipeCom - self.exe_autogen(msg) - - -class AutoGenGroupChat(AutoGenGeneral): - def exe_autogen(self, input): - # ⭐⭐ run in subprocess - import autogen - - input = input.content - with ProxyNetworkActivate("AutoGen"): - code_execution_config = {"work_dir": self.autogen_work_dir, "use_docker": self.use_docker} - agents = self.define_agents() - agents_instances = [] - for agent_kwargs in agents: - agent_cls = agent_kwargs.pop("cls") - kwargs = {"code_execution_config": code_execution_config} - kwargs.update(agent_kwargs) - agent_handle = agent_cls(**kwargs) - agent_handle._print_received_message = lambda a, b: self.gpt_academic_print_override(agent_kwargs, a, b) - agents_instances.append(agent_handle) - if agent_kwargs["name"] == "user_proxy": - user_proxy = agent_handle - user_proxy.get_human_input = lambda a: self.gpt_academic_get_human_input(user_proxy, a) - try: - groupchat = autogen.GroupChat(agents=agents_instances, messages=[], max_round=50) - manager = autogen.GroupChatManager(groupchat=groupchat, **self.define_group_chat_manager_config()) - manager._print_received_message = lambda a, b: self.gpt_academic_print_override(agent_kwargs, a, b) - manager.get_human_input = lambda a: self.gpt_academic_get_human_input(manager, a) - if user_proxy is None: - raise Exception("user_proxy is not defined") - user_proxy.initiate_chat(manager, message=input) - except Exception: - tb_str = "```\n" + trimmed_format_exc() + "```" - self.child_conn.send(PipeCom("done", "AutoGen exe failed: \n\n" + tb_str)) - - def define_group_chat_manager_config(self): - raise NotImplementedError diff --git a/crazy_functions/agent_fns/persistent.py b/crazy_functions/agent_fns/persistent.py deleted file mode 100644 index 82c869cb18ceba5c56e05d3d8b18bb968cf3b35e..0000000000000000000000000000000000000000 --- a/crazy_functions/agent_fns/persistent.py +++ /dev/null @@ -1,16 +0,0 @@ -from toolbox import Singleton -@Singleton -class GradioMultiuserManagerForPersistentClasses(): - def __init__(self): - self.mapping = {} - - def already_alive(self, key): - return (key in self.mapping) and (self.mapping[key].is_alive()) - - def set(self, key, x): - self.mapping[key] = x - return self.mapping[key] - - def get(self, key): - return self.mapping[key] - diff --git a/crazy_functions/agent_fns/pipe.py b/crazy_functions/agent_fns/pipe.py deleted file mode 100644 index bb3bc78520d50b0a7995d0390208f69867c5b7e1..0000000000000000000000000000000000000000 --- a/crazy_functions/agent_fns/pipe.py +++ /dev/null @@ -1,194 +0,0 @@ -from toolbox import get_log_folder, update_ui, gen_time_str, get_conf, promote_file_to_downloadzone -from crazy_functions.agent_fns.watchdog import WatchDog -import time, os - -class PipeCom: - def __init__(self, cmd, content) -> None: - self.cmd = cmd - self.content = content - - -class PluginMultiprocessManager: - def __init__(self, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # ⭐ run in main process - self.autogen_work_dir = os.path.join(get_log_folder("autogen"), gen_time_str()) - self.previous_work_dir_files = {} - self.llm_kwargs = llm_kwargs - self.plugin_kwargs = plugin_kwargs - self.chatbot = chatbot - self.history = history - self.system_prompt = system_prompt - # self.web_port = web_port - self.alive = True - self.use_docker = get_conf("AUTOGEN_USE_DOCKER") - self.last_user_input = "" - # create a thread to monitor self.heartbeat, terminate the instance if no heartbeat for a long time - timeout_seconds = 5 * 60 - self.heartbeat_watchdog = WatchDog(timeout=timeout_seconds, bark_fn=self.terminate, interval=5) - self.heartbeat_watchdog.begin_watch() - - def feed_heartbeat_watchdog(self): - # feed this `dog`, so the dog will not `bark` (bark_fn will terminate the instance) - self.heartbeat_watchdog.feed() - - def is_alive(self): - return self.alive - - def launch_subprocess_with_pipe(self): - # ⭐ run in main process - from multiprocessing import Process, Pipe - - parent_conn, child_conn = Pipe() - self.p = Process(target=self.subprocess_worker, args=(child_conn,)) - self.p.daemon = True - self.p.start() - return parent_conn - - def terminate(self): - self.p.terminate() - self.alive = False - print("[debug] instance terminated") - - def subprocess_worker(self, child_conn): - # ⭐⭐ run in subprocess - raise NotImplementedError - - def send_command(self, cmd): - # ⭐ run in main process - repeated = False - if cmd == self.last_user_input: - repeated = True - cmd = "" - else: - self.last_user_input = cmd - self.parent_conn.send(PipeCom("user_input", cmd)) - return repeated, cmd - - def immediate_showoff_when_possible(self, fp): - # ⭐ 主进程 - # 获取fp的拓展名 - file_type = fp.split('.')[-1] - # 如果是文本文件, 则直接显示文本内容 - if file_type.lower() in ['png', 'jpg']: - image_path = os.path.abspath(fp) - self.chatbot.append([ - '检测到新生图像:', - f'本地文件预览:
' - ]) - yield from update_ui(chatbot=self.chatbot, history=self.history) - - def overwatch_workdir_file_change(self): - # ⭐ 主进程 Docker 外挂文件夹监控 - path_to_overwatch = self.autogen_work_dir - change_list = [] - # 扫描路径下的所有文件, 并与self.previous_work_dir_files中所记录的文件进行对比, - # 如果有新文件出现,或者文件的修改时间发生变化,则更新self.previous_work_dir_files中 - # 把新文件和发生变化的文件的路径记录到 change_list 中 - for root, dirs, files in os.walk(path_to_overwatch): - for file in files: - file_path = os.path.join(root, file) - if file_path not in self.previous_work_dir_files.keys(): - last_modified_time = os.stat(file_path).st_mtime - self.previous_work_dir_files.update({file_path: last_modified_time}) - change_list.append(file_path) - else: - last_modified_time = os.stat(file_path).st_mtime - if last_modified_time != self.previous_work_dir_files[file_path]: - self.previous_work_dir_files[file_path] = last_modified_time - change_list.append(file_path) - if len(change_list) > 0: - file_links = "" - for f in change_list: - res = promote_file_to_downloadzone(f) - file_links += f'
{res}' - yield from self.immediate_showoff_when_possible(f) - - self.chatbot.append(['检测到新生文档.', f'文档清单如下: {file_links}']) - yield from update_ui(chatbot=self.chatbot, history=self.history) - return change_list - - - def main_process_ui_control(self, txt, create_or_resume) -> str: - # ⭐ 主进程 - if create_or_resume == 'create': - self.cnt = 1 - self.parent_conn = self.launch_subprocess_with_pipe() # ⭐⭐⭐ - repeated, cmd_to_autogen = self.send_command(txt) - if txt == 'exit': - self.chatbot.append([f"结束", "结束信号已明确,终止AutoGen程序。"]) - yield from update_ui(chatbot=self.chatbot, history=self.history) - self.terminate() - return "terminate" - - # patience = 10 - - while True: - time.sleep(0.5) - if not self.alive: - # the heartbeat watchdog might have it killed - self.terminate() - return "terminate" - if self.parent_conn.poll(): - self.feed_heartbeat_watchdog() - if "[GPT-Academic] 等待中" in self.chatbot[-1][-1]: - self.chatbot.pop(-1) # remove the last line - if "等待您的进一步指令" in self.chatbot[-1][-1]: - self.chatbot.pop(-1) # remove the last line - if '[GPT-Academic] 等待中' in self.chatbot[-1][-1]: - self.chatbot.pop(-1) # remove the last line - msg = self.parent_conn.recv() # PipeCom - if msg.cmd == "done": - self.chatbot.append([f"结束", msg.content]) - self.cnt += 1 - yield from update_ui(chatbot=self.chatbot, history=self.history) - self.terminate() - break - if msg.cmd == "show": - yield from self.overwatch_workdir_file_change() - notice = "" - if repeated: notice = "(自动忽略重复的输入)" - self.chatbot.append([f"运行阶段-{self.cnt}(上次用户反馈输入为: 「{cmd_to_autogen}」{notice}", msg.content]) - self.cnt += 1 - yield from update_ui(chatbot=self.chatbot, history=self.history) - if msg.cmd == "interact": - yield from self.overwatch_workdir_file_change() - self.chatbot.append([f"程序抵达用户反馈节点.", msg.content + - "\n\n等待您的进一步指令." + - "\n\n(1) 一般情况下您不需要说什么, 清空输入区, 然后直接点击“提交”以继续. " + - "\n\n(2) 如果您需要补充些什么, 输入要反馈的内容, 直接点击“提交”以继续. " + - "\n\n(3) 如果您想终止程序, 输入exit, 直接点击“提交”以终止AutoGen并解锁. " - ]) - yield from update_ui(chatbot=self.chatbot, history=self.history) - # do not terminate here, leave the subprocess_worker instance alive - return "wait_feedback" - else: - self.feed_heartbeat_watchdog() - if '[GPT-Academic] 等待中' not in self.chatbot[-1][-1]: - # begin_waiting_time = time.time() - self.chatbot.append(["[GPT-Academic] 等待AutoGen执行结果 ...", "[GPT-Academic] 等待中"]) - self.chatbot[-1] = [self.chatbot[-1][0], self.chatbot[-1][1].replace("[GPT-Academic] 等待中", "[GPT-Academic] 等待中.")] - yield from update_ui(chatbot=self.chatbot, history=self.history) - # if time.time() - begin_waiting_time > patience: - # self.chatbot.append([f"结束", "等待超时, 终止AutoGen程序。"]) - # yield from update_ui(chatbot=self.chatbot, history=self.history) - # self.terminate() - # return "terminate" - - self.terminate() - return "terminate" - - def subprocess_worker_wait_user_feedback(self, wait_msg="wait user feedback"): - # ⭐⭐ run in subprocess - patience = 5 * 60 - begin_waiting_time = time.time() - self.child_conn.send(PipeCom("interact", wait_msg)) - while True: - time.sleep(0.5) - if self.child_conn.poll(): - wait_success = True - break - if time.time() - begin_waiting_time > patience: - self.child_conn.send(PipeCom("done", "")) - wait_success = False - break - return wait_success diff --git a/crazy_functions/agent_fns/watchdog.py b/crazy_functions/agent_fns/watchdog.py deleted file mode 100644 index 2a2bdfab95097d6c4ad36329ab1fa02dd2ebe868..0000000000000000000000000000000000000000 --- a/crazy_functions/agent_fns/watchdog.py +++ /dev/null @@ -1,28 +0,0 @@ -import threading, time - -class WatchDog(): - def __init__(self, timeout, bark_fn, interval=3, msg="") -> None: - self.last_feed = None - self.timeout = timeout - self.bark_fn = bark_fn - self.interval = interval - self.msg = msg - self.kill_dog = False - - def watch(self): - while True: - if self.kill_dog: break - if time.time() - self.last_feed > self.timeout: - if len(self.msg) > 0: print(self.msg) - self.bark_fn() - break - time.sleep(self.interval) - - def begin_watch(self): - self.last_feed = time.time() - th = threading.Thread(target=self.watch) - th.daemon = True - th.start() - - def feed(self): - self.last_feed = time.time() diff --git "a/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py" "b/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py" deleted file mode 100644 index 336d7cfc85ac159841758123fa057bd20a0bbbec..0000000000000000000000000000000000000000 --- "a/crazy_functions/chatglm\345\276\256\350\260\203\345\267\245\345\205\267.py" +++ /dev/null @@ -1,141 +0,0 @@ -from toolbox import CatchException, update_ui, promote_file_to_downloadzone -from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency -import datetime, json - -def fetch_items(list_of_items, batch_size): - for i in range(0, len(list_of_items), batch_size): - yield list_of_items[i:i + batch_size] - -def string_to_options(arguments): - import argparse - import shlex - - # Create an argparse.ArgumentParser instance - parser = argparse.ArgumentParser() - - # Add command-line arguments - parser.add_argument("--llm_to_learn", type=str, help="LLM model to learn", default="gpt-3.5-turbo") - parser.add_argument("--prompt_prefix", type=str, help="Prompt prefix", default='') - parser.add_argument("--system_prompt", type=str, help="System prompt", default='') - parser.add_argument("--batch", type=int, help="System prompt", default=50) - parser.add_argument("--pre_seq_len", type=int, help="pre_seq_len", default=50) - parser.add_argument("--learning_rate", type=float, help="learning_rate", default=2e-2) - parser.add_argument("--num_gpus", type=int, help="num_gpus", default=1) - parser.add_argument("--json_dataset", type=str, help="json_dataset", default="") - parser.add_argument("--ptuning_directory", type=str, help="ptuning_directory", default="") - - - - # Parse the arguments - args = parser.parse_args(shlex.split(arguments)) - - return args - -@CatchException -def 微调数据集生成(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - """ - txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 - llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行 - plugin_kwargs 插件模型的参数 - chatbot 聊天显示框的句柄,用于显示给用户 - history 聊天历史,前情提要 - system_prompt 给gpt的静默提醒 - web_port 当前软件运行的端口号 - """ - history = [] # 清空历史,以免输入溢出 - chatbot.append(("这是什么功能?", "[Local Message] 微调数据集生成")) - if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") - args = plugin_kwargs.get("advanced_arg", None) - if args is None: - chatbot.append(("没给定指令", "退出")) - yield from update_ui(chatbot=chatbot, history=history); return - else: - arguments = string_to_options(arguments=args) - - dat = [] - with open(txt, 'r', encoding='utf8') as f: - for line in f.readlines(): - json_dat = json.loads(line) - dat.append(json_dat["content"]) - - llm_kwargs['llm_model'] = arguments.llm_to_learn - for batch in fetch_items(dat, arguments.batch): - res = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array=[f"{arguments.prompt_prefix}\n\n{b}" for b in (batch)], - inputs_show_user_array=[f"Show Nothing" for _ in (batch)], - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history_array=[[] for _ in (batch)], - sys_prompt_array=[arguments.system_prompt for _ in (batch)], - max_workers=10 # OpenAI所允许的最大并行过载 - ) - - with open(txt+'.generated.json', 'a+', encoding='utf8') as f: - for b, r in zip(batch, res[1::2]): - f.write(json.dumps({"content":b, "summary":r}, ensure_ascii=False)+'\n') - - promote_file_to_downloadzone(txt+'.generated.json', rename_file='generated.json', chatbot=chatbot) - return - - - -@CatchException -def 启动微调(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - """ - txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径 - llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行 - plugin_kwargs 插件模型的参数 - chatbot 聊天显示框的句柄,用于显示给用户 - history 聊天历史,前情提要 - system_prompt 给gpt的静默提醒 - web_port 当前软件运行的端口号 - """ - import subprocess - history = [] # 清空历史,以免输入溢出 - chatbot.append(("这是什么功能?", "[Local Message] 微调数据集生成")) - if ("advanced_arg" in plugin_kwargs) and (plugin_kwargs["advanced_arg"] == ""): plugin_kwargs.pop("advanced_arg") - args = plugin_kwargs.get("advanced_arg", None) - if args is None: - chatbot.append(("没给定指令", "退出")) - yield from update_ui(chatbot=chatbot, history=history); return - else: - arguments = string_to_options(arguments=args) - - - - pre_seq_len = arguments.pre_seq_len # 128 - learning_rate = arguments.learning_rate # 2e-2 - num_gpus = arguments.num_gpus # 1 - json_dataset = arguments.json_dataset # 't_code.json' - ptuning_directory = arguments.ptuning_directory # '/home/hmp/ChatGLM2-6B/ptuning' - - command = f"torchrun --standalone --nnodes=1 --nproc-per-node={num_gpus} main.py \ - --do_train \ - --train_file AdvertiseGen/{json_dataset} \ - --validation_file AdvertiseGen/{json_dataset} \ - --preprocessing_num_workers 20 \ - --prompt_column content \ - --response_column summary \ - --overwrite_cache \ - --model_name_or_path THUDM/chatglm2-6b \ - --output_dir output/clothgen-chatglm2-6b-pt-{pre_seq_len}-{learning_rate} \ - --overwrite_output_dir \ - --max_source_length 256 \ - --max_target_length 256 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 16 \ - --predict_with_generate \ - --max_steps 100 \ - --logging_steps 10 \ - --save_steps 20 \ - --learning_rate {learning_rate} \ - --pre_seq_len {pre_seq_len} \ - --quantization_bit 4" - - process = subprocess.Popen(command, shell=True, cwd=ptuning_directory) - try: - process.communicate(timeout=3600*24) - except subprocess.TimeoutExpired: - process.kill() - return diff --git a/crazy_functions/crazy_functions_test.py b/crazy_functions/crazy_functions_test.py deleted file mode 100644 index 0c623b8e027858b2579a021769bb304e34c4e373..0000000000000000000000000000000000000000 --- a/crazy_functions/crazy_functions_test.py +++ /dev/null @@ -1,231 +0,0 @@ -""" -这是什么? - 这个文件用于函数插件的单元测试 - 运行方法 python crazy_functions/crazy_functions_test.py -""" - -# ============================================================================================================================== - -def validate_path(): - import os, sys - dir_name = os.path.dirname(__file__) - root_dir_assume = os.path.abspath(os.path.dirname(__file__) + '/..') - os.chdir(root_dir_assume) - sys.path.append(root_dir_assume) -validate_path() # validate path so you can run from base directory - -# ============================================================================================================================== - -from colorful import * -from toolbox import get_conf, ChatBotWithCookies -import contextlib -import os -import sys -from functools import wraps -proxies, WEB_PORT, LLM_MODEL, CONCURRENT_COUNT, AUTHENTICATION, CHATBOT_HEIGHT, LAYOUT, API_KEY = \ - get_conf('proxies', 'WEB_PORT', 'LLM_MODEL', 'CONCURRENT_COUNT', 'AUTHENTICATION', 'CHATBOT_HEIGHT', 'LAYOUT', 'API_KEY') - -llm_kwargs = { - 'api_key': API_KEY, - 'llm_model': LLM_MODEL, - 'top_p':1.0, - 'max_length': None, - 'temperature':1.0, -} -plugin_kwargs = { } -chatbot = ChatBotWithCookies(llm_kwargs) -history = [] -system_prompt = "Serve me as a writing and programming assistant." -web_port = 1024 - -# ============================================================================================================================== - -def silence_stdout(func): - @wraps(func) - def wrapper(*args, **kwargs): - _original_stdout = sys.stdout - sys.stdout = open(os.devnull, 'w') - for q in func(*args, **kwargs): - sys.stdout = _original_stdout - yield q - sys.stdout = open(os.devnull, 'w') - sys.stdout.close() - sys.stdout = _original_stdout - return wrapper - -class CLI_Printer(): - def __init__(self) -> None: - self.pre_buf = "" - - def print(self, buf): - bufp = "" - for index, chat in enumerate(buf): - a, b = chat - bufp += sprint亮靛('[Me]:' + a) + '\n' - bufp += '[GPT]:' + b - if index < len(buf)-1: - bufp += '\n' - - if self.pre_buf!="" and bufp.startswith(self.pre_buf): - print(bufp[len(self.pre_buf):], end='') - else: - print('\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'+bufp, end='') - self.pre_buf = bufp - return - -cli_printer = CLI_Printer() -# ============================================================================================================================== -def test_解析一个Python项目(): - from crazy_functions.解析项目源代码 import 解析一个Python项目 - txt = "crazy_functions/test_project/python/dqn" - for cookies, cb, hist, msg in 解析一个Python项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_解析一个Cpp项目(): - from crazy_functions.解析项目源代码 import 解析一个C项目 - txt = "crazy_functions/test_project/cpp/cppipc" - for cookies, cb, hist, msg in 解析一个C项目(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_Latex英文润色(): - from crazy_functions.Latex全文润色 import Latex英文润色 - txt = "crazy_functions/test_project/latex/attention" - for cookies, cb, hist, msg in Latex英文润色(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_Markdown中译英(): - from crazy_functions.批量Markdown翻译 import Markdown中译英 - txt = "README.md" - for cookies, cb, hist, msg in Markdown中译英(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_批量翻译PDF文档(): - from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档 - txt = "crazy_functions/test_project/pdf_and_word" - for cookies, cb, hist, msg in 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_谷歌检索小助手(): - from crazy_functions.谷歌检索小助手 import 谷歌检索小助手 - txt = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=auto+reinforcement+learning&btnG=" - for cookies, cb, hist, msg in 谷歌检索小助手(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_总结word文档(): - from crazy_functions.总结word文档 import 总结word文档 - txt = "crazy_functions/test_project/pdf_and_word" - for cookies, cb, hist, msg in 总结word文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_下载arxiv论文并翻译摘要(): - from crazy_functions.下载arxiv论文翻译摘要 import 下载arxiv论文并翻译摘要 - txt = "1812.10695" - for cookies, cb, hist, msg in 下载arxiv论文并翻译摘要(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_联网回答问题(): - from crazy_functions.联网的ChatGPT import 连接网络回答问题 - # txt = "谁是应急食品?" - # >> '根据以上搜索结果可以得知,应急食品是“原神”游戏中的角色派蒙的外号。' - # txt = "道路千万条,安全第一条。后面两句是?" - # >> '行车不规范,亲人两行泪。' - # txt = "You should have gone for the head. What does that mean?" - # >> The phrase "You should have gone for the head" is a quote from the Marvel movies, Avengers: Infinity War and Avengers: Endgame. It was spoken by the character Thanos in Infinity War and by Thor in Endgame. - txt = "AutoGPT是什么?" - for cookies, cb, hist, msg in 连接网络回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print("当前问答:", cb[-1][-1].replace("\n"," ")) - for i, it in enumerate(cb): print亮蓝(it[0]); print亮黄(it[1]) - -def test_解析ipynb文件(): - from crazy_functions.解析JupyterNotebook import 解析ipynb文件 - txt = "crazy_functions/test_samples" - for cookies, cb, hist, msg in 解析ipynb文件(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - - -def test_数学动画生成manim(): - from crazy_functions.数学动画生成manim import 动画生成 - txt = "A ball split into 2, and then split into 4, and finally split into 8." - for cookies, cb, hist, msg in 动画生成(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - - - -def test_Markdown多语言(): - from crazy_functions.批量Markdown翻译 import Markdown翻译指定语言 - txt = "README.md" - history = [] - for lang in ["English", "French", "Japanese", "Korean", "Russian", "Italian", "German", "Portuguese", "Arabic"]: - plugin_kwargs = {"advanced_arg": lang} - for cookies, cb, hist, msg in Markdown翻译指定语言(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - print(cb) - -def test_Langchain知识库(): - from crazy_functions.Langchain知识库 import 知识库问答 - txt = "./" - chatbot = ChatBotWithCookies(llm_kwargs) - for cookies, cb, hist, msg in silence_stdout(知识库问答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - cli_printer.print(cb) # print(cb) - - chatbot = ChatBotWithCookies(cookies) - from crazy_functions.Langchain知识库 import 读取知识库作答 - txt = "What is the installation method?" - for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - cli_printer.print(cb) # print(cb) - -def test_Langchain知识库读取(): - from crazy_functions.Langchain知识库 import 读取知识库作答 - txt = "远程云服务器部署?" - for cookies, cb, hist, msg in silence_stdout(读取知识库作答)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - cli_printer.print(cb) # print(cb) - -def test_Latex(): - from crazy_functions.Latex输出PDF结果 import Latex英文纠错加PDF对比, Latex翻译中文并重新编译PDF - - # txt = r"https://arxiv.org/abs/1706.03762" - # txt = r"https://arxiv.org/abs/1902.03185" - # txt = r"https://arxiv.org/abs/2305.18290" - # txt = r"https://arxiv.org/abs/2305.17608" - # txt = r"https://arxiv.org/abs/2211.16068" # ACE - # txt = r"C:\Users\x\arxiv_cache\2211.16068\workfolder" # ACE - # txt = r"https://arxiv.org/abs/2002.09253" - # txt = r"https://arxiv.org/abs/2306.07831" - # txt = r"https://arxiv.org/abs/2212.10156" - # txt = r"https://arxiv.org/abs/2211.11559" - # txt = r"https://arxiv.org/abs/2303.08774" - txt = r"https://arxiv.org/abs/2303.12712" - # txt = r"C:\Users\fuqingxu\arxiv_cache\2303.12712\workfolder" - - - for cookies, cb, hist, msg in (Latex翻译中文并重新编译PDF)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - cli_printer.print(cb) # print(cb) - - - - # txt = "2302.02948.tar" - # print(txt) - # main_tex, work_folder = Latex预处理(txt) - # print('main tex:', main_tex) - # res = 编译Latex(main_tex, work_folder) - # # for cookies, cb, hist, msg in silence_stdout(编译Latex)(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port): - # cli_printer.print(cb) # print(cb) - - - -# test_解析一个Python项目() -# test_Latex英文润色() -# test_Markdown中译英() -# test_批量翻译PDF文档() -# test_谷歌检索小助手() -# test_总结word文档() -# test_下载arxiv论文并翻译摘要() -# test_解析一个Cpp项目() -# test_联网回答问题() -# test_解析ipynb文件() -# test_数学动画生成manim() -# test_Langchain知识库() -# test_Langchain知识库读取() -if __name__ == "__main__": - test_Latex() - input("程序完成,回车退出。") - print("退出。") \ No newline at end of file diff --git a/crazy_functions/crazy_utils.py b/crazy_functions/crazy_utils.py deleted file mode 100644 index 4d3b1953d424c8d0c9ba01882b55c2fe0ee18941..0000000000000000000000000000000000000000 --- a/crazy_functions/crazy_utils.py +++ /dev/null @@ -1,606 +0,0 @@ -from toolbox import update_ui, get_conf, trimmed_format_exc, get_max_token, Singleton -import threading -import os -import logging - -def input_clipping(inputs, history, max_token_limit): - import numpy as np - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - - mode = 'input-and-history' - # 当 输入部分的token占比 小于 全文的一半时,只裁剪历史 - input_token_num = get_token_num(inputs) - if input_token_num < max_token_limit//2: - mode = 'only-history' - max_token_limit = max_token_limit - input_token_num - - everything = [inputs] if mode == 'input-and-history' else [''] - everything.extend(history) - n_token = get_token_num('\n'.join(everything)) - everything_token = [get_token_num(e) for e in everything] - delta = max(everything_token) // 16 # 截断时的颗粒度 - - while n_token > max_token_limit: - where = np.argmax(everything_token) - encoded = enc.encode(everything[where], disallowed_special=()) - clipped_encoded = encoded[:len(encoded)-delta] - everything[where] = enc.decode(clipped_encoded)[:-1] # -1 to remove the may-be illegal char - everything_token[where] = get_token_num(everything[where]) - n_token = get_token_num('\n'.join(everything)) - - if mode == 'input-and-history': - inputs = everything[0] - else: - pass - history = everything[1:] - return inputs, history - -def request_gpt_model_in_new_thread_with_ui_alive( - inputs, inputs_show_user, llm_kwargs, - chatbot, history, sys_prompt, refresh_interval=0.2, - handle_token_exceed=True, - retry_times_at_unknown_error=2, - ): - """ - Request GPT model,请求GPT模型同时维持用户界面活跃。 - - 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行): - inputs (string): List of inputs (输入) - inputs_show_user (string): List of inputs to show user(展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性) - top_p (float): Top p value for sampling from model distribution (GPT参数,浮点数) - temperature (float): Temperature value for sampling from model distribution(GPT参数,浮点数) - chatbot: chatbot inputs and outputs (用户界面对话窗口句柄,用于数据流可视化) - history (list): List of chat history (历史,对话历史列表) - sys_prompt (string): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样) - refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果) - handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启 - retry_times_at_unknown_error:失败时的重试次数 - - 输出 Returns: - future: 输出,GPT返回的结果 - """ - import time - from concurrent.futures import ThreadPoolExecutor - from request_llms.bridge_all import predict_no_ui_long_connection - # 用户反馈 - chatbot.append([inputs_show_user, ""]) - yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 - executor = ThreadPoolExecutor(max_workers=16) - mutable = ["", time.time(), ""] - # 看门狗耐心 - watch_dog_patience = 5 - # 请求任务 - def _req_gpt(inputs, history, sys_prompt): - retry_op = retry_times_at_unknown_error - exceeded_cnt = 0 - while True: - # watchdog error - if len(mutable) >= 2 and (time.time()-mutable[1]) > watch_dog_patience: - raise RuntimeError("检测到程序终止。") - try: - # 【第一种情况】:顺利完成 - result = predict_no_ui_long_connection( - inputs=inputs, llm_kwargs=llm_kwargs, - history=history, sys_prompt=sys_prompt, observe_window=mutable) - return result - except ConnectionAbortedError as token_exceeded_error: - # 【第二种情况】:Token溢出 - if handle_token_exceed: - exceeded_cnt += 1 - # 【选择处理】 尝试计算比例,尽可能多地保留文本 - from toolbox import get_reduce_token_percent - p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error)) - MAX_TOKEN = get_max_token(llm_kwargs) - EXCEED_ALLO = 512 + 512 * exceeded_cnt - inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO) - mutable[0] += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n' - continue # 返回重试 - else: - # 【选择放弃】 - tb_str = '```\n' + trimmed_format_exc() + '```' - mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" - return mutable[0] # 放弃 - except: - # 【第三种情况】:其他错误:重试几次 - tb_str = '```\n' + trimmed_format_exc() + '```' - print(tb_str) - mutable[0] += f"[Local Message] 警告,在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" - if retry_op > 0: - retry_op -= 1 - mutable[0] += f"[Local Message] 重试中,请稍等 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}:\n\n" - if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str): - time.sleep(30) - time.sleep(5) - continue # 返回重试 - else: - time.sleep(5) - return mutable[0] # 放弃 - - # 提交任务 - future = executor.submit(_req_gpt, inputs, history, sys_prompt) - while True: - # yield一次以刷新前端页面 - time.sleep(refresh_interval) - # “喂狗”(看门狗) - mutable[1] = time.time() - if future.done(): - break - chatbot[-1] = [chatbot[-1][0], mutable[0]] - yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 - - final_result = future.result() - chatbot[-1] = [chatbot[-1][0], final_result] - yield from update_ui(chatbot=chatbot, history=[]) # 如果最后成功了,则删除报错信息 - return final_result - -def can_multi_process(llm): - if llm.startswith('gpt-'): return True - if llm.startswith('api2d-'): return True - if llm.startswith('azure-'): return True - if llm.startswith('spark'): return True - if llm.startswith('zhipuai'): return True - return False - -def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array, inputs_show_user_array, llm_kwargs, - chatbot, history_array, sys_prompt_array, - refresh_interval=0.2, max_workers=-1, scroller_max_len=30, - handle_token_exceed=True, show_user_at_complete=False, - retry_times_at_unknown_error=2, - ): - """ - Request GPT model using multiple threads with UI and high efficiency - 请求GPT模型的[多线程]版。 - 具备以下功能: - 实时在UI上反馈远程数据流 - 使用线程池,可调节线程池的大小避免openai的流量限制错误 - 处理中途中止的情况 - 网络等出问题时,会把traceback和已经接收的数据转入输出 - - 输入参数 Args (以_array结尾的输入变量都是列表,列表长度为子任务的数量,执行时,会把列表拆解,放到每个子线程中分别执行): - inputs_array (list): List of inputs (每个子任务的输入) - inputs_show_user_array (list): List of inputs to show user(每个子任务展现在报告中的输入,借助此参数,在汇总报告中隐藏啰嗦的真实输入,增强报告的可读性) - llm_kwargs: llm_kwargs参数 - chatbot: chatbot (用户界面对话窗口句柄,用于数据流可视化) - history_array (list): List of chat history (历史对话输入,双层列表,第一层列表是子任务分解,第二层列表是对话历史) - sys_prompt_array (list): List of system prompts (系统输入,列表,用于输入给GPT的前提提示,比如你是翻译官怎样怎样) - refresh_interval (float, optional): Refresh interval for UI (default: 0.2) (刷新时间间隔频率,建议低于1,不可高于3,仅仅服务于视觉效果) - max_workers (int, optional): Maximum number of threads (default: see config.py) (最大线程数,如果子任务非常多,需要用此选项防止高频地请求openai导致错误) - scroller_max_len (int, optional): Maximum length for scroller (default: 30)(数据流的显示最后收到的多少个字符,仅仅服务于视觉效果) - handle_token_exceed (bool, optional): (是否在输入过长时,自动缩减文本) - handle_token_exceed:是否自动处理token溢出的情况,如果选择自动处理,则会在溢出时暴力截断,默认开启 - show_user_at_complete (bool, optional): (在结束时,把完整输入-输出结果显示在聊天框) - retry_times_at_unknown_error:子任务失败时的重试次数 - - 输出 Returns: - list: List of GPT model responses (每个子任务的输出汇总,如果某个子任务出错,response中会携带traceback报错信息,方便调试和定位问题。) - """ - import time, random - from concurrent.futures import ThreadPoolExecutor - from request_llms.bridge_all import predict_no_ui_long_connection - assert len(inputs_array) == len(history_array) - assert len(inputs_array) == len(sys_prompt_array) - if max_workers == -1: # 读取配置文件 - try: max_workers = get_conf('DEFAULT_WORKER_NUM') - except: max_workers = 8 - if max_workers <= 0: max_workers = 3 - # 屏蔽掉 chatglm的多线程,可能会导致严重卡顿 - if not can_multi_process(llm_kwargs['llm_model']): - max_workers = 1 - - executor = ThreadPoolExecutor(max_workers=max_workers) - n_frag = len(inputs_array) - # 用户反馈 - chatbot.append(["请开始多线程操作。", ""]) - yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 - # 跨线程传递 - mutable = [["", time.time(), "等待中"] for _ in range(n_frag)] - - # 看门狗耐心 - watch_dog_patience = 5 - - # 子线程任务 - def _req_gpt(index, inputs, history, sys_prompt): - gpt_say = "" - retry_op = retry_times_at_unknown_error - exceeded_cnt = 0 - mutable[index][2] = "执行中" - detect_timeout = lambda: len(mutable[index]) >= 2 and (time.time()-mutable[index][1]) > watch_dog_patience - while True: - # watchdog error - if detect_timeout(): raise RuntimeError("检测到程序终止。") - try: - # 【第一种情况】:顺利完成 - gpt_say = predict_no_ui_long_connection( - inputs=inputs, llm_kwargs=llm_kwargs, history=history, - sys_prompt=sys_prompt, observe_window=mutable[index], console_slience=True - ) - mutable[index][2] = "已成功" - return gpt_say - except ConnectionAbortedError as token_exceeded_error: - # 【第二种情况】:Token溢出 - if handle_token_exceed: - exceeded_cnt += 1 - # 【选择处理】 尝试计算比例,尽可能多地保留文本 - from toolbox import get_reduce_token_percent - p_ratio, n_exceed = get_reduce_token_percent(str(token_exceeded_error)) - MAX_TOKEN = get_max_token(llm_kwargs) - EXCEED_ALLO = 512 + 512 * exceeded_cnt - inputs, history = input_clipping(inputs, history, max_token_limit=MAX_TOKEN-EXCEED_ALLO) - gpt_say += f'[Local Message] 警告,文本过长将进行截断,Token溢出数:{n_exceed}。\n\n' - mutable[index][2] = f"截断重试" - continue # 返回重试 - else: - # 【选择放弃】 - tb_str = '```\n' + trimmed_format_exc() + '```' - gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" - if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0] - mutable[index][2] = "输入过长已放弃" - return gpt_say # 放弃 - except: - # 【第三种情况】:其他错误 - if detect_timeout(): raise RuntimeError("检测到程序终止。") - tb_str = '```\n' + trimmed_format_exc() + '```' - print(tb_str) - gpt_say += f"[Local Message] 警告,线程{index}在执行过程中遭遇问题, Traceback:\n\n{tb_str}\n\n" - if len(mutable[index][0]) > 0: gpt_say += "此线程失败前收到的回答:\n\n" + mutable[index][0] - if retry_op > 0: - retry_op -= 1 - wait = random.randint(5, 20) - if ("Rate limit reached" in tb_str) or ("Too Many Requests" in tb_str): - wait = wait * 3 - fail_info = "OpenAI绑定信用卡可解除频率限制 " - else: - fail_info = "" - # 也许等待十几秒后,情况会好转 - for i in range(wait): - mutable[index][2] = f"{fail_info}等待重试 {wait-i}"; time.sleep(1) - # 开始重试 - if detect_timeout(): raise RuntimeError("检测到程序终止。") - mutable[index][2] = f"重试中 {retry_times_at_unknown_error-retry_op}/{retry_times_at_unknown_error}" - continue # 返回重试 - else: - mutable[index][2] = "已失败" - wait = 5 - time.sleep(5) - return gpt_say # 放弃 - - # 异步任务开始 - futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip( - range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)] - cnt = 0 - while True: - # yield一次以刷新前端页面 - time.sleep(refresh_interval) - cnt += 1 - worker_done = [h.done() for h in futures] - # 更好的UI视觉效果 - observe_win = [] - # 每个线程都要“喂狗”(看门狗) - for thread_index, _ in enumerate(worker_done): - mutable[thread_index][1] = time.time() - # 在前端打印些好玩的东西 - for thread_index, _ in enumerate(worker_done): - print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\ - replace('\n', '').replace('`', '.').replace( - ' ', '.').replace('
', '.....').replace('$', '.')+"`... ]" - observe_win.append(print_something_really_funny) - # 在前端打印些好玩的东西 - stat_str = ''.join([f'`{mutable[thread_index][2]}`: {obs}\n\n' - if not done else f'`{mutable[thread_index][2]}`\n\n' - for thread_index, done, obs in zip(range(len(worker_done)), worker_done, observe_win)]) - # 在前端打印些好玩的东西 - chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt % 10+1))] - yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 - if all(worker_done): - executor.shutdown() - break - - # 异步任务结束 - gpt_response_collection = [] - for inputs_show_user, f in zip(inputs_show_user_array, futures): - gpt_res = f.result() - gpt_response_collection.extend([inputs_show_user, gpt_res]) - - # 是否在结束时,在界面上显示结果 - if show_user_at_complete: - for inputs_show_user, f in zip(inputs_show_user_array, futures): - gpt_res = f.result() - chatbot.append([inputs_show_user, gpt_res]) - yield from update_ui(chatbot=chatbot, history=[]) # 刷新界面 - time.sleep(0.5) - return gpt_response_collection - - - -def read_and_clean_pdf_text(fp): - """ - 这个函数用于分割pdf,用了很多trick,逻辑较乱,效果奇好 - - **输入参数说明** - - `fp`:需要读取和清理文本的pdf文件路径 - - **输出参数说明** - - `meta_txt`:清理后的文本内容字符串 - - `page_one_meta`:第一页清理后的文本内容列表 - - **函数功能** - 读取pdf文件并清理其中的文本内容,清理规则包括: - - 提取所有块元的文本信息,并合并为一个字符串 - - 去除短块(字符数小于100)并替换为回车符 - - 清理多余的空行 - - 合并小写字母开头的段落块并替换为空格 - - 清除重复的换行 - - 将每个换行符替换为两个换行符,使每个段落之间有两个换行符分隔 - """ - import fitz, copy - import re - import numpy as np - from colorful import print亮黄, print亮绿 - fc = 0 # Index 0 文本 - fs = 1 # Index 1 字体 - fb = 2 # Index 2 框框 - REMOVE_FOOT_NOTE = True # 是否丢弃掉 不是正文的内容 (比正文字体小,如参考文献、脚注、图注等) - REMOVE_FOOT_FFSIZE_PERCENT = 0.95 # 小于正文的?时,判定为不是正文(有些文章的正文部分字体大小不是100%统一的,有肉眼不可见的小变化) - def primary_ffsize(l): - """ - 提取文本块主字体 - """ - fsize_statiscs = {} - for wtf in l['spans']: - if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0 - fsize_statiscs[wtf['size']] += len(wtf['text']) - return max(fsize_statiscs, key=fsize_statiscs.get) - - def ffsize_same(a,b): - """ - 提取字体大小是否近似相等 - """ - return abs((a-b)/max(a,b)) < 0.02 - - with fitz.open(fp) as doc: - meta_txt = [] - meta_font = [] - - meta_line = [] - meta_span = [] - ############################## <第 1 步,搜集初始信息> ################################## - for index, page in enumerate(doc): - # file_content += page.get_text() - text_areas = page.get_text("dict") # 获取页面上的文本信息 - for t in text_areas['blocks']: - if 'lines' in t: - pf = 998 - for l in t['lines']: - txt_line = "".join([wtf['text'] for wtf in l['spans']]) - if len(txt_line) == 0: continue - pf = primary_ffsize(l) - meta_line.append([txt_line, pf, l['bbox'], l]) - for wtf in l['spans']: # for l in t['lines']: - meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])]) - # meta_line.append(["NEW_BLOCK", pf]) - # 块元提取 for each word segment with in line for each line cross-line words for each block - meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( - '- ', '') for t in text_areas['blocks'] if 'lines' in t]) - meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']]) - for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t]) - if index == 0: - page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace( - '- ', '') for t in text_areas['blocks'] if 'lines' in t] - - ############################## <第 2 步,获取正文主字体> ################################## - try: - fsize_statiscs = {} - for span in meta_span: - if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0 - fsize_statiscs[span[1]] += span[2] - main_fsize = max(fsize_statiscs, key=fsize_statiscs.get) - if REMOVE_FOOT_NOTE: - give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT - except: - raise RuntimeError(f'抱歉, 我们暂时无法解析此PDF文档: {fp}。') - ############################## <第 3 步,切分和重新整合> ################################## - mega_sec = [] - sec = [] - for index, line in enumerate(meta_line): - if index == 0: - sec.append(line[fc]) - continue - if REMOVE_FOOT_NOTE: - if meta_line[index][fs] <= give_up_fize_threshold: - continue - if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]): - # 尝试识别段落 - if meta_line[index][fc].endswith('.') and\ - (meta_line[index-1][fc] != 'NEW_BLOCK') and \ - (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7: - sec[-1] += line[fc] - sec[-1] += "\n\n" - else: - sec[-1] += " " - sec[-1] += line[fc] - else: - if (index+1 < len(meta_line)) and \ - meta_line[index][fs] > main_fsize: - # 单行 + 字体大 - mega_sec.append(copy.deepcopy(sec)) - sec = [] - sec.append("# " + line[fc]) - else: - # 尝试识别section - if meta_line[index-1][fs] > meta_line[index][fs]: - sec.append("\n" + line[fc]) - else: - sec.append(line[fc]) - mega_sec.append(copy.deepcopy(sec)) - - finals = [] - for ms in mega_sec: - final = " ".join(ms) - final = final.replace('- ', ' ') - finals.append(final) - meta_txt = finals - - ############################## <第 4 步,乱七八糟的后处理> ################################## - def 把字符太少的块清除为回车(meta_txt): - for index, block_txt in enumerate(meta_txt): - if len(block_txt) < 100: - meta_txt[index] = '\n' - return meta_txt - meta_txt = 把字符太少的块清除为回车(meta_txt) - - def 清理多余的空行(meta_txt): - for index in reversed(range(1, len(meta_txt))): - if meta_txt[index] == '\n' and meta_txt[index-1] == '\n': - meta_txt.pop(index) - return meta_txt - meta_txt = 清理多余的空行(meta_txt) - - def 合并小写开头的段落块(meta_txt): - def starts_with_lowercase_word(s): - pattern = r"^[a-z]+" - match = re.match(pattern, s) - if match: - return True - else: - return False - for _ in range(100): - for index, block_txt in enumerate(meta_txt): - if starts_with_lowercase_word(block_txt): - if meta_txt[index-1] != '\n': - meta_txt[index-1] += ' ' - else: - meta_txt[index-1] = '' - meta_txt[index-1] += meta_txt[index] - meta_txt[index] = '\n' - return meta_txt - meta_txt = 合并小写开头的段落块(meta_txt) - meta_txt = 清理多余的空行(meta_txt) - - meta_txt = '\n'.join(meta_txt) - # 清除重复的换行 - for _ in range(5): - meta_txt = meta_txt.replace('\n\n', '\n') - - # 换行 -> 双换行 - meta_txt = meta_txt.replace('\n', '\n\n') - - ############################## <第 5 步,展示分割效果> ################################## - # for f in finals: - # print亮黄(f) - # print亮绿('***************************') - - return meta_txt, page_one_meta - - -def get_files_from_everything(txt, type): # type='.md' - """ - 这个函数是用来获取指定目录下所有指定类型(如.md)的文件,并且对于网络上的文件,也可以获取它。 - 下面是对每个参数和返回值的说明: - 参数 - - txt: 路径或网址,表示要搜索的文件或者文件夹路径或网络上的文件。 - - type: 字符串,表示要搜索的文件类型。默认是.md。 - 返回值 - - success: 布尔值,表示函数是否成功执行。 - - file_manifest: 文件路径列表,里面包含以指定类型为后缀名的所有文件的绝对路径。 - - project_folder: 字符串,表示文件所在的文件夹路径。如果是网络上的文件,就是临时文件夹的路径。 - 该函数详细注释已添加,请确认是否满足您的需要。 - """ - import glob, os - - success = True - if txt.startswith('http'): - # 网络的远程文件 - import requests - from toolbox import get_conf - from toolbox import get_log_folder, gen_time_str - proxies = get_conf('proxies') - try: - r = requests.get(txt, proxies=proxies) - except: - raise ConnectionRefusedError(f"无法下载资源{txt},请检查。") - path = os.path.join(get_log_folder(plugin_name='web_download'), gen_time_str()+type) - with open(path, 'wb+') as f: f.write(r.content) - project_folder = get_log_folder(plugin_name='web_download') - file_manifest = [path] - elif txt.endswith(type): - # 直接给定文件 - file_manifest = [txt] - project_folder = os.path.dirname(txt) - elif os.path.exists(txt): - # 本地路径,递归搜索 - project_folder = txt - file_manifest = [f for f in glob.glob(f'{project_folder}/**/*'+type, recursive=True)] - if len(file_manifest) == 0: - success = False - else: - project_folder = None - file_manifest = [] - success = False - - return success, file_manifest, project_folder - - - -@Singleton -class nougat_interface(): - def __init__(self): - self.threadLock = threading.Lock() - - def nougat_with_timeout(self, command, cwd, timeout=3600): - import subprocess - from toolbox import ProxyNetworkActivate - logging.info(f'正在执行命令 {command}') - with ProxyNetworkActivate("Nougat_Download"): - process = subprocess.Popen(command, shell=True, cwd=cwd, env=os.environ) - try: - stdout, stderr = process.communicate(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - stdout, stderr = process.communicate() - print("Process timed out!") - return False - return True - - - def NOUGAT_parse_pdf(self, fp, chatbot, history): - from toolbox import update_ui_lastest_msg - - yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在排队, 等待线程锁...", - chatbot=chatbot, history=history, delay=0) - self.threadLock.acquire() - import glob, threading, os - from toolbox import get_log_folder, gen_time_str - dst = os.path.join(get_log_folder(plugin_name='nougat'), gen_time_str()) - os.makedirs(dst) - - yield from update_ui_lastest_msg("正在解析论文, 请稍候。进度:正在加载NOUGAT... (提示:首次运行需要花费较长时间下载NOUGAT参数)", - chatbot=chatbot, history=history, delay=0) - self.nougat_with_timeout(f'nougat --out "{os.path.abspath(dst)}" "{os.path.abspath(fp)}"', os.getcwd(), timeout=3600) - res = glob.glob(os.path.join(dst,'*.mmd')) - if len(res) == 0: - self.threadLock.release() - raise RuntimeError("Nougat解析论文失败。") - self.threadLock.release() - return res[0] - - - - -def try_install_deps(deps, reload_m=[]): - import subprocess, sys, importlib - for dep in deps: - subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--user', dep]) - import site - importlib.reload(site) - for m in reload_m: - importlib.reload(__import__(m)) - - -def get_plugin_arg(plugin_kwargs, key, default): - # 如果参数是空的 - if (key in plugin_kwargs) and (plugin_kwargs[key] == ""): plugin_kwargs.pop(key) - # 正常情况 - return plugin_kwargs.get(key, default) diff --git a/crazy_functions/game_fns/game_ascii_art.py b/crazy_functions/game_fns/game_ascii_art.py deleted file mode 100644 index e0b700877415f04437413ac1765fa90fe1b0844f..0000000000000000000000000000000000000000 --- a/crazy_functions/game_fns/game_ascii_art.py +++ /dev/null @@ -1,42 +0,0 @@ -from toolbox import CatchException, update_ui, update_ui_lastest_msg -from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState -from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive -from request_llms.bridge_all import predict_no_ui_long_connection -from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing -import random - - -class MiniGame_ASCII_Art(GptAcademicGameBaseState): - def step(self, prompt, chatbot, history): - if self.step_cnt == 0: - chatbot.append(["我画你猜(动物)", "请稍等..."]) - else: - if prompt.strip() == 'exit': - self.delete_game = True - yield from update_ui_lastest_msg(lastmsg=f"谜底是{self.obj},游戏结束。", chatbot=chatbot, history=history, delay=0.) - return - chatbot.append([prompt, ""]) - yield from update_ui(chatbot=chatbot, history=history) - - if self.step_cnt == 0: - self.lock_plugin(chatbot) - self.cur_task = 'draw' - - if self.cur_task == 'draw': - avail_obj = ["狗","猫","鸟","鱼","老鼠","蛇"] - self.obj = random.choice(avail_obj) - inputs = "I want to play a game called Guess the ASCII art. You can draw the ASCII art and I will try to guess it. " + \ - f"This time you draw a {self.obj}. Note that you must not indicate what you have draw in the text, and you should only produce the ASCII art wrapped by ```. " - raw_res = predict_no_ui_long_connection(inputs=inputs, llm_kwargs=self.llm_kwargs, history=[], sys_prompt="") - self.cur_task = 'identify user guess' - res = get_code_block(raw_res) - history += ['', f'the answer is {self.obj}', inputs, res] - yield from update_ui_lastest_msg(lastmsg=res, chatbot=chatbot, history=history, delay=0.) - - elif self.cur_task == 'identify user guess': - if is_same_thing(self.obj, prompt, self.llm_kwargs): - self.delete_game = True - yield from update_ui_lastest_msg(lastmsg="你猜对了!", chatbot=chatbot, history=history, delay=0.) - else: - self.cur_task = 'identify user guess' - yield from update_ui_lastest_msg(lastmsg="猜错了,再试试,输入“exit”获取答案。", chatbot=chatbot, history=history, delay=0.) \ No newline at end of file diff --git a/crazy_functions/game_fns/game_interactive_story.py b/crazy_functions/game_fns/game_interactive_story.py deleted file mode 100644 index 5c25f4a350409006ca7a4cd03f010d6b47eb044f..0000000000000000000000000000000000000000 --- a/crazy_functions/game_fns/game_interactive_story.py +++ /dev/null @@ -1,212 +0,0 @@ -prompts_hs = """ 请以“{headstart}”为开头,编写一个小说的第一幕。 - -- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。 -- 出现人物时,给出人物的名字。 -- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 -- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 -- 字数要求:第一幕的字数少于300字,且少于2个段落。 -""" - -prompts_interact = """ 小说的前文回顾: -「 -{previously_on_story} -」 - -你是一个作家,根据以上的情节,给出4种不同的后续剧情发展方向,每个发展方向都精明扼要地用一句话说明。稍后,我将在这4个选择中,挑选一种剧情发展。 - -输出格式例如: -1. 后续剧情发展1 -2. 后续剧情发展2 -3. 后续剧情发展3 -4. 后续剧情发展4 -""" - - -prompts_resume = """小说的前文回顾: -「 -{previously_on_story} -」 - -你是一个作家,我们正在互相讨论,确定后续剧情的发展。 -在以下的剧情发展中, -「 -{choice} -」 -我认为更合理的是:{user_choice}。 -请在前文的基础上(不要重复前文),围绕我选定的剧情情节,编写小说的下一幕。 - -- 禁止杜撰不符合我选择的剧情。 -- 尽量短,不要包含太多情节,因为你接下来将会与用户互动续写下面的情节,要留出足够的互动空间。 -- 不要重复前文。 -- 出现人物时,给出人物的名字。 -- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 -- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 -- 小说的下一幕字数少于300字,且少于2个段落。 -""" - - -prompts_terminate = """小说的前文回顾: -「 -{previously_on_story} -」 - -你是一个作家,我们正在互相讨论,确定后续剧情的发展。 -现在,故事该结束了,我认为最合理的故事结局是:{user_choice}。 - -请在前文的基础上(不要重复前文),编写小说的最后一幕。 - -- 不要重复前文。 -- 出现人物时,给出人物的名字。 -- 积极地运用环境描写、人物描写等手法,让读者能够感受到你的故事世界。 -- 积极地运用修辞手法,比如比喻、拟人、排比、对偶、夸张等等。 -- 字数要求:最后一幕的字数少于1000字。 -""" - - -from toolbox import CatchException, update_ui, update_ui_lastest_msg -from crazy_functions.multi_stage.multi_stage_utils import GptAcademicGameBaseState -from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive -from request_llms.bridge_all import predict_no_ui_long_connection -from crazy_functions.game_fns.game_utils import get_code_block, is_same_thing -import random - - -class MiniGame_ResumeStory(GptAcademicGameBaseState): - story_headstart = [ - '先行者知道,他现在是全宇宙中唯一的一个人了。', - '深夜,一个年轻人穿过天安门广场向纪念堂走去。在二十二世纪编年史中,计算机把他的代号定为M102。', - '他知道,这最后一课要提前讲了。又一阵剧痛从肝部袭来,几乎使他晕厥过去。', - '在距地球五万光年的远方,在银河系的中心,一场延续了两万年的星际战争已接近尾声。那里的太空中渐渐隐现出一个方形区域,仿佛灿烂的群星的背景被剪出一个方口。', - '伊依一行三人乘坐一艘游艇在南太平洋上做吟诗航行,他们的目的地是南极,如果几天后能顺利到达那里,他们将钻出地壳去看诗云。', - '很多人生来就会莫名其妙地迷上一样东西,仿佛他的出生就是要和这东西约会似的,正是这样,圆圆迷上了肥皂泡。' - ] - - - def begin_game_step_0(self, prompt, chatbot, history): - # init game at step 0 - self.headstart = random.choice(self.story_headstart) - self.story = [] - chatbot.append(["互动写故事", f"这次的故事开头是:{self.headstart}"]) - self.sys_prompt_ = '你是一个想象力丰富的杰出作家。正在与你的朋友互动,一起写故事,因此你每次写的故事段落应少于300字(结局除外)。' - - - def generate_story_image(self, story_paragraph): - try: - from crazy_functions.图片生成 import gen_image - prompt_ = predict_no_ui_long_connection(inputs=story_paragraph, llm_kwargs=self.llm_kwargs, history=[], sys_prompt='你需要根据用户给出的小说段落,进行简短的环境描写。要求:80字以内。') - image_url, image_path = gen_image(self.llm_kwargs, prompt_, '512x512', model="dall-e-2", quality='standard', style='natural') - return f'
' - except: - return '' - - def step(self, prompt, chatbot, history): - - """ - 首先,处理游戏初始化等特殊情况 - """ - if self.step_cnt == 0: - self.begin_game_step_0(prompt, chatbot, history) - self.lock_plugin(chatbot) - self.cur_task = 'head_start' - else: - if prompt.strip() == 'exit' or prompt.strip() == '结束剧情': - # should we terminate game here? - self.delete_game = True - yield from update_ui_lastest_msg(lastmsg=f"游戏结束。", chatbot=chatbot, history=history, delay=0.) - return - if '剧情收尾' in prompt: - self.cur_task = 'story_terminate' - # # well, game resumes - # chatbot.append([prompt, ""]) - # update ui, don't keep the user waiting - yield from update_ui(chatbot=chatbot, history=history) - - - """ - 处理游戏的主体逻辑 - """ - if self.cur_task == 'head_start': - """ - 这是游戏的第一步 - """ - inputs_ = prompts_hs.format(headstart=self.headstart) - history_ = [] - story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs_, '故事开头', self.llm_kwargs, - chatbot, history_, self.sys_prompt_ - ) - self.story.append(story_paragraph) - # # 配图 - yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) - yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) - - # # 构建后续剧情引导 - previously_on_story = "" - for s in self.story: - previously_on_story += s + '\n' - inputs_ = prompts_interact.format(previously_on_story=previously_on_story) - history_ = [] - self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs_, '请在以下几种故事走向中,选择一种(当然,您也可以选择给出其他故事走向):', self.llm_kwargs, - chatbot, - history_, - self.sys_prompt_ - ) - self.cur_task = 'user_choice' - - - elif self.cur_task == 'user_choice': - """ - 根据用户的提示,确定故事的下一步 - """ - if '请在以下几种故事走向中,选择一种' in chatbot[-1][0]: chatbot.pop(-1) - previously_on_story = "" - for s in self.story: - previously_on_story += s + '\n' - inputs_ = prompts_resume.format(previously_on_story=previously_on_story, choice=self.next_choices, user_choice=prompt) - history_ = [] - story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs_, f'下一段故事(您的选择是:{prompt})。', self.llm_kwargs, - chatbot, history_, self.sys_prompt_ - ) - self.story.append(story_paragraph) - # # 配图 - yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) - yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) - - # # 构建后续剧情引导 - previously_on_story = "" - for s in self.story: - previously_on_story += s + '\n' - inputs_ = prompts_interact.format(previously_on_story=previously_on_story) - history_ = [] - self.next_choices = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs_, - '请在以下几种故事走向中,选择一种。当然,您也可以给出您心中的其他故事走向。另外,如果您希望剧情立即收尾,请输入剧情走向,并以“剧情收尾”四个字提示程序。', self.llm_kwargs, - chatbot, - history_, - self.sys_prompt_ - ) - self.cur_task = 'user_choice' - - - elif self.cur_task == 'story_terminate': - """ - 根据用户的提示,确定故事的结局 - """ - previously_on_story = "" - for s in self.story: - previously_on_story += s + '\n' - inputs_ = prompts_terminate.format(previously_on_story=previously_on_story, user_choice=prompt) - history_ = [] - story_paragraph = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs_, f'故事收尾(您的选择是:{prompt})。', self.llm_kwargs, - chatbot, history_, self.sys_prompt_ - ) - # # 配图 - yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
正在生成插图中 ...', chatbot=chatbot, history=history, delay=0.) - yield from update_ui_lastest_msg(lastmsg=story_paragraph + '
'+ self.generate_story_image(story_paragraph), chatbot=chatbot, history=history, delay=0.) - - # terminate game - self.delete_game = True - return diff --git a/crazy_functions/game_fns/game_utils.py b/crazy_functions/game_fns/game_utils.py deleted file mode 100644 index 09b6f7a935f3e1f254c4cd0f3b74f78e4c2af298..0000000000000000000000000000000000000000 --- a/crazy_functions/game_fns/game_utils.py +++ /dev/null @@ -1,35 +0,0 @@ - -from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError -from request_llms.bridge_all import predict_no_ui_long_connection -def get_code_block(reply): - import re - pattern = r"```([\s\S]*?)```" # regex pattern to match code blocks - matches = re.findall(pattern, reply) # find all code blocks in text - if len(matches) == 1: - return "```" + matches[0] + "```" # code block - raise RuntimeError("GPT is not generating proper code.") - -def is_same_thing(a, b, llm_kwargs): - from pydantic import BaseModel, Field - class IsSameThing(BaseModel): - is_same_thing: bool = Field(description="determine whether two objects are same thing.", default=False) - - def run_gpt_fn(inputs, sys_prompt, history=[]): - return predict_no_ui_long_connection( - inputs=inputs, llm_kwargs=llm_kwargs, - history=history, sys_prompt=sys_prompt, observe_window=[] - ) - - gpt_json_io = GptJsonIO(IsSameThing) - inputs_01 = "Identity whether the user input and the target is the same thing: \n target object: {a} \n user input object: {b} \n\n\n".format(a=a, b=b) - inputs_01 += "\n\n\n Note that the user may describe the target object with a different language, e.g. cat and 猫 are the same thing." - analyze_res_cot_01 = run_gpt_fn(inputs_01, "", []) - - inputs_02 = inputs_01 + gpt_json_io.format_instructions - analyze_res = run_gpt_fn(inputs_02, "", [inputs_01, analyze_res_cot_01]) - - try: - res = gpt_json_io.generate_output_auto_repair(analyze_res, run_gpt_fn) - return res.is_same_thing - except JsonStringError as e: - return False \ No newline at end of file diff --git a/crazy_functions/gen_fns/gen_fns_shared.py b/crazy_functions/gen_fns/gen_fns_shared.py deleted file mode 100644 index 8e73794e84437e861d3468d4f0ab799deae6d98c..0000000000000000000000000000000000000000 --- a/crazy_functions/gen_fns/gen_fns_shared.py +++ /dev/null @@ -1,70 +0,0 @@ -import time -import importlib -from toolbox import trimmed_format_exc, gen_time_str, get_log_folder -from toolbox import CatchException, update_ui, gen_time_str, trimmed_format_exc, is_the_upload_folder -from toolbox import promote_file_to_downloadzone, get_log_folder, update_ui_lastest_msg -import multiprocessing - -def get_class_name(class_string): - import re - # Use regex to extract the class name - class_name = re.search(r'class (\w+)\(', class_string).group(1) - return class_name - -def try_make_module(code, chatbot): - module_file = 'gpt_fn_' + gen_time_str().replace('-','_') - fn_path = f'{get_log_folder(plugin_name="gen_plugin_verify")}/{module_file}.py' - with open(fn_path, 'w', encoding='utf8') as f: f.write(code) - promote_file_to_downloadzone(fn_path, chatbot=chatbot) - class_name = get_class_name(code) - manager = multiprocessing.Manager() - return_dict = manager.dict() - p = multiprocessing.Process(target=is_function_successfully_generated, args=(fn_path, class_name, return_dict)) - # only has 10 seconds to run - p.start(); p.join(timeout=10) - if p.is_alive(): p.terminate(); p.join() - p.close() - return return_dict["success"], return_dict['traceback'] - -# check is_function_successfully_generated -def is_function_successfully_generated(fn_path, class_name, return_dict): - return_dict['success'] = False - return_dict['traceback'] = "" - try: - # Create a spec for the module - module_spec = importlib.util.spec_from_file_location('example_module', fn_path) - # Load the module - example_module = importlib.util.module_from_spec(module_spec) - module_spec.loader.exec_module(example_module) - # Now you can use the module - some_class = getattr(example_module, class_name) - # Now you can create an instance of the class - instance = some_class() - return_dict['success'] = True - return - except: - return_dict['traceback'] = trimmed_format_exc() - return - -def subprocess_worker(code, file_path, return_dict): - return_dict['result'] = None - return_dict['success'] = False - return_dict['traceback'] = "" - try: - module_file = 'gpt_fn_' + gen_time_str().replace('-','_') - fn_path = f'{get_log_folder(plugin_name="gen_plugin_run")}/{module_file}.py' - with open(fn_path, 'w', encoding='utf8') as f: f.write(code) - class_name = get_class_name(code) - # Create a spec for the module - module_spec = importlib.util.spec_from_file_location('example_module', fn_path) - # Load the module - example_module = importlib.util.module_from_spec(module_spec) - module_spec.loader.exec_module(example_module) - # Now you can use the module - some_class = getattr(example_module, class_name) - # Now you can create an instance of the class - instance = some_class() - return_dict['result'] = instance.run(file_path) - return_dict['success'] = True - except: - return_dict['traceback'] = trimmed_format_exc() diff --git a/crazy_functions/ipc_fns/mp.py b/crazy_functions/ipc_fns/mp.py deleted file mode 100644 index 575d47ccecbb775205193085c58c06a114d3bfc2..0000000000000000000000000000000000000000 --- a/crazy_functions/ipc_fns/mp.py +++ /dev/null @@ -1,37 +0,0 @@ -import platform -import pickle -import multiprocessing - -def run_in_subprocess_wrapper_func(v_args): - func, args, kwargs, return_dict, exception_dict = pickle.loads(v_args) - import sys - try: - result = func(*args, **kwargs) - return_dict['result'] = result - except Exception as e: - exc_info = sys.exc_info() - exception_dict['exception'] = exc_info - -def run_in_subprocess_with_timeout(func, timeout=60): - if platform.system() == 'Linux': - def wrapper(*args, **kwargs): - return_dict = multiprocessing.Manager().dict() - exception_dict = multiprocessing.Manager().dict() - v_args = pickle.dumps((func, args, kwargs, return_dict, exception_dict)) - process = multiprocessing.Process(target=run_in_subprocess_wrapper_func, args=(v_args,)) - process.start() - process.join(timeout) - if process.is_alive(): - process.terminate() - raise TimeoutError(f'功能单元{str(func)}未能在规定时间内完成任务') - process.close() - if 'exception' in exception_dict: - # ooops, the subprocess ran into an exception - exc_info = exception_dict['exception'] - raise exc_info[1].with_traceback(exc_info[2]) - if 'result' in return_dict.keys(): - # If the subprocess ran successfully, return the result - return return_dict['result'] - return wrapper - else: - return func \ No newline at end of file diff --git a/crazy_functions/json_fns/pydantic_io.py b/crazy_functions/json_fns/pydantic_io.py deleted file mode 100644 index 4e300d65dd918f890d64e68e0cc5a37f36366585..0000000000000000000000000000000000000000 --- a/crazy_functions/json_fns/pydantic_io.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -https://github.com/langchain-ai/langchain/blob/master/docs/extras/modules/model_io/output_parsers/pydantic.ipynb - -Example 1. - -# Define your desired data structure. -class Joke(BaseModel): - setup: str = Field(description="question to set up a joke") - punchline: str = Field(description="answer to resolve the joke") - - # You can add custom validation logic easily with Pydantic. - @validator("setup") - def question_ends_with_question_mark(cls, field): - if field[-1] != "?": - raise ValueError("Badly formed question!") - return field - - -Example 2. - -# Here's another example, but with a compound typed field. -class Actor(BaseModel): - name: str = Field(description="name of an actor") - film_names: List[str] = Field(description="list of names of films they starred in") -""" - -import json, re, logging - - -PYDANTIC_FORMAT_INSTRUCTIONS = """The output should be formatted as a JSON instance that conforms to the JSON schema below. - -As an example, for the schema {{"properties": {{"foo": {{"title": "Foo", "description": "a list of strings", "type": "array", "items": {{"type": "string"}}}}}}, "required": ["foo"]}} -the object {{"foo": ["bar", "baz"]}} is a well-formatted instance of the schema. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not well-formatted. - -Here is the output schema: -``` -{schema} -```""" - - -PYDANTIC_FORMAT_INSTRUCTIONS_SIMPLE = """The output should be formatted as a JSON instance that conforms to the JSON schema below. -``` -{schema} -```""" - -class JsonStringError(Exception): ... - -class GptJsonIO(): - - def __init__(self, schema, example_instruction=True): - self.pydantic_object = schema - self.example_instruction = example_instruction - self.format_instructions = self.generate_format_instructions() - - def generate_format_instructions(self): - schema = self.pydantic_object.schema() - - # Remove extraneous fields. - reduced_schema = schema - if "title" in reduced_schema: - del reduced_schema["title"] - if "type" in reduced_schema: - del reduced_schema["type"] - # Ensure json in context is well-formed with double quotes. - if self.example_instruction: - schema_str = json.dumps(reduced_schema) - return PYDANTIC_FORMAT_INSTRUCTIONS.format(schema=schema_str) - else: - return PYDANTIC_FORMAT_INSTRUCTIONS_SIMPLE.format(schema=schema_str) - - def generate_output(self, text): - # Greedy search for 1st json candidate. - match = re.search( - r"\{.*\}", text.strip(), re.MULTILINE | re.IGNORECASE | re.DOTALL - ) - json_str = "" - if match: json_str = match.group() - json_object = json.loads(json_str, strict=False) - final_object = self.pydantic_object.parse_obj(json_object) - return final_object - - def generate_repair_prompt(self, broken_json, error): - prompt = "Fix a broken json string.\n\n" + \ - "(1) The broken json string need to fix is: \n\n" + \ - "```" + "\n" + \ - broken_json + "\n" + \ - "```" + "\n\n" + \ - "(2) The error message is: \n\n" + \ - error + "\n\n" + \ - "Now, fix this json string. \n\n" - return prompt - - def generate_output_auto_repair(self, response, gpt_gen_fn): - """ - response: string containing canidate json - gpt_gen_fn: gpt_gen_fn(inputs, sys_prompt) - """ - try: - result = self.generate_output(response) - except Exception as e: - try: - logging.info(f'Repairing json:{response}') - repair_prompt = self.generate_repair_prompt(broken_json = response, error=repr(e)) - result = self.generate_output(gpt_gen_fn(repair_prompt, self.format_instructions)) - logging.info('Repaire json success.') - except Exception as e: - # 没辙了,放弃治疗 - logging.info('Repaire json fail.') - raise JsonStringError('Cannot repair json.', str(e)) - return result - diff --git a/crazy_functions/latex_fns/latex_actions.py b/crazy_functions/latex_fns/latex_actions.py deleted file mode 100644 index 8772f5e1fb530d72be282deaef2eb18ed9ffa1d2..0000000000000000000000000000000000000000 --- a/crazy_functions/latex_fns/latex_actions.py +++ /dev/null @@ -1,467 +0,0 @@ -from toolbox import update_ui, update_ui_lastest_msg, get_log_folder -from toolbox import get_conf, objdump, objload, promote_file_to_downloadzone -from .latex_toolbox import PRESERVE, TRANSFORM -from .latex_toolbox import set_forbidden_text, set_forbidden_text_begin_end, set_forbidden_text_careful_brace -from .latex_toolbox import reverse_forbidden_text_careful_brace, reverse_forbidden_text, convert_to_linklist, post_process -from .latex_toolbox import fix_content, find_main_tex_file, merge_tex_files, compile_latex_with_timeout -from .latex_toolbox import find_title_and_abs - -import os, shutil -import re -import numpy as np - -pj = os.path.join - - -def split_subprocess(txt, project_folder, return_dict, opts): - """ - break down latex file to a linked list, - each node use a preserve flag to indicate whether it should - be proccessed by GPT. - """ - text = txt - mask = np.zeros(len(txt), dtype=np.uint8) + TRANSFORM - - # 吸收title与作者以上的部分 - text, mask = set_forbidden_text(text, mask, r"^(.*?)\\maketitle", re.DOTALL) - text, mask = set_forbidden_text(text, mask, r"^(.*?)\\begin{document}", re.DOTALL) - # 吸收iffalse注释 - text, mask = set_forbidden_text(text, mask, r"\\iffalse(.*?)\\fi", re.DOTALL) - # 吸收在42行以内的begin-end组合 - text, mask = set_forbidden_text_begin_end(text, mask, r"\\begin\{([a-z\*]*)\}(.*?)\\end\{\1\}", re.DOTALL, limit_n_lines=42) - # 吸收匿名公式 - text, mask = set_forbidden_text(text, mask, [ r"\$\$([^$]+)\$\$", r"\\\[.*?\\\]" ], re.DOTALL) - # 吸收其他杂项 - text, mask = set_forbidden_text(text, mask, [ r"\\section\{(.*?)\}", r"\\section\*\{(.*?)\}", r"\\subsection\{(.*?)\}", r"\\subsubsection\{(.*?)\}" ]) - text, mask = set_forbidden_text(text, mask, [ r"\\bibliography\{(.*?)\}", r"\\bibliographystyle\{(.*?)\}" ]) - text, mask = set_forbidden_text(text, mask, r"\\begin\{thebibliography\}.*?\\end\{thebibliography\}", re.DOTALL) - text, mask = set_forbidden_text(text, mask, r"\\begin\{lstlisting\}(.*?)\\end\{lstlisting\}", re.DOTALL) - text, mask = set_forbidden_text(text, mask, r"\\begin\{wraptable\}(.*?)\\end\{wraptable\}", re.DOTALL) - text, mask = set_forbidden_text(text, mask, r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}", re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\begin\{wrapfigure\}(.*?)\\end\{wrapfigure\}", r"\\begin\{wrapfigure\*\}(.*?)\\end\{wrapfigure\*\}"], re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\begin\{figure\}(.*?)\\end\{figure\}", r"\\begin\{figure\*\}(.*?)\\end\{figure\*\}"], re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\begin\{multline\}(.*?)\\end\{multline\}", r"\\begin\{multline\*\}(.*?)\\end\{multline\*\}"], re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\begin\{table\}(.*?)\\end\{table\}", r"\\begin\{table\*\}(.*?)\\end\{table\*\}"], re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\begin\{minipage\}(.*?)\\end\{minipage\}", r"\\begin\{minipage\*\}(.*?)\\end\{minipage\*\}"], re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\begin\{align\*\}(.*?)\\end\{align\*\}", r"\\begin\{align\}(.*?)\\end\{align\}"], re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\begin\{equation\}(.*?)\\end\{equation\}", r"\\begin\{equation\*\}(.*?)\\end\{equation\*\}"], re.DOTALL) - text, mask = set_forbidden_text(text, mask, [r"\\includepdf\[(.*?)\]\{(.*?)\}", r"\\clearpage", r"\\newpage", r"\\appendix", r"\\tableofcontents", r"\\include\{(.*?)\}"]) - text, mask = set_forbidden_text(text, mask, [r"\\vspace\{(.*?)\}", r"\\hspace\{(.*?)\}", r"\\label\{(.*?)\}", r"\\begin\{(.*?)\}", r"\\end\{(.*?)\}", r"\\item "]) - text, mask = set_forbidden_text_careful_brace(text, mask, r"\\hl\{(.*?)\}", re.DOTALL) - # reverse 操作必须放在最后 - text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\caption\{(.*?)\}", re.DOTALL, forbid_wrapper=True) - text, mask = reverse_forbidden_text_careful_brace(text, mask, r"\\abstract\{(.*?)\}", re.DOTALL, forbid_wrapper=True) - text, mask = reverse_forbidden_text(text, mask, r"\\begin\{abstract\}(.*?)\\end\{abstract\}", re.DOTALL, forbid_wrapper=True) - root = convert_to_linklist(text, mask) - - # 最后一步处理,增强稳健性 - root = post_process(root) - - # 输出html调试文件,用红色标注处保留区(PRESERVE),用黑色标注转换区(TRANSFORM) - with open(pj(project_folder, 'debug_log.html'), 'w', encoding='utf8') as f: - segment_parts_for_gpt = [] - nodes = [] - node = root - while True: - nodes.append(node) - show_html = node.string.replace('\n','
') - if not node.preserve: - segment_parts_for_gpt.append(node.string) - f.write(f'

#{node.range}{show_html}#

') - else: - f.write(f'

{show_html}

') - node = node.next - if node is None: break - - for n in nodes: n.next = None # break - return_dict['nodes'] = nodes - return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt - return return_dict - -class LatexPaperSplit(): - """ - break down latex file to a linked list, - each node use a preserve flag to indicate whether it should - be proccessed by GPT. - """ - def __init__(self) -> None: - self.nodes = None - self.msg = "*{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ - "版权归原文作者所有。翻译内容可靠性无保障,请仔细鉴别并以原文为准。" + \ - "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" - # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者) - self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" - self.title = "unknown" - self.abstract = "unknown" - - def read_title_and_abstract(self, txt): - try: - title, abstract = find_title_and_abs(txt) - if title is not None: - self.title = title.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '') - if abstract is not None: - self.abstract = abstract.replace('\n', ' ').replace('\\\\', ' ').replace(' ', '').replace(' ', '') - except: - pass - - def merge_result(self, arr, mode, msg, buggy_lines=[], buggy_line_surgery_n_lines=10): - """ - Merge the result after the GPT process completed - """ - result_string = "" - node_cnt = 0 - line_cnt = 0 - - for node in self.nodes: - if node.preserve: - line_cnt += node.string.count('\n') - result_string += node.string - else: - translated_txt = fix_content(arr[node_cnt], node.string) - begin_line = line_cnt - end_line = line_cnt + translated_txt.count('\n') - - # reverse translation if any error - if any([begin_line-buggy_line_surgery_n_lines <= b_line <= end_line+buggy_line_surgery_n_lines for b_line in buggy_lines]): - translated_txt = node.string - - result_string += translated_txt - node_cnt += 1 - line_cnt += translated_txt.count('\n') - - if mode == 'translate_zh': - pattern = re.compile(r'\\begin\{abstract\}.*\n') - match = pattern.search(result_string) - if not match: - # match \abstract{xxxx} - pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL) - match = pattern_compile.search(result_string) - position = match.regs[1][0] - else: - # match \begin{abstract}xxxx\end{abstract} - position = match.end() - result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] - return result_string - - - def split(self, txt, project_folder, opts): - """ - break down latex file to a linked list, - each node use a preserve flag to indicate whether it should - be proccessed by GPT. - P.S. use multiprocessing to avoid timeout error - """ - import multiprocessing - manager = multiprocessing.Manager() - return_dict = manager.dict() - p = multiprocessing.Process( - target=split_subprocess, - args=(txt, project_folder, return_dict, opts)) - p.start() - p.join() - p.close() - self.nodes = return_dict['nodes'] - self.sp = return_dict['segment_parts_for_gpt'] - return self.sp - - -class LatexPaperFileGroup(): - """ - use tokenizer to break down text according to max_token_limit - """ - def __init__(self): - self.file_paths = [] - self.file_contents = [] - self.sp_file_contents = [] - self.sp_file_index = [] - self.sp_file_tag = [] - # count_token - from request_llms.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - - def run_file_split(self, max_token_limit=1900): - """ - use tokenizer to break down text according to max_token_limit - """ - for index, file_content in enumerate(self.file_contents): - if self.get_token_num(file_content) < max_token_limit: - self.sp_file_contents.append(file_content) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index]) - else: - from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit - segments = breakdown_text_to_satisfy_token_limit(file_content, max_token_limit) - for j, segment in enumerate(segments): - self.sp_file_contents.append(segment) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") - - def merge_result(self): - self.file_result = ["" for _ in range(len(self.file_paths))] - for r, k in zip(self.sp_file_result, self.sp_file_index): - self.file_result[k] += r - - def write_result(self): - manifest = [] - for path, res in zip(self.file_paths, self.file_result): - with open(path + '.polish.tex', 'w', encoding='utf8') as f: - manifest.append(path + '.polish.tex') - f.write(res) - return manifest - - -def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]): - import time, os, re - from ..crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency - from .latex_actions import LatexPaperFileGroup, LatexPaperSplit - - # <-------- 寻找主tex文件 ----------> - maintex = find_main_tex_file(file_manifest, mode) - chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - time.sleep(3) - - # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> - main_tex_basename = os.path.basename(maintex) - assert main_tex_basename.endswith('.tex') - main_tex_basename_bare = main_tex_basename[:-4] - may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl') - if os.path.exists(may_exist_bbl): - shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl')) - shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl')) - shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl')) - - with open(maintex, 'r', encoding='utf-8', errors='replace') as f: - content = f.read() - merged_content = merge_tex_files(project_folder, content, mode) - - with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f: - f.write(merged_content) - - # <-------- 精细切分latex文件 ----------> - chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件,这需要一段时间计算,文档越长耗时越长,请耐心等待。')) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - lps = LatexPaperSplit() - lps.read_title_and_abstract(merged_content) - res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数 - # <-------- 拆分过长的latex片段 ----------> - pfg = LatexPaperFileGroup() - for index, r in enumerate(res): - pfg.file_paths.append('segment-' + str(index)) - pfg.file_contents.append(r) - - pfg.run_file_split(max_token_limit=1024) - n_split = len(pfg.sp_file_contents) - - # <-------- 根据需要切换prompt ----------> - inputs_array, sys_prompt_array = switch_prompt(pfg, mode) - inputs_show_user_array = [f"{mode} {f}" for f in pfg.sp_file_tag] - - if os.path.exists(pj(project_folder,'temp.pkl')): - - # <-------- 【仅调试】如果存在调试缓存文件,则跳过GPT请求环节 ----------> - pfg = objload(file=pj(project_folder,'temp.pkl')) - - else: - # <-------- gpt 多线程请求 ----------> - history_array = [[""] for _ in range(n_split)] - # LATEX_EXPERIMENTAL, = get_conf('LATEX_EXPERIMENTAL') - # if LATEX_EXPERIMENTAL: - # paper_meta = f"The paper you processing is `{lps.title}`, a part of the abstraction is `{lps.abstract}`" - # paper_meta_max_len = 888 - # history_array = [[ paper_meta[:paper_meta_max_len] + '...', "Understand, what should I do?"] for _ in range(n_split)] - - gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array=inputs_array, - inputs_show_user_array=inputs_show_user_array, - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history_array=history_array, - sys_prompt_array=sys_prompt_array, - # max_workers=5, # 并行任务数量限制, 最多同时执行5个, 其他的排队等待 - scroller_max_len = 40 - ) - - # <-------- 文本碎片重组为完整的tex片段 ----------> - pfg.sp_file_result = [] - for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents): - pfg.sp_file_result.append(gpt_say) - pfg.merge_result() - - # <-------- 临时存储用于调试 ----------> - pfg.get_token_num = None - objdump(pfg, file=pj(project_folder,'temp.pkl')) - - write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder) - - # <-------- 写出文件 ----------> - msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" - final_tex = lps.merge_result(pfg.file_result, mode, msg) - objdump((lps, pfg.file_result, mode, msg), file=pj(project_folder,'merge_result.pkl')) - - with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f: - if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex) - - - # <-------- 整理结果, 退出 ----------> - chatbot.append((f"完成了吗?", 'GPT结果已输出, 即将编译PDF')) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # <-------- 返回 ----------> - return project_folder + f'/merge_{mode}.tex' - - -def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified, fixed_line=[]): - try: - with open(log_path, 'r', encoding='utf-8', errors='replace') as f: - log = f.read() - import re - buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log) - buggy_lines = [int(l) for l in buggy_lines] - buggy_lines = sorted(buggy_lines) - buggy_line = buggy_lines[0]-1 - print("reversing tex line that has errors", buggy_line) - - # 重组,逆转出错的段落 - if buggy_line not in fixed_line: - fixed_line.append(buggy_line) - - lps, file_result, mode, msg = objload(file=pj(work_folder_modified,'merge_result.pkl')) - final_tex = lps.merge_result(file_result, mode, msg, buggy_lines=fixed_line, buggy_line_surgery_n_lines=5*n_fix) - - with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f: - f.write(final_tex) - - return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines - except: - print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") - return False, -1, [-1] - - -def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): - import os, time - n_fix = 1 - fixed_line = [] - max_try = 32 - chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) - chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 - yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 - - while True: - import os - may_exist_bbl = pj(work_folder_modified, f'merge.bbl') - target_bbl = pj(work_folder_modified, f'{main_file_modified}.bbl') - if os.path.exists(may_exist_bbl) and not os.path.exists(target_bbl): - shutil.copyfile(may_exist_bbl, target_bbl) - - # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) - - if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): - # 只有第二步成功,才能继续下面的步骤 - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面 - if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')): - ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux', work_folder_original) - if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')): - ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified) - - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) - - if mode!='translate_zh': - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 - print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') - ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex', os.getcwd()) - - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - - # <---------- 检查结果 -----------> - results_ = "" - original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf')) - modified_pdf_success = os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')) - diff_pdf_success = os.path.exists(pj(work_folder, f'merge_diff.pdf')) - results_ += f"原始PDF编译是否成功: {original_pdf_success};" - results_ += f"转化PDF编译是否成功: {modified_pdf_success};" - results_ += f"对比PDF编译是否成功: {diff_pdf_success};" - yield from update_ui_lastest_msg(f'第{n_fix}编译结束:
{results_}...', chatbot, history) # 刷新Gradio前端界面 - - if diff_pdf_success: - result_pdf = pj(work_folder_modified, f'merge_diff.pdf') # get pdf path - promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI - if modified_pdf_success: - yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 正在尝试生成对比PDF, 请稍候 ...', chatbot, history) # 刷新Gradio前端界面 - result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') # get pdf path - origin_pdf = pj(work_folder_original, f'{main_file_original}.pdf') # get pdf path - if os.path.exists(pj(work_folder, '..', 'translation')): - shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf')) - promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI - # 将两个PDF拼接 - if original_pdf_success: - try: - from .latex_toolbox import merge_pdfs - concat_pdf = pj(work_folder_modified, f'comparison.pdf') - merge_pdfs(origin_pdf, result_pdf, concat_pdf) - if os.path.exists(pj(work_folder, '..', 'translation')): - shutil.copyfile(concat_pdf, pj(work_folder, '..', 'translation', 'comparison.pdf')) - promote_file_to_downloadzone(concat_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI - except Exception as e: - print(e) - pass - return True # 成功啦 - else: - if n_fix>=max_try: break - n_fix += 1 - can_retry, main_file_modified, buggy_lines = remove_buggy_lines( - file_path=pj(work_folder_modified, f'{main_file_modified}.tex'), - log_path=pj(work_folder_modified, f'{main_file_modified}.log'), - tex_name=f'{main_file_modified}.tex', - tex_name_pure=f'{main_file_modified}', - n_fix=n_fix, - work_folder_modified=work_folder_modified, - fixed_line=fixed_line - ) - yield from update_ui_lastest_msg(f'由于最为关键的转化PDF编译失败, 将根据报错信息修正tex源文件并重试, 当前报错的latex代码处于第{buggy_lines}行 ...', chatbot, history) # 刷新Gradio前端界面 - if not can_retry: break - - return False # 失败啦 - - -def write_html(sp_file_contents, sp_file_result, chatbot, project_folder): - # write html - try: - import shutil - from crazy_functions.pdf_fns.report_gen_html import construct_html - from toolbox import gen_time_str - ch = construct_html() - orig = "" - trans = "" - final = [] - for c,r in zip(sp_file_contents, sp_file_result): - final.append(c) - final.append(r) - for i, k in enumerate(final): - if i%2==0: - orig = k - if i%2==1: - trans = k - ch.add_row(a=orig, b=trans) - create_report_file_name = f"{gen_time_str()}.trans.html" - res = ch.save_file(create_report_file_name) - shutil.copyfile(res, pj(project_folder, create_report_file_name)) - promote_file_to_downloadzone(file=res, chatbot=chatbot) - except: - from toolbox import trimmed_format_exc - print('writing html result failed:', trimmed_format_exc()) diff --git a/crazy_functions/latex_fns/latex_toolbox.py b/crazy_functions/latex_fns/latex_toolbox.py deleted file mode 100644 index 0a6a873b50b8299fa28bc41e27cf7a27a16637ae..0000000000000000000000000000000000000000 --- a/crazy_functions/latex_fns/latex_toolbox.py +++ /dev/null @@ -1,562 +0,0 @@ -import os, shutil -import re -import numpy as np -PRESERVE = 0 -TRANSFORM = 1 - -pj = os.path.join - -class LinkedListNode(): - """ - Linked List Node - """ - def __init__(self, string, preserve=True) -> None: - self.string = string - self.preserve = preserve - self.next = None - self.range = None - # self.begin_line = 0 - # self.begin_char = 0 - -def convert_to_linklist(text, mask): - root = LinkedListNode("", preserve=True) - current_node = root - for c, m, i in zip(text, mask, range(len(text))): - if (m==PRESERVE and current_node.preserve) \ - or (m==TRANSFORM and not current_node.preserve): - # add - current_node.string += c - else: - current_node.next = LinkedListNode(c, preserve=(m==PRESERVE)) - current_node = current_node.next - return root - -def post_process(root): - # 修复括号 - node = root - while True: - string = node.string - if node.preserve: - node = node.next - if node is None: break - continue - def break_check(string): - str_stack = [""] # (lv, index) - for i, c in enumerate(string): - if c == '{': - str_stack.append('{') - elif c == '}': - if len(str_stack) == 1: - print('stack fix') - return i - str_stack.pop(-1) - else: - str_stack[-1] += c - return -1 - bp = break_check(string) - - if bp == -1: - pass - elif bp == 0: - node.string = string[:1] - q = LinkedListNode(string[1:], False) - q.next = node.next - node.next = q - else: - node.string = string[:bp] - q = LinkedListNode(string[bp:], False) - q.next = node.next - node.next = q - - node = node.next - if node is None: break - - # 屏蔽空行和太短的句子 - node = root - while True: - if len(node.string.strip('\n').strip(''))==0: node.preserve = True - if len(node.string.strip('\n').strip(''))<42: node.preserve = True - node = node.next - if node is None: break - node = root - while True: - if node.next and node.preserve and node.next.preserve: - node.string += node.next.string - node.next = node.next.next - node = node.next - if node is None: break - - # 将前后断行符脱离 - node = root - prev_node = None - while True: - if not node.preserve: - lstriped_ = node.string.lstrip().lstrip('\n') - if (prev_node is not None) and (prev_node.preserve) and (len(lstriped_)!=len(node.string)): - prev_node.string += node.string[:-len(lstriped_)] - node.string = lstriped_ - rstriped_ = node.string.rstrip().rstrip('\n') - if (node.next is not None) and (node.next.preserve) and (len(rstriped_)!=len(node.string)): - node.next.string = node.string[len(rstriped_):] + node.next.string - node.string = rstriped_ - # ===== - prev_node = node - node = node.next - if node is None: break - - # 标注节点的行数范围 - node = root - n_line = 0 - expansion = 2 - while True: - n_l = node.string.count('\n') - node.range = [n_line-expansion, n_line+n_l+expansion] # 失败时,扭转的范围 - n_line = n_line+n_l - node = node.next - if node is None: break - return root - - -""" -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -Latex segmentation with a binary mask (PRESERVE=0, TRANSFORM=1) -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -""" - - -def set_forbidden_text(text, mask, pattern, flags=0): - """ - Add a preserve text area in this paper - e.g. with pattern = r"\\begin\{algorithm\}(.*?)\\end\{algorithm\}" - you can mask out (mask = PRESERVE so that text become untouchable for GPT) - everything between "\begin{equation}" and "\end{equation}" - """ - if isinstance(pattern, list): pattern = '|'.join(pattern) - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - mask[res.span()[0]:res.span()[1]] = PRESERVE - return text, mask - -def reverse_forbidden_text(text, mask, pattern, flags=0, forbid_wrapper=True): - """ - Move area out of preserve area (make text editable for GPT) - count the number of the braces so as to catch compelete text area. - e.g. - \begin{abstract} blablablablablabla. \end{abstract} - """ - if isinstance(pattern, list): pattern = '|'.join(pattern) - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - if not forbid_wrapper: - mask[res.span()[0]:res.span()[1]] = TRANSFORM - else: - mask[res.regs[0][0]: res.regs[1][0]] = PRESERVE # '\\begin{abstract}' - mask[res.regs[1][0]: res.regs[1][1]] = TRANSFORM # abstract - mask[res.regs[1][1]: res.regs[0][1]] = PRESERVE # abstract - return text, mask - -def set_forbidden_text_careful_brace(text, mask, pattern, flags=0): - """ - Add a preserve text area in this paper (text become untouchable for GPT). - count the number of the braces so as to catch compelete text area. - e.g. - \caption{blablablablabla\texbf{blablabla}blablabla.} - """ - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - brace_level = -1 - p = begin = end = res.regs[0][0] - for _ in range(1024*16): - if text[p] == '}' and brace_level == 0: break - elif text[p] == '}': brace_level -= 1 - elif text[p] == '{': brace_level += 1 - p += 1 - end = p+1 - mask[begin:end] = PRESERVE - return text, mask - -def reverse_forbidden_text_careful_brace(text, mask, pattern, flags=0, forbid_wrapper=True): - """ - Move area out of preserve area (make text editable for GPT) - count the number of the braces so as to catch compelete text area. - e.g. - \caption{blablablablabla\texbf{blablabla}blablabla.} - """ - pattern_compile = re.compile(pattern, flags) - for res in pattern_compile.finditer(text): - brace_level = 0 - p = begin = end = res.regs[1][0] - for _ in range(1024*16): - if text[p] == '}' and brace_level == 0: break - elif text[p] == '}': brace_level -= 1 - elif text[p] == '{': brace_level += 1 - p += 1 - end = p - mask[begin:end] = TRANSFORM - if forbid_wrapper: - mask[res.regs[0][0]:begin] = PRESERVE - mask[end:res.regs[0][1]] = PRESERVE - return text, mask - -def set_forbidden_text_begin_end(text, mask, pattern, flags=0, limit_n_lines=42): - """ - Find all \begin{} ... \end{} text block that with less than limit_n_lines lines. - Add it to preserve area - """ - pattern_compile = re.compile(pattern, flags) - def search_with_line_limit(text, mask): - for res in pattern_compile.finditer(text): - cmd = res.group(1) # begin{what} - this = res.group(2) # content between begin and end - this_mask = mask[res.regs[2][0]:res.regs[2][1]] - white_list = ['document', 'abstract', 'lemma', 'definition', 'sproof', - 'em', 'emph', 'textit', 'textbf', 'itemize', 'enumerate'] - if (cmd in white_list) or this.count('\n') >= limit_n_lines: # use a magical number 42 - this, this_mask = search_with_line_limit(this, this_mask) - mask[res.regs[2][0]:res.regs[2][1]] = this_mask - else: - mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE - return text, mask - return search_with_line_limit(text, mask) - - - -""" -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -Latex Merge File -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -""" - -def find_main_tex_file(file_manifest, mode): - """ - 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 - P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码) - """ - canidates = [] - for texf in file_manifest: - if os.path.basename(texf).startswith('merge'): - continue - with open(texf, 'r', encoding='utf8', errors='ignore') as f: - file_content = f.read() - if r'\documentclass' in file_content: - canidates.append(texf) - else: - continue - - if len(canidates) == 0: - raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') - elif len(canidates) == 1: - return canidates[0] - else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回 - canidates_score = [] - # 给出一些判定模板文档的词作为扣分项 - unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers'] - expected_words = ['\input', '\ref', '\cite'] - for texf in canidates: - canidates_score.append(0) - with open(texf, 'r', encoding='utf8', errors='ignore') as f: - file_content = f.read() - file_content = rm_comments(file_content) - for uw in unexpected_words: - if uw in file_content: - canidates_score[-1] -= 1 - for uw in expected_words: - if uw in file_content: - canidates_score[-1] += 1 - select = np.argmax(canidates_score) # 取评分最高者返回 - return canidates[select] - -def rm_comments(main_file): - new_file_remove_comment_lines = [] - for l in main_file.splitlines(): - # 删除整行的空注释 - if l.lstrip().startswith("%"): - pass - else: - new_file_remove_comment_lines.append(l) - main_file = '\n'.join(new_file_remove_comment_lines) - # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令 - main_file = re.sub(r'(? 0 and node_string.count('\_') > final_tex.count('\_'): - # walk and replace any _ without \ - final_tex = re.sub(r"(?= limit_n_lines: # use a magical number 42 - this, this_mask = search_with_line_limit(this, this_mask) - mask[res.regs[2][0]:res.regs[2][1]] = this_mask - else: - mask[res.regs[0][0]:res.regs[0][1]] = PRESERVE - return text, mask - return search_with_line_limit(text, mask) - -class LinkedListNode(): - """ - Linked List Node - """ - def __init__(self, string, preserve=True) -> None: - self.string = string - self.preserve = preserve - self.next = None - # self.begin_line = 0 - # self.begin_char = 0 - -def convert_to_linklist(text, mask): - root = LinkedListNode("", preserve=True) - current_node = root - for c, m, i in zip(text, mask, range(len(text))): - if (m==PRESERVE and current_node.preserve) \ - or (m==TRANSFORM and not current_node.preserve): - # add - current_node.string += c - else: - current_node.next = LinkedListNode(c, preserve=(m==PRESERVE)) - current_node = current_node.next - return root -""" -======================================================================== -Latex Merge File -======================================================================== -""" - -def 寻找Latex主文件(file_manifest, mode): - """ - 在多Tex文档中,寻找主文件,必须包含documentclass,返回找到的第一个。 - P.S. 但愿没人把latex模板放在里面传进来 (6.25 加入判定latex模板的代码) - """ - canidates = [] - for texf in file_manifest: - if os.path.basename(texf).startswith('merge'): - continue - with open(texf, 'r', encoding='utf8') as f: - file_content = f.read() - if r'\documentclass' in file_content: - canidates.append(texf) - else: - continue - - if len(canidates) == 0: - raise RuntimeError('无法找到一个主Tex文件(包含documentclass关键字)') - elif len(canidates) == 1: - return canidates[0] - else: # if len(canidates) >= 2 通过一些Latex模板中常见(但通常不会出现在正文)的单词,对不同latex源文件扣分,取评分最高者返回 - canidates_score = [] - # 给出一些判定模板文档的词作为扣分项 - unexpected_words = ['\LaTeX', 'manuscript', 'Guidelines', 'font', 'citations', 'rejected', 'blind review', 'reviewers'] - expected_words = ['\input', '\ref', '\cite'] - for texf in canidates: - canidates_score.append(0) - with open(texf, 'r', encoding='utf8') as f: - file_content = f.read() - for uw in unexpected_words: - if uw in file_content: - canidates_score[-1] -= 1 - for uw in expected_words: - if uw in file_content: - canidates_score[-1] += 1 - select = np.argmax(canidates_score) # 取评分最高者返回 - return canidates[select] - -def rm_comments(main_file): - new_file_remove_comment_lines = [] - for l in main_file.splitlines(): - # 删除整行的空注释 - if l.lstrip().startswith("%"): - pass - else: - new_file_remove_comment_lines.append(l) - main_file = '\n'.join(new_file_remove_comment_lines) - # main_file = re.sub(r"\\include{(.*?)}", r"\\input{\1}", main_file) # 将 \include 命令转换为 \input 命令 - main_file = re.sub(r'(? 0 and node_string.count('\_') > final_tex.count('\_'): - # walk and replace any _ without \ - final_tex = re.sub(r"(?') - if not node.preserve: - segment_parts_for_gpt.append(node.string) - f.write(f'

#{show_html}#

') - else: - f.write(f'

{show_html}

') - node = node.next - if node is None: break - - for n in nodes: n.next = None # break - return_dict['nodes'] = nodes - return_dict['segment_parts_for_gpt'] = segment_parts_for_gpt - return return_dict - - - -class LatexPaperSplit(): - """ - break down latex file to a linked list, - each node use a preserve flag to indicate whether it should - be proccessed by GPT. - """ - def __init__(self) -> None: - self.nodes = None - self.msg = "*{\\scriptsize\\textbf{警告:该PDF由GPT-Academic开源项目调用大语言模型+Latex翻译插件一键生成," + \ - "版权归原文作者所有。翻译内容可靠性无保障,请仔细鉴别并以原文为准。" + \ - "项目Github地址 \\url{https://github.com/binary-husky/gpt_academic/}。" - # 请您不要删除或修改这行警告,除非您是论文的原作者(如果您是论文原作者,欢迎加REAME中的QQ联系开发者) - self.msg_declare = "为了防止大语言模型的意外谬误产生扩散影响,禁止移除或修改此警告。}}\\\\" - - def merge_result(self, arr, mode, msg): - """ - Merge the result after the GPT process completed - """ - result_string = "" - p = 0 - for node in self.nodes: - if node.preserve: - result_string += node.string - else: - result_string += fix_content(arr[p], node.string) - p += 1 - if mode == 'translate_zh': - pattern = re.compile(r'\\begin\{abstract\}.*\n') - match = pattern.search(result_string) - if not match: - # match \abstract{xxxx} - pattern_compile = re.compile(r"\\abstract\{(.*?)\}", flags=re.DOTALL) - match = pattern_compile.search(result_string) - position = match.regs[1][0] - else: - # match \begin{abstract}xxxx\end{abstract} - position = match.end() - result_string = result_string[:position] + self.msg + msg + self.msg_declare + result_string[position:] - return result_string - - def split(self, txt, project_folder, opts): - """ - break down latex file to a linked list, - each node use a preserve flag to indicate whether it should - be proccessed by GPT. - P.S. use multiprocessing to avoid timeout error - """ - import multiprocessing - manager = multiprocessing.Manager() - return_dict = manager.dict() - p = multiprocessing.Process( - target=split_subprocess, - args=(txt, project_folder, return_dict, opts)) - p.start() - p.join() - p.close() - self.nodes = return_dict['nodes'] - self.sp = return_dict['segment_parts_for_gpt'] - return self.sp - - - -class LatexPaperFileGroup(): - """ - use tokenizer to break down text according to max_token_limit - """ - def __init__(self): - self.file_paths = [] - self.file_contents = [] - self.sp_file_contents = [] - self.sp_file_index = [] - self.sp_file_tag = [] - - # count_token - from request_llm.bridge_all import model_info - enc = model_info["gpt-3.5-turbo"]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - self.get_token_num = get_token_num - - def run_file_split(self, max_token_limit=1900): - """ - use tokenizer to break down text according to max_token_limit - """ - for index, file_content in enumerate(self.file_contents): - if self.get_token_num(file_content) < max_token_limit: - self.sp_file_contents.append(file_content) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index]) - else: - from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf - segments = breakdown_txt_to_satisfy_token_limit_for_pdf(file_content, self.get_token_num, max_token_limit) - for j, segment in enumerate(segments): - self.sp_file_contents.append(segment) - self.sp_file_index.append(index) - self.sp_file_tag.append(self.file_paths[index] + f".part-{j}.tex") - print('Segmentation: done') - - def merge_result(self): - self.file_result = ["" for _ in range(len(self.file_paths))] - for r, k in zip(self.sp_file_result, self.sp_file_index): - self.file_result[k] += r - - def write_result(self): - manifest = [] - for path, res in zip(self.file_paths, self.file_result): - with open(path + '.polish.tex', 'w', encoding='utf8') as f: - manifest.append(path + '.polish.tex') - f.write(res) - return manifest - -def write_html(sp_file_contents, sp_file_result, chatbot, project_folder): - - # write html - try: - import shutil - from .crazy_utils import construct_html - from toolbox import gen_time_str - ch = construct_html() - orig = "" - trans = "" - final = [] - for c,r in zip(sp_file_contents, sp_file_result): - final.append(c) - final.append(r) - for i, k in enumerate(final): - if i%2==0: - orig = k - if i%2==1: - trans = k - ch.add_row(a=orig, b=trans) - create_report_file_name = f"{gen_time_str()}.trans.html" - ch.save_file(create_report_file_name) - shutil.copyfile(pj('./gpt_log/', create_report_file_name), pj(project_folder, create_report_file_name)) - promote_file_to_downloadzone(file=f'./gpt_log/{create_report_file_name}', chatbot=chatbot) - except: - from toolbox import trimmed_format_exc - print('writing html result failed:', trimmed_format_exc()) - -def Latex精细分解与转化(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, mode='proofread', switch_prompt=None, opts=[]): - import time, os, re - from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency - from .latex_utils import LatexPaperFileGroup, merge_tex_files, LatexPaperSplit, 寻找Latex主文件 - - # <-------- 寻找主tex文件 ----------> - maintex = 寻找Latex主文件(file_manifest, mode) - chatbot.append((f"定位主Latex文件", f'[Local Message] 分析结果:该项目的Latex主文件是{maintex}, 如果分析错误, 请立即终止程序, 删除或修改歧义文件, 然后重试。主程序即将开始, 请稍候。')) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - time.sleep(3) - - # <-------- 读取Latex文件, 将多文件tex工程融合为一个巨型tex ----------> - main_tex_basename = os.path.basename(maintex) - assert main_tex_basename.endswith('.tex') - main_tex_basename_bare = main_tex_basename[:-4] - may_exist_bbl = pj(project_folder, f'{main_tex_basename_bare}.bbl') - if os.path.exists(may_exist_bbl): - shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge.bbl')) - shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_{mode}.bbl')) - shutil.copyfile(may_exist_bbl, pj(project_folder, f'merge_diff.bbl')) - - with open(maintex, 'r', encoding='utf-8', errors='replace') as f: - content = f.read() - merged_content = merge_tex_files(project_folder, content, mode) - - with open(project_folder + '/merge.tex', 'w', encoding='utf-8', errors='replace') as f: - f.write(merged_content) - - # <-------- 精细切分latex文件 ----------> - chatbot.append((f"Latex文件融合完成", f'[Local Message] 正在精细切分latex文件,这需要一段时间计算,文档越长耗时越长,请耐心等待。')) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - lps = LatexPaperSplit() - res = lps.split(merged_content, project_folder, opts) # 消耗时间的函数 - - # <-------- 拆分过长的latex片段 ----------> - pfg = LatexPaperFileGroup() - for index, r in enumerate(res): - pfg.file_paths.append('segment-' + str(index)) - pfg.file_contents.append(r) - - pfg.run_file_split(max_token_limit=1024) - n_split = len(pfg.sp_file_contents) - - # <-------- 根据需要切换prompt ----------> - inputs_array, sys_prompt_array = switch_prompt(pfg, mode) - inputs_show_user_array = [f"{mode} {f}" for f in pfg.sp_file_tag] - - if os.path.exists(pj(project_folder,'temp.pkl')): - - # <-------- 【仅调试】如果存在调试缓存文件,则跳过GPT请求环节 ----------> - pfg = objload(file=pj(project_folder,'temp.pkl')) - - else: - # <-------- gpt 多线程请求 ----------> - gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array=inputs_array, - inputs_show_user_array=inputs_show_user_array, - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history_array=[[""] for _ in range(n_split)], - sys_prompt_array=sys_prompt_array, - # max_workers=5, # 并行任务数量限制, 最多同时执行5个, 其他的排队等待 - scroller_max_len = 40 - ) - - # <-------- 文本碎片重组为完整的tex片段 ----------> - pfg.sp_file_result = [] - for i_say, gpt_say, orig_content in zip(gpt_response_collection[0::2], gpt_response_collection[1::2], pfg.sp_file_contents): - pfg.sp_file_result.append(gpt_say) - pfg.merge_result() - - # <-------- 临时存储用于调试 ----------> - pfg.get_token_num = None - objdump(pfg, file=pj(project_folder,'temp.pkl')) - - write_html(pfg.sp_file_contents, pfg.sp_file_result, chatbot=chatbot, project_folder=project_folder) - - # <-------- 写出文件 ----------> - msg = f"当前大语言模型: {llm_kwargs['llm_model']},当前语言模型温度设定: {llm_kwargs['temperature']}。" - final_tex = lps.merge_result(pfg.file_result, mode, msg) - with open(project_folder + f'/merge_{mode}.tex', 'w', encoding='utf-8', errors='replace') as f: - if mode != 'translate_zh' or "binary" in final_tex: f.write(final_tex) - - - # <-------- 整理结果, 退出 ----------> - chatbot.append((f"完成了吗?", 'GPT结果已输出, 正在编译PDF')) - yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 - - # <-------- 返回 ----------> - return project_folder + f'/merge_{mode}.tex' - - - -def remove_buggy_lines(file_path, log_path, tex_name, tex_name_pure, n_fix, work_folder_modified): - try: - with open(log_path, 'r', encoding='utf-8', errors='replace') as f: - log = f.read() - with open(file_path, 'r', encoding='utf-8', errors='replace') as f: - file_lines = f.readlines() - import re - buggy_lines = re.findall(tex_name+':([0-9]{1,5}):', log) - buggy_lines = [int(l) for l in buggy_lines] - buggy_lines = sorted(buggy_lines) - print("removing lines that has errors", buggy_lines) - file_lines.pop(buggy_lines[0]-1) - with open(pj(work_folder_modified, f"{tex_name_pure}_fix_{n_fix}.tex"), 'w', encoding='utf-8', errors='replace') as f: - f.writelines(file_lines) - return True, f"{tex_name_pure}_fix_{n_fix}", buggy_lines - except: - print("Fatal error occurred, but we cannot identify error, please download zip, read latex log, and compile manually.") - return False, -1, [-1] - -def compile_latex_with_timeout(command, cwd, timeout=60): - import subprocess - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) - try: - stdout, stderr = process.communicate(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - stdout, stderr = process.communicate() - print("Process timed out!") - return False - return True - -def 编译Latex(chatbot, history, main_file_original, main_file_modified, work_folder_original, work_folder_modified, work_folder, mode='default'): - import os, time - current_dir = os.getcwd() - n_fix = 1 - max_try = 32 - chatbot.append([f"正在编译PDF文档", f'编译已经开始。当前工作路径为{work_folder},如果程序停顿5分钟以上,请直接去该路径下取回翻译结果,或者重启之后再度尝试 ...']); yield from update_ui(chatbot=chatbot, history=history) - chatbot.append([f"正在编译PDF文档", '...']); yield from update_ui(chatbot=chatbot, history=history); time.sleep(1); chatbot[-1] = list(chatbot[-1]) # 刷新界面 - yield from update_ui_lastest_msg('编译已经开始...', chatbot, history) # 刷新Gradio前端界面 - - while True: - import os - - # https://stackoverflow.com/questions/738755/dont-make-me-manually-abort-a-latex-compile-when-theres-an-error - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译原始PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译转化后的PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) - - if ok and os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')): - # 只有第二步成功,才能继续下面的步骤 - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译BibTex ...', chatbot, history) # 刷新Gradio前端界面 - if not os.path.exists(pj(work_folder_original, f'{main_file_original}.bbl')): - ok = compile_latex_with_timeout(f'bibtex {main_file_original}.aux', work_folder_original) - if not os.path.exists(pj(work_folder_modified, f'{main_file_modified}.bbl')): - ok = compile_latex_with_timeout(f'bibtex {main_file_modified}.aux', work_folder_modified) - - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 编译文献交叉引用 ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_original}.tex', work_folder_original) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error {main_file_modified}.tex', work_folder_modified) - - if mode!='translate_zh': - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 使用latexdiff生成论文转化前后对比 ...', chatbot, history) # 刷新Gradio前端界面 - print( f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') - ok = compile_latex_with_timeout(f'latexdiff --encoding=utf8 --append-safecmd=subfile {work_folder_original}/{main_file_original}.tex {work_folder_modified}/{main_file_modified}.tex --flatten > {work_folder}/merge_diff.tex') - - yield from update_ui_lastest_msg(f'尝试第 {n_fix}/{max_try} 次编译, 正在编译对比PDF ...', chatbot, history) # 刷新Gradio前端界面 - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - ok = compile_latex_with_timeout(f'bibtex merge_diff.aux', work_folder) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - ok = compile_latex_with_timeout(f'pdflatex -interaction=batchmode -file-line-error merge_diff.tex', work_folder) - - - # <---------- 检查结果 -----------> - results_ = "" - original_pdf_success = os.path.exists(pj(work_folder_original, f'{main_file_original}.pdf')) - modified_pdf_success = os.path.exists(pj(work_folder_modified, f'{main_file_modified}.pdf')) - diff_pdf_success = os.path.exists(pj(work_folder, f'merge_diff.pdf')) - results_ += f"原始PDF编译是否成功: {original_pdf_success};" - results_ += f"转化PDF编译是否成功: {modified_pdf_success};" - results_ += f"对比PDF编译是否成功: {diff_pdf_success};" - yield from update_ui_lastest_msg(f'第{n_fix}编译结束:
{results_}...', chatbot, history) # 刷新Gradio前端界面 - - if diff_pdf_success: - result_pdf = pj(work_folder_modified, f'merge_diff.pdf') # get pdf path - promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI - if modified_pdf_success: - yield from update_ui_lastest_msg(f'转化PDF编译已经成功, 即将退出 ...', chatbot, history) # 刷新Gradio前端界面 - result_pdf = pj(work_folder_modified, f'{main_file_modified}.pdf') # get pdf path - if os.path.exists(pj(work_folder, '..', 'translation')): - shutil.copyfile(result_pdf, pj(work_folder, '..', 'translation', 'translate_zh.pdf')) - promote_file_to_downloadzone(result_pdf, rename_file=None, chatbot=chatbot) # promote file to web UI - return True # 成功啦 - else: - if n_fix>=max_try: break - n_fix += 1 - can_retry, main_file_modified, buggy_lines = remove_buggy_lines( - file_path=pj(work_folder_modified, f'{main_file_modified}.tex'), - log_path=pj(work_folder_modified, f'{main_file_modified}.log'), - tex_name=f'{main_file_modified}.tex', - tex_name_pure=f'{main_file_modified}', - n_fix=n_fix, - work_folder_modified=work_folder_modified, - ) - yield from update_ui_lastest_msg(f'由于最为关键的转化PDF编译失败, 将根据报错信息修正tex源文件并重试, 当前报错的latex代码处于第{buggy_lines}行 ...', chatbot, history) # 刷新Gradio前端界面 - if not can_retry: break - - return False # 失败啦 - - - diff --git a/crazy_functions/live_audio/aliyunASR.py b/crazy_functions/live_audio/aliyunASR.py deleted file mode 100644 index cba4c01f86be93b4fbb7ef474330a6a104c59431..0000000000000000000000000000000000000000 --- a/crazy_functions/live_audio/aliyunASR.py +++ /dev/null @@ -1,261 +0,0 @@ -import time, logging, json, sys, struct -import numpy as np -from scipy.io.wavfile import WAVE_FORMAT - -def write_numpy_to_wave(filename, rate, data, add_header=False): - """ - Write a NumPy array as a WAV file. - """ - def _array_tofile(fid, data): - # ravel gives a c-contiguous buffer - fid.write(data.ravel().view('b').data) - - if hasattr(filename, 'write'): - fid = filename - else: - fid = open(filename, 'wb') - - fs = rate - - try: - dkind = data.dtype.kind - if not (dkind == 'i' or dkind == 'f' or (dkind == 'u' and - data.dtype.itemsize == 1)): - raise ValueError("Unsupported data type '%s'" % data.dtype) - - header_data = b'' - - header_data += b'RIFF' - header_data += b'\x00\x00\x00\x00' - header_data += b'WAVE' - - # fmt chunk - header_data += b'fmt ' - if dkind == 'f': - format_tag = WAVE_FORMAT.IEEE_FLOAT - else: - format_tag = WAVE_FORMAT.PCM - if data.ndim == 1: - channels = 1 - else: - channels = data.shape[1] - bit_depth = data.dtype.itemsize * 8 - bytes_per_second = fs*(bit_depth // 8)*channels - block_align = channels * (bit_depth // 8) - - fmt_chunk_data = struct.pack(' 0xFFFFFFFF: - raise ValueError("Data exceeds wave file size limit") - if add_header: - fid.write(header_data) - # data chunk - fid.write(b'data') - fid.write(struct.pack('' or (data.dtype.byteorder == '=' and - sys.byteorder == 'big'): - data = data.byteswap() - _array_tofile(fid, data) - - if add_header: - # Determine file size and place it in correct - # position at start of the file. - size = fid.tell() - fid.seek(4) - fid.write(struct.pack('{}".format(args)) - pass - - def test_on_close(self, *args): - self.aliyun_service_ok = False - pass - - def test_on_result_chg(self, message, *args): - # print("test_on_chg:{}".format(message)) - message = json.loads(message) - self.parsed_text = message['payload']['result'] - self.event_on_result_chg.set() - - def test_on_completed(self, message, *args): - # print("on_completed:args=>{} message=>{}".format(args, message)) - pass - - def audio_convertion_thread(self, uuid): - # 在一个异步线程中采集音频 - import nls # pip install git+https://github.com/aliyun/alibabacloud-nls-python-sdk.git - import tempfile - from scipy import io - from toolbox import get_conf - from .audio_io import change_sample_rate - from .audio_io import RealtimeAudioDistribution - NEW_SAMPLERATE = 16000 - rad = RealtimeAudioDistribution() - rad.clean_up() - temp_folder = tempfile.gettempdir() - TOKEN, APPKEY = get_conf('ALIYUN_TOKEN', 'ALIYUN_APPKEY') - if len(TOKEN) == 0: - TOKEN = self.get_token() - self.aliyun_service_ok = True - URL="wss://nls-gateway.aliyuncs.com/ws/v1" - sr = nls.NlsSpeechTranscriber( - url=URL, - token=TOKEN, - appkey=APPKEY, - on_sentence_begin=self.test_on_sentence_begin, - on_sentence_end=self.test_on_sentence_end, - on_start=self.test_on_start, - on_result_changed=self.test_on_result_chg, - on_completed=self.test_on_completed, - on_error=self.test_on_error, - on_close=self.test_on_close, - callback_args=[uuid.hex] - ) - timeout_limit_second = 20 - r = sr.start(aformat="pcm", - timeout=timeout_limit_second, - enable_intermediate_result=True, - enable_punctuation_prediction=True, - enable_inverse_text_normalization=True) - - import webrtcvad - vad = webrtcvad.Vad() - vad.set_mode(1) - - is_previous_frame_transmitted = False # 上一帧是否有人说话 - previous_frame_data = None - echo_cnt = 0 # 在没有声音之后,继续向服务器发送n次音频数据 - echo_cnt_max = 4 # 在没有声音之后,继续向服务器发送n次音频数据 - keep_alive_last_send_time = time.time() - while not self.stop: - # time.sleep(self.capture_interval) - audio = rad.read(uuid.hex) - if audio is not None: - # convert to pcm file - temp_file = f'{temp_folder}/{uuid.hex}.pcm' # - dsdata = change_sample_rate(audio, rad.rate, NEW_SAMPLERATE) # 48000 --> 16000 - write_numpy_to_wave(temp_file, NEW_SAMPLERATE, dsdata) - # read pcm binary - with open(temp_file, "rb") as f: data = f.read() - is_speaking, info = is_speaker_speaking(vad, data, NEW_SAMPLERATE) - - if is_speaking or echo_cnt > 0: - # 如果话筒激活 / 如果处于回声收尾阶段 - echo_cnt -= 1 - if not is_previous_frame_transmitted: # 上一帧没有人声,但是我们把上一帧同样加上 - if previous_frame_data is not None: data = previous_frame_data + data - if is_speaking: - echo_cnt = echo_cnt_max - slices = zip(*(iter(data),) * 640) # 640个字节为一组 - for i in slices: sr.send_audio(bytes(i)) - keep_alive_last_send_time = time.time() - is_previous_frame_transmitted = True - else: - is_previous_frame_transmitted = False - echo_cnt = 0 - # 保持链接激活,即使没有声音,也根据时间间隔,发送一些音频片段给服务器 - if time.time() - keep_alive_last_send_time > timeout_limit_second/2: - slices = zip(*(iter(data),) * 640) # 640个字节为一组 - for i in slices: sr.send_audio(bytes(i)) - keep_alive_last_send_time = time.time() - is_previous_frame_transmitted = True - self.audio_shape = info - else: - time.sleep(0.1) - - if not self.aliyun_service_ok: - self.stop = True - self.stop_msg = 'Aliyun音频服务异常,请检查ALIYUN_TOKEN和ALIYUN_APPKEY是否过期。' - r = sr.stop() - - def get_token(self): - from toolbox import get_conf - import json - from aliyunsdkcore.request import CommonRequest - from aliyunsdkcore.client import AcsClient - AccessKey_ID, AccessKey_secret = get_conf('ALIYUN_ACCESSKEY', 'ALIYUN_SECRET') - - # 创建AcsClient实例 - client = AcsClient( - AccessKey_ID, - AccessKey_secret, - "cn-shanghai" - ) - - # 创建request,并设置参数。 - request = CommonRequest() - request.set_method('POST') - request.set_domain('nls-meta.cn-shanghai.aliyuncs.com') - request.set_version('2019-02-28') - request.set_action_name('CreateToken') - - try: - response = client.do_action_with_exception(request) - print(response) - jss = json.loads(response) - if 'Token' in jss and 'Id' in jss['Token']: - token = jss['Token']['Id'] - expireTime = jss['Token']['ExpireTime'] - print("token = " + token) - print("expireTime = " + str(expireTime)) - except Exception as e: - print(e) - - return token diff --git a/crazy_functions/live_audio/audio_io.py b/crazy_functions/live_audio/audio_io.py deleted file mode 100644 index 00fd3f2d846ccf20eb300b796bb91842315e3482..0000000000000000000000000000000000000000 --- a/crazy_functions/live_audio/audio_io.py +++ /dev/null @@ -1,51 +0,0 @@ -import numpy as np -from scipy import interpolate - -def Singleton(cls): - _instance = {} - - def _singleton(*args, **kargs): - if cls not in _instance: - _instance[cls] = cls(*args, **kargs) - return _instance[cls] - - return _singleton - - -@Singleton -class RealtimeAudioDistribution(): - def __init__(self) -> None: - self.data = {} - self.max_len = 1024*1024 - self.rate = 48000 # 只读,每秒采样数量 - - def clean_up(self): - self.data = {} - - def feed(self, uuid, audio): - self.rate, audio_ = audio - # print('feed', len(audio_), audio_[-25:]) - if uuid not in self.data: - self.data[uuid] = audio_ - else: - new_arr = np.concatenate((self.data[uuid], audio_)) - if len(new_arr) > self.max_len: new_arr = new_arr[-self.max_len:] - self.data[uuid] = new_arr - - def read(self, uuid): - if uuid in self.data: - res = self.data.pop(uuid) - # print('\r read-', len(res), '-', max(res), end='', flush=True) - else: - res = None - return res - -def change_sample_rate(audio, old_sr, new_sr): - duration = audio.shape[0] / old_sr - - time_old = np.linspace(0, duration, audio.shape[0]) - time_new = np.linspace(0, duration, int(audio.shape[0] * new_sr / old_sr)) - - interpolator = interpolate.interp1d(time_old, audio.T) - new_audio = interpolator(time_new).T - return new_audio.astype(np.int16) \ No newline at end of file diff --git a/crazy_functions/multi_stage/multi_stage_utils.py b/crazy_functions/multi_stage/multi_stage_utils.py deleted file mode 100644 index 1395e79ff132de3622d2dd3b3867f3916399e061..0000000000000000000000000000000000000000 --- a/crazy_functions/multi_stage/multi_stage_utils.py +++ /dev/null @@ -1,93 +0,0 @@ -from pydantic import BaseModel, Field -from typing import List -from toolbox import update_ui_lastest_msg, disable_auto_promotion -from toolbox import CatchException, update_ui, get_conf, select_api_key, get_log_folder -from request_llms.bridge_all import predict_no_ui_long_connection -from crazy_functions.json_fns.pydantic_io import GptJsonIO, JsonStringError -import time -import pickle - -def have_any_recent_upload_files(chatbot): - _5min = 5 * 60 - if not chatbot: return False # chatbot is None - most_recent_uploaded = chatbot._cookies.get("most_recent_uploaded", None) - if not most_recent_uploaded: return False # most_recent_uploaded is None - if time.time() - most_recent_uploaded["time"] < _5min: return True # most_recent_uploaded is new - else: return False # most_recent_uploaded is too old - -class GptAcademicState(): - def __init__(self): - self.reset() - - def reset(self): - pass - - def dump_state(self, chatbot): - chatbot._cookies['plugin_state'] = pickle.dumps(self) - - def set_state(self, chatbot, key, value): - setattr(self, key, value) - chatbot._cookies['plugin_state'] = pickle.dumps(self) - - def get_state(chatbot, cls=None): - state = chatbot._cookies.get('plugin_state', None) - if state is not None: state = pickle.loads(state) - elif cls is not None: state = cls() - else: state = GptAcademicState() - state.chatbot = chatbot - return state - - -class GptAcademicGameBaseState(): - """ - 1. first init: __init__ -> - """ - def init_game(self, chatbot, lock_plugin): - self.plugin_name = None - self.callback_fn = None - self.delete_game = False - self.step_cnt = 0 - - def lock_plugin(self, chatbot): - if self.callback_fn is None: - raise ValueError("callback_fn is None") - chatbot._cookies['lock_plugin'] = self.callback_fn - self.dump_state(chatbot) - - def get_plugin_name(self): - if self.plugin_name is None: - raise ValueError("plugin_name is None") - return self.plugin_name - - def dump_state(self, chatbot): - chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = pickle.dumps(self) - - def set_state(self, chatbot, key, value): - setattr(self, key, value) - chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = pickle.dumps(self) - - @staticmethod - def sync_state(chatbot, llm_kwargs, cls, plugin_name, callback_fn, lock_plugin=True): - state = chatbot._cookies.get(f'plugin_state/{plugin_name}', None) - if state is not None: - state = pickle.loads(state) - else: - state = cls() - state.init_game(chatbot, lock_plugin) - state.plugin_name = plugin_name - state.llm_kwargs = llm_kwargs - state.chatbot = chatbot - state.callback_fn = callback_fn - return state - - def continue_game(self, prompt, chatbot, history): - # 游戏主体 - yield from self.step(prompt, chatbot, history) - self.step_cnt += 1 - # 保存状态,收尾 - self.dump_state(chatbot) - # 如果游戏结束,清理 - if self.delete_game: - chatbot._cookies['lock_plugin'] = None - chatbot._cookies[f'plugin_state/{self.get_plugin_name()}'] = None - yield from update_ui(chatbot=chatbot, history=history) diff --git a/crazy_functions/pdf_fns/breakdown_txt.py b/crazy_functions/pdf_fns/breakdown_txt.py deleted file mode 100644 index a9614814020335fc83e63b859319a961300d94b4..0000000000000000000000000000000000000000 --- a/crazy_functions/pdf_fns/breakdown_txt.py +++ /dev/null @@ -1,125 +0,0 @@ -from crazy_functions.ipc_fns.mp import run_in_subprocess_with_timeout - -def force_breakdown(txt, limit, get_token_fn): - """ 当无法用标点、空行分割时,我们用最暴力的方法切割 - """ - for i in reversed(range(len(txt))): - if get_token_fn(txt[:i]) < limit: - return txt[:i], txt[i:] - return "Tiktoken未知错误", "Tiktoken未知错误" - - -def maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage): - """ 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage - 当 remain_txt_to_cut < `_min` 时,我们再把 remain_txt_to_cut_storage 中的部分文字取出 - """ - _min = int(5e4) - _max = int(1e5) - # print(len(remain_txt_to_cut), len(remain_txt_to_cut_storage)) - if len(remain_txt_to_cut) < _min and len(remain_txt_to_cut_storage) > 0: - remain_txt_to_cut = remain_txt_to_cut + remain_txt_to_cut_storage - remain_txt_to_cut_storage = "" - if len(remain_txt_to_cut) > _max: - remain_txt_to_cut_storage = remain_txt_to_cut[_max:] + remain_txt_to_cut_storage - remain_txt_to_cut = remain_txt_to_cut[:_max] - return remain_txt_to_cut, remain_txt_to_cut_storage - - -def cut(limit, get_token_fn, txt_tocut, must_break_at_empty_line, break_anyway=False): - """ 文本切分 - """ - res = [] - total_len = len(txt_tocut) - fin_len = 0 - remain_txt_to_cut = txt_tocut - remain_txt_to_cut_storage = "" - # 为了加速计算,我们采样一个特殊的手段。当 remain_txt_to_cut > `_max` 时, 我们把 _max 后的文字转存至 remain_txt_to_cut_storage - remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) - - while True: - if get_token_fn(remain_txt_to_cut) <= limit: - # 如果剩余文本的token数小于限制,那么就不用切了 - res.append(remain_txt_to_cut); fin_len+=len(remain_txt_to_cut) - break - else: - # 如果剩余文本的token数大于限制,那么就切 - lines = remain_txt_to_cut.split('\n') - - # 估计一个切分点 - estimated_line_cut = limit / get_token_fn(remain_txt_to_cut) * len(lines) - estimated_line_cut = int(estimated_line_cut) - - # 开始查找合适切分点的偏移(cnt) - cnt = 0 - for cnt in reversed(range(estimated_line_cut)): - if must_break_at_empty_line: - # 首先尝试用双空行(\n\n)作为切分点 - if lines[cnt] != "": - continue - prev = "\n".join(lines[:cnt]) - post = "\n".join(lines[cnt:]) - if get_token_fn(prev) < limit: - break - - if cnt == 0: - # 如果没有找到合适的切分点 - if break_anyway: - # 是否允许暴力切分 - prev, post = force_breakdown(txt_tocut, limit, get_token_fn) - else: - # 不允许直接报错 - raise RuntimeError(f"存在一行极长的文本!{txt_tocut}") - - # 追加列表 - res.append(prev); fin_len+=len(prev) - # 准备下一次迭代 - remain_txt_to_cut = post - remain_txt_to_cut, remain_txt_to_cut_storage = maintain_storage(remain_txt_to_cut, remain_txt_to_cut_storage) - process = fin_len/total_len - print(f'正在文本切分 {int(process*100)}%') - if len(remain_txt_to_cut.strip()) == 0: - break - return res - - -def breakdown_text_to_satisfy_token_limit_(txt, limit, llm_model="gpt-3.5-turbo"): - """ 使用多种方式尝试切分文本,以满足 token 限制 - """ - from request_llms.bridge_all import model_info - enc = model_info[llm_model]['tokenizer'] - def get_token_fn(txt): return len(enc.encode(txt, disallowed_special=())) - try: - # 第1次尝试,将双空行(\n\n)作为切分点 - return cut(limit, get_token_fn, txt, must_break_at_empty_line=True) - except RuntimeError: - try: - # 第2次尝试,将单空行(\n)作为切分点 - return cut(limit, get_token_fn, txt, must_break_at_empty_line=False) - except RuntimeError: - try: - # 第3次尝试,将英文句号(.)作为切分点 - res = cut(limit, get_token_fn, txt.replace('.', '。\n'), must_break_at_empty_line=False) # 这个中文的句号是故意的,作为一个标识而存在 - return [r.replace('。\n', '.') for r in res] - except RuntimeError as e: - try: - # 第4次尝试,将中文句号(。)作为切分点 - res = cut(limit, get_token_fn, txt.replace('。', '。。\n'), must_break_at_empty_line=False) - return [r.replace('。。\n', '。') for r in res] - except RuntimeError as e: - # 第5次尝试,没办法了,随便切一下吧 - return cut(limit, get_token_fn, txt, must_break_at_empty_line=False, break_anyway=True) - -breakdown_text_to_satisfy_token_limit = run_in_subprocess_with_timeout(breakdown_text_to_satisfy_token_limit_, timeout=60) - -if __name__ == '__main__': - from crazy_functions.crazy_utils import read_and_clean_pdf_text - file_content, page_one = read_and_clean_pdf_text("build/assets/at.pdf") - - from request_llms.bridge_all import model_info - for i in range(5): - file_content += file_content - - print(len(file_content)) - TOKEN_LIMIT_PER_FRAGMENT = 2500 - res = breakdown_text_to_satisfy_token_limit(file_content, TOKEN_LIMIT_PER_FRAGMENT) - diff --git a/crazy_functions/pdf_fns/parse_pdf.py b/crazy_functions/pdf_fns/parse_pdf.py deleted file mode 100644 index fa27de516feb735c0ac92ffa02be97164343d8cf..0000000000000000000000000000000000000000 --- a/crazy_functions/pdf_fns/parse_pdf.py +++ /dev/null @@ -1,171 +0,0 @@ -from functools import lru_cache -from toolbox import gen_time_str -from toolbox import promote_file_to_downloadzone -from toolbox import write_history_to_file, promote_file_to_downloadzone -from toolbox import get_conf -from toolbox import ProxyNetworkActivate -from colorful import * -import requests -import random -import copy -import os -import math - -class GROBID_OFFLINE_EXCEPTION(Exception): pass - -def get_avail_grobid_url(): - GROBID_URLS = get_conf('GROBID_URLS') - if len(GROBID_URLS) == 0: return None - try: - _grobid_url = random.choice(GROBID_URLS) # 随机负载均衡 - if _grobid_url.endswith('/'): _grobid_url = _grobid_url.rstrip('/') - with ProxyNetworkActivate('Connect_Grobid'): - res = requests.get(_grobid_url+'/api/isalive') - if res.text=='true': return _grobid_url - else: return None - except: - return None - -@lru_cache(maxsize=32) -def parse_pdf(pdf_path, grobid_url): - import scipdf # pip install scipdf_parser - if grobid_url.endswith('/'): grobid_url = grobid_url.rstrip('/') - try: - with ProxyNetworkActivate('Connect_Grobid'): - article_dict = scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url) - except GROBID_OFFLINE_EXCEPTION: - raise GROBID_OFFLINE_EXCEPTION("GROBID服务不可用,请修改config中的GROBID_URL,可修改成本地GROBID服务。") - except: - raise RuntimeError("解析PDF失败,请检查PDF是否损坏。") - return article_dict - - -def produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files): - # -=-=-=-=-=-=-=-= 写出第1个文件:翻译前后混合 -=-=-=-=-=-=-=-= - res_path = write_history_to_file(meta + ["# Meta Translation" , paper_meta_info] + gpt_response_collection, file_basename=f"{gen_time_str()}translated_and_original.md", file_fullname=None) - promote_file_to_downloadzone(res_path, rename_file=os.path.basename(res_path)+'.md', chatbot=chatbot) - generated_conclusion_files.append(res_path) - - # -=-=-=-=-=-=-=-= 写出第2个文件:仅翻译后的文本 -=-=-=-=-=-=-=-= - translated_res_array = [] - # 记录当前的大章节标题: - last_section_name = "" - for index, value in enumerate(gpt_response_collection): - # 先挑选偶数序列号: - if index % 2 != 0: - # 先提取当前英文标题: - cur_section_name = gpt_response_collection[index-1].split('\n')[0].split(" Part")[0] - # 如果index是1的话,则直接使用first section name: - if cur_section_name != last_section_name: - cur_value = cur_section_name + '\n' - last_section_name = copy.deepcopy(cur_section_name) - else: - cur_value = "" - # 再做一个小修改:重新修改当前part的标题,默认用英文的 - cur_value += value - translated_res_array.append(cur_value) - res_path = write_history_to_file(meta + ["# Meta Translation" , paper_meta_info] + translated_res_array, - file_basename = f"{gen_time_str()}-translated_only.md", - file_fullname = None, - auto_caption = False) - promote_file_to_downloadzone(res_path, rename_file=os.path.basename(res_path)+'.md', chatbot=chatbot) - generated_conclusion_files.append(res_path) - return res_path - -def translate_pdf(article_dict, llm_kwargs, chatbot, fp, generated_conclusion_files, TOKEN_LIMIT_PER_FRAGMENT, DST_LANG): - from crazy_functions.pdf_fns.report_gen_html import construct_html - from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit - from crazy_functions.crazy_utils import request_gpt_model_in_new_thread_with_ui_alive - from crazy_functions.crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency - - prompt = "以下是一篇学术论文的基本信息:\n" - # title - title = article_dict.get('title', '无法获取 title'); prompt += f'title:{title}\n\n' - # authors - authors = article_dict.get('authors', '无法获取 authors')[:100]; prompt += f'authors:{authors}\n\n' - # abstract - abstract = article_dict.get('abstract', '无法获取 abstract'); prompt += f'abstract:{abstract}\n\n' - # command - prompt += f"请将题目和摘要翻译为{DST_LANG}。" - meta = [f'# Title:\n\n', title, f'# Abstract:\n\n', abstract ] - - # 单线,获取文章meta信息 - paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive( - inputs=prompt, - inputs_show_user=prompt, - llm_kwargs=llm_kwargs, - chatbot=chatbot, history=[], - sys_prompt="You are an academic paper reader。", - ) - - # 多线,翻译 - inputs_array = [] - inputs_show_user_array = [] - - # get_token_num - from request_llms.bridge_all import model_info - enc = model_info[llm_kwargs['llm_model']]['tokenizer'] - def get_token_num(txt): return len(enc.encode(txt, disallowed_special=())) - - def break_down(txt): - raw_token_num = get_token_num(txt) - if raw_token_num <= TOKEN_LIMIT_PER_FRAGMENT: - return [txt] - else: - # raw_token_num > TOKEN_LIMIT_PER_FRAGMENT - # find a smooth token limit to achieve even seperation - count = int(math.ceil(raw_token_num / TOKEN_LIMIT_PER_FRAGMENT)) - token_limit_smooth = raw_token_num // count + count - return breakdown_text_to_satisfy_token_limit(txt, limit=token_limit_smooth, llm_model=llm_kwargs['llm_model']) - - for section in article_dict.get('sections'): - if len(section['text']) == 0: continue - section_frags = break_down(section['text']) - for i, fragment in enumerate(section_frags): - heading = section['heading'] - if len(section_frags) > 1: heading += f' Part-{i+1}' - inputs_array.append( - f"你需要翻译{heading}章节,内容如下: \n\n{fragment}" - ) - inputs_show_user_array.append( - f"# {heading}\n\n{fragment}" - ) - - gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency( - inputs_array=inputs_array, - inputs_show_user_array=inputs_show_user_array, - llm_kwargs=llm_kwargs, - chatbot=chatbot, - history_array=[meta for _ in inputs_array], - sys_prompt_array=[ - "请你作为一个学术翻译,负责把学术论文准确翻译成中文。注意文章中的每一句话都要翻译。" for _ in inputs_array], - ) - # -=-=-=-=-=-=-=-= 写出Markdown文件 -=-=-=-=-=-=-=-= - produce_report_markdown(gpt_response_collection, meta, paper_meta_info, chatbot, fp, generated_conclusion_files) - - # -=-=-=-=-=-=-=-= 写出HTML文件 -=-=-=-=-=-=-=-= - ch = construct_html() - orig = "" - trans = "" - gpt_response_collection_html = copy.deepcopy(gpt_response_collection) - for i,k in enumerate(gpt_response_collection_html): - if i%2==0: - gpt_response_collection_html[i] = inputs_show_user_array[i//2] - else: - # 先提取当前英文标题: - cur_section_name = gpt_response_collection[i-1].split('\n')[0].split(" Part")[0] - cur_value = cur_section_name + "\n" + gpt_response_collection_html[i] - gpt_response_collection_html[i] = cur_value - - final = ["", "", "一、论文概况", "", "Abstract", paper_meta_info, "二、论文翻译", ""] - final.extend(gpt_response_collection_html) - for i, k in enumerate(final): - if i%2==0: - orig = k - if i%2==1: - trans = k - ch.add_row(a=orig, b=trans) - create_report_file_name = f"{os.path.basename(fp)}.trans.html" - html_file = ch.save_file(create_report_file_name) - generated_conclusion_files.append(html_file) - promote_file_to_downloadzone(html_file, rename_file=os.path.basename(html_file), chatbot=chatbot) diff --git a/crazy_functions/pdf_fns/report_gen_html.py b/crazy_functions/pdf_fns/report_gen_html.py deleted file mode 100644 index 21829212ff13a2dfd1492f05ac9abc73907dce7b..0000000000000000000000000000000000000000 --- a/crazy_functions/pdf_fns/report_gen_html.py +++ /dev/null @@ -1,58 +0,0 @@ -from toolbox import update_ui, get_conf, trimmed_format_exc, get_log_folder -import os - - - - -class construct_html(): - def __init__(self) -> None: - self.html_string = "" - - def add_row(self, a, b): - from toolbox import markdown_convertion - template = """ - { - primary_col: { - header: String.raw`__PRIMARY_HEADER__`, - msg: String.raw`__PRIMARY_MSG__`, - }, - secondary_rol: { - header: String.raw`__SECONDARY_HEADER__`, - msg: String.raw`__SECONDARY_MSG__`, - } - }, - """ - def std(str): - str = str.replace(r'`',r'`') - if str.endswith("\\"): str += ' ' - if str.endswith("}"): str += ' ' - if str.endswith("$"): str += ' ' - return str - - template_ = template - a_lines = a.split('\n') - b_lines = b.split('\n') - - if len(a_lines) == 1 or len(a_lines[0]) > 50: - template_ = template_.replace("__PRIMARY_HEADER__", std(a[:20])) - template_ = template_.replace("__PRIMARY_MSG__", std(markdown_convertion(a))) - else: - template_ = template_.replace("__PRIMARY_HEADER__", std(a_lines[0])) - template_ = template_.replace("__PRIMARY_MSG__", std(markdown_convertion('\n'.join(a_lines[1:])))) - - if len(b_lines) == 1 or len(b_lines[0]) > 50: - template_ = template_.replace("__SECONDARY_HEADER__", std(b[:20])) - template_ = template_.replace("__SECONDARY_MSG__", std(markdown_convertion(b))) - else: - template_ = template_.replace("__SECONDARY_HEADER__", std(b_lines[0])) - template_ = template_.replace("__SECONDARY_MSG__", std(markdown_convertion('\n'.join(b_lines[1:])))) - self.html_string += template_ - - def save_file(self, file_name): - from toolbox import get_log_folder - with open('crazy_functions/pdf_fns/report_template.html', 'r', encoding='utf8') as f: - html_template = f.read() - html_template = html_template.replace("__TF_ARR__", self.html_string) - with open(os.path.join(get_log_folder(), file_name), 'w', encoding='utf8') as f: - f.write(html_template.encode('utf-8', 'ignore').decode()) - return os.path.join(get_log_folder(), file_name) diff --git a/crazy_functions/pdf_fns/report_template.html b/crazy_functions/pdf_fns/report_template.html deleted file mode 100644 index 39a1e7ce482949978ff90c4738a9adb8803660e6..0000000000000000000000000000000000000000 --- a/crazy_functions/pdf_fns/report_template.html +++ /dev/null @@ -1,104 +0,0 @@ - - - - - - __TITLE__ - - - - - -
-

文章目录

- -
- - - diff --git a/crazy_functions/test_project/cpp/cppipc/buffer.cpp b/crazy_functions/test_project/cpp/cppipc/buffer.cpp deleted file mode 100644 index 084b8153e9401f4e9dc5a6a67cfb5f48b0183ccb..0000000000000000000000000000000000000000 --- a/crazy_functions/test_project/cpp/cppipc/buffer.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include "libipc/buffer.h" -#include "libipc/utility/pimpl.h" - -#include - -namespace ipc { - -bool operator==(buffer const & b1, buffer const & b2) { - return (b1.size() == b2.size()) && (std::memcmp(b1.data(), b2.data(), b1.size()) == 0); -} - -bool operator!=(buffer const & b1, buffer const & b2) { - return !(b1 == b2); -} - -class buffer::buffer_ : public pimpl { -public: - void* p_; - std::size_t s_; - void* a_; - buffer::destructor_t d_; - - buffer_(void* p, std::size_t s, buffer::destructor_t d, void* a) - : p_(p), s_(s), a_(a), d_(d) { - } - - ~buffer_() { - if (d_ == nullptr) return; - d_((a_ == nullptr) ? p_ : a_, s_); - } -}; - -buffer::buffer() - : buffer(nullptr, 0, nullptr, nullptr) { -} - -buffer::buffer(void* p, std::size_t s, destructor_t d) - : p_(p_->make(p, s, d, nullptr)) { -} - -buffer::buffer(void* p, std::size_t s, destructor_t d, void* additional) - : p_(p_->make(p, s, d, additional)) { -} - -buffer::buffer(void* p, std::size_t s) - : buffer(p, s, nullptr) { -} - -buffer::buffer(char const & c) - : buffer(const_cast(&c), 1) { -} - -buffer::buffer(buffer&& rhs) - : buffer() { - swap(rhs); -} - -buffer::~buffer() { - p_->clear(); -} - -void buffer::swap(buffer& rhs) { - std::swap(p_, rhs.p_); -} - -buffer& buffer::operator=(buffer rhs) { - swap(rhs); - return *this; -} - -bool buffer::empty() const noexcept { - return (impl(p_)->p_ == nullptr) || (impl(p_)->s_ == 0); -} - -void* buffer::data() noexcept { - return impl(p_)->p_; -} - -void const * buffer::data() const noexcept { - return impl(p_)->p_; -} - -std::size_t buffer::size() const noexcept { - return impl(p_)->s_; -} - -} // namespace ipc diff --git a/crazy_functions/test_project/cpp/cppipc/ipc.cpp b/crazy_functions/test_project/cpp/cppipc/ipc.cpp deleted file mode 100644 index 4dc71c071c524906205cc4e2eae9ca8bac8b2d2c..0000000000000000000000000000000000000000 --- a/crazy_functions/test_project/cpp/cppipc/ipc.cpp +++ /dev/null @@ -1,701 +0,0 @@ - -#include -#include -#include -#include // std::pair, std::move, std::forward -#include -#include // aligned_storage_t -#include -#include -#include -#include - -#include "libipc/ipc.h" -#include "libipc/def.h" -#include "libipc/shm.h" -#include "libipc/pool_alloc.h" -#include "libipc/queue.h" -#include "libipc/policy.h" -#include "libipc/rw_lock.h" -#include "libipc/waiter.h" - -#include "libipc/utility/log.h" -#include "libipc/utility/id_pool.h" -#include "libipc/utility/scope_guard.h" -#include "libipc/utility/utility.h" - -#include "libipc/memory/resource.h" -#include "libipc/platform/detail.h" -#include "libipc/circ/elem_array.h" - -namespace { - -using msg_id_t = std::uint32_t; -using acc_t = std::atomic; - -template -struct msg_t; - -template -struct msg_t<0, AlignSize> { - msg_id_t cc_id_; - msg_id_t id_; - std::int32_t remain_; - bool storage_; -}; - -template -struct msg_t : msg_t<0, AlignSize> { - std::aligned_storage_t data_ {}; - - msg_t() = default; - msg_t(msg_id_t cc_id, msg_id_t id, std::int32_t remain, void const * data, std::size_t size) - : msg_t<0, AlignSize> {cc_id, id, remain, (data == nullptr) || (size == 0)} { - if (this->storage_) { - if (data != nullptr) { - // copy storage-id - *reinterpret_cast(&data_) = - *static_cast(data); - } - } - else std::memcpy(&data_, data, size); - } -}; - -template -ipc::buff_t make_cache(T& data, std::size_t size) { - auto ptr = ipc::mem::alloc(size); - std::memcpy(ptr, &data, (ipc::detail::min)(sizeof(data), size)); - return { ptr, size, ipc::mem::free }; -} - -struct cache_t { - std::size_t fill_; - ipc::buff_t buff_; - - cache_t(std::size_t f, ipc::buff_t && b) - : fill_(f), buff_(std::move(b)) - {} - - void append(void const * data, std::size_t size) { - if (fill_ >= buff_.size() || data == nullptr || size == 0) return; - auto new_fill = (ipc::detail::min)(fill_ + size, buff_.size()); - std::memcpy(static_cast(buff_.data()) + fill_, data, new_fill - fill_); - fill_ = new_fill; - } -}; - -auto cc_acc() { - static ipc::shm::handle acc_h("__CA_CONN__", sizeof(acc_t)); - return static_cast(acc_h.get()); -} - -IPC_CONSTEXPR_ std::size_t align_chunk_size(std::size_t size) noexcept { - return (((size - 1) / ipc::large_msg_align) + 1) * ipc::large_msg_align; -} - -IPC_CONSTEXPR_ std::size_t calc_chunk_size(std::size_t size) noexcept { - return ipc::make_align(alignof(std::max_align_t), align_chunk_size( - ipc::make_align(alignof(std::max_align_t), sizeof(std::atomic)) + size)); -} - -struct chunk_t { - std::atomic &conns() noexcept { - return *reinterpret_cast *>(this); - } - - void *data() noexcept { - return reinterpret_cast(this) - + ipc::make_align(alignof(std::max_align_t), sizeof(std::atomic)); - } -}; - -struct chunk_info_t { - ipc::id_pool<> pool_; - ipc::spin_lock lock_; - - IPC_CONSTEXPR_ static std::size_t chunks_mem_size(std::size_t chunk_size) noexcept { - return ipc::id_pool<>::max_count * chunk_size; - } - - ipc::byte_t *chunks_mem() noexcept { - return reinterpret_cast(this + 1); - } - - chunk_t *at(std::size_t chunk_size, ipc::storage_id_t id) noexcept { - if (id < 0) return nullptr; - return reinterpret_cast(chunks_mem() + (chunk_size * id)); - } -}; - -auto& chunk_storages() { - class chunk_handle_t { - ipc::shm::handle handle_; - - public: - chunk_info_t *get_info(std::size_t chunk_size) { - if (!handle_.valid() && - !handle_.acquire( ("__CHUNK_INFO__" + ipc::to_string(chunk_size)).c_str(), - sizeof(chunk_info_t) + chunk_info_t::chunks_mem_size(chunk_size) )) { - ipc::error("[chunk_storages] chunk_shm.id_info_.acquire failed: chunk_size = %zd\n", chunk_size); - return nullptr; - } - auto info = static_cast(handle_.get()); - if (info == nullptr) { - ipc::error("[chunk_storages] chunk_shm.id_info_.get failed: chunk_size = %zd\n", chunk_size); - return nullptr; - } - return info; - } - }; - static ipc::map chunk_hs; - return chunk_hs; -} - -chunk_info_t *chunk_storage_info(std::size_t chunk_size) { - auto &storages = chunk_storages(); - std::decay_t::iterator it; - { - static ipc::rw_lock lock; - IPC_UNUSED_ std::shared_lock guard {lock}; - if ((it = storages.find(chunk_size)) == storages.end()) { - using chunk_handle_t = std::decay_t::value_type::second_type; - guard.unlock(); - IPC_UNUSED_ std::lock_guard guard {lock}; - it = storages.emplace(chunk_size, chunk_handle_t{}).first; - } - } - return it->second.get_info(chunk_size); -} - -std::pair acquire_storage(std::size_t size, ipc::circ::cc_t conns) { - std::size_t chunk_size = calc_chunk_size(size); - auto info = chunk_storage_info(chunk_size); - if (info == nullptr) return {}; - - info->lock_.lock(); - info->pool_.prepare(); - // got an unique id - auto id = info->pool_.acquire(); - info->lock_.unlock(); - - auto chunk = info->at(chunk_size, id); - if (chunk == nullptr) return {}; - chunk->conns().store(conns, std::memory_order_relaxed); - return { id, chunk->data() }; -} - -void *find_storage(ipc::storage_id_t id, std::size_t size) { - if (id < 0) { - ipc::error("[find_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size); - return nullptr; - } - std::size_t chunk_size = calc_chunk_size(size); - auto info = chunk_storage_info(chunk_size); - if (info == nullptr) return nullptr; - return info->at(chunk_size, id)->data(); -} - -void release_storage(ipc::storage_id_t id, std::size_t size) { - if (id < 0) { - ipc::error("[release_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size); - return; - } - std::size_t chunk_size = calc_chunk_size(size); - auto info = chunk_storage_info(chunk_size); - if (info == nullptr) return; - info->lock_.lock(); - info->pool_.release(id); - info->lock_.unlock(); -} - -template -bool sub_rc(ipc::wr, - std::atomic &/*conns*/, ipc::circ::cc_t /*curr_conns*/, ipc::circ::cc_t /*conn_id*/) noexcept { - return true; -} - -template -bool sub_rc(ipc::wr, - std::atomic &conns, ipc::circ::cc_t curr_conns, ipc::circ::cc_t conn_id) noexcept { - auto last_conns = curr_conns & ~conn_id; - for (unsigned k = 0;;) { - auto chunk_conns = conns.load(std::memory_order_acquire); - if (conns.compare_exchange_weak(chunk_conns, chunk_conns & last_conns, std::memory_order_release)) { - return (chunk_conns & last_conns) == 0; - } - ipc::yield(k); - } -} - -template -void recycle_storage(ipc::storage_id_t id, std::size_t size, ipc::circ::cc_t curr_conns, ipc::circ::cc_t conn_id) { - if (id < 0) { - ipc::error("[recycle_storage] id is invalid: id = %ld, size = %zd\n", (long)id, size); - return; - } - std::size_t chunk_size = calc_chunk_size(size); - auto info = chunk_storage_info(chunk_size); - if (info == nullptr) return; - - auto chunk = info->at(chunk_size, id); - if (chunk == nullptr) return; - - if (!sub_rc(Flag{}, chunk->conns(), curr_conns, conn_id)) { - return; - } - info->lock_.lock(); - info->pool_.release(id); - info->lock_.unlock(); -} - -template -bool clear_message(void* p) { - auto msg = static_cast(p); - if (msg->storage_) { - std::int32_t r_size = static_cast(ipc::data_length) + msg->remain_; - if (r_size <= 0) { - ipc::error("[clear_message] invalid msg size: %d\n", (int)r_size); - return true; - } - release_storage( - *reinterpret_cast(&msg->data_), - static_cast(r_size)); - } - return true; -} - -struct conn_info_head { - - ipc::string name_; - msg_id_t cc_id_; // connection-info id - ipc::detail::waiter cc_waiter_, wt_waiter_, rd_waiter_; - ipc::shm::handle acc_h_; - - conn_info_head(char const * name) - : name_ {name} - , cc_id_ {(cc_acc() == nullptr) ? 0 : cc_acc()->fetch_add(1, std::memory_order_relaxed)} - , cc_waiter_{("__CC_CONN__" + name_).c_str()} - , wt_waiter_{("__WT_CONN__" + name_).c_str()} - , rd_waiter_{("__RD_CONN__" + name_).c_str()} - , acc_h_ {("__AC_CONN__" + name_).c_str(), sizeof(acc_t)} { - } - - void quit_waiting() { - cc_waiter_.quit_waiting(); - wt_waiter_.quit_waiting(); - rd_waiter_.quit_waiting(); - } - - auto acc() { - return static_cast(acc_h_.get()); - } - - auto& recv_cache() { - thread_local ipc::unordered_map tls; - return tls; - } -}; - -template -bool wait_for(W& waiter, F&& pred, std::uint64_t tm) { - if (tm == 0) return !pred(); - for (unsigned k = 0; pred();) { - bool ret = true; - ipc::sleep(k, [&k, &ret, &waiter, &pred, tm] { - ret = waiter.wait_if(std::forward(pred), tm); - k = 0; - }); - if (!ret) return false; // timeout or fail - if (k == 0) break; // k has been reset - } - return true; -} - -template -struct queue_generator { - - using queue_t = ipc::queue, Policy>; - - struct conn_info_t : conn_info_head { - queue_t que_; - - conn_info_t(char const * name) - : conn_info_head{name} - , que_{("__QU_CONN__" + - ipc::to_string(DataSize) + "__" + - ipc::to_string(AlignSize) + "__" + name).c_str()} { - } - - void disconnect_receiver() { - bool dis = que_.disconnect(); - this->quit_waiting(); - if (dis) { - this->recv_cache().clear(); - } - } - }; -}; - -template -struct detail_impl { - -using policy_t = Policy; -using flag_t = typename policy_t::flag_t; -using queue_t = typename queue_generator::queue_t; -using conn_info_t = typename queue_generator::conn_info_t; - -constexpr static conn_info_t* info_of(ipc::handle_t h) noexcept { - return static_cast(h); -} - -constexpr static queue_t* queue_of(ipc::handle_t h) noexcept { - return (info_of(h) == nullptr) ? nullptr : &(info_of(h)->que_); -} - -/* API implementations */ - -static void disconnect(ipc::handle_t h) { - auto que = queue_of(h); - if (que == nullptr) { - return; - } - que->shut_sending(); - assert(info_of(h) != nullptr); - info_of(h)->disconnect_receiver(); -} - -static bool reconnect(ipc::handle_t * ph, bool start_to_recv) { - assert(ph != nullptr); - assert(*ph != nullptr); - auto que = queue_of(*ph); - if (que == nullptr) { - return false; - } - if (start_to_recv) { - que->shut_sending(); - if (que->connect()) { // wouldn't connect twice - info_of(*ph)->cc_waiter_.broadcast(); - return true; - } - return false; - } - // start_to_recv == false - if (que->connected()) { - info_of(*ph)->disconnect_receiver(); - } - return que->ready_sending(); -} - -static bool connect(ipc::handle_t * ph, char const * name, bool start_to_recv) { - assert(ph != nullptr); - if (*ph == nullptr) { - *ph = ipc::mem::alloc(name); - } - return reconnect(ph, start_to_recv); -} - -static void destroy(ipc::handle_t h) { - disconnect(h); - ipc::mem::free(info_of(h)); -} - -static std::size_t recv_count(ipc::handle_t h) noexcept { - auto que = queue_of(h); - if (que == nullptr) { - return ipc::invalid_value; - } - return que->conn_count(); -} - -static bool wait_for_recv(ipc::handle_t h, std::size_t r_count, std::uint64_t tm) { - auto que = queue_of(h); - if (que == nullptr) { - return false; - } - return wait_for(info_of(h)->cc_waiter_, [que, r_count] { - return que->conn_count() < r_count; - }, tm); -} - -template -static bool send(F&& gen_push, ipc::handle_t h, void const * data, std::size_t size) { - if (data == nullptr || size == 0) { - ipc::error("fail: send(%p, %zd)\n", data, size); - return false; - } - auto que = queue_of(h); - if (que == nullptr) { - ipc::error("fail: send, queue_of(h) == nullptr\n"); - return false; - } - if (que->elems() == nullptr) { - ipc::error("fail: send, queue_of(h)->elems() == nullptr\n"); - return false; - } - if (!que->ready_sending()) { - ipc::error("fail: send, que->ready_sending() == false\n"); - return false; - } - ipc::circ::cc_t conns = que->elems()->connections(std::memory_order_relaxed); - if (conns == 0) { - ipc::error("fail: send, there is no receiver on this connection.\n"); - return false; - } - // calc a new message id - auto acc = info_of(h)->acc(); - if (acc == nullptr) { - ipc::error("fail: send, info_of(h)->acc() == nullptr\n"); - return false; - } - auto msg_id = acc->fetch_add(1, std::memory_order_relaxed); - auto try_push = std::forward(gen_push)(info_of(h), que, msg_id); - if (size > ipc::large_msg_limit) { - auto dat = acquire_storage(size, conns); - void * buf = dat.second; - if (buf != nullptr) { - std::memcpy(buf, data, size); - return try_push(static_cast(size) - - static_cast(ipc::data_length), &(dat.first), 0); - } - // try using message fragment - //ipc::log("fail: shm::handle for big message. msg_id: %zd, size: %zd\n", msg_id, size); - } - // push message fragment - std::int32_t offset = 0; - for (std::int32_t i = 0; i < static_cast(size / ipc::data_length); ++i, offset += ipc::data_length) { - if (!try_push(static_cast(size) - offset - static_cast(ipc::data_length), - static_cast(data) + offset, ipc::data_length)) { - return false; - } - } - // if remain > 0, this is the last message fragment - std::int32_t remain = static_cast(size) - offset; - if (remain > 0) { - if (!try_push(remain - static_cast(ipc::data_length), - static_cast(data) + offset, - static_cast(remain))) { - return false; - } - } - return true; -} - -static bool send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { - return send([tm](auto info, auto que, auto msg_id) { - return [tm, info, que, msg_id](std::int32_t remain, void const * data, std::size_t size) { - if (!wait_for(info->wt_waiter_, [&] { - return !que->push( - [](void*) { return true; }, - info->cc_id_, msg_id, remain, data, size); - }, tm)) { - ipc::log("force_push: msg_id = %zd, remain = %d, size = %zd\n", msg_id, remain, size); - if (!que->force_push( - clear_message, - info->cc_id_, msg_id, remain, data, size)) { - return false; - } - } - info->rd_waiter_.broadcast(); - return true; - }; - }, h, data, size); -} - -static bool try_send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { - return send([tm](auto info, auto que, auto msg_id) { - return [tm, info, que, msg_id](std::int32_t remain, void const * data, std::size_t size) { - if (!wait_for(info->wt_waiter_, [&] { - return !que->push( - [](void*) { return true; }, - info->cc_id_, msg_id, remain, data, size); - }, tm)) { - return false; - } - info->rd_waiter_.broadcast(); - return true; - }; - }, h, data, size); -} - -static ipc::buff_t recv(ipc::handle_t h, std::uint64_t tm) { - auto que = queue_of(h); - if (que == nullptr) { - ipc::error("fail: recv, queue_of(h) == nullptr\n"); - return {}; - } - if (!que->connected()) { - // hasn't connected yet, just return. - return {}; - } - auto& rc = info_of(h)->recv_cache(); - for (;;) { - // pop a new message - typename queue_t::value_t msg; - if (!wait_for(info_of(h)->rd_waiter_, [que, &msg] { - return !que->pop(msg); - }, tm)) { - // pop failed, just return. - return {}; - } - info_of(h)->wt_waiter_.broadcast(); - if ((info_of(h)->acc() != nullptr) && (msg.cc_id_ == info_of(h)->cc_id_)) { - continue; // ignore message to self - } - // msg.remain_ may minus & abs(msg.remain_) < data_length - std::int32_t r_size = static_cast(ipc::data_length) + msg.remain_; - if (r_size <= 0) { - ipc::error("fail: recv, r_size = %d\n", (int)r_size); - return {}; - } - std::size_t msg_size = static_cast(r_size); - // large message - if (msg.storage_) { - ipc::storage_id_t buf_id = *reinterpret_cast(&msg.data_); - void* buf = find_storage(buf_id, msg_size); - if (buf != nullptr) { - struct recycle_t { - ipc::storage_id_t storage_id; - ipc::circ::cc_t curr_conns; - ipc::circ::cc_t conn_id; - } *r_info = ipc::mem::alloc(recycle_t{ - buf_id, que->elems()->connections(std::memory_order_relaxed), que->connected_id() - }); - if (r_info == nullptr) { - ipc::log("fail: ipc::mem::alloc.\n"); - return ipc::buff_t{buf, msg_size}; // no recycle - } else { - return ipc::buff_t{buf, msg_size, [](void* p_info, std::size_t size) { - auto r_info = static_cast(p_info); - IPC_UNUSED_ auto finally = ipc::guard([r_info] { - ipc::mem::free(r_info); - }); - recycle_storage(r_info->storage_id, size, r_info->curr_conns, r_info->conn_id); - }, r_info}; - } - } else { - ipc::log("fail: shm::handle for large message. msg_id: %zd, buf_id: %zd, size: %zd\n", msg.id_, buf_id, msg_size); - continue; - } - } - // find cache with msg.id_ - auto cac_it = rc.find(msg.id_); - if (cac_it == rc.end()) { - if (msg_size <= ipc::data_length) { - return make_cache(msg.data_, msg_size); - } - // gc - if (rc.size() > 1024) { - std::vector need_del; - for (auto const & pair : rc) { - auto cmp = std::minmax(msg.id_, pair.first); - if (cmp.second - cmp.first > 8192) { - need_del.push_back(pair.first); - } - } - for (auto id : need_del) rc.erase(id); - } - // cache the first message fragment - rc.emplace(msg.id_, cache_t { ipc::data_length, make_cache(msg.data_, msg_size) }); - } - // has cached before this message - else { - auto& cac = cac_it->second; - // this is the last message fragment - if (msg.remain_ <= 0) { - cac.append(&(msg.data_), msg_size); - // finish this message, erase it from cache - auto buff = std::move(cac.buff_); - rc.erase(cac_it); - return buff; - } - // there are remain datas after this message - cac.append(&(msg.data_), ipc::data_length); - } - } -} - -static ipc::buff_t try_recv(ipc::handle_t h) { - return recv(h, 0); -} - -}; // detail_impl - -template -using policy_t = ipc::policy::choose; - -} // internal-linkage - -namespace ipc { - -template -ipc::handle_t chan_impl::inited() { - ipc::detail::waiter::init(); - return nullptr; -} - -template -bool chan_impl::connect(ipc::handle_t * ph, char const * name, unsigned mode) { - return detail_impl>::connect(ph, name, mode & receiver); -} - -template -bool chan_impl::reconnect(ipc::handle_t * ph, unsigned mode) { - return detail_impl>::reconnect(ph, mode & receiver); -} - -template -void chan_impl::disconnect(ipc::handle_t h) { - detail_impl>::disconnect(h); -} - -template -void chan_impl::destroy(ipc::handle_t h) { - detail_impl>::destroy(h); -} - -template -char const * chan_impl::name(ipc::handle_t h) { - auto info = detail_impl>::info_of(h); - return (info == nullptr) ? nullptr : info->name_.c_str(); -} - -template -std::size_t chan_impl::recv_count(ipc::handle_t h) { - return detail_impl>::recv_count(h); -} - -template -bool chan_impl::wait_for_recv(ipc::handle_t h, std::size_t r_count, std::uint64_t tm) { - return detail_impl>::wait_for_recv(h, r_count, tm); -} - -template -bool chan_impl::send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { - return detail_impl>::send(h, data, size, tm); -} - -template -buff_t chan_impl::recv(ipc::handle_t h, std::uint64_t tm) { - return detail_impl>::recv(h, tm); -} - -template -bool chan_impl::try_send(ipc::handle_t h, void const * data, std::size_t size, std::uint64_t tm) { - return detail_impl>::try_send(h, data, size, tm); -} - -template -buff_t chan_impl::try_recv(ipc::handle_t h) { - return detail_impl>::try_recv(h); -} - -template struct chan_impl>; -// template struct chan_impl>; // TBD -// template struct chan_impl>; // TBD -template struct chan_impl>; -template struct chan_impl>; - -} // namespace ipc diff --git a/crazy_functions/test_project/cpp/cppipc/policy.h b/crazy_functions/test_project/cpp/cppipc/policy.h deleted file mode 100644 index 89596079e2cbb3ffa4ce68264a9b67a4c0f363b5..0000000000000000000000000000000000000000 --- a/crazy_functions/test_project/cpp/cppipc/policy.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include - -#include "libipc/def.h" -#include "libipc/prod_cons.h" - -#include "libipc/circ/elem_array.h" - -namespace ipc { -namespace policy { - -template