| | import json |
| | import os |
| | import shutil |
| | import time |
| | import zipfile |
| |
|
| | import requests |
| | import streamlit as st |
| | from langchain_core.messages import SystemMessage |
| | from langchain_core.prompts import ChatPromptTemplate |
| | from langchain_openai import ChatOpenAI |
| | from langchain_text_splitters import MarkdownHeaderTextSplitter |
| | from loguru import logger |
| |
|
| | from ui.Component import side_bar_links |
| | from utils.Doc2x import pre_upload, put_file, get_status, get_md |
| |
|
| | st.set_page_config( |
| | page_title='工具箱', |
| | page_icon='🔨', |
| | layout='wide', |
| | ) |
| |
|
| | with st.sidebar: |
| | side_bar_links() |
| |
|
| |
|
| | def ac_translate(original_text: str): |
| | _prompt = ChatPromptTemplate.from_messages( |
| | [ |
| | SystemMessage(content="""你是一个能够高效准确翻译学术论文的助手。你的任务是将用户提供的学术论文从英文翻译成中文,并保留原始的Markdown格式完全不变。为了确保输出结果正确,请注意以下规则: |
| | |
| | 1. **保留Markdown结构**:包括标题、列表、表格、代码块、引用、脚注等,确保格式一致。 |
| | 2. **翻译正文内容**:将正文内容翻译成流畅自然的中文,但保留学术术语、专有名词或引用标记(如`[引用]`、`<term>`),除非用户另有要求。 |
| | 3. **避免错误**: |
| | - 确保翻译结果中仅保留与原始Markdown相同的标题结构(如`#`等符号的使用)。 |
| | - 不要误将普通段落转换为标题。 |
| | 4. **输出为纯Markdown文本**:不要添加额外的Markdown代码块标记(如`````markdown````或类似结构),直接返回翻译后的Markdown内容。 |
| | |
| | 在完成任务时,请专注于准确性和格式一致性。如果有任何不确定的内容,请保持原文不变。 |
| | """), |
| | ("human", """请将以下Markdown格式的学术论文从英文翻译成中文,并严格按照以下要求处理: |
| | |
| | 1. 保留原始Markdown结构,包括标题、列表、表格、代码块等。 |
| | 2. 翻译正文内容为流畅自然的中文,但保留特定术语或标记(如`[引用]`、`<term>`)不被翻译。 |
| | 3. 确保段落与标题的区分准确,避免错误地将正文内容标记为标题。 |
| | 4. 直接返回翻译后的Markdown文本,不要额外包裹在代码块标记中。 |
| | |
| | 以下是需要翻译的内容: |
| | |
| | {original_text} |
| | """) |
| | ] |
| | ) |
| |
|
| | llm = ChatOpenAI( |
| | model_name="glm-4-flash", |
| | openai_api_base='https://open.bigmodel.cn/api/paas/v4/', |
| | temperature=0.5, |
| | openai_api_key=st.secrets['gml_key'], |
| | streaming=True |
| | ) |
| |
|
| | chain = _prompt | llm |
| |
|
| | llm_result = chain.stream({"original_text": original_text}) |
| |
|
| | return llm_result |
| |
|
| |
|
| | pdf_file = st.file_uploader( |
| | '选择PDF文件', |
| | type=['pdf'], |
| | ) |
| | col1, col2, _ = st.columns([1, 1, 5]) |
| | translate_container = st.container(height=550, border=True) |
| | if upload_btn := col1.button('翻译', disabled=pdf_file is None): |
| | os.makedirs('tmp/translate', exist_ok=True) |
| |
|
| | pdf_path = os.path.join('tmp', pdf_file.name) |
| | with open(pdf_path, 'wb') as f: |
| | f.write(pdf_file.getbuffer()) |
| |
|
| | with st.status('pre upload...'): |
| | upload_data = pre_upload(st.secrets['doc2x']) |
| | url = upload_data["url"] |
| | uid = upload_data["uid"] |
| |
|
| | put_file(pdf_path, url) |
| | progress_bar = st.progress(0, '解析中...') |
| |
|
| | while True: |
| |
|
| | status_data = get_status(uid, st.secrets['doc2x']) |
| | if status_data["status"] == "success": |
| | result = status_data["result"] |
| | with open("result.json", "w") as f: |
| | json.dump(result, f) |
| | break |
| | elif status_data["status"] == "failed": |
| | detail = status_data["detail"] |
| | raise Exception(f"parse failed: {detail}") |
| | elif status_data["status"] == "processing": |
| | |
| | progress = status_data["progress"] |
| | progress_bar.progress(progress, '解析中...') |
| | time.sleep(3) |
| |
|
| | progress_bar.progress(100, '解析完毕') |
| |
|
| | uid = '01938687-2e4e-779e-9371-1e03521e85ec' |
| | logger.info(uid) |
| |
|
| | with st.status('导出为markdown'): |
| | get_md(uid, st.secrets['doc2x'], True) |
| |
|
| | while True: |
| | status_data = get_md(uid, st.secrets['doc2x'], False) |
| |
|
| | if status_data["status"] == "success": |
| | result_url = status_data["url"] |
| | response = requests.get(result_url) |
| | with open('tmp/downloaded_file.zip', 'wb') as f: |
| | f.write(response.content) |
| | break |
| | elif status_data["status"] == "failed": |
| | detail = status_data["detail"] |
| | raise Exception(f"parse failed: {detail}") |
| | elif status_data["status"] == "processing": |
| | progress = status_data["progress"] |
| | time.sleep(3) |
| |
|
| | with zipfile.ZipFile('tmp/downloaded_file.zip', 'r') as zip_ref: |
| | zip_ref.extractall('tmp/translate') |
| |
|
| | with open('tmp/translate/origin.md', 'r', encoding='utf-8') as md_file: |
| | md_docs = md_file.read().splitlines() |
| |
|
| | with open('tmp/translate/translated.md', 'w+', encoding='utf-8') as f: |
| | if md_docs[0] == '---': |
| | f.write(md_docs.pop(0)) |
| | while True: |
| | next_line = md_docs.pop(0) |
| | f.write(f"{next_line}\n") |
| | if next_line == '---': |
| | break |
| |
|
| | for doc in md_docs: |
| | if doc.startswith("![") or doc == '---' or len(doc) == 0: |
| | f.write(f"{doc}\n") |
| | continue |
| |
|
| | response = ac_translate(doc) |
| | translate_result = translate_container.write_stream(response) |
| | if doc.startswith('#'): |
| | f.write(f"{translate_result}\n") |
| | else: |
| | f.write(f"{translate_result.lstrip('#')}\n \n") |
| |
|
| | shutil.make_archive( |
| | f'translate', |
| | 'zip', |
| | 'tmp/translate', |
| | f'./' |
| | ) |
| | shutil.rmtree('tmp') |
| |
|
| | if os.path.exists('translate.zip'): |
| | with open('translate.zip', 'rb') as f: |
| | col2.download_button( |
| | "下载", |
| | data=f, |
| | type="primary", |
| | file_name='downloaded_file.zip', |
| | mime="application/octet-stream", |
| | ) |
| |
|