Spaces:

JohnnyEudora
/

quote

Sleeping

App Files Files Community

JohnnyEudora commited on Jul 28, 2025

Commit

08f57fb

1 Parent(s): 47d9a92

Update

Browse files

Files changed (2) hide show

app.py +193 -0
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1141,6 +1141,20 @@ def create_gradio_app():
                 inputs=[srt_quantity, srt_layout_selector],
                 outputs=srt_unit_prices + srt_total_prices + [srt_service_choice_radio, srt_grand_total_display]
             )
     return demo
@@ -1785,6 +1799,185 @@ def project_handle_form_submission(
     # 调用核心函数生成PDF，并指定使用新的模板
     generated_file = generate_quote_pdf(quote_data, output_path, "project.tex")
     return generated_file
 # ==============================================================================
 # 运行应用
 # ==============================================================================

                 inputs=[srt_quantity, srt_layout_selector],
                 outputs=srt_unit_prices + srt_total_prices + [srt_service_choice_radio, srt_grand_total_display]
             )
+        with gr.Tab("其他工具"):
+            with gr.Tab("双语"):
+                input_translated_json = gr.File(label="输入翻译获得的Json文件")
+                output_one_lang_txt = gr.File(label="输出仅译文")
+                output_double_lang_txt = gr.File(label="输出双语内容")
+                double_gen_button = gr.Button("转换")
+                double_gen_button.click(fn=translated_json2txt_file,inputs=[input_translated_json],outputs=[output_one_lang_txt, output_double_lang_txt])
+            with gr.Tab("LaTeX格式化"):
+                input_LaTeX_text = gr.File(label="待格式化的LaTeX文本")
+                output_LaTeX_txt = gr.File(label="格式化后文本")
+                LaTeX_gen_button = gr.Button("格式化")
+                LaTeX_gen_button.click(fn=latex2txt_blocks,inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
     return demo
     # 调用核心函数生成PDF，并指定使用新的模板
     generated_file = generate_quote_pdf(quote_data, output_path, "project.tex")
     return generated_file
+from pathlib import Path
+import json
+from typing import Union
+# ---------- 辅助：转义未被反斜杠保护的 % ----------
+_ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%')   # 前面不是反斜杠的 %
+def escape_percent(text: str) -> str:
+    return _ESCAPE_PERCENT_RE.sub(r'\\%', text)
+# -------------------------------------------------
+def translated_json2txt_file(json_path: Union[str, Path]) -> tuple[str, str]:
+    """
+    读取符合题述结构的 JSON 文件，生成：
+    1. 双语 TXT  : 原文\\n\\n译文\\n\\n…
+    2. 纯译文 TXT: 译文\\n\\n…
+    返回值 (仅译文路径, 双语路径)
+    """
+    json_path = Path(json_path).expanduser().resolve()
+    if not json_path.is_file():
+        raise FileNotFoundError(f"找不到 JSON 文件: {json_path}")
+    # 输出文件路径
+    bilingual_path = json_path.with_suffix(".txt")
+    pure_path      = json_path.with_name(f"{json_path.stem}_translated.txt")
+    # 读取 JSON
+    with json_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if "chunks" not in data or not isinstance(data["chunks"], list):
+        raise ValueError("输入 JSON 缺少 'chunks' 列表")
+    # 逐对拼接
+    ordered = sorted(data["chunks"], key=lambda c: c.get("chunk_index", 0))
+    bilingual_segments, translation_segments = [], []
+    for c in ordered:
+        original   = (c.get("original_chunk") or "").strip()
+        translated = (c.get("refined_chunk")  or c.get("translated_chunk") or "").strip()
+        translated = escape_percent(translated)           # ← 关键：转义 %
+        bilingual_segments.append(f"{original}\n\n{translated}")
+        translation_segments.append(translated)
+    bilingual_segments = [_strip_linebreak_backslash(b) for b in bilingual_segments]
+    translation_segments = [_strip_linebreak_backslash(b) for b in translation_segments]
+    # 写入文件
+    bilingual_path.write_text("\n\n".join(bilingual_segments), encoding="utf-8")
+    pure_path.write_text("\n\n".join(translation_segments),   encoding="utf-8")
+    return str(pure_path), str(bilingual_path)
+from uuid import uuid4
+from langchain_text_splitters.latex import LatexTextSplitter
+from typing import Union
+_DOLLAR_RE = re.compile(r'(?<!\\)(?:\\\\)*\$')
+def count_dollar_signs(text: str) -> int:
+    r"""
+    统计 LaTeX 文本中 **未被转义** 的 '$' 数量。
+    规则
+    ----
+    - `\$`    ➜ **不计入**（单反斜杠转义）
+    - `\\$`   ➜ 计入（两个反斜杠→实际输出一个 \，$ 未被转义）
+    """
+    return len(_DOLLAR_RE.findall(text))
+def _strip_linebreak_backslash(block: str) -> str:
+    r"""
+    若区块不含 ``\begin``，根据四条规则清理多余的 ``\\``：
+    1. “…X\\\n”  → “…X”      （X 不是换行 / . / 。；后面是换行）
+    2. “…X\\Y”   → “…XY”     （X 同上；后面不是换行）
+    3. “\n\\Y”   → “Y”        （前面是换行；后面不是换行）
+    4. “…[.。]\\” → “…[.。]\n”（前面是 . 或 。）
+    """
+    if r"\begin" in block:
+        return block
+    # 1) 前面非换行/./。 且后面是换行: 删除 \\ 和其后的换行
+    block = re.sub(r"([^\n.。])\\\\\n", r"\1", block)
+    # 3) 前面是换行，后面不是换行: 删除前面的换行和 \\
+    block = re.sub(r"\n\\\\([^\n])", r"\1", block)
+    # 4) 前面是句点 . 或 全角句号 。: 把 \\ 替换为换行
+    block = re.sub(r"([.。])\\\\", r"\1\n\n", block)
+    # 2) 前面非换行/./。 且后面不是换行: 仅删除 \\
+    block = re.sub(r"([^\n.。])\\\\(?!\n)", r"\1", block)
+    return block
+def latex2txt_blocks(
+    latex_txt_path: Union[str, Path, None] = None,
+    *,
+    output_path: Union[str, Path, None] = None,
+    chunk_size: int = 20_000,
+    chunk_overlap: int = 0,
+) -> str:
+    r"""
+    处理流程
+    ----------
+    1. 读入文件，仅保留 \begin{document}…\end{document}（若无则全文）。
+    2. 先按 **空行 ``\n\n``** 粗分成若干 `raw_block`。
+    3. 对每个 `raw_block` 再用 ``LatexTextSplitter`` 细分为 `fine_blocks`：
+       - 校验：细分前后 ``count_dollar_signs`` 值必须一致，否则抛 `ValueError`。
+    4. 遍历 `fine_blocks`：
+       - 若区块 **不含 ``\begin``**，应用三条“删除多余 \\\\”规则（见 `_strip_linebreak_backslash`）。
+    5. 将处理后的 `fine_blocks` 用 ``\n\n`` 连接为 `txt` 并执行 6.1–6.4 后处理。
+    6. 写入最终 TXT，返回其绝对路径。
+    """
+    # ---------- 1. 提取 document 主体 ----------
+    latex_txt_path = Path(latex_txt_path).expanduser().resolve()
+    content = latex_txt_path.read_text(encoding="utf-8")
+    doc_match = re.search(
+        r"\\begin\{document\}(.*?)\\end\{document\}",
+        content,
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+    content = doc_match.group(1) if doc_match else content
+    # ---------- 2 & 3. 粗分 + 细分 + 校验 ----------
+    splitter = LatexTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    fine_blocks: list[str] = []
+    for raw_block in content.split("\n\n"):
+        raw_block = raw_block.strip()
+        if not raw_block:
+            continue
+        split_blocks = [b.strip() for b in splitter.split_text(raw_block) if b.strip()]
+        # 校验 $ 数量
+        before = count_dollar_signs(raw_block)
+        after = sum(count_dollar_signs(b) for b in split_blocks)
+        if before != after:
+            raise ValueError(
+                "LatexTextSplitter 改变了 `$` 数量！\n\n"
+                f"===== 原始块（{before} 个 $） =====\n{raw_block}\n\n"
+                f"===== 细分合并（{after} 个 $） =====\n"
+                + "\n\n==== 分块分隔 ====\n\n".join(split_blocks)
+            )
+        fine_blocks.extend(split_blocks)
+    # ---------- 4. 删除多余的 \\\\ ----------
+    fine_blocks = [_strip_linebreak_backslash(b) for b in fine_blocks]
+    txt = "\n\n".join(fine_blocks)
+    # ---------- 6.1–6.4 后处理 ----------
+    txt = re.sub(
+        r"(\\begin\{tabular\}\{[^}]*\})",
+        r"\\resizebox{\\textwidth}{!}{\n\1",
+        txt,
+    )
+    txt = re.sub(r"(\\end\{tabular\})", r"\1\n}", txt)
+    txt = re.sub(r"max\s*width=\\textwidth", r"width =\\textwidth, height =\\textheight", txt)
+    txt = re.sub(r"\n\n(\$\$)", r"\n\1", txt)
+    # ---------- 7. 写入最终稿 ----------
+    if output_path is None:
+        output_path = Path.cwd() / f"latex_blocks_{uuid4().hex[:8]}.txt"
+    output_path = Path(output_path).expanduser().resolve()
+    output_path.write_text(txt, encoding="utf-8")
+    return str(output_path)
 # ==============================================================================
 # 运行应用
 # ==============================================================================

requirements.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- gradio

+gradio
+langchain-text-splitters
+charset-normalizer