JohnnyEudora commited on
Commit
08f57fb
·
1 Parent(s): 47d9a92
Files changed (2) hide show
  1. app.py +193 -0
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1141,6 +1141,20 @@ def create_gradio_app():
1141
  inputs=[srt_quantity, srt_layout_selector],
1142
  outputs=srt_unit_prices + srt_total_prices + [srt_service_choice_radio, srt_grand_total_display]
1143
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1144
 
1145
  return demo
1146
 
@@ -1785,6 +1799,185 @@ def project_handle_form_submission(
1785
  # 调用核心函数生成PDF,并指定使用新的模板
1786
  generated_file = generate_quote_pdf(quote_data, output_path, "project.tex")
1787
  return generated_file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1788
  # ==============================================================================
1789
  # 运行应用
1790
  # ==============================================================================
 
1141
  inputs=[srt_quantity, srt_layout_selector],
1142
  outputs=srt_unit_prices + srt_total_prices + [srt_service_choice_radio, srt_grand_total_display]
1143
  )
1144
+ with gr.Tab("其他工具"):
1145
+ with gr.Tab("双语"):
1146
+ input_translated_json = gr.File(label="输入翻译获得的Json文件")
1147
+ output_one_lang_txt = gr.File(label="输出仅译文")
1148
+ output_double_lang_txt = gr.File(label="输出双语内容")
1149
+ double_gen_button = gr.Button("转换")
1150
+
1151
+ double_gen_button.click(fn=translated_json2txt_file,inputs=[input_translated_json],outputs=[output_one_lang_txt, output_double_lang_txt])
1152
+ with gr.Tab("LaTeX格式化"):
1153
+ input_LaTeX_text = gr.File(label="待格式化的LaTeX文本")
1154
+ output_LaTeX_txt = gr.File(label="格式化后文本")
1155
+ LaTeX_gen_button = gr.Button("格式化")
1156
+
1157
+ LaTeX_gen_button.click(fn=latex2txt_blocks,inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
1158
 
1159
  return demo
1160
 
 
1799
  # 调用核心函数生成PDF,并指定使用新的模板
1800
  generated_file = generate_quote_pdf(quote_data, output_path, "project.tex")
1801
  return generated_file
1802
+
1803
+ from pathlib import Path
1804
+ import json
1805
+ from typing import Union
1806
+
1807
+ # ---------- 辅助:转义未被反斜杠保护的 % ----------
1808
+ _ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%') # 前面不是反斜杠的 %
1809
+ def escape_percent(text: str) -> str:
1810
+ return _ESCAPE_PERCENT_RE.sub(r'\\%', text)
1811
+ # -------------------------------------------------
1812
+
1813
+ def translated_json2txt_file(json_path: Union[str, Path]) -> tuple[str, str]:
1814
+ """
1815
+ 读取符合题述结构的 JSON 文件,生成:
1816
+ 1. 双语 TXT : 原文\\n\\n译文\\n\\n…
1817
+ 2. 纯译文 TXT: 译文\\n\\n…
1818
+
1819
+ 返回值 (仅译文路径, 双语路径)
1820
+ """
1821
+ json_path = Path(json_path).expanduser().resolve()
1822
+ if not json_path.is_file():
1823
+ raise FileNotFoundError(f"找不到 JSON 文件: {json_path}")
1824
+
1825
+ # 输出文件路径
1826
+ bilingual_path = json_path.with_suffix(".txt")
1827
+ pure_path = json_path.with_name(f"{json_path.stem}_translated.txt")
1828
+
1829
+ # 读取 JSON
1830
+ with json_path.open("r", encoding="utf-8") as f:
1831
+ data = json.load(f)
1832
+ if "chunks" not in data or not isinstance(data["chunks"], list):
1833
+ raise ValueError("输入 JSON 缺少 'chunks' 列表")
1834
+
1835
+ # 逐对拼接
1836
+ ordered = sorted(data["chunks"], key=lambda c: c.get("chunk_index", 0))
1837
+ bilingual_segments, translation_segments = [], []
1838
+
1839
+ for c in ordered:
1840
+ original = (c.get("original_chunk") or "").strip()
1841
+ translated = (c.get("refined_chunk") or c.get("translated_chunk") or "").strip()
1842
+ translated = escape_percent(translated) # ← 关键:转义 %
1843
+
1844
+ bilingual_segments.append(f"{original}\n\n{translated}")
1845
+ translation_segments.append(translated)
1846
+
1847
+ bilingual_segments = [_strip_linebreak_backslash(b) for b in bilingual_segments]
1848
+
1849
+ translation_segments = [_strip_linebreak_backslash(b) for b in translation_segments]
1850
+
1851
+ # 写入文件
1852
+ bilingual_path.write_text("\n\n".join(bilingual_segments), encoding="utf-8")
1853
+ pure_path.write_text("\n\n".join(translation_segments), encoding="utf-8")
1854
+
1855
+ return str(pure_path), str(bilingual_path)
1856
+
1857
+ from uuid import uuid4
1858
+
1859
+ from langchain_text_splitters.latex import LatexTextSplitter
1860
+ from typing import Union
1861
+
1862
+ _DOLLAR_RE = re.compile(r'(?<!\\)(?:\\\\)*\$')
1863
+
1864
+ def count_dollar_signs(text: str) -> int:
1865
+ r"""
1866
+ 统计 LaTeX 文本中 **未被转义** 的 '$' 数量。
1867
+
1868
+ 规则
1869
+ ----
1870
+ - `\$` ➜ **不计入**(单反斜杠转义)
1871
+ - `\\$` ➜ 计入(两个反斜杠→实际输出一个 \,$ 未被转义)
1872
+ """
1873
+ return len(_DOLLAR_RE.findall(text))
1874
+
1875
+
1876
+ def _strip_linebreak_backslash(block: str) -> str:
1877
+ r"""
1878
+ 若区块不含 ``\begin``,根据四条规则清理多余的 ``\\``:
1879
+
1880
+ 1. “…X\\\n” → “…X” (X 不是换行 / . / 。;后面是换行)
1881
+ 2. “…X\\Y” → “…XY” (X 同上;后面不是换行)
1882
+ 3. “\n\\Y” → “Y” (前面是换行;后面不是换行)
1883
+ 4. “…[.。]\\” → “…[.。]\n”(前面是 . 或 。)
1884
+ """
1885
+ if r"\begin" in block:
1886
+ return block
1887
+
1888
+ # 1) 前面非换行/./。 且后面是换行: 删除 \\ 和其后的换行
1889
+ block = re.sub(r"([^\n.。])\\\\\n", r"\1", block)
1890
+
1891
+ # 3) 前面是换行,后面不是换行: 删除前面的换行和 \\
1892
+ block = re.sub(r"\n\\\\([^\n])", r"\1", block)
1893
+
1894
+ # 4) 前面是句点 . 或 全角句号 。: 把 \\ 替换为换行
1895
+ block = re.sub(r"([.。])\\\\", r"\1\n\n", block)
1896
+
1897
+ # 2) 前面非换行/./。 且后面不是换行: 仅删除 \\
1898
+ block = re.sub(r"([^\n.。])\\\\(?!\n)", r"\1", block)
1899
+
1900
+ return block
1901
+
1902
+
1903
+
1904
+ def latex2txt_blocks(
1905
+ latex_txt_path: Union[str, Path, None] = None,
1906
+ *,
1907
+ output_path: Union[str, Path, None] = None,
1908
+ chunk_size: int = 20_000,
1909
+ chunk_overlap: int = 0,
1910
+ ) -> str:
1911
+ r"""
1912
+ 处理流程
1913
+ ----------
1914
+ 1. 读入文件,仅保留 \begin{document}…\end{document}(若无则全文)。
1915
+ 2. 先按 **空行 ``\n\n``** 粗分成若干 `raw_block`。
1916
+ 3. 对每个 `raw_block` 再用 ``LatexTextSplitter`` 细分为 `fine_blocks`:
1917
+ - 校验:细分前后 ``count_dollar_signs`` 值必须一致,否则抛 `ValueError`。
1918
+ 4. 遍历 `fine_blocks`:
1919
+ - 若区块 **不含 ``\begin``**,应用三条“删除多余 \\\\”规则(见 `_strip_linebreak_backslash`)。
1920
+ 5. 将处理后的 `fine_blocks` 用 ``\n\n`` 连接为 `txt` 并执行 6.1–6.4 后处理。
1921
+ 6. 写入最终 TXT,返回其绝对路径。
1922
+ """
1923
+
1924
+ # ---------- 1. 提取 document 主体 ----------
1925
+ latex_txt_path = Path(latex_txt_path).expanduser().resolve()
1926
+ content = latex_txt_path.read_text(encoding="utf-8")
1927
+ doc_match = re.search(
1928
+ r"\\begin\{document\}(.*?)\\end\{document\}",
1929
+ content,
1930
+ flags=re.DOTALL | re.IGNORECASE,
1931
+ )
1932
+ content = doc_match.group(1) if doc_match else content
1933
+
1934
+ # ---------- 2 & 3. 粗分 + 细分 + 校验 ----------
1935
+ splitter = LatexTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
1936
+ fine_blocks: list[str] = []
1937
+
1938
+ for raw_block in content.split("\n\n"):
1939
+ raw_block = raw_block.strip()
1940
+ if not raw_block:
1941
+ continue
1942
+
1943
+ split_blocks = [b.strip() for b in splitter.split_text(raw_block) if b.strip()]
1944
+
1945
+ # 校验 $ 数量
1946
+ before = count_dollar_signs(raw_block)
1947
+ after = sum(count_dollar_signs(b) for b in split_blocks)
1948
+ if before != after:
1949
+ raise ValueError(
1950
+ "LatexTextSplitter 改变了 `$` 数量!\n\n"
1951
+ f"===== 原始块({before} 个 $) =====\n{raw_block}\n\n"
1952
+ f"===== 细分合并({after} 个 $) =====\n"
1953
+ + "\n\n==== 分块分隔 ====\n\n".join(split_blocks)
1954
+ )
1955
+
1956
+ fine_blocks.extend(split_blocks)
1957
+
1958
+ # ---------- 4. 删除多余的 \\\\ ----------
1959
+ fine_blocks = [_strip_linebreak_backslash(b) for b in fine_blocks]
1960
+ txt = "\n\n".join(fine_blocks)
1961
+
1962
+ # ---------- 6.1–6.4 后处理 ----------
1963
+ txt = re.sub(
1964
+ r"(\\begin\{tabular\}\{[^}]*\})",
1965
+ r"\\resizebox{\\textwidth}{!}{\n\1",
1966
+ txt,
1967
+ )
1968
+ txt = re.sub(r"(\\end\{tabular\})", r"\1\n}", txt)
1969
+ txt = re.sub(r"max\s*width=\\textwidth", r"width =\\textwidth, height =\\textheight", txt)
1970
+ txt = re.sub(r"\n\n(\$\$)", r"\n\1", txt)
1971
+
1972
+ # ---------- 7. 写入最终稿 ----------
1973
+ if output_path is None:
1974
+ output_path = Path.cwd() / f"latex_blocks_{uuid4().hex[:8]}.txt"
1975
+ output_path = Path(output_path).expanduser().resolve()
1976
+ output_path.write_text(txt, encoding="utf-8")
1977
+
1978
+ return str(output_path)
1979
+
1980
+
1981
  # ==============================================================================
1982
  # 运行应用
1983
  # ==============================================================================
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- gradio
 
 
 
1
+ gradio
2
+ langchain-text-splitters
3
+ charset-normalizer