Spaces:
Sleeping
Sleeping
Commit ·
08f57fb
1
Parent(s): 47d9a92
Update
Browse files- app.py +193 -0
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -1141,6 +1141,20 @@ def create_gradio_app():
|
|
| 1141 |
inputs=[srt_quantity, srt_layout_selector],
|
| 1142 |
outputs=srt_unit_prices + srt_total_prices + [srt_service_choice_radio, srt_grand_total_display]
|
| 1143 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1144 |
|
| 1145 |
return demo
|
| 1146 |
|
|
@@ -1785,6 +1799,185 @@ def project_handle_form_submission(
|
|
| 1785 |
# 调用核心函数生成PDF,并指定使用新的模板
|
| 1786 |
generated_file = generate_quote_pdf(quote_data, output_path, "project.tex")
|
| 1787 |
return generated_file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1788 |
# ==============================================================================
|
| 1789 |
# 运行应用
|
| 1790 |
# ==============================================================================
|
|
|
|
| 1141 |
inputs=[srt_quantity, srt_layout_selector],
|
| 1142 |
outputs=srt_unit_prices + srt_total_prices + [srt_service_choice_radio, srt_grand_total_display]
|
| 1143 |
)
|
| 1144 |
+
with gr.Tab("其他工具"):
|
| 1145 |
+
with gr.Tab("双语"):
|
| 1146 |
+
input_translated_json = gr.File(label="输入翻译获得的Json文件")
|
| 1147 |
+
output_one_lang_txt = gr.File(label="输出仅译文")
|
| 1148 |
+
output_double_lang_txt = gr.File(label="输出双语内容")
|
| 1149 |
+
double_gen_button = gr.Button("转换")
|
| 1150 |
+
|
| 1151 |
+
double_gen_button.click(fn=translated_json2txt_file,inputs=[input_translated_json],outputs=[output_one_lang_txt, output_double_lang_txt])
|
| 1152 |
+
with gr.Tab("LaTeX格式化"):
|
| 1153 |
+
input_LaTeX_text = gr.File(label="待格式化的LaTeX文本")
|
| 1154 |
+
output_LaTeX_txt = gr.File(label="格式化后文本")
|
| 1155 |
+
LaTeX_gen_button = gr.Button("格式化")
|
| 1156 |
+
|
| 1157 |
+
LaTeX_gen_button.click(fn=latex2txt_blocks,inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
|
| 1158 |
|
| 1159 |
return demo
|
| 1160 |
|
|
|
|
| 1799 |
# 调用核心函数生成PDF,并指定使用新的模板
|
| 1800 |
generated_file = generate_quote_pdf(quote_data, output_path, "project.tex")
|
| 1801 |
return generated_file
|
| 1802 |
+
|
| 1803 |
+
from pathlib import Path
|
| 1804 |
+
import json
|
| 1805 |
+
from typing import Union
|
| 1806 |
+
|
| 1807 |
+
# ---------- 辅助:转义未被反斜杠保护的 % ----------
|
| 1808 |
+
_ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%') # 前面不是反斜杠的 %
|
| 1809 |
+
def escape_percent(text: str) -> str:
|
| 1810 |
+
return _ESCAPE_PERCENT_RE.sub(r'\\%', text)
|
| 1811 |
+
# -------------------------------------------------
|
| 1812 |
+
|
| 1813 |
+
def translated_json2txt_file(json_path: Union[str, Path]) -> tuple[str, str]:
|
| 1814 |
+
"""
|
| 1815 |
+
读取符合题述结构的 JSON 文件,生成:
|
| 1816 |
+
1. 双语 TXT : 原文\\n\\n译文\\n\\n…
|
| 1817 |
+
2. 纯译文 TXT: 译文\\n\\n…
|
| 1818 |
+
|
| 1819 |
+
返回值 (仅译文路径, 双语路径)
|
| 1820 |
+
"""
|
| 1821 |
+
json_path = Path(json_path).expanduser().resolve()
|
| 1822 |
+
if not json_path.is_file():
|
| 1823 |
+
raise FileNotFoundError(f"找不到 JSON 文件: {json_path}")
|
| 1824 |
+
|
| 1825 |
+
# 输出文件路径
|
| 1826 |
+
bilingual_path = json_path.with_suffix(".txt")
|
| 1827 |
+
pure_path = json_path.with_name(f"{json_path.stem}_translated.txt")
|
| 1828 |
+
|
| 1829 |
+
# 读取 JSON
|
| 1830 |
+
with json_path.open("r", encoding="utf-8") as f:
|
| 1831 |
+
data = json.load(f)
|
| 1832 |
+
if "chunks" not in data or not isinstance(data["chunks"], list):
|
| 1833 |
+
raise ValueError("输入 JSON 缺少 'chunks' 列表")
|
| 1834 |
+
|
| 1835 |
+
# 逐对拼接
|
| 1836 |
+
ordered = sorted(data["chunks"], key=lambda c: c.get("chunk_index", 0))
|
| 1837 |
+
bilingual_segments, translation_segments = [], []
|
| 1838 |
+
|
| 1839 |
+
for c in ordered:
|
| 1840 |
+
original = (c.get("original_chunk") or "").strip()
|
| 1841 |
+
translated = (c.get("refined_chunk") or c.get("translated_chunk") or "").strip()
|
| 1842 |
+
translated = escape_percent(translated) # ← 关键:转义 %
|
| 1843 |
+
|
| 1844 |
+
bilingual_segments.append(f"{original}\n\n{translated}")
|
| 1845 |
+
translation_segments.append(translated)
|
| 1846 |
+
|
| 1847 |
+
bilingual_segments = [_strip_linebreak_backslash(b) for b in bilingual_segments]
|
| 1848 |
+
|
| 1849 |
+
translation_segments = [_strip_linebreak_backslash(b) for b in translation_segments]
|
| 1850 |
+
|
| 1851 |
+
# 写入文件
|
| 1852 |
+
bilingual_path.write_text("\n\n".join(bilingual_segments), encoding="utf-8")
|
| 1853 |
+
pure_path.write_text("\n\n".join(translation_segments), encoding="utf-8")
|
| 1854 |
+
|
| 1855 |
+
return str(pure_path), str(bilingual_path)
|
| 1856 |
+
|
| 1857 |
+
from uuid import uuid4
|
| 1858 |
+
|
| 1859 |
+
from langchain_text_splitters.latex import LatexTextSplitter
|
| 1860 |
+
from typing import Union
|
| 1861 |
+
|
| 1862 |
+
_DOLLAR_RE = re.compile(r'(?<!\\)(?:\\\\)*\$')
|
| 1863 |
+
|
| 1864 |
+
def count_dollar_signs(text: str) -> int:
|
| 1865 |
+
r"""
|
| 1866 |
+
统计 LaTeX 文本中 **未被转义** 的 '$' 数量。
|
| 1867 |
+
|
| 1868 |
+
规则
|
| 1869 |
+
----
|
| 1870 |
+
- `\$` ➜ **不计入**(单反斜杠转义)
|
| 1871 |
+
- `\\$` ➜ 计入(两个反斜杠→实际输出一个 \,$ 未被转义)
|
| 1872 |
+
"""
|
| 1873 |
+
return len(_DOLLAR_RE.findall(text))
|
| 1874 |
+
|
| 1875 |
+
|
| 1876 |
+
def _strip_linebreak_backslash(block: str) -> str:
|
| 1877 |
+
r"""
|
| 1878 |
+
若区块不含 ``\begin``,根据四条规则清理多余的 ``\\``:
|
| 1879 |
+
|
| 1880 |
+
1. “…X\\\n” → “…X” (X 不是换行 / . / 。;后面是换行)
|
| 1881 |
+
2. “…X\\Y” → “…XY” (X 同上;后面不是换行)
|
| 1882 |
+
3. “\n\\Y” → “Y” (前面是换行;后面不是换行)
|
| 1883 |
+
4. “…[.。]\\” → “…[.。]\n”(前面是 . 或 。)
|
| 1884 |
+
"""
|
| 1885 |
+
if r"\begin" in block:
|
| 1886 |
+
return block
|
| 1887 |
+
|
| 1888 |
+
# 1) 前面非换行/./。 且后面是换行: 删除 \\ 和其后的换行
|
| 1889 |
+
block = re.sub(r"([^\n.。])\\\\\n", r"\1", block)
|
| 1890 |
+
|
| 1891 |
+
# 3) 前面是换行,后面不是换行: 删除前面的换行和 \\
|
| 1892 |
+
block = re.sub(r"\n\\\\([^\n])", r"\1", block)
|
| 1893 |
+
|
| 1894 |
+
# 4) 前面是句点 . 或 全角句号 。: 把 \\ 替换为换行
|
| 1895 |
+
block = re.sub(r"([.。])\\\\", r"\1\n\n", block)
|
| 1896 |
+
|
| 1897 |
+
# 2) 前面非换行/./。 且后面不是换行: 仅删除 \\
|
| 1898 |
+
block = re.sub(r"([^\n.。])\\\\(?!\n)", r"\1", block)
|
| 1899 |
+
|
| 1900 |
+
return block
|
| 1901 |
+
|
| 1902 |
+
|
| 1903 |
+
|
| 1904 |
+
def latex2txt_blocks(
|
| 1905 |
+
latex_txt_path: Union[str, Path, None] = None,
|
| 1906 |
+
*,
|
| 1907 |
+
output_path: Union[str, Path, None] = None,
|
| 1908 |
+
chunk_size: int = 20_000,
|
| 1909 |
+
chunk_overlap: int = 0,
|
| 1910 |
+
) -> str:
|
| 1911 |
+
r"""
|
| 1912 |
+
处理流程
|
| 1913 |
+
----------
|
| 1914 |
+
1. 读入文件,仅保留 \begin{document}…\end{document}(若无则全文)。
|
| 1915 |
+
2. 先按 **空行 ``\n\n``** 粗分成若干 `raw_block`。
|
| 1916 |
+
3. 对每个 `raw_block` 再用 ``LatexTextSplitter`` 细分为 `fine_blocks`:
|
| 1917 |
+
- 校验:细分前后 ``count_dollar_signs`` 值必须一致,否则抛 `ValueError`。
|
| 1918 |
+
4. 遍历 `fine_blocks`:
|
| 1919 |
+
- 若区块 **不含 ``\begin``**,应用三条“删除多余 \\\\”规则(见 `_strip_linebreak_backslash`)。
|
| 1920 |
+
5. 将处理后的 `fine_blocks` 用 ``\n\n`` 连接为 `txt` 并执行 6.1–6.4 后处理。
|
| 1921 |
+
6. 写入最终 TXT,返回其绝对路径。
|
| 1922 |
+
"""
|
| 1923 |
+
|
| 1924 |
+
# ---------- 1. 提取 document 主体 ----------
|
| 1925 |
+
latex_txt_path = Path(latex_txt_path).expanduser().resolve()
|
| 1926 |
+
content = latex_txt_path.read_text(encoding="utf-8")
|
| 1927 |
+
doc_match = re.search(
|
| 1928 |
+
r"\\begin\{document\}(.*?)\\end\{document\}",
|
| 1929 |
+
content,
|
| 1930 |
+
flags=re.DOTALL | re.IGNORECASE,
|
| 1931 |
+
)
|
| 1932 |
+
content = doc_match.group(1) if doc_match else content
|
| 1933 |
+
|
| 1934 |
+
# ---------- 2 & 3. 粗分 + 细分 + 校验 ----------
|
| 1935 |
+
splitter = LatexTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 1936 |
+
fine_blocks: list[str] = []
|
| 1937 |
+
|
| 1938 |
+
for raw_block in content.split("\n\n"):
|
| 1939 |
+
raw_block = raw_block.strip()
|
| 1940 |
+
if not raw_block:
|
| 1941 |
+
continue
|
| 1942 |
+
|
| 1943 |
+
split_blocks = [b.strip() for b in splitter.split_text(raw_block) if b.strip()]
|
| 1944 |
+
|
| 1945 |
+
# 校验 $ 数量
|
| 1946 |
+
before = count_dollar_signs(raw_block)
|
| 1947 |
+
after = sum(count_dollar_signs(b) for b in split_blocks)
|
| 1948 |
+
if before != after:
|
| 1949 |
+
raise ValueError(
|
| 1950 |
+
"LatexTextSplitter 改变了 `$` 数量!\n\n"
|
| 1951 |
+
f"===== 原始块({before} 个 $) =====\n{raw_block}\n\n"
|
| 1952 |
+
f"===== 细分合并({after} 个 $) =====\n"
|
| 1953 |
+
+ "\n\n==== 分块分隔 ====\n\n".join(split_blocks)
|
| 1954 |
+
)
|
| 1955 |
+
|
| 1956 |
+
fine_blocks.extend(split_blocks)
|
| 1957 |
+
|
| 1958 |
+
# ---------- 4. 删除多余的 \\\\ ----------
|
| 1959 |
+
fine_blocks = [_strip_linebreak_backslash(b) for b in fine_blocks]
|
| 1960 |
+
txt = "\n\n".join(fine_blocks)
|
| 1961 |
+
|
| 1962 |
+
# ---------- 6.1–6.4 后处理 ----------
|
| 1963 |
+
txt = re.sub(
|
| 1964 |
+
r"(\\begin\{tabular\}\{[^}]*\})",
|
| 1965 |
+
r"\\resizebox{\\textwidth}{!}{\n\1",
|
| 1966 |
+
txt,
|
| 1967 |
+
)
|
| 1968 |
+
txt = re.sub(r"(\\end\{tabular\})", r"\1\n}", txt)
|
| 1969 |
+
txt = re.sub(r"max\s*width=\\textwidth", r"width =\\textwidth, height =\\textheight", txt)
|
| 1970 |
+
txt = re.sub(r"\n\n(\$\$)", r"\n\1", txt)
|
| 1971 |
+
|
| 1972 |
+
# ---------- 7. 写入最终稿 ----------
|
| 1973 |
+
if output_path is None:
|
| 1974 |
+
output_path = Path.cwd() / f"latex_blocks_{uuid4().hex[:8]}.txt"
|
| 1975 |
+
output_path = Path(output_path).expanduser().resolve()
|
| 1976 |
+
output_path.write_text(txt, encoding="utf-8")
|
| 1977 |
+
|
| 1978 |
+
return str(output_path)
|
| 1979 |
+
|
| 1980 |
+
|
| 1981 |
# ==============================================================================
|
| 1982 |
# 运行应用
|
| 1983 |
# ==============================================================================
|
requirements.txt
CHANGED
|
@@ -1 +1,3 @@
|
|
| 1 |
-
gradio
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
langchain-text-splitters
|
| 3 |
+
charset-normalizer
|