Spaces:
Sleeping
Sleeping
Commit ·
e4670ae
1
Parent(s): ad68d34
update
Browse files- app.py +214 -4
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -4,7 +4,12 @@ import subprocess
|
|
| 4 |
import tempfile
|
| 5 |
import shutil
|
| 6 |
import re
|
|
|
|
|
|
|
| 7 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
# ==============================================================================
|
|
@@ -1163,6 +1168,16 @@ def create_gradio_app():
|
|
| 1163 |
LaTeX_gen_button = gr.Button("格式化")
|
| 1164 |
|
| 1165 |
LaTeX_gen_button.click(fn=latex2txt_blocks, inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1166 |
|
| 1167 |
return demo
|
| 1168 |
|
|
@@ -1811,9 +1826,204 @@ def project_handle_form_submission(
|
|
| 1811 |
return generated_file
|
| 1812 |
|
| 1813 |
|
| 1814 |
-
|
| 1815 |
-
|
| 1816 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1817 |
|
| 1818 |
# ---------- 辅助:转义未被反斜杠保护的 % ----------
|
| 1819 |
_ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%') # 前面不是反斜杠的 %
|
|
@@ -2001,7 +2211,7 @@ def latex2txt_blocks(
|
|
| 2001 |
latex_txt_path = Path(latex_txt_path).expanduser().resolve()
|
| 2002 |
content = latex_txt_path.read_text(encoding="utf-8")
|
| 2003 |
doc_match = re.search(
|
| 2004 |
-
r"\\begin\{document}(.*?)\\end\{document
|
| 2005 |
content,
|
| 2006 |
flags=re.DOTALL | re.IGNORECASE,
|
| 2007 |
)
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import shutil
|
| 6 |
import re
|
| 7 |
+
import warnings
|
| 8 |
+
import pandas as pd
|
| 9 |
from datetime import datetime
|
| 10 |
+
from typing import List, Dict, Optional, Iterable, Union
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
import json
|
| 13 |
|
| 14 |
|
| 15 |
# ==============================================================================
|
|
|
|
| 1168 |
LaTeX_gen_button = gr.Button("格式化")
|
| 1169 |
|
| 1170 |
LaTeX_gen_button.click(fn=latex2txt_blocks, inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
|
| 1171 |
+
with gr.Tab("想学的很多"):
|
| 1172 |
+
input_excel = gr.File(label="输入excel文件")
|
| 1173 |
+
output_csv = gr.File(label="待Notion的csv文件")
|
| 1174 |
+
|
| 1175 |
+
free_button = gr.Button("免费队列")
|
| 1176 |
+
money_button = gr.Button("付费队列")
|
| 1177 |
+
|
| 1178 |
+
free_button.click(fn=build_free, inputs=[input_excel],outputs=[output_csv])
|
| 1179 |
+
money_button.click(fn=build_money, inputs=[input_excel], outputs=[output_csv])
|
| 1180 |
+
|
| 1181 |
|
| 1182 |
return demo
|
| 1183 |
|
|
|
|
| 1826 |
return generated_file
|
| 1827 |
|
| 1828 |
|
| 1829 |
+
def build_free(
|
| 1830 |
+
input_path: str,
|
| 1831 |
+
output_path: Optional[str] = None,
|
| 1832 |
+
default_status: str = "待处理",
|
| 1833 |
+
dedup_by_link: bool = True,
|
| 1834 |
+
) -> str:
|
| 1835 |
+
"""
|
| 1836 |
+
将“这种 Excel”(含原始收集字段)转换为队列 CSV。
|
| 1837 |
+
输出列为:提交时间、状态、标题、链接、提交者、备注
|
| 1838 |
+
|
| 1839 |
+
规则:
|
| 1840 |
+
- 提交时间:直接继承原 Excel 中的时间文本(不更改格式)
|
| 1841 |
+
- 状态:固定写入 `default_status`(默认:待处理)
|
| 1842 |
+
- 标题:格式为【机构】作者《中文课程名 - 英文课程名》
|
| 1843 |
+
* 若作者为空或为“无”等,则省略作者部分
|
| 1844 |
+
* 若中/英文课程名只有其一存在,则只使用存在的那个,不加“ - ”
|
| 1845 |
+
* 若两者皆空,则省略《》部分
|
| 1846 |
+
* 机构为空时用“未填写机构”
|
| 1847 |
+
- 链接、提交者、备注:直接继承
|
| 1848 |
+
- 若 `dedup_by_link=True`,则按“链接”去重,保留首个
|
| 1849 |
+
|
| 1850 |
+
兼容列名(从左到右优先匹配):
|
| 1851 |
+
提交时间: ["提交时间", "提交时间(自动)"]
|
| 1852 |
+
机构/学校: ["机构/学校(必填)", "机构/学校", "机构", "学校"]
|
| 1853 |
+
作者信息: ["作者信息(必填)", "作者信息", "作者"]
|
| 1854 |
+
原语言课程名: ["课程名(原语言课程名)", "课程名(原文课程名)", "原语言课程名", "英文课程名", "课程名(英文)"]
|
| 1855 |
+
中文课程名: ["课程名(中文课程名)", "中文课程名", "课程名(中文)", "课程名-中文"]
|
| 1856 |
+
课程链接: ["课程链接(必填)", "链接", "URL"]
|
| 1857 |
+
备注: ["备注", "说明"]
|
| 1858 |
+
提交者: ["提交者(自动)", "提交者", "提交人"]
|
| 1859 |
+
|
| 1860 |
+
返回:生成的 CSV 路径(utf-8-sig 编码)
|
| 1861 |
+
"""
|
| 1862 |
+
|
| 1863 |
+
# --------- 读入表格(优先 Excel,屏蔽 openpyxl 的默认样式告警) ---------
|
| 1864 |
+
with warnings.catch_warnings():
|
| 1865 |
+
warnings.filterwarnings(
|
| 1866 |
+
"ignore",
|
| 1867 |
+
category=UserWarning,
|
| 1868 |
+
module=r"openpyxl\.styles\.stylesheet",
|
| 1869 |
+
)
|
| 1870 |
+
try:
|
| 1871 |
+
df = pd.read_excel(input_path) # 默认读第一个工作表
|
| 1872 |
+
except Exception:
|
| 1873 |
+
# 兜底尝试 CSV(如果用户误传了 .csv)
|
| 1874 |
+
for enc in ("utf-8", "utf-8-sig", "gb18030"):
|
| 1875 |
+
try:
|
| 1876 |
+
df = pd.read_csv(input_path, encoding=enc)
|
| 1877 |
+
break
|
| 1878 |
+
except Exception:
|
| 1879 |
+
df = None
|
| 1880 |
+
if df is None:
|
| 1881 |
+
raise RuntimeError("无法读取为 Excel 或 CSV。")
|
| 1882 |
+
|
| 1883 |
+
# --------- 列名解析(兼容多种命名) ---------
|
| 1884 |
+
candidates: Dict[str, List[str]] = {
|
| 1885 |
+
"time": ["提交时间", "提交时间(自动)"],
|
| 1886 |
+
"org": ["机构/学校(必填)", "机构/学校", "机构", "学校"],
|
| 1887 |
+
"author": ["作者信息(必填)", "作者信息", "作者"],
|
| 1888 |
+
"title_en": ["课程名(原语言课程名)", "课程名(原文课程名)", "原语言课程名", "英文课程名", "课程名(英文)"],
|
| 1889 |
+
"title_cn": ["课程名(中文课程名)", "中文课程名", "课程名(中文)", "课程名-中文"],
|
| 1890 |
+
"link": ["课程链接(必填)", "链接", "URL"],
|
| 1891 |
+
"remark": ["备注", "说明"],
|
| 1892 |
+
"submitter": ["提交者(自动)", "提交者", "提交人"],
|
| 1893 |
+
}
|
| 1894 |
+
|
| 1895 |
+
def resolve_col(name_list: Iterable[str]) -> str:
|
| 1896 |
+
for name in name_list:
|
| 1897 |
+
if name in df.columns:
|
| 1898 |
+
return name
|
| 1899 |
+
# 为方便定位问题,返回更清晰的报错
|
| 1900 |
+
raise KeyError(f"找不到列名(任一均可):{list(name_list)};现有列:{list(df.columns)}")
|
| 1901 |
+
|
| 1902 |
+
col_time = resolve_col(candidates["time"])
|
| 1903 |
+
col_org = resolve_col(candidates["org"])
|
| 1904 |
+
col_author = resolve_col(candidates["author"])
|
| 1905 |
+
col_title_en = resolve_col(candidates["title_en"])
|
| 1906 |
+
col_title_cn = resolve_col(candidates["title_cn"])
|
| 1907 |
+
col_link = resolve_col(candidates["link"])
|
| 1908 |
+
col_remark = resolve_col(candidates["remark"])
|
| 1909 |
+
col_submitter = resolve_col(candidates["submitter"])
|
| 1910 |
+
|
| 1911 |
+
# --------- 清洗 & 构造标题 ---------
|
| 1912 |
+
def _s(x) -> str:
|
| 1913 |
+
if pd.isna(x):
|
| 1914 |
+
return ""
|
| 1915 |
+
return str(x).strip()
|
| 1916 |
+
|
| 1917 |
+
def _author_ok(a: str) -> bool:
|
| 1918 |
+
aa = _s(a)
|
| 1919 |
+
return aa not in {"", "无", "None", "none", "N/A", "n/a", "null", "-", "—", "——"}
|
| 1920 |
+
|
| 1921 |
+
def build_title(row) -> str:
|
| 1922 |
+
inst = _s(row[col_org]) or "未填写机构"
|
| 1923 |
+
author = _s(row[col_author])
|
| 1924 |
+
cn = _s(row[col_title_cn])
|
| 1925 |
+
en = _s(row[col_title_en])
|
| 1926 |
+
|
| 1927 |
+
inner_parts = [p for p in (cn, en) if p]
|
| 1928 |
+
inner = " - ".join(inner_parts)
|
| 1929 |
+
|
| 1930 |
+
title = f"【{inst}】"
|
| 1931 |
+
if _author_ok(author):
|
| 1932 |
+
title += author
|
| 1933 |
+
if inner:
|
| 1934 |
+
title += f"《{inner}》"
|
| 1935 |
+
return title
|
| 1936 |
+
|
| 1937 |
+
out = pd.DataFrame(
|
| 1938 |
+
{
|
| 1939 |
+
"提交时间": df[col_time].apply(_s),
|
| 1940 |
+
"状态": default_status,
|
| 1941 |
+
"标题": df.apply(build_title, axis=1),
|
| 1942 |
+
"链接": df[col_link].apply(_s),
|
| 1943 |
+
"提交者": df[col_submitter].apply(_s),
|
| 1944 |
+
"备注": df[col_remark].apply(_s),
|
| 1945 |
+
}
|
| 1946 |
+
)
|
| 1947 |
+
|
| 1948 |
+
# --------- 去重(按链接) ---------
|
| 1949 |
+
if dedup_by_link:
|
| 1950 |
+
out["__link_norm__"] = out["链接"].str.strip().str.lower()
|
| 1951 |
+
out = out.drop_duplicates(subset="__link_norm__", keep="first").drop(columns="__link_norm__")
|
| 1952 |
+
|
| 1953 |
+
# --------- 写出 CSV ---------
|
| 1954 |
+
in_path = Path(input_path)
|
| 1955 |
+
if output_path is None:
|
| 1956 |
+
output_path = str(in_path.with_name(in_path.stem + "_队列.csv"))
|
| 1957 |
+
|
| 1958 |
+
out.to_csv(output_path, index=False, encoding="utf-8-sig")
|
| 1959 |
+
return output_path
|
| 1960 |
+
|
| 1961 |
+
|
| 1962 |
+
def build_money(input_excel_path: str) -> str:
|
| 1963 |
+
"""
|
| 1964 |
+
将 Excel 转换为队列 CSV,列为:提交时间、状态、标题、链接、提交者、备注
|
| 1965 |
+
"""
|
| 1966 |
+
# --- 静默 openpyxl 的 “no default style” 告警 ---
|
| 1967 |
+
with warnings.catch_warnings():
|
| 1968 |
+
warnings.filterwarnings(
|
| 1969 |
+
"ignore",
|
| 1970 |
+
category=UserWarning,
|
| 1971 |
+
module=r"openpyxl\.styles\.stylesheet"
|
| 1972 |
+
)
|
| 1973 |
+
df = pd.read_excel(input_excel_path) # 默认读第一个工作表
|
| 1974 |
+
|
| 1975 |
+
COL_SUBMIT_TIME = "提交时间(自动)"
|
| 1976 |
+
COL_INST = "机构/学校(必填)"
|
| 1977 |
+
COL_AUTHOR = "作者信息(必填)"
|
| 1978 |
+
COL_ORIG = "课程名(原语言课程名)" # 作为“英文课程名”来源
|
| 1979 |
+
COL_CN = "课程名(中文课程名)"
|
| 1980 |
+
COL_LINK = "课程链接(必填)"
|
| 1981 |
+
COL_REMARK = "备注"
|
| 1982 |
+
COL_SUBMITTER = "提交者(自动)"
|
| 1983 |
+
|
| 1984 |
+
required_cols = [COL_SUBMIT_TIME, COL_INST, COL_AUTHOR, COL_ORIG, COL_CN, COL_LINK, COL_REMARK, COL_SUBMITTER]
|
| 1985 |
+
missing = [c for c in required_cols if c not in df.columns]
|
| 1986 |
+
if missing:
|
| 1987 |
+
raise ValueError(f"缺少必要列:{missing}")
|
| 1988 |
+
|
| 1989 |
+
def _s(x) -> str:
|
| 1990 |
+
if pd.isna(x):
|
| 1991 |
+
return ""
|
| 1992 |
+
return str(x).strip()
|
| 1993 |
+
|
| 1994 |
+
def _author_ok(a: str) -> bool:
|
| 1995 |
+
aa = _s(a)
|
| 1996 |
+
return aa not in {"", "无", "None", "none", "N/A", "null", "-"}
|
| 1997 |
+
|
| 1998 |
+
def _build_title(row) -> str:
|
| 1999 |
+
inst = _s(row[COL_INST]) or "未填写机构"
|
| 2000 |
+
author = _s(row[COL_AUTHOR])
|
| 2001 |
+
cn = _s(row[COL_CN])
|
| 2002 |
+
en = _s(row[COL_ORIG])
|
| 2003 |
+
|
| 2004 |
+
parts = [p for p in [cn, en] if p]
|
| 2005 |
+
inner = " - ".join(parts)
|
| 2006 |
+
|
| 2007 |
+
title = f"【{inst}】"
|
| 2008 |
+
if _author_ok(author):
|
| 2009 |
+
title += author
|
| 2010 |
+
if inner:
|
| 2011 |
+
title += f"《{inner}》"
|
| 2012 |
+
return title
|
| 2013 |
+
|
| 2014 |
+
out = pd.DataFrame({
|
| 2015 |
+
"提交时间": df[COL_SUBMIT_TIME].apply(_s),
|
| 2016 |
+
"状态": "待处理",
|
| 2017 |
+
"标题": df.apply(_build_title, axis=1),
|
| 2018 |
+
"链接": df[COL_LINK].apply(_s),
|
| 2019 |
+
"提交者": df[COL_SUBMITTER].apply(_s),
|
| 2020 |
+
"备注": df[COL_REMARK].apply(_s),
|
| 2021 |
+
})
|
| 2022 |
+
|
| 2023 |
+
in_path = Path(input_excel_path)
|
| 2024 |
+
csv_path = in_path.with_name(in_path.stem + "_队列.csv")
|
| 2025 |
+
out.to_csv(csv_path, index=False, encoding="utf-8-sig")
|
| 2026 |
+
return str(csv_path)
|
| 2027 |
|
| 2028 |
# ---------- 辅助:转义未被反斜杠保护的 % ----------
|
| 2029 |
_ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%') # 前面不是反斜杠的 %
|
|
|
|
| 2211 |
latex_txt_path = Path(latex_txt_path).expanduser().resolve()
|
| 2212 |
content = latex_txt_path.read_text(encoding="utf-8")
|
| 2213 |
doc_match = re.search(
|
| 2214 |
+
r"\\begin\{document}(.*?)\\end\{document}",
|
| 2215 |
content,
|
| 2216 |
flags=re.DOTALL | re.IGNORECASE,
|
| 2217 |
)
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ charset-normalizer
|
|
| 4 |
google-genai
|
| 5 |
google-cloud-aiplatform
|
| 6 |
tqdm
|
| 7 |
-
sentencepiece
|
|
|
|
|
|
| 4 |
google-genai
|
| 5 |
google-cloud-aiplatform
|
| 6 |
tqdm
|
| 7 |
+
sentencepiece
|
| 8 |
+
openpyxl
|