JohnnyEudora commited on
Commit
e4670ae
·
1 Parent(s): ad68d34
Files changed (2) hide show
  1. app.py +214 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,7 +4,12 @@ import subprocess
4
  import tempfile
5
  import shutil
6
  import re
 
 
7
  from datetime import datetime
 
 
 
8
 
9
 
10
  # ==============================================================================
@@ -1163,6 +1168,16 @@ def create_gradio_app():
1163
  LaTeX_gen_button = gr.Button("格式化")
1164
 
1165
  LaTeX_gen_button.click(fn=latex2txt_blocks, inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
 
 
 
 
 
 
 
 
 
 
1166
 
1167
  return demo
1168
 
@@ -1811,9 +1826,204 @@ def project_handle_form_submission(
1811
  return generated_file
1812
 
1813
 
1814
- from pathlib import Path
1815
- import json
1816
- from typing import Union
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1817
 
1818
  # ---------- 辅助:转义未被反斜杠保护的 % ----------
1819
  _ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%') # 前面不是反斜杠的 %
@@ -2001,7 +2211,7 @@ def latex2txt_blocks(
2001
  latex_txt_path = Path(latex_txt_path).expanduser().resolve()
2002
  content = latex_txt_path.read_text(encoding="utf-8")
2003
  doc_match = re.search(
2004
- r"\\begin\{document}(.*?)\\end\{document\}",
2005
  content,
2006
  flags=re.DOTALL | re.IGNORECASE,
2007
  )
 
4
  import tempfile
5
  import shutil
6
  import re
7
+ import warnings
8
+ import pandas as pd
9
  from datetime import datetime
10
+ from typing import List, Dict, Optional, Iterable, Union
11
+ from pathlib import Path
12
+ import json
13
 
14
 
15
  # ==============================================================================
 
1168
  LaTeX_gen_button = gr.Button("格式化")
1169
 
1170
  LaTeX_gen_button.click(fn=latex2txt_blocks, inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
1171
+ with gr.Tab("想学的很多"):
1172
+ input_excel = gr.File(label="输入excel文件")
1173
+ output_csv = gr.File(label="待Notion的csv文件")
1174
+
1175
+ free_button = gr.Button("免费队列")
1176
+ money_button = gr.Button("付费队列")
1177
+
1178
+ free_button.click(fn=build_free, inputs=[input_excel],outputs=[output_csv])
1179
+ money_button.click(fn=build_money, inputs=[input_excel], outputs=[output_csv])
1180
+
1181
 
1182
  return demo
1183
 
 
1826
  return generated_file
1827
 
1828
 
1829
+ def build_free(
1830
+ input_path: str,
1831
+ output_path: Optional[str] = None,
1832
+ default_status: str = "待处理",
1833
+ dedup_by_link: bool = True,
1834
+ ) -> str:
1835
+ """
1836
+ 将“这种 Excel”(含原始收集字段)转换为队列 CSV。
1837
+ 输出列为:提交时间、状态、标题、链接、提交者、备注
1838
+
1839
+ 规则:
1840
+ - 提交时间:直接继承原 Excel 中的时间文本(不更改格式)
1841
+ - 状态:固定写入 `default_status`(默认:待处理)
1842
+ - 标题:格式为【机构】作者《中文课程名 - 英文课程名》
1843
+ * 若作者为空或为“无”等,则省略作者部分
1844
+ * 若中/英文课程名只有其一存在,则只使用存在的那个,不加“ - ”
1845
+ * 若两者皆空,则省略《》部分
1846
+ * 机构为空时用“未填写机构”
1847
+ - 链接、提交者、备注:直接继承
1848
+ - 若 `dedup_by_link=True`,则按“链接”去重,保留首个
1849
+
1850
+ 兼容列名(从左到右优先匹配):
1851
+ 提交时间: ["提交时间", "提交时间(自动)"]
1852
+ 机构/学校: ["机构/学校(必填)", "机构/学校", "机构", "学校"]
1853
+ 作者信息: ["作者信息(必填)", "作者信息", "作者"]
1854
+ 原语言课程名: ["课程名(原语言课程名)", "课程名(原文课程名)", "原语言课程名", "英文课程名", "课程名(英文)"]
1855
+ 中文课程名: ["课程名(中文课程名)", "中文课程名", "课程名(中文)", "课程名-中文"]
1856
+ 课程链接: ["课程链接(必填)", "链接", "URL"]
1857
+ 备注: ["备注", "说明"]
1858
+ 提交者: ["提交者(自动)", "提交者", "提交人"]
1859
+
1860
+ 返回:生成的 CSV 路径(utf-8-sig 编码)
1861
+ """
1862
+
1863
+ # --------- 读入表格(优先 Excel,屏蔽 openpyxl 的默认样式告警) ---------
1864
+ with warnings.catch_warnings():
1865
+ warnings.filterwarnings(
1866
+ "ignore",
1867
+ category=UserWarning,
1868
+ module=r"openpyxl\.styles\.stylesheet",
1869
+ )
1870
+ try:
1871
+ df = pd.read_excel(input_path) # 默认读第一个工作表
1872
+ except Exception:
1873
+ # 兜底尝试 CSV(如果用户误传了 .csv)
1874
+ for enc in ("utf-8", "utf-8-sig", "gb18030"):
1875
+ try:
1876
+ df = pd.read_csv(input_path, encoding=enc)
1877
+ break
1878
+ except Exception:
1879
+ df = None
1880
+ if df is None:
1881
+ raise RuntimeError("无法读取为 Excel 或 CSV。")
1882
+
1883
+ # --------- 列名解析(兼容多种命名) ---------
1884
+ candidates: Dict[str, List[str]] = {
1885
+ "time": ["提交时间", "提交时间(自动)"],
1886
+ "org": ["机构/学校(必填)", "机构/学校", "机构", "学校"],
1887
+ "author": ["作者信息(必填)", "作者信息", "作者"],
1888
+ "title_en": ["课程名(原语言课程名)", "课程名(原文课程名)", "原语言课程名", "英文课程名", "课程名(英文)"],
1889
+ "title_cn": ["课程名(中文课程名)", "中文课程名", "课程名(中文)", "课程名-中文"],
1890
+ "link": ["课程链接(必填)", "链接", "URL"],
1891
+ "remark": ["备注", "说明"],
1892
+ "submitter": ["提交者(自动)", "提交者", "提交人"],
1893
+ }
1894
+
1895
+ def resolve_col(name_list: Iterable[str]) -> str:
1896
+ for name in name_list:
1897
+ if name in df.columns:
1898
+ return name
1899
+ # 为方便定位问题,返回更清晰的报错
1900
+ raise KeyError(f"找不到列名(任一均可):{list(name_list)};现有列:{list(df.columns)}")
1901
+
1902
+ col_time = resolve_col(candidates["time"])
1903
+ col_org = resolve_col(candidates["org"])
1904
+ col_author = resolve_col(candidates["author"])
1905
+ col_title_en = resolve_col(candidates["title_en"])
1906
+ col_title_cn = resolve_col(candidates["title_cn"])
1907
+ col_link = resolve_col(candidates["link"])
1908
+ col_remark = resolve_col(candidates["remark"])
1909
+ col_submitter = resolve_col(candidates["submitter"])
1910
+
1911
+ # --------- 清洗 & 构造标题 ---------
1912
+ def _s(x) -> str:
1913
+ if pd.isna(x):
1914
+ return ""
1915
+ return str(x).strip()
1916
+
1917
+ def _author_ok(a: str) -> bool:
1918
+ aa = _s(a)
1919
+ return aa not in {"", "无", "None", "none", "N/A", "n/a", "null", "-", "—", "——"}
1920
+
1921
+ def build_title(row) -> str:
1922
+ inst = _s(row[col_org]) or "未填写机构"
1923
+ author = _s(row[col_author])
1924
+ cn = _s(row[col_title_cn])
1925
+ en = _s(row[col_title_en])
1926
+
1927
+ inner_parts = [p for p in (cn, en) if p]
1928
+ inner = " - ".join(inner_parts)
1929
+
1930
+ title = f"【{inst}】"
1931
+ if _author_ok(author):
1932
+ title += author
1933
+ if inner:
1934
+ title += f"《{inner}》"
1935
+ return title
1936
+
1937
+ out = pd.DataFrame(
1938
+ {
1939
+ "提交时间": df[col_time].apply(_s),
1940
+ "状态": default_status,
1941
+ "标题": df.apply(build_title, axis=1),
1942
+ "链接": df[col_link].apply(_s),
1943
+ "提交者": df[col_submitter].apply(_s),
1944
+ "备注": df[col_remark].apply(_s),
1945
+ }
1946
+ )
1947
+
1948
+ # --------- 去重(按链接) ---------
1949
+ if dedup_by_link:
1950
+ out["__link_norm__"] = out["链接"].str.strip().str.lower()
1951
+ out = out.drop_duplicates(subset="__link_norm__", keep="first").drop(columns="__link_norm__")
1952
+
1953
+ # --------- 写出 CSV ---------
1954
+ in_path = Path(input_path)
1955
+ if output_path is None:
1956
+ output_path = str(in_path.with_name(in_path.stem + "_队列.csv"))
1957
+
1958
+ out.to_csv(output_path, index=False, encoding="utf-8-sig")
1959
+ return output_path
1960
+
1961
+
1962
+ def build_money(input_excel_path: str) -> str:
1963
+ """
1964
+ 将 Excel 转换为队列 CSV,列为:提交时间、状态、标题、链接、提交者、备注
1965
+ """
1966
+ # --- 静默 openpyxl 的 “no default style” 告警 ---
1967
+ with warnings.catch_warnings():
1968
+ warnings.filterwarnings(
1969
+ "ignore",
1970
+ category=UserWarning,
1971
+ module=r"openpyxl\.styles\.stylesheet"
1972
+ )
1973
+ df = pd.read_excel(input_excel_path) # 默认读第一个工作表
1974
+
1975
+ COL_SUBMIT_TIME = "提交时间(自动)"
1976
+ COL_INST = "机构/学校(必填)"
1977
+ COL_AUTHOR = "作者信息(必填)"
1978
+ COL_ORIG = "课程名(原语言课程名)" # 作为“英文课程名”来源
1979
+ COL_CN = "课程名(中文课程名)"
1980
+ COL_LINK = "课程链接(必填)"
1981
+ COL_REMARK = "备注"
1982
+ COL_SUBMITTER = "提交者(自动)"
1983
+
1984
+ required_cols = [COL_SUBMIT_TIME, COL_INST, COL_AUTHOR, COL_ORIG, COL_CN, COL_LINK, COL_REMARK, COL_SUBMITTER]
1985
+ missing = [c for c in required_cols if c not in df.columns]
1986
+ if missing:
1987
+ raise ValueError(f"缺少必要列:{missing}")
1988
+
1989
+ def _s(x) -> str:
1990
+ if pd.isna(x):
1991
+ return ""
1992
+ return str(x).strip()
1993
+
1994
+ def _author_ok(a: str) -> bool:
1995
+ aa = _s(a)
1996
+ return aa not in {"", "无", "None", "none", "N/A", "null", "-"}
1997
+
1998
+ def _build_title(row) -> str:
1999
+ inst = _s(row[COL_INST]) or "未填写机构"
2000
+ author = _s(row[COL_AUTHOR])
2001
+ cn = _s(row[COL_CN])
2002
+ en = _s(row[COL_ORIG])
2003
+
2004
+ parts = [p for p in [cn, en] if p]
2005
+ inner = " - ".join(parts)
2006
+
2007
+ title = f"【{inst}】"
2008
+ if _author_ok(author):
2009
+ title += author
2010
+ if inner:
2011
+ title += f"《{inner}》"
2012
+ return title
2013
+
2014
+ out = pd.DataFrame({
2015
+ "提交时间": df[COL_SUBMIT_TIME].apply(_s),
2016
+ "状态": "待处理",
2017
+ "标题": df.apply(_build_title, axis=1),
2018
+ "链接": df[COL_LINK].apply(_s),
2019
+ "提交者": df[COL_SUBMITTER].apply(_s),
2020
+ "备注": df[COL_REMARK].apply(_s),
2021
+ })
2022
+
2023
+ in_path = Path(input_excel_path)
2024
+ csv_path = in_path.with_name(in_path.stem + "_队列.csv")
2025
+ out.to_csv(csv_path, index=False, encoding="utf-8-sig")
2026
+ return str(csv_path)
2027
 
2028
  # ---------- 辅助:转义未被反斜杠保护的 % ----------
2029
  _ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%') # 前面不是反斜杠的 %
 
2211
  latex_txt_path = Path(latex_txt_path).expanduser().resolve()
2212
  content = latex_txt_path.read_text(encoding="utf-8")
2213
  doc_match = re.search(
2214
+ r"\\begin\{document}(.*?)\\end\{document}",
2215
  content,
2216
  flags=re.DOTALL | re.IGNORECASE,
2217
  )
requirements.txt CHANGED
@@ -4,4 +4,5 @@ charset-normalizer
4
  google-genai
5
  google-cloud-aiplatform
6
  tqdm
7
- sentencepiece
 
 
4
  google-genai
5
  google-cloud-aiplatform
6
  tqdm
7
+ sentencepiece
8
+ openpyxl