Spaces:

JohnnyEudora
/

quote

Sleeping

App Files Files Community

JohnnyEudora commited on Aug 14, 2025

Commit

e4670ae

1 Parent(s): ad68d34

update

Browse files

Files changed (2) hide show

app.py +214 -4
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,7 +4,12 @@ import subprocess
 import tempfile
 import shutil
 import re
 from datetime import datetime
 # ==============================================================================
@@ -1163,6 +1168,16 @@ def create_gradio_app():
                 LaTeX_gen_button = gr.Button("格式化")
                 LaTeX_gen_button.click(fn=latex2txt_blocks, inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
     return demo
@@ -1811,9 +1826,204 @@ def project_handle_form_submission(
     return generated_file
-from pathlib import Path
-import json
-from typing import Union
 # ---------- 辅助：转义未被反斜杠保护的 % ----------
 _ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%')  # 前面不是反斜杠的 %
@@ -2001,7 +2211,7 @@ def latex2txt_blocks(
     latex_txt_path = Path(latex_txt_path).expanduser().resolve()
     content = latex_txt_path.read_text(encoding="utf-8")
     doc_match = re.search(
-        r"\\begin\{document}(.*?)\\end\{document\}",
         content,
         flags=re.DOTALL | re.IGNORECASE,
     )

 import tempfile
 import shutil
 import re
+import warnings
+import pandas as pd
 from datetime import datetime
+from typing import List, Dict, Optional, Iterable, Union
+from pathlib import Path
+import json
 # ==============================================================================
                 LaTeX_gen_button = gr.Button("格式化")
                 LaTeX_gen_button.click(fn=latex2txt_blocks, inputs=[input_LaTeX_text], outputs=[output_LaTeX_txt])
+        with gr.Tab("想学的很多"):
+            input_excel = gr.File(label="输入excel文件")
+            output_csv = gr.File(label="待Notion的csv文件")
+            free_button = gr.Button("免费队列")
+            money_button = gr.Button("付费队列")
+            free_button.click(fn=build_free, inputs=[input_excel],outputs=[output_csv])
+            money_button.click(fn=build_money, inputs=[input_excel], outputs=[output_csv])
     return demo
     return generated_file
+def build_free(
+    input_path: str,
+    output_path: Optional[str] = None,
+    default_status: str = "待处理",
+    dedup_by_link: bool = True,
+) -> str:
+    """
+    将“这种 Excel”（含原始收集字段）转换为队列 CSV。
+    输出列为：提交时间、状态、标题、链接、提交者、备注
+    规则：
+    - 提交时间：直接继承原 Excel 中的时间文本（不更改格式）
+    - 状态：固定写入 `default_status`（默认：待处理）
+    - 标题：格式为【机构】作者《中文课程名 - 英文课程名》
+        * 若作者为空或为“无”等，则省略作者部分
+        * 若中/英文课程名只有其一存在，则只使用存在的那个，不加“ - ”
+        * 若两者皆空，则省略《》部分
+        * 机构为空时用“未填写机构”
+    - 链接、提交者、备注：直接继承
+    - 若 `dedup_by_link=True`，则按“链接”去重，保留首个
+    兼容列名（从左到右优先匹配）：
+        提交时间:   ["提交时间", "提交时间（自动）"]
+        机构/学校:   ["机构/学校（必填）", "机构/学校", "机构", "学校"]
+        作者信息:     ["作者信息（必填）", "作者信息", "作者"]
+        原语言课程名: ["课程名（原语言课程名）", "课程名（原文课程名）", "原语言课程名", "英文课程名", "课程名（英文）"]
+        中文课程名:   ["课程名（中文课程名）", "中文课程名", "课程名（中文）", "课程名-中文"]
+        课程链接:     ["课程链接（必填）", "链接", "URL"]
+        备注:        ["备注", "说明"]
+        提交者:       ["提交者（自动）", "提交者", "提交人"]
+    返回：生成的 CSV 路径（utf-8-sig 编码）
+    """
+    # --------- 读入表格（优先 Excel，屏蔽 openpyxl 的默认样式告警） ---------
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            category=UserWarning,
+            module=r"openpyxl\.styles\.stylesheet",
+        )
+        try:
+            df = pd.read_excel(input_path)  # 默认读第一个工作表
+        except Exception:
+            # 兜底尝试 CSV（如果用户误传了 .csv）
+            for enc in ("utf-8", "utf-8-sig", "gb18030"):
+                try:
+                    df = pd.read_csv(input_path, encoding=enc)
+                    break
+                except Exception:
+                    df = None
+            if df is None:
+                raise RuntimeError("无法读取为 Excel 或 CSV。")
+    # --------- 列名解析（兼容多种命名） ---------
+    candidates: Dict[str, List[str]] = {
+        "time": ["提交时间", "提交时间（自动）"],
+        "org": ["机构/学校（必填）", "机构/学校", "机构", "学校"],
+        "author": ["作者信息（必填）", "作者信息", "作者"],
+        "title_en": ["课程名（原语言课程名）", "课程名（原文课程名）", "原语言课程名", "英文课程名", "课程名（英文）"],
+        "title_cn": ["课程名（中文课程名）", "中文课程名", "课程名（中文）", "课程名-中文"],
+        "link": ["课程链接（必填）", "链接", "URL"],
+        "remark": ["备注", "说明"],
+        "submitter": ["提交者（自动）", "提交者", "提交人"],
+    }
+    def resolve_col(name_list: Iterable[str]) -> str:
+        for name in name_list:
+            if name in df.columns:
+                return name
+        # 为方便定位问题，返回更清晰的报错
+        raise KeyError(f"找不到列名（任一均可）：{list(name_list)}；现有列：{list(df.columns)}")
+    col_time = resolve_col(candidates["time"])
+    col_org = resolve_col(candidates["org"])
+    col_author = resolve_col(candidates["author"])
+    col_title_en = resolve_col(candidates["title_en"])
+    col_title_cn = resolve_col(candidates["title_cn"])
+    col_link = resolve_col(candidates["link"])
+    col_remark = resolve_col(candidates["remark"])
+    col_submitter = resolve_col(candidates["submitter"])
+    # --------- 清洗 & 构造标题 ---------
+    def _s(x) -> str:
+        if pd.isna(x):
+            return ""
+        return str(x).strip()
+    def _author_ok(a: str) -> bool:
+        aa = _s(a)
+        return aa not in {"", "无", "None", "none", "N/A", "n/a", "null", "-", "—", "——"}
+    def build_title(row) -> str:
+        inst = _s(row[col_org]) or "未填写机构"
+        author = _s(row[col_author])
+        cn = _s(row[col_title_cn])
+        en = _s(row[col_title_en])
+        inner_parts = [p for p in (cn, en) if p]
+        inner = " - ".join(inner_parts)
+        title = f"【{inst}】"
+        if _author_ok(author):
+            title += author
+        if inner:
+            title += f"《{inner}》"
+        return title
+    out = pd.DataFrame(
+        {
+            "提交时间": df[col_time].apply(_s),
+            "状态": default_status,
+            "标题": df.apply(build_title, axis=1),
+            "链接": df[col_link].apply(_s),
+            "提交者": df[col_submitter].apply(_s),
+            "备注": df[col_remark].apply(_s),
+        }
+    )
+    # --------- 去重（按链接） ---------
+    if dedup_by_link:
+        out["__link_norm__"] = out["链接"].str.strip().str.lower()
+        out = out.drop_duplicates(subset="__link_norm__", keep="first").drop(columns="__link_norm__")
+    # --------- 写出 CSV ---------
+    in_path = Path(input_path)
+    if output_path is None:
+        output_path = str(in_path.with_name(in_path.stem + "_队列.csv"))
+    out.to_csv(output_path, index=False, encoding="utf-8-sig")
+    return output_path
+def build_money(input_excel_path: str) -> str:
+    """
+    将 Excel 转换为队列 CSV，列为：提交时间、状态、标题、链接、提交者、备注
+    """
+    # --- 静默 openpyxl 的 “no default style” 告警 ---
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            category=UserWarning,
+            module=r"openpyxl\.styles\.stylesheet"
+        )
+        df = pd.read_excel(input_excel_path)  # 默认读第一个工作表
+    COL_SUBMIT_TIME = "提交时间（自动）"
+    COL_INST = "机构/学校（必填）"
+    COL_AUTHOR = "作者信息（必填）"
+    COL_ORIG = "课程名（原语言课程名）"     # 作为“英文课程名”来源
+    COL_CN = "课程名（中文课程名）"
+    COL_LINK = "课程链接（必填）"
+    COL_REMARK = "备注"
+    COL_SUBMITTER = "提交者（自动）"
+    required_cols = [COL_SUBMIT_TIME, COL_INST, COL_AUTHOR, COL_ORIG, COL_CN, COL_LINK, COL_REMARK, COL_SUBMITTER]
+    missing = [c for c in required_cols if c not in df.columns]
+    if missing:
+        raise ValueError(f"缺少必要列：{missing}")
+    def _s(x) -> str:
+        if pd.isna(x):
+            return ""
+        return str(x).strip()
+    def _author_ok(a: str) -> bool:
+        aa = _s(a)
+        return aa not in {"", "无", "None", "none", "N/A", "null", "-"}
+    def _build_title(row) -> str:
+        inst = _s(row[COL_INST]) or "未填写机构"
+        author = _s(row[COL_AUTHOR])
+        cn = _s(row[COL_CN])
+        en = _s(row[COL_ORIG])
+        parts = [p for p in [cn, en] if p]
+        inner = " - ".join(parts)
+        title = f"【{inst}】"
+        if _author_ok(author):
+            title += author
+        if inner:
+            title += f"《{inner}》"
+        return title
+    out = pd.DataFrame({
+        "提交时间": df[COL_SUBMIT_TIME].apply(_s),
+        "状态": "待处理",
+        "标题": df.apply(_build_title, axis=1),
+        "链接": df[COL_LINK].apply(_s),
+        "提交者": df[COL_SUBMITTER].apply(_s),
+        "备注": df[COL_REMARK].apply(_s),
+    })
+    in_path = Path(input_excel_path)
+    csv_path = in_path.with_name(in_path.stem + "_队列.csv")
+    out.to_csv(csv_path, index=False, encoding="utf-8-sig")
+    return str(csv_path)
 # ---------- 辅助：转义未被反斜杠保护的 % ----------
 _ESCAPE_PERCENT_RE = re.compile(r'(?<!\\)%')  # 前面不是反斜杠的 %
     latex_txt_path = Path(latex_txt_path).expanduser().resolve()
     content = latex_txt_path.read_text(encoding="utf-8")
     doc_match = re.search(
+        r"\\begin\{document}(.*?)\\end\{document}",
         content,
         flags=re.DOTALL | re.IGNORECASE,
     )

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ charset-normalizer
 google-genai
 google-cloud-aiplatform
 tqdm
-sentencepiece

 google-genai
 google-cloud-aiplatform
 tqdm
+sentencepiece
+openpyxl