Spaces:
Sleeping
Sleeping
| # app.py | |
| import sys, subprocess | |
| print("=== Python version ===") | |
| print(sys.version) | |
| print("=== pip show kiwipiepy ===") | |
| try: | |
| out = subprocess.check_output([sys.executable, "-m", "pip", "show", "kiwipiepy"], text=True) | |
| print(out) | |
| except Exception as e: | |
| print("pip show failed:", e) | |
| print("=== pip list (filter kiwi) ===") | |
| try: | |
| out = subprocess.check_output([sys.executable, "-m", "pip", "list"], text=True) | |
| for line in out.splitlines(): | |
| if "kiwi" in line.lower(): | |
| print(line) | |
| except Exception as e: | |
| print("pip list failed:", e) | |
| # -*- coding: utf-8 -*- | |
| import os | |
| import re | |
| import io | |
| import pandas as pd | |
| import gradio as gr | |
| from PIL import Image, ImageDraw, ImageFont | |
| from kiwipiepy import Kiwi | |
| # 可选:用 HF Inference API 生成图(推荐 MVP) | |
| from huggingface_hub import InferenceClient | |
| # ----------------------- | |
| # 配置 | |
| # ----------------------- | |
| VOCAB_PATH = "vocab.csv" | |
| FONT_KR = "fonts/NotoSansKR-Regular.otf" | |
| FONT_SC = "fonts/NotoSansSC-Regular.otf" | |
| # 你可以换成任何你能用的 text-to-image 模型 | |
| # 注意:有些模型需要在 HF 上先点“Agree”许可 | |
| IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/sdxl-turbo") | |
| HF_TOKEN = os.getenv("HF_TOKEN", "") # 在 Spaces -> Settings -> Secrets 里加 | |
| kiwi = Kiwi() | |
| def load_vocab(): | |
| if not os.path.exists(VOCAB_PATH): | |
| return pd.DataFrame(columns=["lemma", "meaning_zh", "pos", "level"]) | |
| df = pd.read_csv(VOCAB_PATH) | |
| df["lemma"] = df["lemma"].astype(str).str.strip() | |
| return df | |
| VOCAB_DF = load_vocab() | |
| VOCAB_MAP = {row["lemma"]: row for _, row in VOCAB_DF.iterrows()} | |
| def safe_font(path, size): | |
| # 没字体也别崩,先用默认字体(但可能不显示韩文) | |
| if os.path.exists(path): | |
| return ImageFont.truetype(path, size=size) | |
| return ImageFont.load_default() | |
| def normalize_lemma(form, tag): | |
| """ | |
| 一个“够用版”的词形还原: | |
| - 动词/形容词(VV/VA 等)尽量归一到 '다' 结尾 | |
| - 其他词保持原样 | |
| Kiwi 的 tag 很多,这里只粗略处理 | |
| """ | |
| s = form.strip() | |
| # 去掉常见终结词尾/助词的残留(非常简化) | |
| s = re.sub(r"(요|니다|까|나|냐|지|죠|죠\?|야)$", "", s) | |
| # 如果是动词/形容词类,尽量补成词典形 | |
| if tag.startswith("V") or tag in ("VA", "VV", "VX", "VCP", "VCN"): | |
| if not s.endswith("다"): | |
| s = s + "다" | |
| return s | |
| def analyze_sentence(sentence: str): | |
| sentence = sentence.strip() | |
| if not sentence: | |
| return [], "" | |
| # Kiwi 分析:取最优分析结果 | |
| analyzed = kiwi.analyze(sentence) | |
| tokens = analyzed[0][0] # list of Token | |
| rows = [] | |
| for t in tokens: | |
| form = t.form | |
| tag = t.tag # POS tag | |
| lemma = normalize_lemma(form, tag) | |
| # 词库匹配:先用 lemma,再退回 form | |
| hit = VOCAB_MAP.get(lemma) or VOCAB_MAP.get(form) | |
| rows.append({ | |
| "surface": form, | |
| "pos": tag, | |
| "lemma": lemma, | |
| "meaning_zh": (hit["meaning_zh"] if hit is not None else ""), | |
| "in_vocab": (hit is not None), | |
| "level": (hit["level"] if hit is not None and "level" in hit else "") | |
| }) | |
| # 生成一个“高亮句子”HTML:词库中有的绿色,没有的橙色 | |
| html_parts = [] | |
| for r in rows: | |
| color = "#16a34a" if r["in_vocab"] else "#f59e0b" | |
| html_parts.append(f"<span style='color:{color}; font-weight:700'>{r['surface']}</span>") | |
| highlighted = " ".join(html_parts) | |
| return rows, highlighted | |
| def ko_to_prompt(sentence: str, token_rows): | |
| """ | |
| 把句子变成更“可画”的 prompt(MVP:规则法) | |
| 你后面可以升级:先翻译成英文,再写 prompt | |
| """ | |
| # 抽取名词类(Kiwi tag:NNG/NNP 等),最多取几个 | |
| nouns = [r["surface"] for r in token_rows if r["pos"].startswith("NN")] | |
| nouns = nouns[:5] | |
| # 一个偏“学习卡片插画”的风格 prompt | |
| core = ", ".join(nouns) if nouns else "a person, daily life" | |
| prompt = ( | |
| f"cute flat illustration, simple background, centered subject, " | |
| f"learning flashcard style, {core}, no text" | |
| ) | |
| return prompt | |
| def generate_image(prompt: str): | |
| """ | |
| 用 HF Inference API 生成图 | |
| """ | |
| if not HF_TOKEN: | |
| # 没 token 就给个占位图,保证 UI 还能用 | |
| img = Image.new("RGB", (768, 512), "lightgray") | |
| d = ImageDraw.Draw(img) | |
| d.text((20, 20), "No HF_TOKEN set.\nSet it in Spaces Secrets to generate images.", fill="black") | |
| return img | |
| client = InferenceClient(model=IMAGE_MODEL_ID, token=HF_TOKEN) | |
| # 返回 bytes | |
| img_bytes = client.text_to_image(prompt) | |
| return Image.open(io.BytesIO(img_bytes)).convert("RGB") | |
| def compose_card(main_img: Image.Image, ko: str, zh: str, vocab_rows): | |
| """ | |
| 拼一张长图卡片(接近你给的样式) | |
| """ | |
| W, H = 1080, 1920 | |
| bg = Image.new("RGB", (W, H), (248, 235, 209)) # 米黄色 | |
| draw = ImageDraw.Draw(bg) | |
| # 白色面板 | |
| panel_margin = 60 | |
| panel = (panel_margin, 70, W - panel_margin, H - 70) | |
| draw.rounded_rectangle(panel, radius=35, fill="white", outline=(40, 40, 40), width=4) | |
| # 顶部图片区 | |
| img_box = (panel_margin + 80, 150, W - panel_margin - 80, 650) | |
| img_w = img_box[2] - img_box[0] | |
| img_h = img_box[3] - img_box[1] | |
| main_img = main_img.copy() | |
| main_img.thumbnail((img_w, img_h)) | |
| paste_x = img_box[0] + (img_w - main_img.size[0]) // 2 | |
| paste_y = img_box[1] + (img_h - main_img.size[1]) // 2 | |
| bg.paste(main_img, (paste_x, paste_y)) | |
| # 字体 | |
| f_ko = safe_font(FONT_KR, 56) | |
| f_zh = safe_font(FONT_SC, 42) | |
| f_title = safe_font(FONT_SC, 48) | |
| f_table = safe_font(FONT_SC, 36) | |
| # 韩语句子(黄色描边感:简单做法画两层) | |
| y_ko = 720 | |
| for dx, dy in [(-2,0),(2,0),(0,-2),(0,2)]: | |
| draw.text((panel_margin+90+dx, y_ko+dy), ko, font=f_ko, fill="black") | |
| draw.text((panel_margin+90, y_ko), ko, font=f_ko, fill=(250, 204, 21)) # 黄 | |
| # 中文翻译 | |
| y_zh = 810 | |
| draw.text((panel_margin+90, y_zh), zh, font=f_zh, fill=(40, 40, 40)) | |
| # 词汇拆解标题 | |
| y_t = 930 | |
| draw.text((panel_margin+90, y_t), "*词汇拆解", font=f_title, fill="black") | |
| # 表格 | |
| y = 1030 | |
| line_h = 60 | |
| # 列:surface / lemma / meaning | |
| for r in vocab_rows: | |
| if r["pos"].startswith("SP") or r["pos"].startswith("SF"): | |
| continue | |
| surface = r["surface"] | |
| lemma = r["lemma"] | |
| meaning = r["meaning_zh"] if r["meaning_zh"] else "(未收录)" | |
| text = f"{surface} → {lemma} | {meaning}" | |
| draw.text((panel_margin+90, y), text, font=f_table, fill=(30, 30, 30)) | |
| y += line_h | |
| if y > H - 140: | |
| break | |
| return bg | |
| def run(sentence, zh_translation): | |
| rows, highlighted = analyze_sentence(sentence) | |
| prompt = ko_to_prompt(sentence, rows) | |
| img = generate_image(prompt) | |
| card = compose_card(img, sentence.strip(), zh_translation.strip(), rows) | |
| df = pd.DataFrame(rows)[["surface", "pos", "lemma", "meaning_zh", "in_vocab", "level"]] | |
| return card, df, highlighted, prompt | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 韩语句子 → 自动拆词 → 自动配图 → 学习卡片(Hugging Face MVP)") | |
| with gr.Row(): | |
| sentence = gr.Textbox(label="输入韩语句子", placeholder="例如:잠을 자도 피로가 안 풀리나?", lines=2) | |
| with gr.Row(): | |
| zh = gr.Textbox(label="中文翻译(先手动填;后续可接自动翻译模型)", placeholder="例如:怎么睡了觉还是觉得很累?", lines=2) | |
| btn = gr.Button("生成卡片") | |
| with gr.Row(): | |
| out_img = gr.Image(label="生成的学习卡片", type="pil") | |
| with gr.Row(): | |
| out_df = gr.Dataframe(label="词汇识别结果(可复制)", interactive=False) | |
| with gr.Row(): | |
| out_html = gr.HTML(label="高亮句子(词库命中=绿,未命中=橙)") | |
| with gr.Row(): | |
| out_prompt = gr.Textbox(label="用于生成图片的 prompt(你可以自己改)") | |
| btn.click(fn=run, inputs=[sentence, zh], outputs=[out_img, out_df, out_html, out_prompt]) | |
| demo.launch() | |