Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import io
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import gradio as gr
|
| 8 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 9 |
+
|
| 10 |
+
from kiwipiepy import Kiwi
|
| 11 |
+
|
| 12 |
+
# 可选:用 HF Inference API 生成图(推荐 MVP)
|
| 13 |
+
from huggingface_hub import InferenceClient
|
| 14 |
+
|
| 15 |
+
# -----------------------
|
| 16 |
+
# 配置
|
| 17 |
+
# -----------------------
|
| 18 |
+
VOCAB_PATH = "vocab.csv"
|
| 19 |
+
FONT_KR = "fonts/NotoSansKR-Regular.otf"
|
| 20 |
+
FONT_SC = "fonts/NotoSansSC-Regular.otf"
|
| 21 |
+
|
| 22 |
+
# 你可以换成任何你能用的 text-to-image 模型
|
| 23 |
+
# 注意:有些模型需要在 HF 上先点“Agree”许可
|
| 24 |
+
IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/sdxl-turbo")
|
| 25 |
+
|
| 26 |
+
HF_TOKEN = os.getenv("HF_TOKEN", "") # 在 Spaces -> Settings -> Secrets 里加
|
| 27 |
+
|
| 28 |
+
kiwi = Kiwi()
|
| 29 |
+
|
| 30 |
+
def load_vocab():
|
| 31 |
+
if not os.path.exists(VOCAB_PATH):
|
| 32 |
+
return pd.DataFrame(columns=["lemma", "meaning_zh", "pos", "level"])
|
| 33 |
+
df = pd.read_csv(VOCAB_PATH)
|
| 34 |
+
df["lemma"] = df["lemma"].astype(str).str.strip()
|
| 35 |
+
return df
|
| 36 |
+
|
| 37 |
+
VOCAB_DF = load_vocab()
|
| 38 |
+
VOCAB_MAP = {row["lemma"]: row for _, row in VOCAB_DF.iterrows()}
|
| 39 |
+
|
| 40 |
+
def safe_font(path, size):
|
| 41 |
+
# 没字体也别崩,先用默认字体(但可能不显示韩文)
|
| 42 |
+
if os.path.exists(path):
|
| 43 |
+
return ImageFont.truetype(path, size=size)
|
| 44 |
+
return ImageFont.load_default()
|
| 45 |
+
|
| 46 |
+
def normalize_lemma(form, tag):
|
| 47 |
+
"""
|
| 48 |
+
一个“够用版”的词形还原:
|
| 49 |
+
- 动词/形容词(VV/VA 等)尽量归一到 '다' 结尾
|
| 50 |
+
- 其他词保持原样
|
| 51 |
+
Kiwi 的 tag 很多,这里只粗略处理
|
| 52 |
+
"""
|
| 53 |
+
s = form.strip()
|
| 54 |
+
|
| 55 |
+
# 去掉常见终结词尾/助词的残留(非常简化)
|
| 56 |
+
s = re.sub(r"(요|니다|까|나|냐|지|죠|죠\?|야)$", "", s)
|
| 57 |
+
|
| 58 |
+
# 如果是动词/形容词类,尽量补成词典形
|
| 59 |
+
if tag.startswith("V") or tag in ("VA", "VV", "VX", "VCP", "VCN"):
|
| 60 |
+
if not s.endswith("다"):
|
| 61 |
+
s = s + "다"
|
| 62 |
+
return s
|
| 63 |
+
|
| 64 |
+
def analyze_sentence(sentence: str):
|
| 65 |
+
sentence = sentence.strip()
|
| 66 |
+
if not sentence:
|
| 67 |
+
return [], ""
|
| 68 |
+
|
| 69 |
+
# Kiwi 分析:取最优分析结果
|
| 70 |
+
analyzed = kiwi.analyze(sentence)
|
| 71 |
+
tokens = analyzed[0][0] # list of Token
|
| 72 |
+
|
| 73 |
+
rows = []
|
| 74 |
+
for t in tokens:
|
| 75 |
+
form = t.form
|
| 76 |
+
tag = t.tag # POS tag
|
| 77 |
+
lemma = normalize_lemma(form, tag)
|
| 78 |
+
|
| 79 |
+
# 词库匹配:先用 lemma,再退回 form
|
| 80 |
+
hit = VOCAB_MAP.get(lemma) or VOCAB_MAP.get(form)
|
| 81 |
+
|
| 82 |
+
rows.append({
|
| 83 |
+
"surface": form,
|
| 84 |
+
"pos": tag,
|
| 85 |
+
"lemma": lemma,
|
| 86 |
+
"meaning_zh": (hit["meaning_zh"] if hit is not None else ""),
|
| 87 |
+
"in_vocab": (hit is not None),
|
| 88 |
+
"level": (hit["level"] if hit is not None and "level" in hit else "")
|
| 89 |
+
})
|
| 90 |
+
|
| 91 |
+
# 生成一个“高亮句子”HTML:词库中有的绿色,没有的橙色
|
| 92 |
+
html_parts = []
|
| 93 |
+
for r in rows:
|
| 94 |
+
color = "#16a34a" if r["in_vocab"] else "#f59e0b"
|
| 95 |
+
html_parts.append(f"<span style='color:{color}; font-weight:700'>{r['surface']}</span>")
|
| 96 |
+
highlighted = " ".join(html_parts)
|
| 97 |
+
|
| 98 |
+
return rows, highlighted
|
| 99 |
+
|
| 100 |
+
def ko_to_prompt(sentence: str, token_rows):
|
| 101 |
+
"""
|
| 102 |
+
把句子变成更“可画”的 prompt(MVP:规则法)
|
| 103 |
+
你后面可以升级:先翻译成英文,再写 prompt
|
| 104 |
+
"""
|
| 105 |
+
# 抽取名词类(Kiwi tag:NNG/NNP 等),最多取几个
|
| 106 |
+
nouns = [r["surface"] for r in token_rows if r["pos"].startswith("NN")]
|
| 107 |
+
nouns = nouns[:5]
|
| 108 |
+
|
| 109 |
+
# 一个偏“学习卡片插画”的风格 prompt
|
| 110 |
+
core = ", ".join(nouns) if nouns else "a person, daily life"
|
| 111 |
+
prompt = (
|
| 112 |
+
f"cute flat illustration, simple background, centered subject, "
|
| 113 |
+
f"learning flashcard style, {core}, no text"
|
| 114 |
+
)
|
| 115 |
+
return prompt
|
| 116 |
+
|
| 117 |
+
def generate_image(prompt: str):
|
| 118 |
+
"""
|
| 119 |
+
用 HF Inference API 生成图
|
| 120 |
+
"""
|
| 121 |
+
if not HF_TOKEN:
|
| 122 |
+
# 没 token 就给个占位图,保证 UI 还能用
|
| 123 |
+
img = Image.new("RGB", (768, 512), "lightgray")
|
| 124 |
+
d = ImageDraw.Draw(img)
|
| 125 |
+
d.text((20, 20), "No HF_TOKEN set.\nSet it in Spaces Secrets to generate images.", fill="black")
|
| 126 |
+
return img
|
| 127 |
+
|
| 128 |
+
client = InferenceClient(model=IMAGE_MODEL_ID, token=HF_TOKEN)
|
| 129 |
+
# 返回 bytes
|
| 130 |
+
img_bytes = client.text_to_image(prompt)
|
| 131 |
+
return Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
| 132 |
+
|
| 133 |
+
def compose_card(main_img: Image.Image, ko: str, zh: str, vocab_rows):
|
| 134 |
+
"""
|
| 135 |
+
拼一张长图卡片(接近你给的样式)
|
| 136 |
+
"""
|
| 137 |
+
W, H = 1080, 1920
|
| 138 |
+
bg = Image.new("RGB", (W, H), (248, 235, 209)) # 米黄色
|
| 139 |
+
draw = ImageDraw.Draw(bg)
|
| 140 |
+
|
| 141 |
+
# 白色面板
|
| 142 |
+
panel_margin = 60
|
| 143 |
+
panel = (panel_margin, 70, W - panel_margin, H - 70)
|
| 144 |
+
draw.rounded_rectangle(panel, radius=35, fill="white", outline=(40, 40, 40), width=4)
|
| 145 |
+
|
| 146 |
+
# 顶部图片区
|
| 147 |
+
img_box = (panel_margin + 80, 150, W - panel_margin - 80, 650)
|
| 148 |
+
img_w = img_box[2] - img_box[0]
|
| 149 |
+
img_h = img_box[3] - img_box[1]
|
| 150 |
+
main_img = main_img.copy()
|
| 151 |
+
main_img.thumbnail((img_w, img_h))
|
| 152 |
+
paste_x = img_box[0] + (img_w - main_img.size[0]) // 2
|
| 153 |
+
paste_y = img_box[1] + (img_h - main_img.size[1]) // 2
|
| 154 |
+
bg.paste(main_img, (paste_x, paste_y))
|
| 155 |
+
|
| 156 |
+
# 字体
|
| 157 |
+
f_ko = safe_font(FONT_KR, 56)
|
| 158 |
+
f_zh = safe_font(FONT_SC, 42)
|
| 159 |
+
f_title = safe_font(FONT_SC, 48)
|
| 160 |
+
f_table = safe_font(FONT_SC, 36)
|
| 161 |
+
|
| 162 |
+
# 韩语句子(黄色描边感:简单做法画两层)
|
| 163 |
+
y_ko = 720
|
| 164 |
+
for dx, dy in [(-2,0),(2,0),(0,-2),(0,2)]:
|
| 165 |
+
draw.text((panel_margin+90+dx, y_ko+dy), ko, font=f_ko, fill="black")
|
| 166 |
+
draw.text((panel_margin+90, y_ko), ko, font=f_ko, fill=(250, 204, 21)) # 黄
|
| 167 |
+
|
| 168 |
+
# 中文翻译
|
| 169 |
+
y_zh = 810
|
| 170 |
+
draw.text((panel_margin+90, y_zh), zh, font=f_zh, fill=(40, 40, 40))
|
| 171 |
+
|
| 172 |
+
# 词汇拆解标题
|
| 173 |
+
y_t = 930
|
| 174 |
+
draw.text((panel_margin+90, y_t), "*词汇拆解", font=f_title, fill="black")
|
| 175 |
+
|
| 176 |
+
# 表格
|
| 177 |
+
y = 1030
|
| 178 |
+
line_h = 60
|
| 179 |
+
# 列:surface / lemma / meaning
|
| 180 |
+
for r in vocab_rows:
|
| 181 |
+
if r["pos"].startswith("SP") or r["pos"].startswith("SF"):
|
| 182 |
+
continue
|
| 183 |
+
surface = r["surface"]
|
| 184 |
+
lemma = r["lemma"]
|
| 185 |
+
meaning = r["meaning_zh"] if r["meaning_zh"] else "(未收录)"
|
| 186 |
+
text = f"{surface} → {lemma} | {meaning}"
|
| 187 |
+
draw.text((panel_margin+90, y), text, font=f_table, fill=(30, 30, 30))
|
| 188 |
+
y += line_h
|
| 189 |
+
if y > H - 140:
|
| 190 |
+
break
|
| 191 |
+
|
| 192 |
+
return bg
|
| 193 |
+
|
| 194 |
+
def run(sentence, zh_translation):
|
| 195 |
+
rows, highlighted = analyze_sentence(sentence)
|
| 196 |
+
prompt = ko_to_prompt(sentence, rows)
|
| 197 |
+
img = generate_image(prompt)
|
| 198 |
+
card = compose_card(img, sentence.strip(), zh_translation.strip(), rows)
|
| 199 |
+
|
| 200 |
+
df = pd.DataFrame(rows)[["surface", "pos", "lemma", "meaning_zh", "in_vocab", "level"]]
|
| 201 |
+
return card, df, highlighted, prompt
|
| 202 |
+
|
| 203 |
+
with gr.Blocks() as demo:
|
| 204 |
+
gr.Markdown("## 韩语句子 → 自动拆词 → 自动配图 → 学习卡片(Hugging Face MVP)")
|
| 205 |
+
|
| 206 |
+
with gr.Row():
|
| 207 |
+
sentence = gr.Textbox(label="输入韩语句子", placeholder="例如:잠을 자도 피로가 안 풀리나?", lines=2)
|
| 208 |
+
with gr.Row():
|
| 209 |
+
zh = gr.Textbox(label="中文翻译(先手动填;后续可接自动翻译模型)", placeholder="例如:怎么睡了觉还是觉得很累?", lines=2)
|
| 210 |
+
|
| 211 |
+
btn = gr.Button("生成卡片")
|
| 212 |
+
|
| 213 |
+
with gr.Row():
|
| 214 |
+
out_img = gr.Image(label="生成的学习卡片", type="pil")
|
| 215 |
+
with gr.Row():
|
| 216 |
+
out_df = gr.Dataframe(label="词汇识别结果(可复制)", interactive=False)
|
| 217 |
+
with gr.Row():
|
| 218 |
+
out_html = gr.HTML(label="高亮句子(词库命中=绿,未命中=橙)")
|
| 219 |
+
with gr.Row():
|
| 220 |
+
out_prompt = gr.Textbox(label="用于生成图片的 prompt(你可以自己改)")
|
| 221 |
+
|
| 222 |
+
btn.click(fn=run, inputs=[sentence, zh], outputs=[out_img, out_df, out_html, out_prompt])
|
| 223 |
+
|
| 224 |
+
demo.launch()
|