Spaces:
Sleeping
Sleeping
File size: 8,270 Bytes
95585a9 f1e06db 95585a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
# app.py
import sys, subprocess
print("=== Python version ===")
print(sys.version)
print("=== pip show kiwipiepy ===")
try:
out = subprocess.check_output([sys.executable, "-m", "pip", "show", "kiwipiepy"], text=True)
print(out)
except Exception as e:
print("pip show failed:", e)
print("=== pip list (filter kiwi) ===")
try:
out = subprocess.check_output([sys.executable, "-m", "pip", "list"], text=True)
for line in out.splitlines():
if "kiwi" in line.lower():
print(line)
except Exception as e:
print("pip list failed:", e)
# -*- coding: utf-8 -*-
import os
import re
import io
import pandas as pd
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from kiwipiepy import Kiwi
# 可选:用 HF Inference API 生成图(推荐 MVP)
from huggingface_hub import InferenceClient
# -----------------------
# 配置
# -----------------------
VOCAB_PATH = "vocab.csv"
FONT_KR = "fonts/NotoSansKR-Regular.otf"
FONT_SC = "fonts/NotoSansSC-Regular.otf"
# 你可以换成任何你能用的 text-to-image 模型
# 注意:有些模型需要在 HF 上先点“Agree”许可
IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/sdxl-turbo")
HF_TOKEN = os.getenv("HF_TOKEN", "") # 在 Spaces -> Settings -> Secrets 里加
kiwi = Kiwi()
def load_vocab():
if not os.path.exists(VOCAB_PATH):
return pd.DataFrame(columns=["lemma", "meaning_zh", "pos", "level"])
df = pd.read_csv(VOCAB_PATH)
df["lemma"] = df["lemma"].astype(str).str.strip()
return df
VOCAB_DF = load_vocab()
VOCAB_MAP = {row["lemma"]: row for _, row in VOCAB_DF.iterrows()}
def safe_font(path, size):
# 没字体也别崩,先用默认字体(但可能不显示韩文)
if os.path.exists(path):
return ImageFont.truetype(path, size=size)
return ImageFont.load_default()
def normalize_lemma(form, tag):
"""
一个“够用版”的词形还原:
- 动词/形容词(VV/VA 等)尽量归一到 '다' 结尾
- 其他词保持原样
Kiwi 的 tag 很多,这里只粗略处理
"""
s = form.strip()
# 去掉常见终结词尾/助词的残留(非常简化)
s = re.sub(r"(요|니다|까|나|냐|지|죠|죠\?|야)$", "", s)
# 如果是动词/形容词类,尽量补成词典形
if tag.startswith("V") or tag in ("VA", "VV", "VX", "VCP", "VCN"):
if not s.endswith("다"):
s = s + "다"
return s
def analyze_sentence(sentence: str):
sentence = sentence.strip()
if not sentence:
return [], ""
# Kiwi 分析:取最优分析结果
analyzed = kiwi.analyze(sentence)
tokens = analyzed[0][0] # list of Token
rows = []
for t in tokens:
form = t.form
tag = t.tag # POS tag
lemma = normalize_lemma(form, tag)
# 词库匹配:先用 lemma,再退回 form
hit = VOCAB_MAP.get(lemma) or VOCAB_MAP.get(form)
rows.append({
"surface": form,
"pos": tag,
"lemma": lemma,
"meaning_zh": (hit["meaning_zh"] if hit is not None else ""),
"in_vocab": (hit is not None),
"level": (hit["level"] if hit is not None and "level" in hit else "")
})
# 生成一个“高亮句子”HTML:词库中有的绿色,没有的橙色
html_parts = []
for r in rows:
color = "#16a34a" if r["in_vocab"] else "#f59e0b"
html_parts.append(f"<span style='color:{color}; font-weight:700'>{r['surface']}</span>")
highlighted = " ".join(html_parts)
return rows, highlighted
def ko_to_prompt(sentence: str, token_rows):
"""
把句子变成更“可画”的 prompt(MVP:规则法)
你后面可以升级:先翻译成英文,再写 prompt
"""
# 抽取名词类(Kiwi tag:NNG/NNP 等),最多取几个
nouns = [r["surface"] for r in token_rows if r["pos"].startswith("NN")]
nouns = nouns[:5]
# 一个偏“学习卡片插画”的风格 prompt
core = ", ".join(nouns) if nouns else "a person, daily life"
prompt = (
f"cute flat illustration, simple background, centered subject, "
f"learning flashcard style, {core}, no text"
)
return prompt
def generate_image(prompt: str):
"""
用 HF Inference API 生成图
"""
if not HF_TOKEN:
# 没 token 就给个占位图,保证 UI 还能用
img = Image.new("RGB", (768, 512), "lightgray")
d = ImageDraw.Draw(img)
d.text((20, 20), "No HF_TOKEN set.\nSet it in Spaces Secrets to generate images.", fill="black")
return img
client = InferenceClient(model=IMAGE_MODEL_ID, token=HF_TOKEN)
# 返回 bytes
img_bytes = client.text_to_image(prompt)
return Image.open(io.BytesIO(img_bytes)).convert("RGB")
def compose_card(main_img: Image.Image, ko: str, zh: str, vocab_rows):
"""
拼一张长图卡片(接近你给的样式)
"""
W, H = 1080, 1920
bg = Image.new("RGB", (W, H), (248, 235, 209)) # 米黄色
draw = ImageDraw.Draw(bg)
# 白色面板
panel_margin = 60
panel = (panel_margin, 70, W - panel_margin, H - 70)
draw.rounded_rectangle(panel, radius=35, fill="white", outline=(40, 40, 40), width=4)
# 顶部图片区
img_box = (panel_margin + 80, 150, W - panel_margin - 80, 650)
img_w = img_box[2] - img_box[0]
img_h = img_box[3] - img_box[1]
main_img = main_img.copy()
main_img.thumbnail((img_w, img_h))
paste_x = img_box[0] + (img_w - main_img.size[0]) // 2
paste_y = img_box[1] + (img_h - main_img.size[1]) // 2
bg.paste(main_img, (paste_x, paste_y))
# 字体
f_ko = safe_font(FONT_KR, 56)
f_zh = safe_font(FONT_SC, 42)
f_title = safe_font(FONT_SC, 48)
f_table = safe_font(FONT_SC, 36)
# 韩语句子(黄色描边感:简单做法画两层)
y_ko = 720
for dx, dy in [(-2,0),(2,0),(0,-2),(0,2)]:
draw.text((panel_margin+90+dx, y_ko+dy), ko, font=f_ko, fill="black")
draw.text((panel_margin+90, y_ko), ko, font=f_ko, fill=(250, 204, 21)) # 黄
# 中文翻译
y_zh = 810
draw.text((panel_margin+90, y_zh), zh, font=f_zh, fill=(40, 40, 40))
# 词汇拆解标题
y_t = 930
draw.text((panel_margin+90, y_t), "*词汇拆解", font=f_title, fill="black")
# 表格
y = 1030
line_h = 60
# 列:surface / lemma / meaning
for r in vocab_rows:
if r["pos"].startswith("SP") or r["pos"].startswith("SF"):
continue
surface = r["surface"]
lemma = r["lemma"]
meaning = r["meaning_zh"] if r["meaning_zh"] else "(未收录)"
text = f"{surface} → {lemma} | {meaning}"
draw.text((panel_margin+90, y), text, font=f_table, fill=(30, 30, 30))
y += line_h
if y > H - 140:
break
return bg
def run(sentence, zh_translation):
rows, highlighted = analyze_sentence(sentence)
prompt = ko_to_prompt(sentence, rows)
img = generate_image(prompt)
card = compose_card(img, sentence.strip(), zh_translation.strip(), rows)
df = pd.DataFrame(rows)[["surface", "pos", "lemma", "meaning_zh", "in_vocab", "level"]]
return card, df, highlighted, prompt
with gr.Blocks() as demo:
gr.Markdown("## 韩语句子 → 自动拆词 → 自动配图 → 学习卡片(Hugging Face MVP)")
with gr.Row():
sentence = gr.Textbox(label="输入韩语句子", placeholder="例如:잠을 자도 피로가 안 풀리나?", lines=2)
with gr.Row():
zh = gr.Textbox(label="中文翻译(先手动填;后续可接自动翻译模型)", placeholder="例如:怎么睡了觉还是觉得很累?", lines=2)
btn = gr.Button("生成卡片")
with gr.Row():
out_img = gr.Image(label="生成的学习卡片", type="pil")
with gr.Row():
out_df = gr.Dataframe(label="词汇识别结果(可复制)", interactive=False)
with gr.Row():
out_html = gr.HTML(label="高亮句子(词库命中=绿,未命中=橙)")
with gr.Row():
out_prompt = gr.Textbox(label="用于生成图片的 prompt(你可以自己改)")
btn.click(fn=run, inputs=[sentence, zh], outputs=[out_img, out_df, out_html, out_prompt])
demo.launch()
|