File size: 8,270 Bytes
95585a9
f1e06db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95585a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# app.py
import sys, subprocess

print("=== Python version ===")
print(sys.version)

print("=== pip show kiwipiepy ===")
try:
    out = subprocess.check_output([sys.executable, "-m", "pip", "show", "kiwipiepy"], text=True)
    print(out)
except Exception as e:
    print("pip show failed:", e)

print("=== pip list (filter kiwi) ===")
try:
    out = subprocess.check_output([sys.executable, "-m", "pip", "list"], text=True)
    for line in out.splitlines():
        if "kiwi" in line.lower():
            print(line)
except Exception as e:
    print("pip list failed:", e)
# -*- coding: utf-8 -*-
import os
import re
import io
import pandas as pd
import gradio as gr
from PIL import Image, ImageDraw, ImageFont

from kiwipiepy import Kiwi

# 可选:用 HF Inference API 生成图(推荐 MVP)
from huggingface_hub import InferenceClient

# -----------------------
# 配置
# -----------------------
VOCAB_PATH = "vocab.csv"
FONT_KR = "fonts/NotoSansKR-Regular.otf"
FONT_SC = "fonts/NotoSansSC-Regular.otf"

# 你可以换成任何你能用的 text-to-image 模型
# 注意:有些模型需要在 HF 上先点“Agree”许可
IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/sdxl-turbo")

HF_TOKEN = os.getenv("HF_TOKEN", "")  # 在 Spaces -> Settings -> Secrets 里加

kiwi = Kiwi()

def load_vocab():
    if not os.path.exists(VOCAB_PATH):
        return pd.DataFrame(columns=["lemma", "meaning_zh", "pos", "level"])
    df = pd.read_csv(VOCAB_PATH)
    df["lemma"] = df["lemma"].astype(str).str.strip()
    return df

VOCAB_DF = load_vocab()
VOCAB_MAP = {row["lemma"]: row for _, row in VOCAB_DF.iterrows()}

def safe_font(path, size):
    # 没字体也别崩,先用默认字体(但可能不显示韩文)
    if os.path.exists(path):
        return ImageFont.truetype(path, size=size)
    return ImageFont.load_default()

def normalize_lemma(form, tag):
    """
    一个“够用版”的词形还原:
    - 动词/形容词(VV/VA 等)尽量归一到 '다' 结尾
    - 其他词保持原样
    Kiwi 的 tag 很多,这里只粗略处理
    """
    s = form.strip()

    # 去掉常见终结词尾/助词的残留(非常简化)
    s = re.sub(r"(요|니다|까|나|냐|지|죠|죠\?|야)$", "", s)

    # 如果是动词/形容词类,尽量补成词典形
    if tag.startswith("V") or tag in ("VA", "VV", "VX", "VCP", "VCN"):
        if not s.endswith("다"):
            s = s + "다"
    return s

def analyze_sentence(sentence: str):
    sentence = sentence.strip()
    if not sentence:
        return [], ""

    # Kiwi 分析:取最优分析结果
    analyzed = kiwi.analyze(sentence)
    tokens = analyzed[0][0]  # list of Token

    rows = []
    for t in tokens:
        form = t.form
        tag = t.tag  # POS tag
        lemma = normalize_lemma(form, tag)

        # 词库匹配:先用 lemma,再退回 form
        hit = VOCAB_MAP.get(lemma) or VOCAB_MAP.get(form)

        rows.append({
            "surface": form,
            "pos": tag,
            "lemma": lemma,
            "meaning_zh": (hit["meaning_zh"] if hit is not None else ""),
            "in_vocab": (hit is not None),
            "level": (hit["level"] if hit is not None and "level" in hit else "")
        })

    # 生成一个“高亮句子”HTML:词库中有的绿色,没有的橙色
    html_parts = []
    for r in rows:
        color = "#16a34a" if r["in_vocab"] else "#f59e0b"
        html_parts.append(f"<span style='color:{color}; font-weight:700'>{r['surface']}</span>")
    highlighted = " ".join(html_parts)

    return rows, highlighted

def ko_to_prompt(sentence: str, token_rows):
    """
    把句子变成更“可画”的 prompt(MVP:规则法)
    你后面可以升级:先翻译成英文,再写 prompt
    """
    # 抽取名词类(Kiwi tag:NNG/NNP 等),最多取几个
    nouns = [r["surface"] for r in token_rows if r["pos"].startswith("NN")]
    nouns = nouns[:5]

    # 一个偏“学习卡片插画”的风格 prompt
    core = ", ".join(nouns) if nouns else "a person, daily life"
    prompt = (
        f"cute flat illustration, simple background, centered subject, "
        f"learning flashcard style, {core}, no text"
    )
    return prompt

def generate_image(prompt: str):
    """
    用 HF Inference API 生成图
    """
    if not HF_TOKEN:
        # 没 token 就给个占位图,保证 UI 还能用
        img = Image.new("RGB", (768, 512), "lightgray")
        d = ImageDraw.Draw(img)
        d.text((20, 20), "No HF_TOKEN set.\nSet it in Spaces Secrets to generate images.", fill="black")
        return img

    client = InferenceClient(model=IMAGE_MODEL_ID, token=HF_TOKEN)
    # 返回 bytes
    img_bytes = client.text_to_image(prompt)
    return Image.open(io.BytesIO(img_bytes)).convert("RGB")

def compose_card(main_img: Image.Image, ko: str, zh: str, vocab_rows):
    """
    拼一张长图卡片(接近你给的样式)
    """
    W, H = 1080, 1920
    bg = Image.new("RGB", (W, H), (248, 235, 209))  # 米黄色
    draw = ImageDraw.Draw(bg)

    # 白色面板
    panel_margin = 60
    panel = (panel_margin, 70, W - panel_margin, H - 70)
    draw.rounded_rectangle(panel, radius=35, fill="white", outline=(40, 40, 40), width=4)

    # 顶部图片区
    img_box = (panel_margin + 80, 150, W - panel_margin - 80, 650)
    img_w = img_box[2] - img_box[0]
    img_h = img_box[3] - img_box[1]
    main_img = main_img.copy()
    main_img.thumbnail((img_w, img_h))
    paste_x = img_box[0] + (img_w - main_img.size[0]) // 2
    paste_y = img_box[1] + (img_h - main_img.size[1]) // 2
    bg.paste(main_img, (paste_x, paste_y))

    # 字体
    f_ko = safe_font(FONT_KR, 56)
    f_zh = safe_font(FONT_SC, 42)
    f_title = safe_font(FONT_SC, 48)
    f_table = safe_font(FONT_SC, 36)

    # 韩语句子(黄色描边感:简单做法画两层)
    y_ko = 720
    for dx, dy in [(-2,0),(2,0),(0,-2),(0,2)]:
        draw.text((panel_margin+90+dx, y_ko+dy), ko, font=f_ko, fill="black")
    draw.text((panel_margin+90, y_ko), ko, font=f_ko, fill=(250, 204, 21))  # 黄

    # 中文翻译
    y_zh = 810
    draw.text((panel_margin+90, y_zh), zh, font=f_zh, fill=(40, 40, 40))

    # 词汇拆解标题
    y_t = 930
    draw.text((panel_margin+90, y_t), "*词汇拆解", font=f_title, fill="black")

    # 表格
    y = 1030
    line_h = 60
    # 列:surface / lemma / meaning
    for r in vocab_rows:
        if r["pos"].startswith("SP") or r["pos"].startswith("SF"):
            continue
        surface = r["surface"]
        lemma = r["lemma"]
        meaning = r["meaning_zh"] if r["meaning_zh"] else "(未收录)"
        text = f"{surface}{lemma}  |  {meaning}"
        draw.text((panel_margin+90, y), text, font=f_table, fill=(30, 30, 30))
        y += line_h
        if y > H - 140:
            break

    return bg

def run(sentence, zh_translation):
    rows, highlighted = analyze_sentence(sentence)
    prompt = ko_to_prompt(sentence, rows)
    img = generate_image(prompt)
    card = compose_card(img, sentence.strip(), zh_translation.strip(), rows)

    df = pd.DataFrame(rows)[["surface", "pos", "lemma", "meaning_zh", "in_vocab", "level"]]
    return card, df, highlighted, prompt

with gr.Blocks() as demo:
    gr.Markdown("## 韩语句子 → 自动拆词 → 自动配图 → 学习卡片(Hugging Face MVP)")

    with gr.Row():
        sentence = gr.Textbox(label="输入韩语句子", placeholder="例如:잠을 자도 피로가 안 풀리나?", lines=2)
    with gr.Row():
        zh = gr.Textbox(label="中文翻译(先手动填;后续可接自动翻译模型)", placeholder="例如:怎么睡了觉还是觉得很累?", lines=2)

    btn = gr.Button("生成卡片")

    with gr.Row():
        out_img = gr.Image(label="生成的学习卡片", type="pil")
    with gr.Row():
        out_df = gr.Dataframe(label="词汇识别结果(可复制)", interactive=False)
    with gr.Row():
        out_html = gr.HTML(label="高亮句子(词库命中=绿,未命中=橙)")
    with gr.Row():
        out_prompt = gr.Textbox(label="用于生成图片的 prompt(你可以自己改)")

    btn.click(fn=run, inputs=[sentence, zh], outputs=[out_img, out_df, out_html, out_prompt])

demo.launch()