WENior commited on
Commit
95585a9
·
verified ·
1 Parent(s): 27fef6e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -0
app.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # -*- coding: utf-8 -*-
3
+ import os
4
+ import re
5
+ import io
6
+ import pandas as pd
7
+ import gradio as gr
8
+ from PIL import Image, ImageDraw, ImageFont
9
+
10
+ from kiwipiepy import Kiwi
11
+
12
+ # 可选:用 HF Inference API 生成图(推荐 MVP)
13
+ from huggingface_hub import InferenceClient
14
+
15
+ # -----------------------
16
+ # 配置
17
+ # -----------------------
18
+ VOCAB_PATH = "vocab.csv"
19
+ FONT_KR = "fonts/NotoSansKR-Regular.otf"
20
+ FONT_SC = "fonts/NotoSansSC-Regular.otf"
21
+
22
+ # 你可以换成任何你能用的 text-to-image 模型
23
+ # 注意:有些模型需要在 HF 上先点“Agree”许可
24
+ IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/sdxl-turbo")
25
+
26
+ HF_TOKEN = os.getenv("HF_TOKEN", "") # 在 Spaces -> Settings -> Secrets 里加
27
+
28
+ kiwi = Kiwi()
29
+
30
+ def load_vocab():
31
+ if not os.path.exists(VOCAB_PATH):
32
+ return pd.DataFrame(columns=["lemma", "meaning_zh", "pos", "level"])
33
+ df = pd.read_csv(VOCAB_PATH)
34
+ df["lemma"] = df["lemma"].astype(str).str.strip()
35
+ return df
36
+
37
+ VOCAB_DF = load_vocab()
38
+ VOCAB_MAP = {row["lemma"]: row for _, row in VOCAB_DF.iterrows()}
39
+
40
+ def safe_font(path, size):
41
+ # 没字体也别崩,先用默认字体(但可能不显示韩文)
42
+ if os.path.exists(path):
43
+ return ImageFont.truetype(path, size=size)
44
+ return ImageFont.load_default()
45
+
46
+ def normalize_lemma(form, tag):
47
+ """
48
+ 一个“够用版”的词形还原:
49
+ - 动词/形容词(VV/VA 等)尽量归一到 '다' 结尾
50
+ - 其他词保持原样
51
+ Kiwi 的 tag 很多,这里只粗略处理
52
+ """
53
+ s = form.strip()
54
+
55
+ # 去掉常见终结词尾/助词的残留(非常简化)
56
+ s = re.sub(r"(요|니다|까|나|냐|지|죠|죠\?|야)$", "", s)
57
+
58
+ # 如果是动词/形容词类,尽量补成词典形
59
+ if tag.startswith("V") or tag in ("VA", "VV", "VX", "VCP", "VCN"):
60
+ if not s.endswith("다"):
61
+ s = s + "다"
62
+ return s
63
+
64
+ def analyze_sentence(sentence: str):
65
+ sentence = sentence.strip()
66
+ if not sentence:
67
+ return [], ""
68
+
69
+ # Kiwi 分析:取最优分析结果
70
+ analyzed = kiwi.analyze(sentence)
71
+ tokens = analyzed[0][0] # list of Token
72
+
73
+ rows = []
74
+ for t in tokens:
75
+ form = t.form
76
+ tag = t.tag # POS tag
77
+ lemma = normalize_lemma(form, tag)
78
+
79
+ # 词库匹配:先用 lemma,再退回 form
80
+ hit = VOCAB_MAP.get(lemma) or VOCAB_MAP.get(form)
81
+
82
+ rows.append({
83
+ "surface": form,
84
+ "pos": tag,
85
+ "lemma": lemma,
86
+ "meaning_zh": (hit["meaning_zh"] if hit is not None else ""),
87
+ "in_vocab": (hit is not None),
88
+ "level": (hit["level"] if hit is not None and "level" in hit else "")
89
+ })
90
+
91
+ # 生成一个“高亮句子”HTML:词库中有的绿色,没有的橙色
92
+ html_parts = []
93
+ for r in rows:
94
+ color = "#16a34a" if r["in_vocab"] else "#f59e0b"
95
+ html_parts.append(f"<span style='color:{color}; font-weight:700'>{r['surface']}</span>")
96
+ highlighted = " ".join(html_parts)
97
+
98
+ return rows, highlighted
99
+
100
+ def ko_to_prompt(sentence: str, token_rows):
101
+ """
102
+ 把句子变成更“可画”的 prompt(MVP:规则法)
103
+ 你后面可以升级:先翻译成英文,再写 prompt
104
+ """
105
+ # 抽取名词类(Kiwi tag:NNG/NNP 等),最多取几个
106
+ nouns = [r["surface"] for r in token_rows if r["pos"].startswith("NN")]
107
+ nouns = nouns[:5]
108
+
109
+ # 一个偏“学习卡片插画”的风格 prompt
110
+ core = ", ".join(nouns) if nouns else "a person, daily life"
111
+ prompt = (
112
+ f"cute flat illustration, simple background, centered subject, "
113
+ f"learning flashcard style, {core}, no text"
114
+ )
115
+ return prompt
116
+
117
+ def generate_image(prompt: str):
118
+ """
119
+ 用 HF Inference API 生成图
120
+ """
121
+ if not HF_TOKEN:
122
+ # 没 token 就给个占位图,保证 UI 还能用
123
+ img = Image.new("RGB", (768, 512), "lightgray")
124
+ d = ImageDraw.Draw(img)
125
+ d.text((20, 20), "No HF_TOKEN set.\nSet it in Spaces Secrets to generate images.", fill="black")
126
+ return img
127
+
128
+ client = InferenceClient(model=IMAGE_MODEL_ID, token=HF_TOKEN)
129
+ # 返回 bytes
130
+ img_bytes = client.text_to_image(prompt)
131
+ return Image.open(io.BytesIO(img_bytes)).convert("RGB")
132
+
133
+ def compose_card(main_img: Image.Image, ko: str, zh: str, vocab_rows):
134
+ """
135
+ 拼一张长图卡片(接近你给的样式)
136
+ """
137
+ W, H = 1080, 1920
138
+ bg = Image.new("RGB", (W, H), (248, 235, 209)) # 米黄色
139
+ draw = ImageDraw.Draw(bg)
140
+
141
+ # 白色面板
142
+ panel_margin = 60
143
+ panel = (panel_margin, 70, W - panel_margin, H - 70)
144
+ draw.rounded_rectangle(panel, radius=35, fill="white", outline=(40, 40, 40), width=4)
145
+
146
+ # 顶部图片区
147
+ img_box = (panel_margin + 80, 150, W - panel_margin - 80, 650)
148
+ img_w = img_box[2] - img_box[0]
149
+ img_h = img_box[3] - img_box[1]
150
+ main_img = main_img.copy()
151
+ main_img.thumbnail((img_w, img_h))
152
+ paste_x = img_box[0] + (img_w - main_img.size[0]) // 2
153
+ paste_y = img_box[1] + (img_h - main_img.size[1]) // 2
154
+ bg.paste(main_img, (paste_x, paste_y))
155
+
156
+ # 字体
157
+ f_ko = safe_font(FONT_KR, 56)
158
+ f_zh = safe_font(FONT_SC, 42)
159
+ f_title = safe_font(FONT_SC, 48)
160
+ f_table = safe_font(FONT_SC, 36)
161
+
162
+ # 韩语句子(黄色描边感:简单做法画两层)
163
+ y_ko = 720
164
+ for dx, dy in [(-2,0),(2,0),(0,-2),(0,2)]:
165
+ draw.text((panel_margin+90+dx, y_ko+dy), ko, font=f_ko, fill="black")
166
+ draw.text((panel_margin+90, y_ko), ko, font=f_ko, fill=(250, 204, 21)) # 黄
167
+
168
+ # 中文翻译
169
+ y_zh = 810
170
+ draw.text((panel_margin+90, y_zh), zh, font=f_zh, fill=(40, 40, 40))
171
+
172
+ # 词汇拆解标题
173
+ y_t = 930
174
+ draw.text((panel_margin+90, y_t), "*词汇拆解", font=f_title, fill="black")
175
+
176
+ # 表格
177
+ y = 1030
178
+ line_h = 60
179
+ # 列:surface / lemma / meaning
180
+ for r in vocab_rows:
181
+ if r["pos"].startswith("SP") or r["pos"].startswith("SF"):
182
+ continue
183
+ surface = r["surface"]
184
+ lemma = r["lemma"]
185
+ meaning = r["meaning_zh"] if r["meaning_zh"] else "(未收录)"
186
+ text = f"{surface} → {lemma} | {meaning}"
187
+ draw.text((panel_margin+90, y), text, font=f_table, fill=(30, 30, 30))
188
+ y += line_h
189
+ if y > H - 140:
190
+ break
191
+
192
+ return bg
193
+
194
+ def run(sentence, zh_translation):
195
+ rows, highlighted = analyze_sentence(sentence)
196
+ prompt = ko_to_prompt(sentence, rows)
197
+ img = generate_image(prompt)
198
+ card = compose_card(img, sentence.strip(), zh_translation.strip(), rows)
199
+
200
+ df = pd.DataFrame(rows)[["surface", "pos", "lemma", "meaning_zh", "in_vocab", "level"]]
201
+ return card, df, highlighted, prompt
202
+
203
+ with gr.Blocks() as demo:
204
+ gr.Markdown("## 韩语句子 → 自动拆词 → 自动配图 → 学习卡片(Hugging Face MVP)")
205
+
206
+ with gr.Row():
207
+ sentence = gr.Textbox(label="输入韩语句子", placeholder="例如:잠을 자도 피로가 안 풀리나?", lines=2)
208
+ with gr.Row():
209
+ zh = gr.Textbox(label="中文翻译(先手动填;后续可接自动翻译模型)", placeholder="例如:怎么睡了觉还是觉得很累?", lines=2)
210
+
211
+ btn = gr.Button("生成卡片")
212
+
213
+ with gr.Row():
214
+ out_img = gr.Image(label="生成的学习卡片", type="pil")
215
+ with gr.Row():
216
+ out_df = gr.Dataframe(label="词汇识别结果(可复制)", interactive=False)
217
+ with gr.Row():
218
+ out_html = gr.HTML(label="高亮句子(词库命中=绿,未命中=橙)")
219
+ with gr.Row():
220
+ out_prompt = gr.Textbox(label="用于生成图片的 prompt(你可以自己改)")
221
+
222
+ btn.click(fn=run, inputs=[sentence, zh], outputs=[out_img, out_df, out_html, out_prompt])
223
+
224
+ demo.launch()