Spaces:

WENior
/

KoreanProcess

Sleeping

App Files Files Community

KoreanProcess / app.py

WENior

Update app.py

f1e06db verified 5 days ago

raw

history blame contribute delete

8.27 kB

	# app.py
	import sys, subprocess

	print("=== Python version ===")
	print(sys.version)

	print("=== pip show kiwipiepy ===")
	try:
	out = subprocess.check_output([sys.executable, "-m", "pip", "show", "kiwipiepy"], text=True)
	print(out)
	except Exception as e:
	print("pip show failed:", e)

	print("=== pip list (filter kiwi) ===")
	try:
	out = subprocess.check_output([sys.executable, "-m", "pip", "list"], text=True)
	for line in out.splitlines():
	if "kiwi" in line.lower():
	print(line)
	except Exception as e:
	print("pip list failed:", e)
	# -- coding: utf-8 --
	import os
	import re
	import io
	import pandas as pd
	import gradio as gr
	from PIL import Image, ImageDraw, ImageFont

	from kiwipiepy import Kiwi

	# 可选：用 HF Inference API 生成图（推荐 MVP）
	from huggingface_hub import InferenceClient

	# -----------------------
	# 配置
	# -----------------------
	VOCAB_PATH = "vocab.csv"
	FONT_KR = "fonts/NotoSansKR-Regular.otf"
	FONT_SC = "fonts/NotoSansSC-Regular.otf"

	# 你可以换成任何你能用的 text-to-image 模型
	# 注意：有些模型需要在 HF 上先点“Agree”许可
	IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/sdxl-turbo")

	HF_TOKEN = os.getenv("HF_TOKEN", "") # 在 Spaces -> Settings -> Secrets 里加

	kiwi = Kiwi()

	def load_vocab():
	if not os.path.exists(VOCAB_PATH):
	return pd.DataFrame(columns=["lemma", "meaning_zh", "pos", "level"])
	df = pd.read_csv(VOCAB_PATH)
	df["lemma"] = df["lemma"].astype(str).str.strip()
	return df

	VOCAB_DF = load_vocab()
	VOCAB_MAP = {row["lemma"]: row for _, row in VOCAB_DF.iterrows()}

	def safe_font(path, size):
	# 没字体也别崩，先用默认字体（但可能不显示韩文）
	if os.path.exists(path):
	return ImageFont.truetype(path, size=size)
	return ImageFont.load_default()

	def normalize_lemma(form, tag):
	"""
	一个“够用版”的词形还原：
	- 动词/形容词（VV/VA 等）尽量归一到 '다' 结尾
	- 其他词保持原样
	Kiwi 的 tag 很多，这里只粗略处理
	"""
	s = form.strip()

	# 去掉常见终结词尾/助词的残留（非常简化）
	s = re.sub(r"(요\|니다\|까\|나\|냐\|지\|죠\|죠\?\|야)$", "", s)

	# 如果是动词/形容词类，尽量补成词典形
	if tag.startswith("V") or tag in ("VA", "VV", "VX", "VCP", "VCN"):
	if not s.endswith("다"):
	s = s + "다"
	return s

	def analyze_sentence(sentence: str):
	sentence = sentence.strip()
	if not sentence:
	return [], ""

	# Kiwi 分析：取最优分析结果
	analyzed = kiwi.analyze(sentence)
	tokens = analyzed[0][0] # list of Token

	rows = []
	for t in tokens:
	form = t.form
	tag = t.tag # POS tag
	lemma = normalize_lemma(form, tag)

	# 词库匹配：先用 lemma，再退回 form
	hit = VOCAB_MAP.get(lemma) or VOCAB_MAP.get(form)

	rows.append({
	"surface": form,
	"pos": tag,
	"lemma": lemma,
	"meaning_zh": (hit["meaning_zh"] if hit is not None else ""),
	"in_vocab": (hit is not None),
	"level": (hit["level"] if hit is not None and "level" in hit else "")
	})

	# 生成一个“高亮句子”HTML：词库中有的绿色，没有的橙色
	html_parts = []
	for r in rows:
	color = "#16a34a" if r["in_vocab"] else "#f59e0b"
	html_parts.append(f"<span style='color:{color}; font-weight:700'>{r['surface']}</span>")
	highlighted = " ".join(html_parts)

	return rows, highlighted

	def ko_to_prompt(sentence: str, token_rows):
	"""
	把句子变成更“可画”的 prompt（MVP：规则法）
	你后面可以升级：先翻译成英文，再写 prompt
	"""
	# 抽取名词类（Kiwi tag：NNG/NNP 等），最多取几个
	nouns = [r["surface"] for r in token_rows if r["pos"].startswith("NN")]
	nouns = nouns[:5]

	# 一个偏“学习卡片插画”的风格 prompt
	core = ", ".join(nouns) if nouns else "a person, daily life"
	prompt = (
	f"cute flat illustration, simple background, centered subject, "
	f"learning flashcard style, {core}, no text"
	)
	return prompt

	def generate_image(prompt: str):
	"""
	用 HF Inference API 生成图
	"""
	if not HF_TOKEN:
	# 没 token 就给个占位图，保证 UI 还能用
	img = Image.new("RGB", (768, 512), "lightgray")
	d = ImageDraw.Draw(img)
	d.text((20, 20), "No HF_TOKEN set.\nSet it in Spaces Secrets to generate images.", fill="black")
	return img

	client = InferenceClient(model=IMAGE_MODEL_ID, token=HF_TOKEN)
	# 返回 bytes
	img_bytes = client.text_to_image(prompt)
	return Image.open(io.BytesIO(img_bytes)).convert("RGB")

	def compose_card(main_img: Image.Image, ko: str, zh: str, vocab_rows):
	"""
	拼一张长图卡片（接近你给的样式）
	"""
	W, H = 1080, 1920
	bg = Image.new("RGB", (W, H), (248, 235, 209)) # 米黄色
	draw = ImageDraw.Draw(bg)

	# 白色面板
	panel_margin = 60
	panel = (panel_margin, 70, W - panel_margin, H - 70)
	draw.rounded_rectangle(panel, radius=35, fill="white", outline=(40, 40, 40), width=4)

	# 顶部图片区
	img_box = (panel_margin + 80, 150, W - panel_margin - 80, 650)
	img_w = img_box[2] - img_box[0]
	img_h = img_box[3] - img_box[1]
	main_img = main_img.copy()
	main_img.thumbnail((img_w, img_h))
	paste_x = img_box[0] + (img_w - main_img.size[0]) // 2
	paste_y = img_box[1] + (img_h - main_img.size[1]) // 2
	bg.paste(main_img, (paste_x, paste_y))

	# 字体
	f_ko = safe_font(FONT_KR, 56)
	f_zh = safe_font(FONT_SC, 42)
	f_title = safe_font(FONT_SC, 48)
	f_table = safe_font(FONT_SC, 36)

	# 韩语句子（黄色描边感：简单做法画两层）
	y_ko = 720
	for dx, dy in [(-2,0),(2,0),(0,-2),(0,2)]:
	draw.text((panel_margin+90+dx, y_ko+dy), ko, font=f_ko, fill="black")
	draw.text((panel_margin+90, y_ko), ko, font=f_ko, fill=(250, 204, 21)) # 黄

	# 中文翻译
	y_zh = 810
	draw.text((panel_margin+90, y_zh), zh, font=f_zh, fill=(40, 40, 40))

	# 词汇拆解标题
	y_t = 930
	draw.text((panel_margin+90, y_t), "＊词汇拆解", font=f_title, fill="black")

	# 表格
	y = 1030
	line_h = 60
	# 列：surface / lemma / meaning
	for r in vocab_rows:
	if r["pos"].startswith("SP") or r["pos"].startswith("SF"):
	continue
	surface = r["surface"]
	lemma = r["lemma"]
	meaning = r["meaning_zh"] if r["meaning_zh"] else "（未收录）"
	text = f"{surface} → {lemma} \| {meaning}"
	draw.text((panel_margin+90, y), text, font=f_table, fill=(30, 30, 30))
	y += line_h
	if y > H - 140:
	break

	return bg

	def run(sentence, zh_translation):
	rows, highlighted = analyze_sentence(sentence)
	prompt = ko_to_prompt(sentence, rows)
	img = generate_image(prompt)
	card = compose_card(img, sentence.strip(), zh_translation.strip(), rows)

	df = pd.DataFrame(rows)[["surface", "pos", "lemma", "meaning_zh", "in_vocab", "level"]]
	return card, df, highlighted, prompt

	with gr.Blocks() as demo:
	gr.Markdown("## 韩语句子 → 自动拆词 → 自动配图 → 学习卡片（Hugging Face MVP）")

	with gr.Row():
	sentence = gr.Textbox(label="输入韩语句子", placeholder="例如：잠을 자도 피로가 안 풀리나?", lines=2)
	with gr.Row():
	zh = gr.Textbox(label="中文翻译（先手动填；后续可接自动翻译模型）", placeholder="例如：怎么睡了觉还是觉得很累？", lines=2)

	btn = gr.Button("生成卡片")

	with gr.Row():
	out_img = gr.Image(label="生成的学习卡片", type="pil")
	with gr.Row():
	out_df = gr.Dataframe(label="词汇识别结果（可复制）", interactive=False)
	with gr.Row():
	out_html = gr.HTML(label="高亮句子（词库命中=绿，未命中=橙）")
	with gr.Row():
	out_prompt = gr.Textbox(label="用于生成图片的 prompt（你可以自己改）")

	btn.click(fn=run, inputs=[sentence, zh], outputs=[out_img, out_df, out_html, out_prompt])

	demo.launch()