File size: 19,083 Bytes
5a12058
 
 
 
 
e513ee9
 
 
5a12058
 
67f36ac
5a12058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67f36ac
5a12058
 
 
 
 
 
 
67f36ac
5a12058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1aa944c
5a12058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e513ee9
5a12058
 
 
 
 
 
 
 
 
 
 
e513ee9
5a12058
 
 
 
 
 
 
 
 
 
3b69f5c
5a12058
 
 
59ea8cc
5a12058
 
 
 
59ea8cc
5a12058
 
 
 
 
 
 
 
 
 
 
 
 
59ea8cc
5a12058
 
 
 
e513ee9
6f38c9a
5a12058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2b25d5
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import os
import re
import time
from typing import List, Dict, Any

import torch
import gradio as gr
from PIL import Image
from transformers import pipeline
from datasets import load_dataset

# Vocabulary dictionary covering Office-Home dataset classes + common COCO
# household/office items DETR emits. Single-word keys are matched per-token in
# captions and detection labels; multi-word keys (e.g. "dining table") are
# matched as phrases.
VOCAB_DICT = {
    # --- Furniture ---
    "chair": {"japanese": "いす", "romaji": "isu", "korean": "의자", "romanization": "uija"},
    "table": {"japanese": "テーブル", "romaji": "teeburu", "korean": "테이블", "romanization": "teibeul"},
    "dining table": {"japanese": "ダイニングテーブル", "romaji": "dainingu teeburu", "korean": "식탁", "romanization": "siktak"},
    "desk": {"japanese": "机", "romaji": "tsukue", "korean": "책상", "romanization": "chaeksang"},
    "bed": {"japanese": "ベッド", "romaji": "beddo", "korean": "침대", "romanization": "chimdae"},
    "couch": {"japanese": "ソファ", "romaji": "sofa", "korean": "소파", "romanization": "sopa"},
    "sofa": {"japanese": "ソファ", "romaji": "sofa", "korean": "소파", "romanization": "sopa"},
    "shelf": {"japanese": "棚", "romaji": "tana", "korean": "선반", "romanization": "seonban"},
    "curtain": {"japanese": "カーテン", "romaji": "kaaten", "korean": "커튼", "romanization": "keoteun"},
    "file cabinet": {"japanese": "ファイルキャビネット", "romaji": "fairu kyabinetto", "korean": "파일 캐비닛", "romanization": "pail kaebinit"},

    # --- Lighting / electrical ---
    "lamp": {"japanese": "ランプ", "romaji": "ranpu", "korean": "램프", "romanization": "raempeu"},
    "desk lamp": {"japanese": "デスクランプ", "romaji": "desuku ranpu", "korean": "책상 램프", "romanization": "chaeksang raempeu"},
    "lamp shade": {"japanese": "ランプシェード", "romaji": "ranpu sheedo", "korean": "램프 갓", "romanization": "raempeu gat"},
    "fan": {"japanese": "扇風機", "romaji": "senpuuki", "korean": "선풍기", "romanization": "seonpunggi"},
    "battery": {"japanese": "電池", "romaji": "denchi", "korean": "배터리", "romanization": "baeteori"},
    "candle": {"japanese": "ろうそく", "romaji": "rousoku", "korean": "양초", "romanization": "yangcho"},

    # --- Computing / electronics ---
    "laptop": {"japanese": "ノートパソコン", "romaji": "nooto pasokon", "korean": "노트북", "romanization": "noteubuk"},
    "computer": {"japanese": "コンピュータ", "romaji": "konpyuuta", "korean": "컴퓨터", "romanization": "keompyuteo"},
    "monitor": {"japanese": "モニター", "romaji": "monitaa", "korean": "모니터", "romanization": "moniteo"},
    "keyboard": {"japanese": "キーボード", "romaji": "kiibodo", "korean": "키보드", "romanization": "kibodeu"},
    "mouse": {"japanese": "マウス", "romaji": "mausu", "korean": "마우스", "romanization": "mauseu"},
    "printer": {"japanese": "プリンター", "romaji": "purintaa", "korean": "프린터", "romanization": "peurinteo"},
    "webcam": {"japanese": "ウェブカメラ", "romaji": "webu kamera", "korean": "웹캠", "romanization": "wepkaem"},
    "speaker": {"japanese": "スピーカー", "romaji": "supiikaa", "korean": "스피커", "romanization": "seupikeo"},
    "tv": {"japanese": "テレビ", "romaji": "terebi", "korean": "텔레비전", "romanization": "tellebijeon"},
    "television": {"japanese": "テレビ", "romaji": "terebi", "korean": "텔레비전", "romanization": "tellebijeon"},
    "remote": {"japanese": "リモコン", "romaji": "rimokon", "korean": "리모컨", "romanization": "rimokeon"},
    "radio": {"japanese": "ラジオ", "romaji": "rajio", "korean": "라디오", "romanization": "radio"},
    "phone": {"japanese": "電話", "romaji": "denwa", "korean": "전화", "romanization": "jeonhwa"},
    "telephone": {"japanese": "電話", "romaji": "denwa", "korean": "전화", "romanization": "jeonhwa"},
    "cell phone": {"japanese": "携帯電話", "romaji": "keitai denwa", "korean": "휴대폰", "romanization": "hyudaepon"},
    "calculator": {"japanese": "電卓", "romaji": "dentaku", "korean": "계산기", "romanization": "gyesangi"},
    "clock": {"japanese": "時計", "romaji": "tokei", "korean": "시계", "romanization": "sigye"},
    "alarm clock": {"japanese": "目覚まし時計", "romaji": "mezamashi dokei", "korean": "알람 시계", "romanization": "allam sigye"},

    # --- Stationery / office supplies ---
    "pen": {"japanese": "ペン", "romaji": "pen", "korean": "펜", "romanization": "pen"},
    "pencil": {"japanese": "鉛筆", "romaji": "enpitsu", "korean": "연필", "romanization": "yeonpil"},
    "marker": {"japanese": "マーカー", "romaji": "maakaa", "korean": "마커", "romanization": "makeo"},
    "eraser": {"japanese": "消しゴム", "romaji": "keshigomu", "korean": "지우개", "romanization": "jiugae"},
    "ruler": {"japanese": "定規", "romaji": "jougi", "korean": "자", "romanization": "ja"},
    "scissors": {"japanese": "はさみ", "romaji": "hasami", "korean": "가위", "romanization": "gawi"},
    "notebook": {"japanese": "ノート", "romaji": "nooto", "korean": "공책", "romanization": "gongchaek"},
    "book": {"japanese": "本", "romaji": "hon", "korean": "책", "romanization": "chaek"},
    "folder": {"japanese": "フォルダ", "romaji": "foruda", "korean": "폴더", "romanization": "poldeo"},
    "clipboard": {"japanese": "クリップボード", "romaji": "kurippu boodo", "korean": "클립보드", "romanization": "keullipbodeu"},
    "calendar": {"japanese": "カレンダー", "romaji": "karendaa", "korean": "달력", "romanization": "dallyeok"},
    "paper clip": {"japanese": "クリップ", "romaji": "kurippu", "korean": "종이 클립", "romanization": "jongi keullip"},
    "push pin": {"japanese": "画びょう", "romaji": "gabyou", "korean": "압정", "romanization": "apjeong"},
    "exit sign": {"japanese": "出口表示", "romaji": "deguchi hyouji", "korean": "출구 표지", "romanization": "chulgu pyoji"},

    # --- Kitchen / dining ---
    "mug": {"japanese": "マグカップ", "romaji": "magu kappu", "korean": "머그컵", "romanization": "meogeukeop"},
    "cup": {"japanese": "カップ", "romaji": "kappu", "korean": "컵", "romanization": "keop"},
    "wine glass": {"japanese": "ワイングラス", "romaji": "wain gurasu", "korean": "와인 잔", "romanization": "wain jan"},
    "bottle": {"japanese": "ボトル", "romaji": "botoru", "korean": "병", "romanization": "byeong"},
    "bowl": {"japanese": "ボウル", "romaji": "bouru", "korean": "그릇", "romanization": "geureut"},
    "fork": {"japanese": "フォーク", "romaji": "fooku", "korean": "포크", "romanization": "pokeu"},
    "spoon": {"japanese": "スプーン", "romaji": "supuun", "korean": "숟가락", "romanization": "sutgarak"},
    "knife": {"japanese": "ナイフ", "romaji": "naifu", "korean": "칼", "romanization": "kal"},
    "kettle": {"japanese": "やかん", "romaji": "yakan", "korean": "주전자", "romanization": "jujeonja"},
    "pan": {"japanese": "フライパン", "romaji": "furaipan", "korean": "팬", "romanization": "paen"},
    "oven": {"japanese": "オーブン", "romaji": "oobun", "korean": "오븐", "romanization": "obeun"},
    "microwave": {"japanese": "電子レンジ", "romaji": "denshi renji", "korean": "전자레인지", "romanization": "jeonjareinji"},
    "toaster": {"japanese": "トースター", "romaji": "toosutaa", "korean": "토스터", "romanization": "toseuteo"},
    "refrigerator": {"japanese": "冷蔵庫", "romaji": "reizouko", "korean": "냉장고", "romanization": "naengjanggo"},
    "sink": {"japanese": "流し", "romaji": "nagashi", "korean": "싱크대", "romanization": "singkeudae"},
    "soda": {"japanese": "ソーダ", "romaji": "sooda", "korean": "탄산음료", "romanization": "tansaneumnyo"},

    # --- Bathroom ---
    "toothbrush": {"japanese": "歯ブラシ", "romaji": "ha burashi", "korean": "칫솔", "romanization": "chitsol"},
    "toilet": {"japanese": "トイレ", "romaji": "toire", "korean": "화장실", "romanization": "hwajangsil"},

    # --- Tools / hardware ---
    "hammer": {"japanese": "ハンマー", "romaji": "hanmaa", "korean": "망치", "romanization": "mangchi"},
    "drill": {"japanese": "ドリル", "romaji": "doriru", "korean": "드릴", "romanization": "deuril"},
    "screwdriver": {"japanese": "ドライバー", "romaji": "doraibaa", "korean": "드라이버", "romanization": "deuraibeo"},
    "bucket": {"japanese": "バケツ", "romaji": "baketsu", "korean": "양동이", "romanization": "yangdongi"},
    "mop": {"japanese": "モップ", "romaji": "moppu", "korean": "대걸레", "romanization": "daegeolle"},
    "trash can": {"japanese": "ゴミ箱", "romaji": "gomibako", "korean": "쓰레기통", "romanization": "sseuregitong"},

    # --- Personal items / clothing ---
    "backpack": {"japanese": "リュックサック", "romaji": "ryukku sakku", "korean": "백팩", "romanization": "baekpaek"},
    "handbag": {"japanese": "ハンドバッグ", "romaji": "hando baggu", "korean": "핸드백", "romanization": "haendeubaek"},
    "suitcase": {"japanese": "スーツケース", "romaji": "suutsu keesu", "korean": "여행 가방", "romanization": "yeohaeng gabang"},
    "umbrella": {"japanese": "傘", "romaji": "kasa", "korean": "우산", "romanization": "usan"},
    "glasses": {"japanese": "眼鏡", "romaji": "megane", "korean": "안경", "romanization": "angyeong"},
    "tie": {"japanese": "ネクタイ", "romaji": "nekutai", "korean": "넥타이", "romanization": "nektai"},
    "helmet": {"japanese": "ヘルメット", "romaji": "herumetto", "korean": "헬멧", "romanization": "helmet"},
    "sneakers": {"japanese": "スニーカー", "romaji": "suniikaa", "korean": "운동화", "romanization": "undonghwa"},
    "flipflops": {"japanese": "ビーチサンダル", "romaji": "biichi sandaru", "korean": "슬리퍼", "romanization": "seullipeo"},
    "bike": {"japanese": "自転車", "romaji": "jitensha", "korean": "자전거", "romanization": "jajeongeo"},

    # --- Decor / misc ---
    "flower": {"japanese": "花", "romaji": "hana", "korean": "꽃", "romanization": "kkot"},
    "plant": {"japanese": "植物", "romaji": "shokubutsu", "korean": "식물", "romanization": "singmul"},
    "potted plant": {"japanese": "鉢植え", "romaji": "hachi-ue", "korean": "화분", "romanization": "hwabun"},
    "vase": {"japanese": "花瓶", "romaji": "kabin", "korean": "꽃병", "romanization": "kkotbyeong"},
    "toy": {"japanese": "おもちゃ", "romaji": "omocha", "korean": "장난감", "romanization": "jangnangam"},
    "teddy bear": {"japanese": "テディベア", "romaji": "tedi bea", "korean": "곰인형", "romanization": "gominhyeong"},
    "postit": {"japanese": "付箋", "romaji": "fusen", "korean": "포스트잇", "romanization": "poseuteuit"},
    "hairdryer": {"japanese": "ドライヤー", "romaji": "doraiyaa", "korean": "드라이어", "romanization": "deuraieo"},
}

# Pre-split single-word vs multi-word keys for efficient matching
_SINGLE_WORD_KEYS = {k for k in VOCAB_DICT if " " not in k}
_MULTI_WORD_KEYS = [k for k in VOCAB_DICT if " " in k]

# Device setup
USE_GPU = torch.cuda.is_available()
DEVICE = 0 if USE_GPU else -1
TORCH_DTYPE = torch.float16 if USE_GPU else None

# Load models globally as pipelines
caption_pipeline = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-base",
    device=DEVICE,
)

def generate_caption(image: Image.Image) -> str:
    """Generate caption using BLIP image-to-text pipeline."""
    out = caption_pipeline(image, max_new_tokens=50)
    if isinstance(out, list) and out and "generated_text" in out[0]:
        return out[0]["generated_text"]
    return ""

detection_pipeline = pipeline(
    "object-detection",
    model="facebook/detr-resnet-50",
    device=DEVICE,
)

# Load up to 10 sample images from flwrlabs/office-home for one-click testing.
# Filter to Office-Home classes whose label matches a key in VOCAB_DICT, so the
# samples are guaranteed to produce vocab the app can actually translate. Dedupe
# by class to maximize variety. Streaming mode avoids downloading the full dataset.
SAMPLE_DIR = "sample_images"
MAX_STREAM_SCAN = 2000  # safety cap so we don't iterate forever

def load_sample_images(n: int = 10) -> List[str]:
    paths: List[str] = []
    try:
        os.makedirs(SAMPLE_DIR, exist_ok=True)
        ds = load_dataset("flwrlabs/office-home", split="train", streaming=True)
        class_names = ds.features["label"].names if "label" in ds.features else []
        seen_classes: set = set()
        for i, example in enumerate(ds):
            if len(paths) >= n or i >= MAX_STREAM_SCAN:
                break
            img = example.get("image")
            label_idx = example.get("label")
            if img is None or label_idx is None or not class_names:
                continue
            raw_label = class_names[label_idx]
            normalized = raw_label.lower().replace("_", "")
            if not any(vocab_key in normalized for vocab_key in VOCAB_DICT):
                continue
            if raw_label in seen_classes:
                continue
            seen_classes.add(raw_label)
            path = os.path.join(SAMPLE_DIR, f"sample_{len(paths):02d}_{raw_label}.jpg")
            img.convert("RGB").save(path, "JPEG")
            paths.append(path)
    except Exception as e:
        print(f"Could not load sample images from flwrlabs/office-home: {e}")
    return paths

SAMPLE_PATHS = load_sample_images(10)


def clean_text(text: str) -> str:
    """Clean and normalize text."""
    return re.sub(r"[^a-zA-Z\s]", "", text.lower()).strip()


def extract_vocab_from_caption(caption: str) -> List[str]:
    """Extract vocab from caption text. Single-word keys match per-token;
    multi-word keys are matched as phrases."""
    cleaned = clean_text(caption)
    tokens = set(cleaned.split())
    matches = {k for k in _SINGLE_WORD_KEYS if k in tokens}
    matches.update(k for k in _MULTI_WORD_KEYS if k in cleaned)
    return list(matches)


def extract_vocab_from_detection(detection_results: List[Dict]) -> List[str]:
    """Extract vocab from detection labels (often multi-word, e.g. 'dining table')."""
    matches = set()
    for res in detection_results:
        if res.get("score", 0) <= 0.5:
            continue
        label = res.get("label", "").lower()
        if label in VOCAB_DICT:
            matches.add(label)
            continue
        for token in label.split():
            if token in _SINGLE_WORD_KEYS:
                matches.add(token)
    return list(matches)


def translate_term(term: str, lang: str) -> Dict[str, str]:
    """Translate term using dictionary."""
    if term not in VOCAB_DICT:
        return {"translation": "translation unavailable", "romanization": "N/A"}
    entry = VOCAB_DICT[term]
    if lang == "Japanese":
        return {"translation": entry["japanese"], "romanization": entry["romaji"]}
    elif lang == "Korean":
        return {"translation": entry["korean"], "romanization": entry["romanization"]}
    return {"translation": term, "romanization": "N/A"}


def generate_flashcard_table(vocab_list: List[str], lang: str) -> List[List[str]]:
    """Generate flashcard table."""
    table = [["English", f"{lang} Translation", "Romanization", "Source"]]
    for term in vocab_list:
        trans = translate_term(term, lang)
        table.append([term, trans["translation"], trans["romanization"], "extracted"])
    return table


def compute_comparison_stats(
    caption_vocab: List[str],
    detection_vocab: List[str],
    caption_time: float,
    detection_time: float,
    detection_results: List[Dict],
) -> str:
    """Compute comparison statistics."""
    overlap = set(caption_vocab) & set(detection_vocab)
    avg_conf = sum(r["score"] for r in detection_results) / len(detection_results) if detection_results else 0.0
    
    stats = f"""
Captioning Vocab Terms: {len(caption_vocab)}
Detection Vocab Terms: {len(detection_vocab)}
Overlapping Terms: {len(overlap)}
Caption Output Length: {len(' '.join(caption_vocab))}
Detection Output Length: {len(detection_vocab)}
Average Detection Confidence: {avg_conf:.2f}
Captioning Time: {caption_time:.2f}s
Detection Time: {detection_time:.2f}s
Conclusion: {'Captioning' if len(caption_vocab) > len(detection_vocab) else 'Detection'} provided more vocabulary terms.
"""
    return stats.strip()


def process_image(image: Image.Image, language: str):
    """Main processing function."""
    if image is None:
        return "No image uploaded.", [], [], "No image."

    # Algorithm 1: Captioning
    start = time.time()
    try:
        caption = generate_caption(image)
    except Exception as e:
        caption = f"Captioning failed: {e}"
    caption_time = time.time() - start

    # Algorithm 2: Detection
    start = time.time()
    try:
        detection_results = detection_pipeline(image)
    except Exception as e:
        detection_results = []
    detection_time = time.time() - start

    # NLP: Extract vocab
    caption_vocab = extract_vocab_from_caption(caption)
    detection_vocab = extract_vocab_from_detection(detection_results)
    all_vocab = list(set(caption_vocab + detection_vocab))

    # Flashcard table
    flashcard_table = generate_flashcard_table(all_vocab, language)

    # Comparison stats
    stats = compute_comparison_stats(caption_vocab, detection_vocab, caption_time, detection_time, detection_results)

    return caption, detection_results, flashcard_table, stats


# Gradio Interface
with gr.Blocks(title="Multimodal Language Flashcard Generator") as demo:
    gr.Markdown("# Multimodal Language Flashcard Generator")
    gr.Markdown("Upload an image, select a language, and generate flashcards with captioning and object detection.")
    
    with gr.Row():
        image_input = gr.Image(type="pil", label="Upload Image")
        lang_input = gr.Dropdown(["Japanese", "Korean"], label="Target Language", value="Japanese")

    if SAMPLE_PATHS:
        gr.Examples(
            examples=[[p] for p in SAMPLE_PATHS],
            inputs=[image_input],
            label="Sample images from flwrlabs/office-home (click one to load)",
        )

    generate_btn = gr.Button("Generate Flashcards")
    
    with gr.Row():
        caption_output = gr.Textbox(label="Image Caption", lines=2)
        detection_output = gr.Dataframe(label="Object Detection Results", headers=["Label", "Score", "Box"])
    
    flashcard_output = gr.Dataframe(label="Flashcard Table", headers=["English", "Translation", "Romanization", "Source"])
    stats_output = gr.Textbox(label="Comparison Statistics", lines=8)

    generate_btn.click(
        fn=process_image,
        inputs=[image_input, lang_input],
        outputs=[caption_output, detection_output, flashcard_output, stats_output],
    )


if __name__ == "__main__":
    demo.launch()