Spaces:

ProfRom
/

TestSpace

Sleeping

File size: 19,083 Bytes

import os
import re
import time
from typing import List, Dict, Any

import torch
import gradio as gr
from PIL import Image
from transformers import pipeline
from datasets import load_dataset

# Vocabulary dictionary covering Office-Home dataset classes + common COCO
# household/office items DETR emits. Single-word keys are matched per-token in
# captions and detection labels; multi-word keys (e.g. "dining table") are
# matched as phrases.
VOCAB_DICT = {
    # --- Furniture ---
    "chair": {"japanese": "いす", "romaji": "isu", "korean": "의자", "romanization": "uija"},
    "table": {"japanese": "テーブル", "romaji": "teeburu", "korean": "테이블", "romanization": "teibeul"},
    "dining table": {"japanese": "ダイニングテーブル", "romaji": "dainingu teeburu", "korean": "식탁", "romanization": "siktak"},
    "desk": {"japanese": "机", "romaji": "tsukue", "korean": "책상", "romanization": "chaeksang"},
    "bed": {"japanese": "ベッド", "romaji": "beddo", "korean": "침대", "romanization": "chimdae"},
    "couch": {"japanese": "ソファ", "romaji": "sofa", "korean": "소파", "romanization": "sopa"},
    "sofa": {"japanese": "ソファ", "romaji": "sofa", "korean": "소파", "romanization": "sopa"},
    "shelf": {"japanese": "棚", "romaji": "tana", "korean": "선반", "romanization": "seonban"},
    "curtain": {"japanese": "カーテン", "romaji": "kaaten", "korean": "커튼", "romanization": "keoteun"},
    "file cabinet": {"japanese": "ファイルキャビネット", "romaji": "fairu kyabinetto", "korean": "파일 캐비닛", "romanization": "pail kaebinit"},

    # --- Lighting / electrical ---
    "lamp": {"japanese": "ランプ", "romaji": "ranpu", "korean": "램프", "romanization": "raempeu"},
    "desk lamp": {"japanese": "デスクランプ", "romaji": "desuku ranpu", "korean": "책상 램프", "romanization": "chaeksang raempeu"},
    "lamp shade": {"japanese": "ランプシェード", "romaji": "ranpu sheedo", "korean": "램프 갓", "romanization": "raempeu gat"},
    "fan": {"japanese": "扇風機", "romaji": "senpuuki", "korean": "선풍기", "romanization": "seonpunggi"},
    "battery": {"japanese": "電池", "romaji": "denchi", "korean": "배터리", "romanization": "baeteori"},
    "candle": {"japanese": "ろうそく", "romaji": "rousoku", "korean": "양초", "romanization": "yangcho"},

    # --- Computing / electronics ---
    "laptop": {"japanese": "ノートパソコン", "romaji": "nooto pasokon", "korean": "노트북", "romanization": "noteubuk"},
    "computer": {"japanese": "コンピュータ", "romaji": "konpyuuta", "korean": "컴퓨터", "romanization": "keompyuteo"},
    "monitor": {"japanese": "モニター", "romaji": "monitaa", "korean": "모니터", "romanization": "moniteo"},
    "keyboard": {"japanese": "キーボード", "romaji": "kiibodo", "korean": "키보드", "romanization": "kibodeu"},
    "mouse": {"japanese": "マウス", "romaji": "mausu", "korean": "마우스", "romanization": "mauseu"},
    "printer": {"japanese": "プリンター", "romaji": "purintaa", "korean": "프린터", "romanization": "peurinteo"},
    "webcam": {"japanese": "ウェブカメラ", "romaji": "webu kamera", "korean": "웹캠", "romanization": "wepkaem"},
    "speaker": {"japanese": "スピーカー", "romaji": "supiikaa", "korean": "스피커", "romanization": "seupikeo"},
    "tv": {"japanese": "テレビ", "romaji": "terebi", "korean": "텔레비전", "romanization": "tellebijeon"},
    "television": {"japanese": "テレビ", "romaji": "terebi", "korean": "텔레비전", "romanization": "tellebijeon"},
    "remote": {"japanese": "リモコン", "romaji": "rimokon", "korean": "리모컨", "romanization": "rimokeon"},
    "radio": {"japanese": "ラジオ", "romaji": "rajio", "korean": "라디오", "romanization": "radio"},
    "phone": {"japanese": "電話", "romaji": "denwa", "korean": "전화", "romanization": "jeonhwa"},
    "telephone": {"japanese": "電話", "romaji": "denwa", "korean": "전화", "romanization": "jeonhwa"},
    "cell phone": {"japanese": "携帯電話", "romaji": "keitai denwa", "korean": "휴대폰", "romanization": "hyudaepon"},
    "calculator": {"japanese": "電卓", "romaji": "dentaku", "korean": "계산기", "romanization": "gyesangi"},
    "clock": {"japanese": "時計", "romaji": "tokei", "korean": "시계", "romanization": "sigye"},
    "alarm clock": {"japanese": "目覚まし時計", "romaji": "mezamashi dokei", "korean": "알람 시계", "romanization": "allam sigye"},

    # --- Stationery / office supplies ---
    "pen": {"japanese": "ペン", "romaji": "pen", "korean": "펜", "romanization": "pen"},
    "pencil": {"japanese": "鉛筆", "romaji": "enpitsu", "korean": "연필", "romanization": "yeonpil"},
    "marker": {"japanese": "マーカー", "romaji": "maakaa", "korean": "마커", "romanization": "makeo"},
    "eraser": {"japanese": "消しゴム", "romaji": "keshigomu", "korean": "지우개", "romanization": "jiugae"},
    "ruler": {"japanese": "定規", "romaji": "jougi", "korean": "자", "romanization": "ja"},
    "scissors": {"japanese": "はさみ", "romaji": "hasami", "korean": "가위", "romanization": "gawi"},
    "notebook": {"japanese": "ノート", "romaji": "nooto", "korean": "공책", "romanization": "gongchaek"},
    "book": {"japanese": "本", "romaji": "hon", "korean": "책", "romanization": "chaek"},
    "folder": {"japanese": "フォルダ", "romaji": "foruda", "korean": "폴더", "romanization": "poldeo"},
    "clipboard": {"japanese": "クリップボード", "romaji": "kurippu boodo", "korean": "클립보드", "romanization": "keullipbodeu"},
    "calendar": {"japanese": "カレンダー", "romaji": "karendaa", "korean": "달력", "romanization": "dallyeok"},
    "paper clip": {"japanese": "クリップ", "romaji": "kurippu", "korean": "종이 클립", "romanization": "jongi keullip"},
    "push pin": {"japanese": "画びょう", "romaji": "gabyou", "korean": "압정", "romanization": "apjeong"},
    "exit sign": {"japanese": "出口表示", "romaji": "deguchi hyouji", "korean": "출구 표지", "romanization": "chulgu pyoji"},

    # --- Kitchen / dining ---
    "mug": {"japanese": "マグカップ", "romaji": "magu kappu", "korean": "머그컵", "romanization": "meogeukeop"},
    "cup": {"japanese": "カップ", "romaji": "kappu", "korean": "컵", "romanization": "keop"},
    "wine glass": {"japanese": "ワイングラス", "romaji": "wain gurasu", "korean": "와인 잔", "romanization": "wain jan"},
    "bottle": {"japanese": "ボトル", "romaji": "botoru", "korean": "병", "romanization": "byeong"},
    "bowl": {"japanese": "ボウル", "romaji": "bouru", "korean": "그릇", "romanization": "geureut"},
    "fork": {"japanese": "フォーク", "romaji": "fooku", "korean": "포크", "romanization": "pokeu"},
    "spoon": {"japanese": "スプーン", "romaji": "supuun", "korean": "숟가락", "romanization": "sutgarak"},
    "knife": {"japanese": "ナイフ", "romaji": "naifu", "korean": "칼", "romanization": "kal"},
    "kettle": {"japanese": "やかん", "romaji": "yakan", "korean": "주전자", "romanization": "jujeonja"},
    "pan": {"japanese": "フライパン", "romaji": "furaipan", "korean": "팬", "romanization": "paen"},
    "oven": {"japanese": "オーブン", "romaji": "oobun", "korean": "오븐", "romanization": "obeun"},
    "microwave": {"japanese": "電子レンジ", "romaji": "denshi renji", "korean": "전자레인지", "romanization": "jeonjareinji"},
    "toaster": {"japanese": "トースター", "romaji": "toosutaa", "korean": "토스터", "romanization": "toseuteo"},
    "refrigerator": {"japanese": "冷蔵庫", "romaji": "reizouko", "korean": "냉장고", "romanization": "naengjanggo"},
    "sink": {"japanese": "流し", "romaji": "nagashi", "korean": "싱크대", "romanization": "singkeudae"},
    "soda": {"japanese": "ソーダ", "romaji": "sooda", "korean": "탄산음료", "romanization": "tansaneumnyo"},

    # --- Bathroom ---
    "toothbrush": {"japanese": "歯ブラシ", "romaji": "ha burashi", "korean": "칫솔", "romanization": "chitsol"},
    "toilet": {"japanese": "トイレ", "romaji": "toire", "korean": "화장실", "romanization": "hwajangsil"},

    # --- Tools / hardware ---
    "hammer": {"japanese": "ハンマー", "romaji": "hanmaa", "korean": "망치", "romanization": "mangchi"},
    "drill": {"japanese": "ドリル", "romaji": "doriru", "korean": "드릴", "romanization": "deuril"},
    "screwdriver": {"japanese": "ドライバー", "romaji": "doraibaa", "korean": "드라이버", "romanization": "deuraibeo"},
    "bucket": {"japanese": "バケツ", "romaji": "baketsu", "korean": "양동이", "romanization": "yangdongi"},
    "mop": {"japanese": "モップ", "romaji": "moppu", "korean": "대걸레", "romanization": "daegeolle"},
    "trash can": {"japanese": "ゴミ箱", "romaji": "gomibako", "korean": "쓰레기통", "romanization": "sseuregitong"},

    # --- Personal items / clothing ---
    "backpack": {"japanese": "リュックサック", "romaji": "ryukku sakku", "korean": "백팩", "romanization": "baekpaek"},
    "handbag": {"japanese": "ハンドバッグ", "romaji": "hando baggu", "korean": "핸드백", "romanization": "haendeubaek"},
    "suitcase": {"japanese": "スーツケース", "romaji": "suutsu keesu", "korean": "여행 가방", "romanization": "yeohaeng gabang"},
    "umbrella": {"japanese": "傘", "romaji": "kasa", "korean": "우산", "romanization": "usan"},
    "glasses": {"japanese": "眼鏡", "romaji": "megane", "korean": "안경", "romanization": "angyeong"},
    "tie": {"japanese": "ネクタイ", "romaji": "nekutai", "korean": "넥타이", "romanization": "nektai"},
    "helmet": {"japanese": "ヘルメット", "romaji": "herumetto", "korean": "헬멧", "romanization": "helmet"},
    "sneakers": {"japanese": "スニーカー", "romaji": "suniikaa", "korean": "운동화", "romanization": "undonghwa"},
    "flipflops": {"japanese": "ビーチサンダル", "romaji": "biichi sandaru", "korean": "슬리퍼", "romanization": "seullipeo"},
    "bike": {"japanese": "自転車", "romaji": "jitensha", "korean": "자전거", "romanization": "jajeongeo"},

    # --- Decor / misc ---
    "flower": {"japanese": "花", "romaji": "hana", "korean": "꽃", "romanization": "kkot"},
    "plant": {"japanese": "植物", "romaji": "shokubutsu", "korean": "식물", "romanization": "singmul"},
    "potted plant": {"japanese": "鉢植え", "romaji": "hachi-ue", "korean": "화분", "romanization": "hwabun"},
    "vase": {"japanese": "花瓶", "romaji": "kabin", "korean": "꽃병", "romanization": "kkotbyeong"},
    "toy": {"japanese": "おもちゃ", "romaji": "omocha", "korean": "장난감", "romanization": "jangnangam"},
    "teddy bear": {"japanese": "テディベア", "romaji": "tedi bea", "korean": "곰인형", "romanization": "gominhyeong"},
    "postit": {"japanese": "付箋", "romaji": "fusen", "korean": "포스트잇", "romanization": "poseuteuit"},
    "hairdryer": {"japanese": "ドライヤー", "romaji": "doraiyaa", "korean": "드라이어", "romanization": "deuraieo"},
}

# Pre-split single-word vs multi-word keys for efficient matching
_SINGLE_WORD_KEYS = {k for k in VOCAB_DICT if " " not in k}
_MULTI_WORD_KEYS = [k for k in VOCAB_DICT if " " in k]

# Device setup
USE_GPU = torch.cuda.is_available()
DEVICE = 0 if USE_GPU else -1
TORCH_DTYPE = torch.float16 if USE_GPU else None

# Load models globally as pipelines
caption_pipeline = pipeline(
    "image-to-text",
    model="Salesforce/blip-image-captioning-base",
    device=DEVICE,
)

def generate_caption(image: Image.Image) -> str:
    """Generate caption using BLIP image-to-text pipeline."""
    out = caption_pipeline(image, max_new_tokens=50)
    if isinstance(out, list) and out and "generated_text" in out[0]:
        return out[0]["generated_text"]
    return ""

detection_pipeline = pipeline(
    "object-detection",
    model="facebook/detr-resnet-50",
    device=DEVICE,
)

# Load up to 10 sample images from flwrlabs/office-home for one-click testing.
# Filter to Office-Home classes whose label matches a key in VOCAB_DICT, so the
# samples are guaranteed to produce vocab the app can actually translate. Dedupe
# by class to maximize variety. Streaming mode avoids downloading the full dataset.
SAMPLE_DIR = "sample_images"
MAX_STREAM_SCAN = 2000  # safety cap so we don't iterate forever

def load_sample_images(n: int = 10) -> List[str]:
    paths: List[str] = []
    try:
        os.makedirs(SAMPLE_DIR, exist_ok=True)
        ds = load_dataset("flwrlabs/office-home", split="train", streaming=True)
        class_names = ds.features["label"].names if "label" in ds.features else []
        seen_classes: set = set()
        for i, example in enumerate(ds):
            if len(paths) >= n or i >= MAX_STREAM_SCAN:
                break
            img = example.get("image")
            label_idx = example.get("label")
            if img is None or label_idx is None or not class_names:
                continue
            raw_label = class_names[label_idx]
            normalized = raw_label.lower().replace("_", "")
            if not any(vocab_key in normalized for vocab_key in VOCAB_DICT):
                continue
            if raw_label in seen_classes:
                continue
            seen_classes.add(raw_label)
            path = os.path.join(SAMPLE_DIR, f"sample_{len(paths):02d}_{raw_label}.jpg")
            img.convert("RGB").save(path, "JPEG")
            paths.append(path)
    except Exception as e:
        print(f"Could not load sample images from flwrlabs/office-home: {e}")
    return paths

SAMPLE_PATHS = load_sample_images(10)


def clean_text(text: str) -> str:
    """Clean and normalize text."""
    return re.sub(r"[^a-zA-Z\s]", "", text.lower()).strip()


def extract_vocab_from_caption(caption: str) -> List[str]:
    """Extract vocab from caption text. Single-word keys match per-token;
    multi-word keys are matched as phrases."""
    cleaned = clean_text(caption)
    tokens = set(cleaned.split())
    matches = {k for k in _SINGLE_WORD_KEYS if k in tokens}
    matches.update(k for k in _MULTI_WORD_KEYS if k in cleaned)
    return list(matches)


def extract_vocab_from_detection(detection_results: List[Dict]) -> List[str]:
    """Extract vocab from detection labels (often multi-word, e.g. 'dining table')."""
    matches = set()
    for res in detection_results:
        if res.get("score", 0) <= 0.5:
            continue
        label = res.get("label", "").lower()
        if label in VOCAB_DICT:
            matches.add(label)
            continue
        for token in label.split():
            if token in _SINGLE_WORD_KEYS:
                matches.add(token)
    return list(matches)


def translate_term(term: str, lang: str) -> Dict[str, str]:
    """Translate term using dictionary."""
    if term not in VOCAB_DICT:
        return {"translation": "translation unavailable", "romanization": "N/A"}
    entry = VOCAB_DICT[term]
    if lang == "Japanese":
        return {"translation": entry["japanese"], "romanization": entry["romaji"]}
    elif lang == "Korean":
        return {"translation": entry["korean"], "romanization": entry["romanization"]}
    return {"translation": term, "romanization": "N/A"}


def generate_flashcard_table(vocab_list: List[str], lang: str) -> List[List[str]]:
    """Generate flashcard table."""
    table = [["English", f"{lang} Translation", "Romanization", "Source"]]
    for term in vocab_list:
        trans = translate_term(term, lang)
        table.append([term, trans["translation"], trans["romanization"], "extracted"])
    return table


def compute_comparison_stats(
    caption_vocab: List[str],
    detection_vocab: List[str],
    caption_time: float,
    detection_time: float,
    detection_results: List[Dict],
) -> str:
    """Compute comparison statistics."""
    overlap = set(caption_vocab) & set(detection_vocab)
    avg_conf = sum(r["score"] for r in detection_results) / len(detection_results) if detection_results else 0.0
    
    stats = f"""
Captioning Vocab Terms: {len(caption_vocab)}
Detection Vocab Terms: {len(detection_vocab)}
Overlapping Terms: {len(overlap)}
Caption Output Length: {len(' '.join(caption_vocab))}
Detection Output Length: {len(detection_vocab)}
Average Detection Confidence: {avg_conf:.2f}
Captioning Time: {caption_time:.2f}s
Detection Time: {detection_time:.2f}s
Conclusion: {'Captioning' if len(caption_vocab) > len(detection_vocab) else 'Detection'} provided more vocabulary terms.
"""
    return stats.strip()


def process_image(image: Image.Image, language: str):
    """Main processing function."""
    if image is None:
        return "No image uploaded.", [], [], "No image."

    # Algorithm 1: Captioning
    start = time.time()
    try:
        caption = generate_caption(image)
    except Exception as e:
        caption = f"Captioning failed: {e}"
    caption_time = time.time() - start

    # Algorithm 2: Detection
    start = time.time()
    try:
        detection_results = detection_pipeline(image)
    except Exception as e:
        detection_results = []
    detection_time = time.time() - start

    # NLP: Extract vocab
    caption_vocab = extract_vocab_from_caption(caption)
    detection_vocab = extract_vocab_from_detection(detection_results)
    all_vocab = list(set(caption_vocab + detection_vocab))

    # Flashcard table
    flashcard_table = generate_flashcard_table(all_vocab, language)

    # Comparison stats
    stats = compute_comparison_stats(caption_vocab, detection_vocab, caption_time, detection_time, detection_results)

    return caption, detection_results, flashcard_table, stats


# Gradio Interface
with gr.Blocks(title="Multimodal Language Flashcard Generator") as demo:
    gr.Markdown("# Multimodal Language Flashcard Generator")
    gr.Markdown("Upload an image, select a language, and generate flashcards with captioning and object detection.")
    
    with gr.Row():
        image_input = gr.Image(type="pil", label="Upload Image")
        lang_input = gr.Dropdown(["Japanese", "Korean"], label="Target Language", value="Japanese")

    if SAMPLE_PATHS:
        gr.Examples(
            examples=[[p] for p in SAMPLE_PATHS],
            inputs=[image_input],
            label="Sample images from flwrlabs/office-home (click one to load)",
        )

    generate_btn = gr.Button("Generate Flashcards")
    
    with gr.Row():
        caption_output = gr.Textbox(label="Image Caption", lines=2)
        detection_output = gr.Dataframe(label="Object Detection Results", headers=["Label", "Score", "Box"])
    
    flashcard_output = gr.Dataframe(label="Flashcard Table", headers=["English", "Translation", "Romanization", "Source"])
    stats_output = gr.Textbox(label="Comparison Statistics", lines=8)

    generate_btn.click(
        fn=process_image,
        inputs=[image_input, lang_input],
        outputs=[caption_output, detection_output, flashcard_output, stats_output],
    )


if __name__ == "__main__":
    demo.launch()