Spaces:

yongyeol
/

Spellcheck

Sleeping

App Files Files Community

yongyeol commited on Jul 6, 2025

Commit

cb2bc8c

verified ·

1 Parent(s): fd7bb60

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -87

app.py CHANGED Viewed

@@ -1,87 +1,58 @@
-import gradio as gr
-import logging
-from PIL import Image
-from transformers import (
-    BlipProcessor,
-    BlipForConditionalGeneration,
-    pipeline,
-    AutoTokenizer,
-    VitsModel
-)
-import torch
-# ─────────────── 로깅 설정 ───────────────
-logging.basicConfig(level=logging.INFO)
-# ─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
-processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
-# ─────────────── 2. 영어→한국어 번역: NLLB 파이프라인 ───────────────
-translation_pipeline = pipeline(
-    "translation",
-    model="facebook/nllb-200-distilled-600M",
-    src_lang="eng_Latn",
-    tgt_lang="kor_Hang",
-    max_length=200
-)
-# ─────────────── 3. 한국어 TTS: VITS 직접 로딩 방식 ───────────────
-tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor")
-tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor")
-tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
-def synthesize_tts(text: str):
-    inputs = tts_tokenizer(text, return_tensors="pt").to(tts_model.device)
-    with torch.no_grad():
-        output = tts_model(**inputs)
-    waveform = output.waveform.squeeze().cpu().numpy()
-    return (tts_model.config.sampling_rate, waveform)
-# ─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
-def describe_and_speak(img: Image.Image):
-    logging.info("[DEBUG] describe_and_speak 함수 호출됨")
-    # ① 영어 캡션 생성
-    pixel_values = processor(images=img, return_tensors="pt").pixel_values
-    generated_ids = blip_model.generate(pixel_values, max_length=64)
-    caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-    logging.info(f"[DEBUG] caption_en: {caption_en}")
-    print(f"[DEBUG] caption_en: {caption_en}")
-    # ② 번역
-    try:
-        result = translation_pipeline(caption_en)
-        caption_ko = result[0]['translation_text'].strip()
-    except Exception as e:
-        logging.error(f"[ERROR] 번역 오류: {e}")
-        caption_ko = ""
-    logging.info(f"[DEBUG] caption_ko: {caption_ko}")
-    print(f"[DEBUG] caption_ko: {caption_ko}")
-    if not caption_ko:
-        return "이미지에 대한 설명을 생성할 수 없습니다.", None
-    # ③ TTS 합성
-    try:
-        sr, wav = synthesize_tts(caption_ko)
-        return caption_ko, (sr, wav)
-    except Exception as e:
-        logging.error(f"[ERROR] TTS 에러: {e}")
-        return caption_ko, None
-# ─────────────── 5. Gradio 인터페이스 ───────────────
-demo = gr.Interface(
-    fn=describe_and_speak,
-    inputs=gr.Image(type="pil", label="입력 이미지"),
-    outputs=[
-        gr.Textbox(label="한글 캡션"),
-        gr.Audio(label="음성 재생", type="numpy")
-    ],
-    title="이미지 → 한글 캡션 & 음성 변환",
-    description="BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성"
-)
-if __name__ == "__main__":
-    demo.launch(debug=True)

+import gradio as gr import logging from PIL import Image from transformers import ( BlipProcessor, BlipForConditionalGeneration, pipeline, AutoTokenizer, VitsModel ) import torch
+─────────────── 로깅 설정 ───────────────
+logging.basicConfig(level=logging.INFO)
+─────────────── 1. BLIP 이미지 캡셔닝 (영어 생성) ───────────────
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+─────────────── 2. 영어→한국어 번역: NLLB 파이프라인 ───────────────
+translation_pipeline = pipeline( "translation", model="facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="kor_Hang", max_length=200 )
+─────────────── 3. 한국어 TTS: VITS 직접 로딩 방식 ───────────────
+tts_model = VitsModel.from_pretrained("facebook/mms-tts-kor") tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-kor") tts_model.to("cuda" if torch.cuda.is_available() else "cpu")
+def synthesize_tts(text: str): inputs = tts_tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(tts_model.device)  # ⚠ fix: use LongTensor only with torch.no_grad(): output = tts_model(input_ids=input_ids) waveform = output.waveform.squeeze().cpu().numpy() return (tts_model.config.sampling_rate, waveform)
+─────────────── 4. 이미지 → 캡션 + 번역 + 음성 출력 ───────────────
+def describe_and_speak(img: Image.Image): logging.info("[DEBUG] describe_and_speak 함수 호출됨")
+# ① 영어 캡션 생성
+pixel_values = processor(images=img, return_tensors="pt").pixel_values
+generated_ids = blip_model.generate(pixel_values, max_length=64)
+caption_en = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+logging.info(f"[DEBUG] caption_en: {caption_en}")
+print(f"[DEBUG] caption_en: {caption_en}")
+# ② 번역
+try:
+    result = translation_pipeline(caption_en)
+    caption_ko = result[0]['translation_text'].strip()
+except Exception as e:
+    logging.error(f"[ERROR] 번역 오류: {e}")
+    caption_ko = ""
+logging.info(f"[DEBUG] caption_ko: {caption_ko}")
+print(f"[DEBUG] caption_ko: {caption_ko}")
+if not caption_ko:
+    return "이미지에 대한 설명을 생성할 수 없습니다.", None
+# ③ TTS 합성
+try:
+    sr, wav = synthesize_tts(caption_ko)
+    return caption_ko, (sr, wav)
+except Exception as e:
+    logging.error(f"[ERROR] TTS 에러: {e}")
+    return caption_ko, None
+─────────────── 5. Gradio 인터페이스 ───────────────
+demo = gr.Interface( fn=describe_and_speak, inputs=gr.Image(type="pil", sources=["upload", "camera"], label="입력 이미지"), outputs=[ gr.Textbox(label="한글 캡션"), gr.Audio(label="음성 재생", type="numpy") ], title="이미지 → 한글 캡션 & 음성 변환", description="BLIP으로 영어 캡션 생성 → NLLB로 한국어 번역 → VITS로 음성 생성" )
+if name == "main": demo.launch(debug=True)