speakingking / app.py
englissi's picture
Update app.py
f85f196 verified
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os
from gtts import gTTS
# ๐ŸŽจ ํ…Œ๋งˆ ์„ค์ •
theme = gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")
# 1. ๋น„์ „ ๋ชจ๋ธ (๋กœ์ปฌ)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
vision_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
# 2. ํ…์ŠคํŠธ ๋ชจ๋ธ (์•ˆ์ •์ ์ธ ์—”๋“œํฌ์ธํŠธ ์„ค์ •)
hf_token = os.environ.get("HF_TOKEN")
text_client = InferenceClient(
model="Qwen/Qwen2.5-7B-Instruct",
token=hf_token
)
def describe_photo_like_toeic(image_path):
if not image_path:
return "โš ๏ธ Please upload a photo!", "", None
try:
# 1๋‹จ๊ณ„: ์‚ฌ์ง„ ๋ถ„์„
raw_image = Image.open(image_path).convert('RGB')
inputs = processor(raw_image, return_tensors="pt")
out = vision_model.generate(**inputs, max_new_tokens=50)
basic_caption = processor.decode(out[0], skip_special_tokens=True)
# 2๋‹จ๊ณ„: ํ† ์ต ์Šคํ”ผํ‚น ์Šคํƒ€์ผ์˜ ์ดˆ๋“ฑํ•™์ƒ ๊ตฌ์–ด์ฒด ํ”„๋กฌํ”„ํŠธ
# ์‚ฌ์ง„์˜ ์œ„์น˜(In the middle, on the left ๋“ฑ)๋ฅผ ์‚ฌ์šฉํ•˜๋„๋ก ์œ ๋„
prompt = f"""
Act as an American elementary school student taking a speaking test.
Describe this image: '{basic_caption}'
Task:
1. Start with "This is a picture of..."
2. Describe the main subject in the middle.
3. Describe what's happening on the left, right, or background.
4. Use simple kid-friendly words but follow the TOEIC Speaking Part 2 structure.
5. Finish with a feeling like "Overall, it looks..."
6. Provide 5 useful expression phrases with Korean meanings.
Format:
[Photo Description]
(Your description here)
[Useful Phrases]
1. Phrase - ๋œป
2. Phrase - ๋œป
"""
response = text_client.chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=600
)
full_text = response.choices[0].message.content
# 3๋‹จ๊ณ„: TTS (์‚ฌ์ง„ ๋ฌ˜์‚ฌ ๋ถ€๋ถ„๋งŒ ์Œ์„ฑ์œผ๋กœ)
audio_path = "description_audio.mp3"
try:
description_part = full_text.split("[Photo Description]")[1].split("[Useful Phrases]")[0].strip()
tts = gTTS(text=description_part, lang='en')
tts.save(audio_path)
except:
tts = gTTS(text="I can describe this photo for you.", lang='en')
tts.save(audio_path)
return f"**๐Ÿ“ท ์‚ฌ์ง„์˜ ์ฒซ์ธ์ƒ:** {basic_caption}", full_text, audio_path
except Exception as e:
return f"โŒ Error: {str(e)}", "Please check your API token or connection.", None
# UI ๊ตฌ์„ฑ
with gr.Blocks(theme=theme) as demo:
gr.Markdown("# ๐ŸŽค ์ดˆ๋“ฑํ•™์ƒ ๋ฒ„์ „ 'ํ† ์ต ์Šคํ”ผํ‚น' ์‚ฌ์ง„ ๋ฌ˜์‚ฌํ•˜๊ธฐ")
gr.Markdown("### ์‚ฌ์ง„์„ ์˜ฌ๋ฆฌ๋ฉด AI ์–ด๋ฆฐ์ด๊ฐ€ ์‹œํ—˜์„ ๋ณด๋“ฏ ์กฐ๋ฆฌ ์žˆ๊ฒŒ ์„ค๋ช…ํ•ด ์ค๋‹ˆ๋‹ค!")
with gr.Row():
with gr.Column():
img = gr.Image(type="filepath", label="๐Ÿ–ผ๏ธ ๋ฌ˜์‚ฌํ•  ์‚ฌ์ง„")
btn = gr.Button("๐Ÿ“ข ์‚ฌ์ง„ ๋ฌ˜์‚ฌ ์‹œ์ž‘!", variant="primary")
with gr.Column():
status = gr.Markdown("์‚ฌ์ง„์„ ๋ถ„์„ํ•˜๋ฉด ์˜์–ด ์„ค๋ช…์ด ๋‚˜์˜ต๋‹ˆ๋‹ค.")
audio = gr.Audio(label="๐Ÿ”Š AI์˜ ์„ค๋ช… ๋“ฃ๊ธฐ", type="filepath", autoplay=True)
result = gr.Textbox(label="๐Ÿ“ ์‚ฌ์ง„ ๋ฌ˜์‚ฌ ์Šคํฌ๋ฆฝํŠธ", lines=12)
btn.click(describe_photo_like_toeic, inputs=img, outputs=[status, result, audio])
if __name__ == "__main__":
demo.launch()