Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| from transformers import BlipProcessor, BlipForConditionalGeneration | |
| from PIL import Image | |
| import os | |
| from gtts import gTTS | |
| # ๐จ ํ ๋ง ์ค์ | |
| theme = gr.themes.Soft(primary_hue="blue", secondary_hue="cyan") | |
| # 1. ๋น์ ๋ชจ๋ธ (๋ก์ปฌ) | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
| vision_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
| # 2. ํ ์คํธ ๋ชจ๋ธ (์์ ์ ์ธ ์๋ํฌ์ธํธ ์ค์ ) | |
| hf_token = os.environ.get("HF_TOKEN") | |
| text_client = InferenceClient( | |
| model="Qwen/Qwen2.5-7B-Instruct", | |
| token=hf_token | |
| ) | |
| def describe_photo_like_toeic(image_path): | |
| if not image_path: | |
| return "โ ๏ธ Please upload a photo!", "", None | |
| try: | |
| # 1๋จ๊ณ: ์ฌ์ง ๋ถ์ | |
| raw_image = Image.open(image_path).convert('RGB') | |
| inputs = processor(raw_image, return_tensors="pt") | |
| out = vision_model.generate(**inputs, max_new_tokens=50) | |
| basic_caption = processor.decode(out[0], skip_special_tokens=True) | |
| # 2๋จ๊ณ: ํ ์ต ์คํผํน ์คํ์ผ์ ์ด๋ฑํ์ ๊ตฌ์ด์ฒด ํ๋กฌํํธ | |
| # ์ฌ์ง์ ์์น(In the middle, on the left ๋ฑ)๋ฅผ ์ฌ์ฉํ๋๋ก ์ ๋ | |
| prompt = f""" | |
| Act as an American elementary school student taking a speaking test. | |
| Describe this image: '{basic_caption}' | |
| Task: | |
| 1. Start with "This is a picture of..." | |
| 2. Describe the main subject in the middle. | |
| 3. Describe what's happening on the left, right, or background. | |
| 4. Use simple kid-friendly words but follow the TOEIC Speaking Part 2 structure. | |
| 5. Finish with a feeling like "Overall, it looks..." | |
| 6. Provide 5 useful expression phrases with Korean meanings. | |
| Format: | |
| [Photo Description] | |
| (Your description here) | |
| [Useful Phrases] | |
| 1. Phrase - ๋ป | |
| 2. Phrase - ๋ป | |
| """ | |
| response = text_client.chat_completion( | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=600 | |
| ) | |
| full_text = response.choices[0].message.content | |
| # 3๋จ๊ณ: TTS (์ฌ์ง ๋ฌ์ฌ ๋ถ๋ถ๋ง ์์ฑ์ผ๋ก) | |
| audio_path = "description_audio.mp3" | |
| try: | |
| description_part = full_text.split("[Photo Description]")[1].split("[Useful Phrases]")[0].strip() | |
| tts = gTTS(text=description_part, lang='en') | |
| tts.save(audio_path) | |
| except: | |
| tts = gTTS(text="I can describe this photo for you.", lang='en') | |
| tts.save(audio_path) | |
| return f"**๐ท ์ฌ์ง์ ์ฒซ์ธ์:** {basic_caption}", full_text, audio_path | |
| except Exception as e: | |
| return f"โ Error: {str(e)}", "Please check your API token or connection.", None | |
| # UI ๊ตฌ์ฑ | |
| with gr.Blocks(theme=theme) as demo: | |
| gr.Markdown("# ๐ค ์ด๋ฑํ์ ๋ฒ์ 'ํ ์ต ์คํผํน' ์ฌ์ง ๋ฌ์ฌํ๊ธฐ") | |
| gr.Markdown("### ์ฌ์ง์ ์ฌ๋ฆฌ๋ฉด AI ์ด๋ฆฐ์ด๊ฐ ์ํ์ ๋ณด๋ฏ ์กฐ๋ฆฌ ์๊ฒ ์ค๋ช ํด ์ค๋๋ค!") | |
| with gr.Row(): | |
| with gr.Column(): | |
| img = gr.Image(type="filepath", label="๐ผ๏ธ ๋ฌ์ฌํ ์ฌ์ง") | |
| btn = gr.Button("๐ข ์ฌ์ง ๋ฌ์ฌ ์์!", variant="primary") | |
| with gr.Column(): | |
| status = gr.Markdown("์ฌ์ง์ ๋ถ์ํ๋ฉด ์์ด ์ค๋ช ์ด ๋์ต๋๋ค.") | |
| audio = gr.Audio(label="๐ AI์ ์ค๋ช ๋ฃ๊ธฐ", type="filepath", autoplay=True) | |
| result = gr.Textbox(label="๐ ์ฌ์ง ๋ฌ์ฌ ์คํฌ๋ฆฝํธ", lines=12) | |
| btn.click(describe_photo_like_toeic, inputs=img, outputs=[status, result, audio]) | |
| if __name__ == "__main__": | |
| demo.launch() |