Spaces:

englissi
/

speakingking

Sleeping

App Files Files Community

speakingking / app.py

englissi

Update app.py

f85f196 verified about 2 months ago

raw

history blame contribute delete

3.73 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	from transformers import BlipProcessor, BlipForConditionalGeneration
	from PIL import Image
	import os
	from gtts import gTTS

	# 🎨 테마 설정
	theme = gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")

	# 1. 비전 모델 (로컬)
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	vision_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

	# 2. 텍스트 모델 (안정적인 엔드포인트 설정)
	hf_token = os.environ.get("HF_TOKEN")
	text_client = InferenceClient(
	model="Qwen/Qwen2.5-7B-Instruct",
	token=hf_token
	)

	def describe_photo_like_toeic(image_path):
	if not image_path:
	return "⚠️ Please upload a photo!", "", None

	try:
	# 1단계: 사진 분석
	raw_image = Image.open(image_path).convert('RGB')
	inputs = processor(raw_image, return_tensors="pt")
	out = vision_model.generate(**inputs, max_new_tokens=50)
	basic_caption = processor.decode(out[0], skip_special_tokens=True)

	# 2단계: 토익 스피킹 스타일의 초등학생 구어체 프롬프트
	# 사진의 위치(In the middle, on the left 등)를 사용하도록 유도
	prompt = f"""
	Act as an American elementary school student taking a speaking test.
	Describe this image: '{basic_caption}'

	Task:
	1. Start with "This is a picture of..."
	2. Describe the main subject in the middle.
	3. Describe what's happening on the left, right, or background.
	4. Use simple kid-friendly words but follow the TOEIC Speaking Part 2 structure.
	5. Finish with a feeling like "Overall, it looks..."
	6. Provide 5 useful expression phrases with Korean meanings.

	Format:
	[Photo Description]
	(Your description here)

	[Useful Phrases]
	1. Phrase - 뜻
	2. Phrase - 뜻
	"""

	response = text_client.chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=600
	)
	full_text = response.choices[0].message.content

	# 3단계: TTS (사진 묘사 부분만 음성으로)
	audio_path = "description_audio.mp3"
	try:
	description_part = full_text.split("[Photo Description]")[1].split("[Useful Phrases]")[0].strip()
	tts = gTTS(text=description_part, lang='en')
	tts.save(audio_path)
	except:
	tts = gTTS(text="I can describe this photo for you.", lang='en')
	tts.save(audio_path)

	return f"📷 사진의 첫인상: {basic_caption}", full_text, audio_path

	except Exception as e:
	return f"❌ Error: {str(e)}", "Please check your API token or connection.", None

	# UI 구성
	with gr.Blocks(theme=theme) as demo:
	gr.Markdown("# 🎤 초등학생 버전 '토익 스피킹' 사진 묘사하기")
	gr.Markdown("### 사진을 올리면 AI 어린이가 시험을 보듯 조리 있게 설명해 줍니다!")

	with gr.Row():
	with gr.Column():
	img = gr.Image(type="filepath", label="🖼️ 묘사할 사진")
	btn = gr.Button("📢 사진 묘사 시작!", variant="primary")
	with gr.Column():
	status = gr.Markdown("사진을 분석하면 영어 설명이 나옵니다.")
	audio = gr.Audio(label="🔊 AI의 설명 듣기", type="filepath", autoplay=True)
	result = gr.Textbox(label="📝 사진 묘사 스크립트", lines=12)

	btn.click(describe_photo_like_toeic, inputs=img, outputs=[status, result, audio])

	if __name__ == "__main__":
	demo.launch()