Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import BlipForConditionalGeneration, BlipProcessor | |
| import torch | |
| import tempfile | |
| from gtts import gTTS | |
| # Load models | |
| device = "cpu" | |
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") | |
| model_image_captioning = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device) | |
| def generate_caption_tts(image): | |
| inputs = processor(images=image, return_tensors="pt") | |
| inputs["max_length"] = 20 | |
| inputs["num_beams"] = 5 | |
| outputs = model_image_captioning.generate(**inputs) | |
| caption = processor.batch_decode(outputs, skip_special_tokens=True)[0] | |
| speech = gTTS(caption, lang="en") | |
| tmp_file = tempfile.mkstemp()[1] | |
| speech.save(tmp_file) | |
| return (caption, tmp_file) | |
| title ="<span style='font-style: italic; font-weight: bold; color: darkred;'>模多多AI科技大语言模型</span> - PTI多模态健康交互机器人" | |
| description = "BLPM模型:引导性语言图像预训练以实现统一视觉语言理解和生成。 请上传您的图像(或自动感知您的状况)" | |
| iface = gr.Interface( | |
| fn=generate_caption_tts, | |
| title=title, | |
| description=description, | |
| inputs=gr.inputs.Image(shape=(224,224)), | |
| outputs=["text", "audio"] | |
| ) | |
| #iface.launch(share=True, debug=True) | |
| iface.launch() |