VISPA2 / app.py
Sabari231024's picture
Update app.py
6e41706
import gradio as gr
import requests
import gtts as gt
from PIL import Image
from gradio_client import Client
from googletrans import Translator
import cv2
import numpy as np
import tempfile
import base64
from io import BytesIO
def trans(text, lang='ta'):
translator = Translator()
out = translator.translate(text, dest=lang)
tts = gt.gTTS(text=out.text, lang=lang)
# Save the audio as a temporary file
temp_audio_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.save(temp_audio_file.name)
return temp_audio_file.name
def object_recognition(image_array, lang):
# Convert the NumPy array to PIL Image
image = Image.fromarray(image_array)
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
headers = {"Authorization": "Bearer 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"}
with open("temp_image.jpg", "wb") as f:
image.save(f, format="JPEG")
with open("temp_image.jpg", "rb") as f:
response = requests.post(API_URL, headers=headers, data=f)
output = response.json()
result = output[0]['generated_text']
text = "Object recognition result for the captured image."
audio_file = trans(result, lang)
return audio_file
def ocr_detection(image_array, lang):
image = Image.fromarray(image_array)
buffered = BytesIO()
image.save(buffered, format="PNG")
image_base64 = base64.b64encode(buffered.getvalue()).decode()
response = requests.post("https://pragnakalp-ocr-image-to-text.hf.space/run/predict", json={
"data": [
"PaddleOCR",
f"data:image/png;base64,{image_base64}",
]
}).json()
data = response.get("data", [])
text = " ".join(data)
audio_file = trans(text, lang)
return audio_file
def operator(image_array, value, lang):
if value == "1":
audio_file = object_recognition(image_array, lang)
elif value == "2":
audio_file = ocr_detection(image_array, lang)
else:
text = "Sorry, I can't perform this operation."
audio_file = trans(text, lang)
return audio_file
# Create Gradio interface
iface = gr.Interface(fn=operator, inputs=["image", "text", "text"], outputs="audio")
iface.launch(share=True)