Spaces:
Sleeping
Sleeping
File size: 2,277 Bytes
b46c7c0 c9b1147 6e41706 b46c7c0 c9b1147 b46c7c0 c9b1147 b46c7c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
import requests
import gtts as gt
from PIL import Image
from gradio_client import Client
from googletrans import Translator
import cv2
import numpy as np
import tempfile
import base64
from io import BytesIO
def trans(text, lang='ta'):
translator = Translator()
out = translator.translate(text, dest=lang)
tts = gt.gTTS(text=out.text, lang=lang)
# Save the audio as a temporary file
temp_audio_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.save(temp_audio_file.name)
return temp_audio_file.name
def object_recognition(image_array, lang):
# Convert the NumPy array to PIL Image
image = Image.fromarray(image_array)
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
headers = {"Authorization": "Bearer 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"}
with open("temp_image.jpg", "wb") as f:
image.save(f, format="JPEG")
with open("temp_image.jpg", "rb") as f:
response = requests.post(API_URL, headers=headers, data=f)
output = response.json()
result = output[0]['generated_text']
text = "Object recognition result for the captured image."
audio_file = trans(result, lang)
return audio_file
def ocr_detection(image_array, lang):
image = Image.fromarray(image_array)
buffered = BytesIO()
image.save(buffered, format="PNG")
image_base64 = base64.b64encode(buffered.getvalue()).decode()
response = requests.post("https://pragnakalp-ocr-image-to-text.hf.space/run/predict", json={
"data": [
"PaddleOCR",
f"data:image/png;base64,{image_base64}",
]
}).json()
data = response.get("data", [])
text = " ".join(data)
audio_file = trans(text, lang)
return audio_file
def operator(image_array, value, lang):
if value == "1":
audio_file = object_recognition(image_array, lang)
elif value == "2":
audio_file = ocr_detection(image_array, lang)
else:
text = "Sorry, I can't perform this operation."
audio_file = trans(text, lang)
return audio_file
# Create Gradio interface
iface = gr.Interface(fn=operator, inputs=["image", "text", "text"], outputs="audio")
iface.launch(share=True)
|