from datasets import load_dataset from transformers import pipeline import soundfile as sf import torch import gradio as gr import numpy as np import sentencepiece from transformers import MarianMTModel, MarianTokenizer def predict_image(image): pipe = pipeline("image-classification", model="google/vit-base-patch16-224") ClassifedImage=pipe(image) result=ClassifedImage[0]['label'] return result def translate_to_arabic(text): pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar") result=pipe(text , max_length=100) return result[0]['translation_text'] def translate_to_arabic(text): model_name = "Helsinki-NLP/opus-mt-en-ar" # Use MarianTokenizer specifically tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) inputs = tokenizer(text, return_tensors="pt", padding=True) outputs = model.generate(**inputs, max_length=100) translated = tokenizer.decode(outputs[0], skip_special_tokens=True) return translated def text_to_speech(text): pipe = pipeline("text-to-speech", model="MBZUAI/speecht5_tts_clartts_ar") embedding_dataset=load_dataset("herwoww/arabic_xvector_embeddings" , split="validation") #Those embeddings represent a speaker’s voice characteristics speaker_embedding=torch.tensor(embedding_dataset[100]['speaker_embeddings']).unsqueeze(0) ##It becomes a 2-D tensor speech=pipe(text , forward_params={'speaker_embeddings':speaker_embedding}) return (speech['sampling_rate'],np.array(speech['audio'], dtype=np.float32)) from PIL import Image with gr.Blocks() as app: gr.Markdown("Image Classification, Arabic Translation, TTS") with gr.Row(): with gr.Column(): image_input=gr.Image(type="pil",label="Upload the Image to classify it" ) classify_image=gr.Button("Classify the Image") pred=gr.Textbox(label="Classifcation Result") classify_image.click(fn=predict_image , inputs=image_input , outputs=pred) with gr.Row(): translated_output=gr.Textbox(label="Translated Text") translate_btn=gr.Button("Translate to Arabic") translate_btn.click(fn=translate_to_arabic , inputs=pred , outputs=translated_output) with gr.Row(): tts_btn=gr.Button("Convert to Speech") audio_output=gr.Audio(label="Audio Output") tts_btn.click(fn=text_to_speech , inputs=translated_output , outputs=audio_output) app.launch()