hebaadel commited on
Commit
4bd3163
·
verified ·
1 Parent(s): 118c8f5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import pipeline
3
+ import soundfile as sf
4
+ import torch
5
+ import gradio as gr
6
+ import numpy as np
7
+
8
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
9
+
10
+ model_name = "Helsinki-NLP/opus-mt-en-ar"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
13
+
14
+ def translate_to_arabic(text):
15
+ inputs = tokenizer(text, return_tensors="pt", padding=True)
16
+ outputs = model.generate(**inputs, max_length=100)
17
+ translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
18
+ return translated
19
+
20
+ # Test it
21
+ print(translate_to_arabic("Hello, how are you?"))
22
+
23
+ def predict_image(image):
24
+ pipe = pipeline("image-classification", model="google/vit-base-patch16-224")
25
+ ClassifedImage=pipe(image)
26
+
27
+ result=ClassifedImage[0]['label']
28
+ return result
29
+
30
+ def translate_to_arabic(text):
31
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar")
32
+ result=pipe(text , max_length=100)
33
+ return result[0]['translation_text']
34
+
35
+ # Use a pipeline as a high-level helper
36
+ # Warning: Pipeline type "translation" is no longer supported in transformers v5.
37
+ # You must load the model directly (see below) or downgrade to v4.x with:
38
+ # 'pip install "transformers<5.0.0'
39
+ from transformers import pipeline
40
+
41
+ pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ar")
42
+
43
+ def translate_to_arabic(text):
44
+ inputs = tokenizer(text, return_tensors="pt", padding=True)
45
+ outputs = model.generate(**inputs, max_length=100)
46
+ translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
47
+ return translated
48
+
49
+ """✅ Because SpeechT5 requires a speaker embedding
50
+
51
+ The model MBZUAI/speecht5_tts_clartts_ar is based on SpeechT5, and SpeechT5 needs a speaker embedding to generate speech.
52
+
53
+ Think of it like this:
54
+
55
+ The text tells the model what to say.
56
+
57
+ The speaker embedding tells the model what the voice should sound like.
58
+
59
+ Without a speaker embedding, the model does not know what voice to use → and the pipeline will fail or produce wrong audio.
60
+
61
+ What is the dataset?
62
+
63
+ herwoww/arabic_xvector_embeddings contains pre-computed speaker embeddings (x-vectors) for different Arabic speakers.
64
+
65
+ Each embedding is like a "voice fingerprint".
66
+
67
+ You pick one of them to generate speech in that voice.
68
+
69
+ In your code, you choose speaker number 105:
70
+ """
71
+
72
+ def text_to_speech(text):
73
+ pipe = pipeline("text-to-speech", model="MBZUAI/speecht5_tts_clartts_ar")
74
+ embedding_dataset=load_dataset("herwoww/arabic_xvector_embeddings" , split="validation") #Those embeddings represent a speaker’s voice characteristics
75
+ '''
76
+ This is a 1D tensor.
77
+ But most models expect: (batch_size, input_size)
78
+ torch.Size([1, 3]) 1 sample (batch size = 1)
79
+ (784,)>> (1, 784)>> (batch_size, 784)
80
+ '''
81
+ speaker_embedding=torch.tensor(embedding_dataset[100]['speaker_embeddings']).unsqueeze(0) ##It becomes a 2-D tensor
82
+ speech=pipe(text , forward_params={'speaker_embeddings':speaker_embedding})
83
+
84
+ return (speech['sampling_rate'],np.array(speech['audio'], dtype=np.float32))
85
+
86
+ from PIL import Image
87
+ with gr.Blocks() as app:
88
+ gr.Markdown("Image Classification, Arabic Translation, TTS")
89
+
90
+ with gr.Row():
91
+ with gr.Column():
92
+ image_input=gr.Image(type="pil",label="Upload the Image to classify it" )
93
+ classify_image=gr.Button("Classify the Image")
94
+ pred=gr.Textbox(label="Classifcation Result")
95
+
96
+ classify_image.click(fn=predict_image , inputs=image_input , outputs=pred)
97
+
98
+ with gr.Row():
99
+ translated_output=gr.Textbox(label="Translated Text")
100
+ translate_btn=gr.Button("Translate to Arabic")
101
+
102
+ translate_btn.click(fn=translate_to_arabic , inputs=pred , outputs=translated_output)
103
+
104
+ with gr.Row():
105
+ tts_btn=gr.Button("Convert to Speech")
106
+ audio_output=gr.Audio(label="Audio Output")
107
+
108
+ tts_btn.click(fn=text_to_speech , inputs=translated_output , outputs=audio_output)
109
+
110
+ app.launch()