SandraCLV commited on
Commit
9e26359
·
1 Parent(s): 1cca5e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -17
app.py CHANGED
@@ -1,27 +1,35 @@
1
  import gradio as gr
2
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
3
  import torch
4
 
5
- # Cargar el modelo y el procesador
6
- model = Wav2Vec2ForCTC.from_pretrained("openai/whisper-large-v2")
7
- processor = Wav2Vec2Processor.from_pretrained("openai/whisper-large-v2")
8
 
9
- def asr(audio_file_path):
10
- # Cargar archivo de audio
11
- input_audio, _ = librosa.load(audio_file_path, sr=16000)
12
 
13
- # Preprocesar audio
14
- input_values = processor(input_audio, return_tensors="pt", sampling_rate=16000).input_values
 
 
15
 
16
- # Realizar inferencia
17
- logits = model(input_values).logits
18
 
19
- # Decodificar los logits a texto
20
- predicted_ids = torch.argmax(logits, dim=-1)
21
- transcription = processor.decode(predicted_ids[0])
22
 
23
- return transcription
 
 
 
 
 
 
 
 
24
 
 
 
25
  # Crear interfaz de Gradio
26
- iface = gr.Interface(fn=asr, inputs=gr.inputs.Audio(source="microphone", type="file"), outputs="text")
27
- iface.launch()
 
1
  import gradio as gr
2
+ from transformers import pipeline
3
  import torch
4
 
5
+ # Cargar el modelo que convierte imagen a texto
6
+ image_to_text_model = pipeline("image-classification")
 
7
 
8
+ # Cargar el modelo que genera audio a partir de texto
9
+ text_to_audio_model = pipeline("text-to-speech")
 
10
 
11
+ # Función para la interfaz de Gradio
12
+ def image_to_audio(input_image):
13
+ # Convertir la imagen a texto
14
+ text_output = image_to_text_model(input_image)[0]['label']
15
 
16
+ # Generar audio a partir del texto
17
+ audio_output = text_to_audio_model(text_output)[0]['audio']
18
 
19
+ return audio_output
 
 
20
 
21
+ # Interfaz Gradio
22
+ iface = gr.Interface(
23
+ fn=image_to_audio,
24
+ inputs=gr.Image(),
25
+ outputs="audio",
26
+ live=True,
27
+ interpretation="default",
28
+ capture_session=True
29
+ )
30
 
31
+ # Ejecutar la interfaz
32
+ iface.launch()
33
  # Crear interfaz de Gradio
34
+ #iface = gr.Interface(fn=asr, inputs=gr.inputs.Audio(source="microphone", type="file"), outputs="text")
35
+ #iface.launch()