Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline,WhisperProcessor, WhisperForConditionalGeneration | |
| import torch | |
| import librosa | |
| import datasets | |
| from transformers.pipelines.pt_utils import KeyDataset | |
| from tqdm.auto import tqdm | |
| import logging | |
| import time | |
| import uuid | |
| import soundfile as sf | |
| from model import get_pretrained_model, language_to_models | |
| #text to speech code from https://huggingface.co/spaces/k2-fsa/text-to-speech/blob/main/app.py | |
| image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k") | |
| def build_html_output(s: str, style: str = "result_item_success"): | |
| return f""" | |
| <div class='result'> | |
| <div class='result_item {style}'> | |
| {s} | |
| </div> | |
| </div> | |
| """ | |
| def image_to_text(input_image): | |
| # Convertir la imagen a texto | |
| text_output = image_to_text_model(input_image)[0]['label'] | |
| print(text_output) | |
| #texts = transcriber(text_output) | |
| return text_output | |
| def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float): | |
| logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}") | |
| sid = int(sid) | |
| tts = get_pretrained_model(repo_id, speed) | |
| start = time.time() | |
| audio = tts.generate(text, sid=sid) | |
| end = time.time() | |
| if len(audio.samples) == 0: | |
| raise ValueError( | |
| "Error in generating audios. Please read previous error messages." | |
| ) | |
| duration = len(audio.samples) / audio.sample_rate | |
| elapsed_seconds = end - start | |
| rtf = elapsed_seconds / duration | |
| info = f""" | |
| Wave duration : {duration:.3f} s <br/> | |
| Processing time: {elapsed_seconds:.3f} s <br/> | |
| RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/> | |
| """ | |
| logging.info(info) | |
| logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}") | |
| filename = str(uuid.uuid4()) | |
| filename = f"{filename}.wav" | |
| sf.write( | |
| filename, | |
| audio.samples, | |
| samplerate=audio.sample_rate, | |
| subtype="PCM_16", | |
| ) | |
| return filename, build_html_output(info) | |
| demo = gr.Blocks() | |
| with demo: | |
| language_choices = list(language_to_models.keys()) | |
| inputsImg=gr.Image(type='pil') | |
| idx=0 | |
| for txt in image_to_text(inputsImg): | |
| output_txt[idx] = gr.Textbox(label=txt,lines=1,max_lines=1,value=txt,placeholder="Interpretation") | |
| input_sid = gr.Textbox( | |
| label="Speaker ID", | |
| info="Speaker ID", | |
| lines=1, | |
| max_lines=1, | |
| value="0", | |
| placeholder="Speaker ID. Valid only for mult-speaker model") | |
| input_speed = gr.Slider( | |
| minimum=0.1, | |
| maximum=10, | |
| value=1, | |
| step=0.1, | |
| label="Speed (larger->faster; smaller->slower)") | |
| text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],txt,input_sid,input_speed) | |
| output_audio[idx] = gr.Audio(label="Output") | |
| output_info[idx] = gr.HTML(label="Info") | |
| idx=idx+1 | |
| gr.Interface(fn=text_to_speech, | |
| title="Image to Text Interpretation", | |
| inputs=inputsImg, | |
| outputs=[output_txt,output_audio,input_sid,input_speed], | |
| description="image to audio demo", | |
| article = "", | |
| ).launch() |