Spaces:

SandraCLV
/

injectModel1intoModel2

Sleeping

App Files Files Community

SandraCLV commited on Nov 1, 2023

Commit

7807f29

1 Parent(s): 6a3e4b6

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -6

app.py CHANGED Viewed

@@ -5,9 +5,23 @@ import librosa
 import datasets
 from transformers.pipelines.pt_utils import KeyDataset
 from tqdm.auto import tqdm
 image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
 def image_to_text(input_image):
     # Convertir la imagen a texto
@@ -16,10 +30,76 @@ def image_to_text(input_image):
     #texts = transcriber(text_output)
     return text_output
-gr.Interface(fn=image_to_text,
-                           title="Image to Text",
-                           inputs=gr.Image(type='pil'),
-                           outputs=[gr.Textbox(label="Output")],
-                           description="Object Recognition using Microsoft BEIT",
-                           article = "Author: <a href=\"https://huggingface.co/rowel\">Rowel Atienza</a>",
                            ).launch()

 import datasets
 from transformers.pipelines.pt_utils import KeyDataset
 from tqdm.auto import tqdm
+import logging
+import time
+import uuid
+import soundfile as sf
+from model import get_pretrained_model, language_to_models
+#text to speech code from https://huggingface.co/spaces/k2-fsa/text-to-speech/blob/main/app.py
 image_to_text_model = pipeline("image-classification",model="microsoft/beit-base-patch16-224-pt22k-ft22k")
+def build_html_output(s: str, style: str = "result_item_success"):
+    return f"""
+    <div class='result'>
+        <div class='result_item {style}'>
+          {s}
+        </div>
+    </div>
+    """
 def image_to_text(input_image):
     # Convertir la imagen a texto
     #texts = transcriber(text_output)
     return text_output
+def text_to_speech(language: str, repo_id: str, text: str, sid: str, speed: float):
+    logging.info(f"Input text: {text}. sid: {sid}, speed: {speed}")
+    sid = int(sid)
+    tts = get_pretrained_model(repo_id, speed)
+    start = time.time()
+    audio = tts.generate(text, sid=sid)
+    end = time.time()
+    if len(audio.samples) == 0:
+        raise ValueError(
+            "Error in generating audios. Please read previous error messages."
+        )
+    duration = len(audio.samples) / audio.sample_rate
+    elapsed_seconds = end - start
+    rtf = elapsed_seconds / duration
+    info = f"""
+    Wave duration  : {duration:.3f} s <br/>
+    Processing time: {elapsed_seconds:.3f} s <br/>
+    RTF: {elapsed_seconds:.3f}/{duration:.3f} = {rtf:.3f} <br/>
+    """
+    logging.info(info)
+    logging.info(f"\nrepo_id: {repo_id}\ntext: {text}\nsid: {sid}\nspeed: {speed}")
+    filename = str(uuid.uuid4())
+    filename = f"{filename}.wav"
+    sf.write(
+        filename,
+        audio.samples,
+        samplerate=audio.sample_rate,
+        subtype="PCM_16",
+    )
+    return filename, build_html_output(info)
+demo = gr.Blocks()
+with demo:
+    language_choices = list(language_to_models.keys())
+    inputsImg=gr.Image(type='pil')
+    idx=0
+    for txt in image_to_text(inputsImg)
+        output_txt[idx] = gr.Textbox(label=txt,lines=1,max_lines=1,value=txt,placeholder="Interpretation")
+        input_sid = gr.Textbox(
+                label="Speaker ID",
+                info="Speaker ID",
+                lines=1,
+                max_lines=1,
+                value="0",
+                placeholder="Speaker ID. Valid only for mult-speaker model",
+            )
+        input_speed = gr.Slider(
+                minimum=0.1,
+                maximum=10,
+                value=1,
+                step=0.1,
+                label="Speed (larger->faster; smaller->slower)",input_sid
+            )
+        text_to_speech(language_choices[0],language_to_models[language_choices[0]][0],txt,input_sid, input_speed)
+        output_audio[idx] = gr.Audio(label="Output")
+        output_info[idx] = gr.HTML(label="Info")
+        idx=idx+1
+    gr.Interface(fn=image_to_text,
+                           title="Image to Text Interpretation",
+                           inputs=inputsImg,
+                           outputs=[output_txt,output_audio,input_sid,input_speed],
+                           description="image to audio demo",
+                           article = "",
                            ).launch()