| from pathlib import Path |
|
|
| import gradio as gr |
| import pickle |
| import torchaudio |
| import torch |
| from speechbrain.inference.speaker import EncoderClassifier |
| from silero_vad import load_silero_vad, read_audio, get_speech_timestamps, collect_chunks |
|
|
| classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb") |
|
|
| with open("gender_classifier.pickle", "rb") as file: |
| gender_clf = pickle.load(file) |
|
|
| with open("height_estimator_1.pickle", "rb") as file: |
| male_clf = pickle.load(file) |
|
|
| with open("height_estimator_0.pickle", "rb") as file: |
| female_clf = pickle.load(file) |
|
|
| article_md = Path("Description.md") |
| error_message = "No speech detected or signal too short!" |
|
|
|
|
| def read_markdown_file(file_path): |
| with open(file_path, 'r', encoding='utf-8') as file: |
| markdown_string = file.read() |
| return markdown_string |
|
|
|
|
| def metric_to_imperial(height): |
| inches = round(height / 2.54) |
| return f"{int(inches / 12)}'{inches % 12}\"" |
|
|
|
|
| def get_speech(wav): |
| model = load_silero_vad() |
| speech_timestamps = get_speech_timestamps(wav, model) |
| return collect_chunks(speech_timestamps, wav) |
|
|
|
|
| def estimate_height(gender, vad, filepath, imperial): |
| if filepath is None: |
| return error_message |
| signal = read_audio(filepath) |
| if vad: |
| signal = get_speech(signal) |
| if len(signal) < 1: |
| return error_message |
|
|
|
|
| embedding = torch.squeeze(classifier.encode_batch(signal), 0) |
| if gender == "Detect" or gender is None: |
| gender = gender_clf.predict(embedding) |
| else: |
| gender = 1 if gender == "Male" else 0 |
|
|
| height_estimator = male_clf if gender else female_clf |
| height = height_estimator.predict(embedding)[0] |
|
|
| if imperial: |
| height = metric_to_imperial(height) |
| else: |
| height = str(round(height)) + " cm" |
|
|
| return f"{'Male' if gender else 'Female'} {height}" |
|
|
|
|
| theme = gr.themes.Glass() |
|
|
| with gr.Blocks(theme=theme) as demo: |
| gr.Interface( |
| fn=estimate_height, inputs=[ |
| gr.Radio(["Detect", "Male", "Female"], label="Gender of a speaker", value="Detect"), |
| gr.Checkbox(label="VAD", info="If there is a lot of silence in your audio, maybe try using VAD"), |
| gr.Audio(label="Audio", type="filepath"), |
| gr.Checkbox(label="Imperial units") |
| ], |
| outputs=[gr.Label(label="Prediction")], |
| title="Speaker height estimator", |
| description="Demo of estimator trained using [HeightCeleb](https://github.com/stachu86/HeightCeleb) dataset", |
| allow_flagging="never", |
| article=read_markdown_file(article_md) |
| ) |
| demo.launch(False, debug=True) |