## 1. Setup & Installation

In [None]:
%%writefile requirements.txt
git+https://github.com/openai/whisper.git@8cf36f3508c9acd341a45eb2364239a3d81458b9

In [None]:
!pip install -r requirements.txt --upgrade

In [40]:
import subprocess
import numpy as np

def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
    """
    Helper function to read an audio file through ffmpeg.
    """
    ar = f"{sampling_rate}"
    ac = "1"
    format_for_conversion = "s16le" # was "f32le"
    acodec = "pcm_s16le"
    ffmpeg_command = [
        "ffmpeg",
        "-i",
        "pipe:0",
        "-vn",
        "-acodec",
        "pcm_s16le",
        "-f",
        "s16le",
        "-ar",
        str(ar),
        "-ac",
        "1",
        "pipe:1",
        ]
    
    try:
        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
            output_stream = ffmpeg_process.communicate(bpayload)
    except FileNotFoundError as error:
        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
    print(output_stream[0])
    out_bytes = output_stream[0]
    audio = np.frombuffer(out_bytes, np.float32)
    print("-------------------------------------------")
    print(audio)
    if audio.shape[0] == 0:
        raise ValueError(
            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
            "URL, ensure that the URL is the full address to **download** the audio file."
        )
    return audio

# file reader
with open("actu.m4a", "rb") as f:
# with open("sample1.flac", "rb") as f:
  data = f.read()


# test the handler
print("-------------------------------------------")
pred = ffmpeg_read(data, 16000)
print("-------------------------------------------")


-------------------------------------------
b''
-------------------------------------------
[]


ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.0.40.1)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enabl

ValueError: Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote URL, ensure that the URL is the full address to **download** the audio file.

## 2. Test model

In [1]:
!wget https://cdn-media.huggingface.co/speech_samples/sample1.flac

--2023-11-16 17:26:16--  https://cdn-media.huggingface.co/speech_samples/sample1.flac
Résolution de cdn-media.huggingface.co (cdn-media.huggingface.co)… 2600:9000:2450:200:19:6fb8:2ac0:93a1, 2600:9000:2450:fa00:19:6fb8:2ac0:93a1, 2600:9000:2450:8c00:19:6fb8:2ac0:93a1, ...
Connexion à cdn-media.huggingface.co (cdn-media.huggingface.co)|2600:9000:2450:200:19:6fb8:2ac0:93a1|:443… connecté.
requête HTTP transmise, en attente de la réponse… 200 OK
Taille : 282378 (276K) [audio/flac]
Sauvegarde en : « sample1.flac »


2023-11-16 17:26:16 (1,77 MB/s) — « sample1.flac » sauvegardé [282378/282378]



In [8]:
import whisperx
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
model = whisperx.load_model("large", device=device, compute_type=compute_type)


result = model.transcribe("sample1.flac")
print(result)

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.0.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
Detected language: en (1.00) in first 30s of audio...
{'segments': [{'text': " going along slushy country roads and speaking to damp audiences in draughty schoolrooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.", 'start': 0.009, 'end': 13.695}], 'language': 'en'}


In [13]:
import whisperx
import gc
import time
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8" 
 
# get the start time
st = time.time()

print(f"Processing file sample1.flac")

batch_size = 16 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" instead of float16 if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
print("-----------TRANSCRIPTION------------")
model = whisperx.load_model("large-v2", device, compute_type=compute_type)

audio = whisperx.load_audio("sample1.flac")
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')


# 2. Align whisper output
# model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
# result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

# print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
print("-----------DIARIZE------------")
diarize_model = whisperx.DiarizationPipeline("pyannote/speaker-diarization-3.0", use_auth_token="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", device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Processing file sample1.flac
-----------TRANSCRIPTION------------


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.0.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
Detected language: en (1.00) in first 30s of audio...
[{'text': " going along slushy country roads and speaking to damp audiences in draughty schoolrooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.", 'start': 0.009, 'end': 13.695}]
Execution time: 23.329508066177368 seconds
-----------DIARIZE------------
                             segment  label     speaker     start        end  \
0  [ 00:00:00.008 -->  00:00:07.020]      0  SPEAKER_00  0.008489   7.020374   
1  [ 00:00:07.105 -->  00:00:13.998]      0  SPEAKER_00  7.105263  13.99830

In [21]:
from transformers.pipelines.audio_utils import ffmpeg_read

SAMPLE_RATE = 16000

def whisper_config():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    whisper_model = "large-v2"
    batch_size = 16  # reduce if low on GPU mem
    # change to "int8" if low on GPU mem (may reduce accuracy)
    compute_type = "float16" if device == "cuda" else "int8"
    return device, batch_size, compute_type, whisper_model


device, batch_size, compute_type, whisper_model = whisper_config()
model = whisperx.load_model(whisper_model, device=device, compute_type=compute_type)

# file reader
with open("sample1.flac", "rb") as f:
  data = f.read()
  
# 1. process input
with open("sample1.flac", "rb") as f:
  inputs = f.read()
audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE)
# audio_tensor= torch.from_numpy(audio_nparray)

results = []

# 2. transcribe
device, batch_size, compute_type, whisper_model = whisper_config()
result = model.transcribe(audio_nparray, batch_size=batch_size)
results.append({"transcribe": result["segments"]})
print(result["segments"])

# 3. align
# model_a, metadata = whisperx.load_align_model(
#     language_code=result["language"], device=device)
# result = whisperx.align(
#     result["segments"], model_a, metadata, audio_nparray, device, return_char_alignments=False)
# results.append({"aligned_transcribe": result["segments"]})
# print(result["segments"])

# 4. Assign speaker labels
# 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 ERIC
# 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 OLD
diarize_model = whisperx.DiarizationPipeline( 
    "pyannote/speaker-diarization-3.0",
    use_auth_token="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", device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio_nparray)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"])  # segments are now assigned speaker IDs
results.append({"diarize_transcribe": result["segments"]})


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.0.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.
Detected language: en (1.00) in first 30s of audio...
[{'text': " going along slushy country roads and speaking to damp audiences in draughty schoolrooms day after day for a fortnight. He'll have to put in an appearance at some place of worship on Sunday morning, and he can come to us immediately afterwards.", 'start': 0.009, 'end': 13.695}]
                             segment  label     speaker     start        end  \
0  [ 00:00:00.008 -->  00:00:07.020]      0  SPEAKER_00  0.008489   7.020374   
1  [ 00:00:07.105 -->  00:00:13.998]      0  SPEAKER_00  7.105263  13.998302   

   intersection      union  
0      7.011374  13.686511  
1      6.5

## 3. Test Custom Handler for Inference Endpoints

### Locally

In [3]:
from handler import EndpointHandler

# init handler
my_handler = EndpointHandler(path=".")

  from .autonotebook import tqdm as notebook_tqdm
torchvision is not available - cannot save figures


No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.0.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.0. Bad things might happen unless you revert torch to 1.x.


In [4]:

import base64
import json

# file reader
with open("actu.m4a", "rb") as f:
    audio_data = f.read()

# Encode the audio data as base64
audio_base64 = base64.b64encode(audio_data).decode("utf-8")

request = {"inputs": audio_base64 }

payload = {
        'inputs': audio_base64,
        'parameters': {
            'language': "en",
            'sampling_rate': 16000,
            }
    }

# test the handler
pred = my_handler(payload)

key: inputs, value: AAAAHGZ0eXBNNEEgAAAAAE00QSBpc29tbXA0MgAAAAFtZGF0AAAAAAAWOUYA0EAHAPKd7v8X7Cf43ecIKrn+JlV9cStLvlDmwLxcAAAAAAR4v0kn5k8GgAAAABLzp7MJ+KPO4AAAAAQ8OejCJvIYAAAAAT869uJcB66AAAAAEPLHikjjbQAAAAAAAAAAE4gidZ5DE5whJAAAAAAAAAAABCtR+SIYOqThxgAAAAAAAAAAIVMSQjTyGXuk6+NAAAAAAAAAAAAAAAAzs4mUhCfIJ1cdRUUmcQAAAAAAAAAAAAAAABCVOusxBjCbmEJVeoWgAAAAAAAAAAAAAAAYIqQ8AgE6NTBId1vAAAAAAAAAAAAAAAAAAAAAPsZOTDICWTCogBOdz50KTEYAAAAAAAAAAAAAAAAAAAAAIBNk8uQBEwEyYPO5cqDyeMAAAAAAAAAAAAAAAAAAAAA+E4z+udJ+maS4cTDDIwrRLzFuifVvTxLKSiPBPPUunJ4mEQjwbThcVVigm3QEsHiyUaMRm1f1pPQZC7rxGDYu+MRj0ZnmE6mRI4xBFyyAQkSqJxnZ1jko6yUUar6NU6KJSSKHzC1352FbpPr368lJVmnon7VsjO4voftJCAKix7v25RmuSSIUe7ewAvjFAJva7w5PhUAQkMXRViAds6v13d8TsroLZkqD33PwbMBWBPiCQyvnl+eKlDz5+L4QEqE3FLqsQlFX572uojWsiVyEQj8sJCORGnk3IQaiDHHplFpp7WHbfKmPQ/CfSa0Bm2w8GDdEKTlwSKHE0CxHAB/ToNVJSgmqAuVyT6/iDcWkzcbr3u4wcnfpyGCe+eI/m2qxo8U+9FUmBGrc5wiZCLAHOVCmDPec7gEC2K7Uew01iQGQvtXWGPXxqN5clUdRqSsuSkyR+uVVSwIgnkMnXJwcxKMAiuASrbShN2RxGXIa22R2OqI6jC1liSGEyxCc3AR31dgOfIFj

In [5]:
pred

[{'diarized_transcription': [{'text': ' The editing was done by the Israeli army. But MP Mathieu Lefebvre defends himself by explaining that it is by no means to justify the Israeli response. It should be noted that here at the Assembly, in the face of the extreme violence of these images, firefighters, doctors or psychologists are present in numbers in case some could not bear these images.',
    'start': 0.009,
    'end': 27.159,
    'speaker': 'SPEAKER_00'},
   {'text': " Thank you very much, Sylvain Rousseau, from the National Assembly for France 24. In Mali, the people are exultant. They claim to have recaptured Kidal, this large city in the north of the country, a crossroads with Algeria, a bastion of the Tuareg Rebellion. Violent fighting has taken place between the two camps last weekend. Serge Daniel's information.",
    'start': 27.381,
    'end': 47.09,
    'speaker': 'SPEAKER_01'}]}]

### remote

In [1]:
import requests
import base64
import json

API_URL = "https://w4a3d6hyuuiyrznb.eu-west-1.aws.endpoints.huggingface.cloud"
headers = {
	"Authorization": "Bearer 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",
	"Content-Type": "application/json"
}

audio_data = None

def query(filename):
    with open(filename, "rb") as f:
        audio_data = f.read()
        # f.close()
    
    
    # Encode the audio data as base64
    audio_base64 = base64.b64encode(audio_data).decode("utf-8")
    
    # Construct the JSON payload
    payload = {
        'inputs': audio_base64,
        'parameters': {
            'language': "en",
            'sampling_rate': 16000,
            }
    }
    
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json(), payload

# output = query("sample1.flac")
output = query("sample1.flac")


ConnectionError: HTTPSConnectionPool(host='w4a3d6hyuuiyrznb.eu-west-1.aws.endpoints.huggingface.cloud', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x106ce7b20>: Failed to resolve 'w4a3d6hyuuiyrznb.eu-west-1.aws.endpoints.huggingface.cloud' ([Errno 8] nodename nor servname provided, or not known)"))

In [8]:
output

([{'diarized_transcription': [{'text': " Allant sur les routes du pays et parlant à des spectateurs dans des chambres de l'école, jour après jour, pour un fortnite. Il va devoir faire une apparition à un endroit de la prière, le matin de lundi, et il peut venir nous voir immédiatement après.",
     'start': 0.009,
     'end': 13.695,
     'speaker': 'SPEAKER_00'}]}],
 {'inputs': 'ZkxhQwAAACIQABAAAAhUABfvA+gA8AADV6AH4KsLAGLf55WnLE8wwDFnAwAAJAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAnAAAAAAAAADDSMQAAQAACggAAAAcmVmZXJlbmNlIGxpYkZMQUMgMS4yLjEgMjAwNzA5MTcAAAAAgQAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA

In [9]:
import requests

API_URL = "https://w4a3d6hyuuiyrznb.eu-west-1.aws.endpoints.huggingface.cloud"
headers = {"Authorization": "Bearer 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",
           "Content-Type": "application/json"}

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
        
    audio_base64 = base64.b64encode(audio_data).decode("utf-8")
    # Construct the JSON payload
    payload = {
        'audio_encoded':audio_base64,
        'language': "en",
        'sampling_rate': 16000,
        'pipeline': "automatic-speech-recognition",
        'model': "facebook/wav2vec2-base-960h"
    }
    
    payload = {"inputs": {"audio_encoded": audio_base64}}
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

output = query("actu.m4a")
print(output)

[{'transcription': [{'text': " Allant sur les routes du pays et parlant à des spectateurs dans des chambres de l'école, jour après jour, pour un fortnite. Il va devoir faire une apparition à un endroit de la prière, le matin de lundi, et il peut venir nous voir immédiatement après.", 'start': 0.009, 'end': 13.695, 'speaker': 'SPEAKER_00'}]}, {'diarize_transcription': [{'text': " Allant sur les routes du pays et parlant à des spectateurs dans des chambres de l'école, jour après jour, pour un fortnite. Il va devoir faire une apparition à un endroit de la prière, le matin de lundi, et il peut venir nous voir immédiatement après.", 'start': 0.009, 'end': 13.695, 'speaker': 'SPEAKER_00'}]}]


In [31]:
output

{'error': 'unexpected character: line 1 column 1 (char 0)'}

In [36]:
output

{'error': 'unexpected character: line 1 column 1 (char 0)'}

In [11]:
str(audio_data)

'b\'fLaC\\x00\\x00\\x00"\\x10\\x00\\x10\\x00\\x00\\x08T\\x00\\x17\\xef\\x03\\xe8\\x00\\xf0\\x00\\x03W\\xa0\\x07\\xe0\\xab\\x0b\\x00b\\xdf\\xe7\\x95\\xa7,O0\\xc01g\\x03\\x00\\x00$\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x10\\x00\\x00\\x00\\x00\\x00\\x00\\x02p\\x00\\x00\\x00\\x00\\x00\\x00\\x03\\r#\\x10\\x00\\x04\\x00\\x00( \\x00\\x00\\x00reference libFLAC 1.2.1 20070917\\x00\\x00\\x00\\x00\\x81\\x00 \\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\

In [26]:
import requests

API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3"
headers = {"Authorization": "Bearer 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"}

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

output = query("sample1.flac")
print(output)

{'text': " going along slushy country roads and speaking to damp audiences in draughty schoolrooms day after day for a fortnight he'll have to put in an appearance at some place of worship on sunday morning and he can come to us immediately afterwards"}


In [24]:
import requests

API_URL = "https://api-inference.huggingface.co/models/philschmid/pyannote-speaker-diarization-endpoint"
headers = {"Authorization": "Bearer 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"}

def query(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

output = query("sample1.flac")
print(output)

{'error': 'HfApiJson(Deserialize(Error("unknown variant `voice-activity-detection`, expected one of `audio-classification`, `audio-to-audio`, `audio-source-separation`, `automatic-speech-recognition`, `feature-extraction`, `text-classification`, `token-classification`, `question-answering`, `translation`, `summarization`, `text-generation`, `text2text-generation`, `fill-mask`, `zero-shot-classification`, `zero-shot-image-classification`, `conversational`, `table-question-answering`, `image-classification`, `image-segmentation`, `image-to-text`, `text-to-speech`, `text-to-audio`, `sentence-similarity`, `speech-segmentation`, `tabular-classification`, `tabular-regression`, `text-to-image`, `object-detection`, `visual-question-answering`, `video-classification`, `document-question-answering`, `image-to-image`", line: 1, column: 352)))'}


In [27]:
import numpy as np
import subprocess

def ffmpeg_read_origin(bpayload: bytes, sampling_rate: int) -> np.array:
    """
    Helper function to read an audio file through ffmpeg.
    """
    ar = f"{sampling_rate}"
    ac = "1"
    format_for_conversion = "s16le"
    ffmpeg_command = [
        "ffmpeg",
        "-y",
        "-i",
        "pipe:0",
        "-acodec",
        "pcm_s16le",
        "-f",
        format_for_conversion,
        "-ac",
        ac,
        "-ar",
        ar,
        #"-hide_banner",
        "-loglevel",
        "info",
        "pipe:1",
    ]

    try:
        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
            output_stream = ffmpeg_process.communicate(bpayload)
    except FileNotFoundError as error:
        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
    out_bytes = output_stream[0]
    audio = np.frombuffer(out_bytes, np.float32)
    if audio.shape[0] == 0:
        raise ValueError(
            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
            "URL, ensure that the URL is the full address to **download** the audio file."
        )
    return audio

with open("actu.m4a", "rb") as f:
    bpayload = f.read()

audio = ffmpeg_read_origin(bpayload, 16000)

print(audio.shape)
print(audio)


ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.0.40.1)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enabl

ValueError: Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote URL, ensure that the URL is the full address to **download** the audio file.

In [26]:
audio

array([ 8.4857024e-38,  6.7227765e-38, -5.9815072e+37, ...,
        9.9183149e-39,  7.3475403e-40,  7.8979592e-39], dtype=float32)

In [51]:
import numpy as np
import subprocess as sp
import os
DEVNULL = open(os.devnull, 'w')

# load_audio can not detect the input type
def ffmpeg_load_audio(filename, sr=44100, mono=False, normalize=True, in_type=np.int16, out_type=np.float32):
    channels = 1 if mono else 2
    format_strings = {
        np.float64: 'f64le',
        np.float32: 'f32le',
        np.int16: 's16le',
        np.int32: 's32le',
        np.uint32: 'u32le'
    }
    format_string = format_strings[in_type]
    command = [
        'ffmpeg',
        '-i', filename,
        '-f', format_string,
        '-acodec', 'pcm_' + format_string,
        '-ar', str(sr),
        '-ac', str(channels),
        '-']
    p = sp.Popen(command, stdout=sp.PIPE, stderr=DEVNULL, bufsize=4096)
    bytes_per_sample = np.dtype(in_type).itemsize
    frame_size = bytes_per_sample * channels
    chunk_size = frame_size * sr # read in 1-second chunks
    raw = b''
    with p.stdout as stdout:
        while True:
            data = stdout.read(chunk_size)
            if data:
                raw += data
            else:
                break
    audio = np.fromstring(raw, dtype=in_type).astype(out_type)
    if channels > 1:
        audio = audio.reshape((-1, channels)).transpose()
    if audio.size == 0:
        return audio, sr
    if issubclass(out_type, np.floating):
        if normalize:
            peak = np.abs(audio).max()
            if peak > 0:
                audio /= peak
        elif issubclass(in_type, np.integer):
            audio /= np.iinfo(in_type).max
    return audio, sr

In [41]:
audio, sr = ffmpeg_load_audio("actu.m4a", sr=16000, out_type=np.int16)

ValueError: embedded null byte

In [67]:
with open("sample1.flac", 'rb') as f:
    audio_data = f.read()

DEVNULL = open(os.devnull, 'wb')

audio_base64 = base64.b64encode(audio_data).decode("utf-8")

inputs = base64.b64decode(audio_base64)

with open('/tmp/myfile.tmp', 'wb') as w:
    w.write(inputs)

audio, sr = ffmpeg_load_audio('/tmp/myfile.tmp', sr=16000, out_type=np.int16)

In [68]:
audio

array([[129, 344, 527, ...,   6,  54,  61],
       [129, 344, 527, ...,   6,  54,  61]], dtype=int16)

In [44]:
import subprocess as sp
from io import BytesIO

# Build synthetic image tmp.gif for testing

bytes_io = open('actu.m4a', "rb")
bytes_io.seek(0)

command = [
        'ffmpeg',
        '-i', 'pipe:0',
        '-f', 's16le',
        '-acodec', 'pcm_' + 's16le',
        '-ar', '16000',
        '-ac', '1',
        'pipe:1']

proc = sp.Popen(command, stdout=sp.PIPE, stdin=sp.PIPE)
out = proc.communicate(input=bytes_io.read())[0]

proc.wait()

bytes_io_png = BytesIO(out)


ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.0.40.1)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/6.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enabl