Spaces:
Runtime error
Runtime error
Commit ·
d30ef31
1
Parent(s): 6b318e2
Upload 5 files
Browse files- __init__.py +0 -0
- download.py +72 -0
- segments.py +55 -0
- utils.py +115 -0
- vad.py +468 -0
__init__.py
ADDED
|
File without changes
|
download.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tempfile import mkdtemp
|
| 2 |
+
from typing import List
|
| 3 |
+
from yt_dlp import YoutubeDL
|
| 4 |
+
|
| 5 |
+
import yt_dlp
|
| 6 |
+
from yt_dlp.postprocessor import PostProcessor
|
| 7 |
+
|
| 8 |
+
class FilenameCollectorPP(PostProcessor):
|
| 9 |
+
def __init__(self):
|
| 10 |
+
super(FilenameCollectorPP, self).__init__(None)
|
| 11 |
+
self.filenames = []
|
| 12 |
+
|
| 13 |
+
def run(self, information):
|
| 14 |
+
self.filenames.append(information["filepath"])
|
| 15 |
+
return [], information
|
| 16 |
+
|
| 17 |
+
def download_url(url: str, maxDuration: int = None, destinationDirectory: str = None, playlistItems: str = "1") -> List[str]:
|
| 18 |
+
try:
|
| 19 |
+
return _perform_download(url, maxDuration=maxDuration, outputTemplate=None, destinationDirectory=destinationDirectory, playlistItems=playlistItems)
|
| 20 |
+
except yt_dlp.utils.DownloadError as e:
|
| 21 |
+
# In case of an OS error, try again with a different output template
|
| 22 |
+
if e.msg and e.msg.find("[Errno 36] File name too long") >= 0:
|
| 23 |
+
return _perform_download(url, maxDuration=maxDuration, outputTemplate="%(title).10s %(id)s.%(ext)s")
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = None, destinationDirectory: str = None, playlistItems: str = "1"):
|
| 27 |
+
# Create a temporary directory to store the downloaded files
|
| 28 |
+
if destinationDirectory is None:
|
| 29 |
+
destinationDirectory = mkdtemp()
|
| 30 |
+
|
| 31 |
+
ydl_opts = {
|
| 32 |
+
"format": "bestaudio/best",
|
| 33 |
+
'paths': {
|
| 34 |
+
'home': destinationDirectory
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
if (playlistItems):
|
| 38 |
+
ydl_opts['playlist_items'] = playlistItems
|
| 39 |
+
|
| 40 |
+
# Add output template if specified
|
| 41 |
+
if outputTemplate:
|
| 42 |
+
ydl_opts['outtmpl'] = outputTemplate
|
| 43 |
+
|
| 44 |
+
filename_collector = FilenameCollectorPP()
|
| 45 |
+
|
| 46 |
+
with YoutubeDL(ydl_opts) as ydl:
|
| 47 |
+
if maxDuration and maxDuration > 0:
|
| 48 |
+
info = ydl.extract_info(url, download=False)
|
| 49 |
+
duration = info['duration']
|
| 50 |
+
|
| 51 |
+
if duration >= maxDuration:
|
| 52 |
+
raise ExceededMaximumDuration(videoDuration=duration, maxDuration=maxDuration, message="Video is too long")
|
| 53 |
+
|
| 54 |
+
ydl.add_post_processor(filename_collector)
|
| 55 |
+
ydl.download([url])
|
| 56 |
+
|
| 57 |
+
if len(filename_collector.filenames) <= 0:
|
| 58 |
+
raise Exception("Cannot download " + url)
|
| 59 |
+
|
| 60 |
+
result = []
|
| 61 |
+
|
| 62 |
+
for filename in filename_collector.filenames:
|
| 63 |
+
result.append(filename)
|
| 64 |
+
print("Downloaded " + filename)
|
| 65 |
+
|
| 66 |
+
return result
|
| 67 |
+
|
| 68 |
+
class ExceededMaximumDuration(Exception):
|
| 69 |
+
def __init__(self, videoDuration, maxDuration, message):
|
| 70 |
+
self.videoDuration = videoDuration
|
| 71 |
+
self.maxDuration = maxDuration
|
| 72 |
+
super().__init__(message)
|
segments.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any, Dict, List
|
| 2 |
+
|
| 3 |
+
import copy
|
| 4 |
+
|
| 5 |
+
def merge_timestamps(timestamps: List[Dict[str, Any]], merge_window: float = 5, max_merge_size: float = 30, padding_left: float = 1, padding_right: float = 1):
|
| 6 |
+
result = []
|
| 7 |
+
|
| 8 |
+
if len(timestamps) == 0:
|
| 9 |
+
return result
|
| 10 |
+
if max_merge_size is None:
|
| 11 |
+
return timestamps
|
| 12 |
+
|
| 13 |
+
if padding_left is None:
|
| 14 |
+
padding_left = 0
|
| 15 |
+
if padding_right is None:
|
| 16 |
+
padding_right = 0
|
| 17 |
+
|
| 18 |
+
processed_time = 0
|
| 19 |
+
current_segment = None
|
| 20 |
+
|
| 21 |
+
for i in range(len(timestamps)):
|
| 22 |
+
next_segment = timestamps[i]
|
| 23 |
+
|
| 24 |
+
delta = next_segment['start'] - processed_time
|
| 25 |
+
|
| 26 |
+
# Note that segments can still be longer than the max merge size, they just won't be merged in that case
|
| 27 |
+
if current_segment is None or (merge_window is not None and delta > merge_window) \
|
| 28 |
+
or next_segment['end'] - current_segment['start'] > max_merge_size:
|
| 29 |
+
# Finish the current segment
|
| 30 |
+
if current_segment is not None:
|
| 31 |
+
# Add right padding
|
| 32 |
+
finish_padding = min(padding_right, delta / 2) if delta < padding_left + padding_right else padding_right
|
| 33 |
+
current_segment['end'] += finish_padding
|
| 34 |
+
delta -= finish_padding
|
| 35 |
+
|
| 36 |
+
result.append(current_segment)
|
| 37 |
+
|
| 38 |
+
# Start a new segment
|
| 39 |
+
current_segment = copy.deepcopy(next_segment)
|
| 40 |
+
|
| 41 |
+
# Pad the segment
|
| 42 |
+
current_segment['start'] = current_segment['start'] - min(padding_left, delta)
|
| 43 |
+
processed_time = current_segment['end']
|
| 44 |
+
|
| 45 |
+
else:
|
| 46 |
+
# Merge the segment
|
| 47 |
+
current_segment['end'] = next_segment['end']
|
| 48 |
+
processed_time = current_segment['end']
|
| 49 |
+
|
| 50 |
+
# Add the last segment
|
| 51 |
+
if current_segment is not None:
|
| 52 |
+
current_segment['end'] += padding_right
|
| 53 |
+
result.append(current_segment)
|
| 54 |
+
|
| 55 |
+
return result
|
utils.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import textwrap
|
| 2 |
+
import unicodedata
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import zlib
|
| 6 |
+
from typing import Iterator, TextIO
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def exact_div(x, y):
|
| 10 |
+
assert x % y == 0
|
| 11 |
+
return x // y
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def str2bool(string):
|
| 15 |
+
str2val = {"True": True, "False": False}
|
| 16 |
+
if string in str2val:
|
| 17 |
+
return str2val[string]
|
| 18 |
+
else:
|
| 19 |
+
raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def optional_int(string):
|
| 23 |
+
return None if string == "None" else int(string)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def optional_float(string):
|
| 27 |
+
return None if string == "None" else float(string)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def compression_ratio(text) -> float:
|
| 31 |
+
return len(text) / len(zlib.compress(text.encode("utf-8")))
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
|
| 35 |
+
assert seconds >= 0, "non-negative timestamp expected"
|
| 36 |
+
milliseconds = round(seconds * 1000.0)
|
| 37 |
+
|
| 38 |
+
hours = milliseconds // 3_600_000
|
| 39 |
+
milliseconds -= hours * 3_600_000
|
| 40 |
+
|
| 41 |
+
minutes = milliseconds // 60_000
|
| 42 |
+
milliseconds -= minutes * 60_000
|
| 43 |
+
|
| 44 |
+
seconds = milliseconds // 1_000
|
| 45 |
+
milliseconds -= seconds * 1_000
|
| 46 |
+
|
| 47 |
+
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
| 48 |
+
return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def write_txt(transcript: Iterator[dict], file: TextIO):
|
| 52 |
+
for segment in transcript:
|
| 53 |
+
print(segment['text'].strip(), file=file, flush=True)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
|
| 57 |
+
print("WEBVTT\n", file=file)
|
| 58 |
+
for segment in transcript:
|
| 59 |
+
text = process_text(segment['text'], maxLineWidth).replace('-->', '->')
|
| 60 |
+
|
| 61 |
+
print(
|
| 62 |
+
f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
|
| 63 |
+
f"{text}\n",
|
| 64 |
+
file=file,
|
| 65 |
+
flush=True,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
|
| 70 |
+
"""
|
| 71 |
+
Write a transcript to a file in SRT format.
|
| 72 |
+
Example usage:
|
| 73 |
+
from pathlib import Path
|
| 74 |
+
from whisper.utils import write_srt
|
| 75 |
+
result = transcribe(model, audio_path, temperature=temperature, **args)
|
| 76 |
+
# save SRT
|
| 77 |
+
audio_basename = Path(audio_path).stem
|
| 78 |
+
with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
|
| 79 |
+
write_srt(result["segments"], file=srt)
|
| 80 |
+
"""
|
| 81 |
+
for i, segment in enumerate(transcript, start=1):
|
| 82 |
+
text = process_text(segment['text'].strip(), maxLineWidth).replace('-->', '->')
|
| 83 |
+
|
| 84 |
+
# write srt lines
|
| 85 |
+
print(
|
| 86 |
+
f"{i}\n"
|
| 87 |
+
f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
|
| 88 |
+
f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
|
| 89 |
+
f"{text}\n",
|
| 90 |
+
file=file,
|
| 91 |
+
flush=True,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def process_text(text: str, maxLineWidth=None):
|
| 95 |
+
if (maxLineWidth is None or maxLineWidth < 0):
|
| 96 |
+
return text
|
| 97 |
+
|
| 98 |
+
lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
|
| 99 |
+
return '\n'.join(lines)
|
| 100 |
+
|
| 101 |
+
def slugify(value, allow_unicode=False):
|
| 102 |
+
"""
|
| 103 |
+
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
| 104 |
+
Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
|
| 105 |
+
dashes to single dashes. Remove characters that aren't alphanumerics,
|
| 106 |
+
underscores, or hyphens. Convert to lowercase. Also strip leading and
|
| 107 |
+
trailing whitespace, dashes, and underscores.
|
| 108 |
+
"""
|
| 109 |
+
value = str(value)
|
| 110 |
+
if allow_unicode:
|
| 111 |
+
value = unicodedata.normalize('NFKC', value)
|
| 112 |
+
else:
|
| 113 |
+
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
|
| 114 |
+
value = re.sub(r'[^\w\s-]', '', value.lower())
|
| 115 |
+
return re.sub(r'[-\s]+', '-', value).strip('-_')
|
vad.py
ADDED
|
@@ -0,0 +1,468 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
from collections import Counter, deque
|
| 3 |
+
|
| 4 |
+
from typing import Any, Deque, Iterator, List, Dict
|
| 5 |
+
|
| 6 |
+
from pprint import pprint
|
| 7 |
+
|
| 8 |
+
from src.segments import merge_timestamps
|
| 9 |
+
|
| 10 |
+
# Workaround for https://github.com/tensorflow/tensorflow/issues/48797
|
| 11 |
+
try:
|
| 12 |
+
import tensorflow as tf
|
| 13 |
+
except ModuleNotFoundError:
|
| 14 |
+
# Error handling
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
|
| 19 |
+
import ffmpeg
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
from src.utils import format_timestamp
|
| 23 |
+
from enum import Enum
|
| 24 |
+
|
| 25 |
+
class NonSpeechStrategy(Enum):
|
| 26 |
+
"""
|
| 27 |
+
Ignore non-speech frames segments.
|
| 28 |
+
"""
|
| 29 |
+
SKIP = 1
|
| 30 |
+
"""
|
| 31 |
+
Just treat non-speech segments as speech.
|
| 32 |
+
"""
|
| 33 |
+
CREATE_SEGMENT = 2
|
| 34 |
+
"""
|
| 35 |
+
Expand speech segments into subsequent non-speech segments.
|
| 36 |
+
"""
|
| 37 |
+
EXPAND_SEGMENT = 3
|
| 38 |
+
|
| 39 |
+
# Defaults for Silero
|
| 40 |
+
SPEECH_TRESHOLD = 0.3
|
| 41 |
+
|
| 42 |
+
# Minimum size of segments to process
|
| 43 |
+
MIN_SEGMENT_DURATION = 1
|
| 44 |
+
|
| 45 |
+
# The maximum time for texts from old segments to be used in the next segment
|
| 46 |
+
MAX_PROMPT_WINDOW = 0 # seconds (0 = disabled)
|
| 47 |
+
PROMPT_NO_SPEECH_PROB = 0.1 # Do not pass the text from segments with a no speech probability higher than this
|
| 48 |
+
|
| 49 |
+
VAD_MAX_PROCESSING_CHUNK = 60 * 60 # 60 minutes of audio
|
| 50 |
+
|
| 51 |
+
class TranscriptionConfig(ABC):
|
| 52 |
+
def __init__(self, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
|
| 53 |
+
segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
|
| 54 |
+
max_merge_size: float = None, max_prompt_window: float = None):
|
| 55 |
+
self.non_speech_strategy = non_speech_strategy
|
| 56 |
+
self.segment_padding_left = segment_padding_left
|
| 57 |
+
self.segment_padding_right = segment_padding_right
|
| 58 |
+
self.max_silent_period = max_silent_period
|
| 59 |
+
self.max_merge_size = max_merge_size
|
| 60 |
+
self.max_prompt_window = max_prompt_window
|
| 61 |
+
|
| 62 |
+
class PeriodicTranscriptionConfig(TranscriptionConfig):
|
| 63 |
+
def __init__(self, periodic_duration: float, non_speech_strategy: NonSpeechStrategy = NonSpeechStrategy.SKIP,
|
| 64 |
+
segment_padding_left: float = None, segment_padding_right = None, max_silent_period: float = None,
|
| 65 |
+
max_merge_size: float = None, max_prompt_window: float = None):
|
| 66 |
+
super().__init__(non_speech_strategy, segment_padding_left, segment_padding_right, max_silent_period, max_merge_size, max_prompt_window)
|
| 67 |
+
self.periodic_duration = periodic_duration
|
| 68 |
+
|
| 69 |
+
class AbstractTranscription(ABC):
|
| 70 |
+
def __init__(self, sampling_rate: int = 16000):
|
| 71 |
+
self.sampling_rate = sampling_rate
|
| 72 |
+
|
| 73 |
+
def get_audio_segment(self, str, start_time: str = None, duration: str = None):
|
| 74 |
+
return load_audio(str, self.sampling_rate, start_time, duration)
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
|
| 78 |
+
"""
|
| 79 |
+
Get the start and end timestamps of the sections that should be transcribed by this VAD method.
|
| 80 |
+
Parameters
|
| 81 |
+
----------
|
| 82 |
+
audio: str
|
| 83 |
+
The audio file.
|
| 84 |
+
config: TranscriptionConfig
|
| 85 |
+
The transcription configuration.
|
| 86 |
+
Returns
|
| 87 |
+
-------
|
| 88 |
+
A list of start and end timestamps, in fractional seconds.
|
| 89 |
+
"""
|
| 90 |
+
return
|
| 91 |
+
|
| 92 |
+
def transcribe(self, audio: str, whisperCallable, config: TranscriptionConfig):
|
| 93 |
+
"""
|
| 94 |
+
Transcribe the given audo file.
|
| 95 |
+
Parameters
|
| 96 |
+
----------
|
| 97 |
+
audio: str
|
| 98 |
+
The audio file.
|
| 99 |
+
whisperCallable: Callable[[Union[str, np.ndarray, torch.Tensor], int, str, str], dict[str, Union[dict, Any]]]
|
| 100 |
+
The callback that is used to invoke Whisper on an audio file/buffer. The first parameter is the audio file/buffer,
|
| 101 |
+
the second parameter is an optional text prompt, and the last is the current detected language. The return value is the result of the Whisper call.
|
| 102 |
+
Returns
|
| 103 |
+
-------
|
| 104 |
+
A list of start and end timestamps, in fractional seconds.
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
# get speech timestamps from full audio file
|
| 108 |
+
seconds_timestamps = self.get_transcribe_timestamps(audio, config)
|
| 109 |
+
|
| 110 |
+
#for seconds_timestamp in seconds_timestamps:
|
| 111 |
+
# print("VAD timestamp ", format_timestamp(seconds_timestamp['start']), " to ", format_timestamp(seconds_timestamp['end']))
|
| 112 |
+
|
| 113 |
+
merged = merge_timestamps(seconds_timestamps, config.max_silent_period, config.max_merge_size, config.segment_padding_left, config.segment_padding_right)
|
| 114 |
+
|
| 115 |
+
# A deque of transcribed segments that is passed to the next segment as a prompt
|
| 116 |
+
prompt_window = deque()
|
| 117 |
+
|
| 118 |
+
print("Timestamps:")
|
| 119 |
+
pprint(merged)
|
| 120 |
+
|
| 121 |
+
if config.non_speech_strategy != NonSpeechStrategy.SKIP:
|
| 122 |
+
max_audio_duration = get_audio_duration(audio)
|
| 123 |
+
|
| 124 |
+
# Expand segments to include the gaps between them
|
| 125 |
+
if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
|
| 126 |
+
# When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
|
| 127 |
+
merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=config.max_merge_size)
|
| 128 |
+
elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
|
| 129 |
+
# With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
|
| 130 |
+
merged = self.expand_gaps(merged, total_duration=max_audio_duration)
|
| 131 |
+
else:
|
| 132 |
+
raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
|
| 133 |
+
|
| 134 |
+
print("Transcribing non-speech:")
|
| 135 |
+
pprint(merged)
|
| 136 |
+
|
| 137 |
+
result = {
|
| 138 |
+
'text': "",
|
| 139 |
+
'segments': [],
|
| 140 |
+
'language': ""
|
| 141 |
+
}
|
| 142 |
+
languageCounter = Counter()
|
| 143 |
+
detected_language = None
|
| 144 |
+
|
| 145 |
+
segment_index = -1
|
| 146 |
+
|
| 147 |
+
# For each time segment, run whisper
|
| 148 |
+
for segment in merged:
|
| 149 |
+
segment_index += 1
|
| 150 |
+
segment_start = segment['start']
|
| 151 |
+
segment_end = segment['end']
|
| 152 |
+
segment_expand_amount = segment.get('expand_amount', 0)
|
| 153 |
+
segment_gap = segment.get('gap', False)
|
| 154 |
+
|
| 155 |
+
segment_duration = segment_end - segment_start
|
| 156 |
+
|
| 157 |
+
if segment_duration < MIN_SEGMENT_DURATION:
|
| 158 |
+
continue;
|
| 159 |
+
|
| 160 |
+
# Audio to run on Whisper
|
| 161 |
+
segment_audio = self.get_audio_segment(audio, start_time = str(segment_start), duration = str(segment_duration))
|
| 162 |
+
# Previous segments to use as a prompt
|
| 163 |
+
segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
|
| 164 |
+
|
| 165 |
+
# Detected language
|
| 166 |
+
detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
|
| 167 |
+
|
| 168 |
+
print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
|
| 169 |
+
segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
|
| 170 |
+
segment_result = whisperCallable(segment_audio, segment_index, segment_prompt, detected_language)
|
| 171 |
+
|
| 172 |
+
adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
|
| 173 |
+
|
| 174 |
+
# Propagate expand amount to the segments
|
| 175 |
+
if (segment_expand_amount > 0):
|
| 176 |
+
segment_without_expansion = segment_duration - segment_expand_amount
|
| 177 |
+
|
| 178 |
+
for adjusted_segment in adjusted_segments:
|
| 179 |
+
adjusted_segment_end = adjusted_segment['end']
|
| 180 |
+
|
| 181 |
+
# Add expand amount if the segment got expanded
|
| 182 |
+
if (adjusted_segment_end > segment_without_expansion):
|
| 183 |
+
adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
|
| 184 |
+
|
| 185 |
+
# Append to output
|
| 186 |
+
result['text'] += segment_result['text']
|
| 187 |
+
result['segments'].extend(adjusted_segments)
|
| 188 |
+
|
| 189 |
+
# Increment detected language
|
| 190 |
+
if not segment_gap:
|
| 191 |
+
languageCounter[segment_result['language']] += 1
|
| 192 |
+
|
| 193 |
+
# Update prompt window
|
| 194 |
+
self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
|
| 195 |
+
|
| 196 |
+
if detected_language is not None:
|
| 197 |
+
result['language'] = detected_language
|
| 198 |
+
|
| 199 |
+
return result
|
| 200 |
+
|
| 201 |
+
def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool, config: TranscriptionConfig):
|
| 202 |
+
if (config.max_prompt_window is not None and config.max_prompt_window > 0):
|
| 203 |
+
# Add segments to the current prompt window (unless it is a speech gap)
|
| 204 |
+
if not segment_gap:
|
| 205 |
+
for segment in adjusted_segments:
|
| 206 |
+
if segment.get('no_speech_prob', 0) <= PROMPT_NO_SPEECH_PROB:
|
| 207 |
+
prompt_window.append(segment)
|
| 208 |
+
|
| 209 |
+
while (len(prompt_window) > 0):
|
| 210 |
+
first_end_time = prompt_window[0].get('end', 0)
|
| 211 |
+
# Time expanded in the segments should be discounted from the prompt window
|
| 212 |
+
first_expand_time = prompt_window[0].get('expand_amount', 0)
|
| 213 |
+
|
| 214 |
+
if (first_end_time - first_expand_time < segment_end - config.max_prompt_window):
|
| 215 |
+
prompt_window.popleft()
|
| 216 |
+
else:
|
| 217 |
+
break
|
| 218 |
+
|
| 219 |
+
def include_gaps(self, segments: Iterator[dict], min_gap_length: float, total_duration: float):
|
| 220 |
+
result = []
|
| 221 |
+
last_end_time = 0
|
| 222 |
+
|
| 223 |
+
for segment in segments:
|
| 224 |
+
segment_start = float(segment['start'])
|
| 225 |
+
segment_end = float(segment['end'])
|
| 226 |
+
|
| 227 |
+
if (last_end_time != segment_start):
|
| 228 |
+
delta = segment_start - last_end_time
|
| 229 |
+
|
| 230 |
+
if (min_gap_length is None or delta >= min_gap_length):
|
| 231 |
+
result.append( { 'start': last_end_time, 'end': segment_start, 'gap': True } )
|
| 232 |
+
|
| 233 |
+
last_end_time = segment_end
|
| 234 |
+
result.append(segment)
|
| 235 |
+
|
| 236 |
+
# Also include total duration if specified
|
| 237 |
+
if (total_duration is not None and last_end_time < total_duration):
|
| 238 |
+
delta = total_duration - segment_start
|
| 239 |
+
|
| 240 |
+
if (min_gap_length is None or delta >= min_gap_length):
|
| 241 |
+
result.append( { 'start': last_end_time, 'end': total_duration, 'gap': True } )
|
| 242 |
+
|
| 243 |
+
return result
|
| 244 |
+
|
| 245 |
+
# Expand the end time of each segment to the start of the next segment
|
| 246 |
+
def expand_gaps(self, segments: List[Dict[str, Any]], total_duration: float):
|
| 247 |
+
result = []
|
| 248 |
+
|
| 249 |
+
if len(segments) == 0:
|
| 250 |
+
return result
|
| 251 |
+
|
| 252 |
+
# Add gap at the beginning if needed
|
| 253 |
+
if (segments[0]['start'] > 0):
|
| 254 |
+
result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
|
| 255 |
+
|
| 256 |
+
for i in range(len(segments) - 1):
|
| 257 |
+
current_segment = segments[i]
|
| 258 |
+
next_segment = segments[i + 1]
|
| 259 |
+
|
| 260 |
+
delta = next_segment['start'] - current_segment['end']
|
| 261 |
+
|
| 262 |
+
# Expand if the gap actually exists
|
| 263 |
+
if (delta >= 0):
|
| 264 |
+
current_segment = current_segment.copy()
|
| 265 |
+
current_segment['expand_amount'] = delta
|
| 266 |
+
current_segment['end'] = next_segment['start']
|
| 267 |
+
|
| 268 |
+
result.append(current_segment)
|
| 269 |
+
|
| 270 |
+
# Add last segment
|
| 271 |
+
last_segment = segments[-1]
|
| 272 |
+
result.append(last_segment)
|
| 273 |
+
|
| 274 |
+
# Also include total duration if specified
|
| 275 |
+
if (total_duration is not None):
|
| 276 |
+
last_segment = result[-1]
|
| 277 |
+
|
| 278 |
+
if (last_segment['end'] < total_duration):
|
| 279 |
+
last_segment = last_segment.copy()
|
| 280 |
+
last_segment['end'] = total_duration
|
| 281 |
+
result[-1] = last_segment
|
| 282 |
+
|
| 283 |
+
return result
|
| 284 |
+
|
| 285 |
+
def fill_gaps(self, segments: List[Dict[str, Any]], total_duration: float, max_expand_size: float = None):
|
| 286 |
+
result = []
|
| 287 |
+
|
| 288 |
+
if len(segments) == 0:
|
| 289 |
+
return result
|
| 290 |
+
|
| 291 |
+
# Add gap at the beginning if needed
|
| 292 |
+
if (segments[0]['start'] > 0):
|
| 293 |
+
result.append({ 'start': 0, 'end': segments[0]['start'], 'gap': True } )
|
| 294 |
+
|
| 295 |
+
for i in range(len(segments) - 1):
|
| 296 |
+
expanded = False
|
| 297 |
+
current_segment = segments[i]
|
| 298 |
+
next_segment = segments[i + 1]
|
| 299 |
+
|
| 300 |
+
delta = next_segment['start'] - current_segment['end']
|
| 301 |
+
|
| 302 |
+
if (max_expand_size is not None and delta <= max_expand_size):
|
| 303 |
+
# Just expand the current segment
|
| 304 |
+
current_segment = current_segment.copy()
|
| 305 |
+
current_segment['expand_amount'] = delta
|
| 306 |
+
current_segment['end'] = next_segment['start']
|
| 307 |
+
expanded = True
|
| 308 |
+
|
| 309 |
+
result.append(current_segment)
|
| 310 |
+
|
| 311 |
+
# Add a gap to the next segment if needed
|
| 312 |
+
if (delta >= 0 and not expanded):
|
| 313 |
+
result.append({ 'start': current_segment['end'], 'end': next_segment['start'], 'gap': True } )
|
| 314 |
+
|
| 315 |
+
# Add last segment
|
| 316 |
+
last_segment = segments[-1]
|
| 317 |
+
result.append(last_segment)
|
| 318 |
+
|
| 319 |
+
# Also include total duration if specified
|
| 320 |
+
if (total_duration is not None):
|
| 321 |
+
last_segment = result[-1]
|
| 322 |
+
|
| 323 |
+
delta = total_duration - last_segment['end']
|
| 324 |
+
|
| 325 |
+
if (delta > 0):
|
| 326 |
+
if (max_expand_size is not None and delta <= max_expand_size):
|
| 327 |
+
# Expand the last segment
|
| 328 |
+
last_segment = last_segment.copy()
|
| 329 |
+
last_segment['expand_amount'] = delta
|
| 330 |
+
last_segment['end'] = total_duration
|
| 331 |
+
result[-1] = last_segment
|
| 332 |
+
else:
|
| 333 |
+
result.append({ 'start': last_segment['end'], 'end': total_duration, 'gap': True } )
|
| 334 |
+
|
| 335 |
+
return result
|
| 336 |
+
|
| 337 |
+
def adjust_timestamp(self, segments: Iterator[dict], adjust_seconds: float, max_source_time: float = None):
|
| 338 |
+
result = []
|
| 339 |
+
|
| 340 |
+
for segment in segments:
|
| 341 |
+
segment_start = float(segment['start'])
|
| 342 |
+
segment_end = float(segment['end'])
|
| 343 |
+
|
| 344 |
+
# Filter segments?
|
| 345 |
+
if (max_source_time is not None):
|
| 346 |
+
if (segment_start > max_source_time):
|
| 347 |
+
continue
|
| 348 |
+
segment_end = min(max_source_time, segment_end)
|
| 349 |
+
|
| 350 |
+
new_segment = segment.copy()
|
| 351 |
+
|
| 352 |
+
# Add to start and end
|
| 353 |
+
new_segment['start'] = segment_start + adjust_seconds
|
| 354 |
+
new_segment['end'] = segment_end + adjust_seconds
|
| 355 |
+
result.append(new_segment)
|
| 356 |
+
return result
|
| 357 |
+
|
| 358 |
+
def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
|
| 359 |
+
result = []
|
| 360 |
+
|
| 361 |
+
for entry in timestamps:
|
| 362 |
+
start = entry['start']
|
| 363 |
+
end = entry['end']
|
| 364 |
+
|
| 365 |
+
result.append({
|
| 366 |
+
'start': start * factor,
|
| 367 |
+
'end': end * factor
|
| 368 |
+
})
|
| 369 |
+
return result
|
| 370 |
+
|
| 371 |
+
class VadSileroTranscription(AbstractTranscription):
|
| 372 |
+
def __init__(self, sampling_rate: int = 16000):
|
| 373 |
+
super().__init__(sampling_rate=sampling_rate)
|
| 374 |
+
|
| 375 |
+
self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
|
| 376 |
+
(self.get_speech_timestamps, _, _, _, _) = utils
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
|
| 380 |
+
audio_duration = get_audio_duration(audio)
|
| 381 |
+
result = []
|
| 382 |
+
|
| 383 |
+
# Divide procesisng of audio into chunks
|
| 384 |
+
chunk_start = 0.0
|
| 385 |
+
|
| 386 |
+
while (chunk_start < audio_duration):
|
| 387 |
+
chunk_duration = min(audio_duration - chunk_start, VAD_MAX_PROCESSING_CHUNK)
|
| 388 |
+
|
| 389 |
+
print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
|
| 390 |
+
wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
|
| 391 |
+
|
| 392 |
+
sample_timestamps = self.get_speech_timestamps(wav, self.model, sampling_rate=self.sampling_rate, threshold=SPEECH_TRESHOLD)
|
| 393 |
+
seconds_timestamps = self.multiply_timestamps(sample_timestamps, factor=1 / self.sampling_rate)
|
| 394 |
+
adjusted = self.adjust_timestamp(seconds_timestamps, adjust_seconds=chunk_start, max_source_time=chunk_start + chunk_duration)
|
| 395 |
+
|
| 396 |
+
#pprint(adjusted)
|
| 397 |
+
|
| 398 |
+
result.extend(adjusted)
|
| 399 |
+
chunk_start += chunk_duration
|
| 400 |
+
|
| 401 |
+
return result
|
| 402 |
+
|
| 403 |
+
# A very simple VAD that just marks every N seconds as speech
|
| 404 |
+
class VadPeriodicTranscription(AbstractTranscription):
|
| 405 |
+
def __init__(self, sampling_rate: int = 16000):
|
| 406 |
+
super().__init__(sampling_rate=sampling_rate)
|
| 407 |
+
|
| 408 |
+
def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig):
|
| 409 |
+
# Get duration in seconds
|
| 410 |
+
audio_duration = get_audio_duration(audio)
|
| 411 |
+
result = []
|
| 412 |
+
|
| 413 |
+
# Generate a timestamp every N seconds
|
| 414 |
+
start_timestamp = 0
|
| 415 |
+
|
| 416 |
+
while (start_timestamp < audio_duration):
|
| 417 |
+
end_timestamp = min(start_timestamp + config.periodic_duration, audio_duration)
|
| 418 |
+
segment_duration = end_timestamp - start_timestamp
|
| 419 |
+
|
| 420 |
+
# Minimum duration is 1 second
|
| 421 |
+
if (segment_duration >= 1):
|
| 422 |
+
result.append( { 'start': start_timestamp, 'end': end_timestamp } )
|
| 423 |
+
|
| 424 |
+
start_timestamp = end_timestamp
|
| 425 |
+
|
| 426 |
+
return result
|
| 427 |
+
|
| 428 |
+
def get_audio_duration(file: str):
|
| 429 |
+
return float(ffmpeg.probe(file)["format"]["duration"])
|
| 430 |
+
|
| 431 |
+
def load_audio(file: str, sample_rate: int = 16000,
|
| 432 |
+
start_time: str = None, duration: str = None):
|
| 433 |
+
"""
|
| 434 |
+
Open an audio file and read as mono waveform, resampling as necessary
|
| 435 |
+
Parameters
|
| 436 |
+
----------
|
| 437 |
+
file: str
|
| 438 |
+
The audio file to open
|
| 439 |
+
sr: int
|
| 440 |
+
The sample rate to resample the audio if necessary
|
| 441 |
+
start_time: str
|
| 442 |
+
The start time, using the standard FFMPEG time duration syntax, or None to disable.
|
| 443 |
+
|
| 444 |
+
duration: str
|
| 445 |
+
The duration, using the standard FFMPEG time duration syntax, or None to disable.
|
| 446 |
+
Returns
|
| 447 |
+
-------
|
| 448 |
+
A NumPy array containing the audio waveform, in float32 dtype.
|
| 449 |
+
"""
|
| 450 |
+
try:
|
| 451 |
+
inputArgs = {'threads': 0}
|
| 452 |
+
|
| 453 |
+
if (start_time is not None):
|
| 454 |
+
inputArgs['ss'] = start_time
|
| 455 |
+
if (duration is not None):
|
| 456 |
+
inputArgs['t'] = duration
|
| 457 |
+
|
| 458 |
+
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
|
| 459 |
+
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
| 460 |
+
out, _ = (
|
| 461 |
+
ffmpeg.input(file, **inputArgs)
|
| 462 |
+
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sample_rate)
|
| 463 |
+
.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
|
| 464 |
+
)
|
| 465 |
+
except ffmpeg.Error as e:
|
| 466 |
+
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}")
|
| 467 |
+
|
| 468 |
+
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|