|
|
import io |
|
|
import sys |
|
|
from typing import BinaryIO |
|
|
from markitdown._exceptions import MissingDependencyException |
|
|
|
|
|
|
|
|
|
|
|
_dependency_exc_info = None |
|
|
try: |
|
|
|
|
|
import warnings |
|
|
|
|
|
with warnings.catch_warnings(): |
|
|
warnings.filterwarnings("ignore", category=DeprecationWarning) |
|
|
warnings.filterwarnings("ignore", category=SyntaxWarning) |
|
|
import speech_recognition as sr |
|
|
import pydub |
|
|
except ImportError: |
|
|
|
|
|
_dependency_exc_info = sys.exc_info() |
|
|
|
|
|
|
|
|
def transcribe_audio(file_stream: BinaryIO, *, audio_format: str = "wav", language: str = None) -> str: |
|
|
|
|
|
if _dependency_exc_info is not None: |
|
|
raise MissingDependencyException( |
|
|
"Speech transcription requires installing MarkItdown with the [audio-transcription] optional dependencies. E.g., `pip install markitdown[audio-transcription]` or `pip install markitdown[all]`" |
|
|
) from _dependency_exc_info[ |
|
|
1 |
|
|
].with_traceback( |
|
|
_dependency_exc_info[2] |
|
|
) |
|
|
|
|
|
if audio_format in ["wav", "aiff", "flac"]: |
|
|
audio_source = file_stream |
|
|
elif audio_format in ["mp3", "mp4", "webm"]: |
|
|
audio_segment = pydub.AudioSegment.from_file(file_stream, format=audio_format) |
|
|
|
|
|
audio_source = io.BytesIO() |
|
|
audio_segment.export(audio_source, format="wav") |
|
|
audio_source.seek(0) |
|
|
else: |
|
|
raise ValueError(f"Unsupported audio format: {audio_format}") |
|
|
|
|
|
recognizer = sr.Recognizer() |
|
|
with sr.AudioFile(audio_source) as source: |
|
|
audio = recognizer.record(source) |
|
|
if language is None: |
|
|
language = "en-US" |
|
|
try: |
|
|
transcript = recognizer.recognize_faster_whisper(audio, language=language.split("-")[0]).strip() |
|
|
except ImportError: |
|
|
transcript = recognizer.recognize_google(audio, language=language).strip() |
|
|
return "[No speech detected]" if transcript == "" else transcript.strip() |
|
|
|