Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
|
| 2 |
import streamlit as st
|
| 3 |
import openai
|
| 4 |
import os
|
| 5 |
from pydub import AudioSegment
|
|
|
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
from tempfile import NamedTemporaryFile
|
| 8 |
import math
|
|
@@ -14,54 +14,37 @@ load_dotenv()
|
|
| 14 |
# Set your OpenAI API key
|
| 15 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 16 |
|
| 17 |
-
def
|
| 18 |
-
"""
|
| 19 |
-
Calculate the length of each chunk in milliseconds to create chunks of approximately target_size_mb.
|
| 20 |
-
|
| 21 |
-
Args:
|
| 22 |
-
file_path (str): Path to the audio file.
|
| 23 |
-
target_size_mb (int): Target size of each chunk in megabytes.
|
| 24 |
-
|
| 25 |
-
Returns:
|
| 26 |
-
int: Chunk length in milliseconds.
|
| 27 |
-
"""
|
| 28 |
-
audio = AudioSegment.from_file(file_path)
|
| 29 |
-
file_size_bytes = os.path.getsize(file_path)
|
| 30 |
-
duration_ms = len(audio)
|
| 31 |
-
|
| 32 |
-
# Calculate the approximate duration per byte
|
| 33 |
-
duration_per_byte = duration_ms / file_size_bytes
|
| 34 |
-
|
| 35 |
-
# Calculate the chunk length in milliseconds for the target size
|
| 36 |
-
chunk_length_ms = target_size_mb * 1024 * 1024 * duration_per_byte
|
| 37 |
-
return math.floor(chunk_length_ms)
|
| 38 |
-
|
| 39 |
-
def split_audio(audio_file_path, chunk_length_ms):
|
| 40 |
"""
|
| 41 |
-
Split an audio file into chunks
|
| 42 |
-
|
| 43 |
Args:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
Returns:
|
| 48 |
-
|
| 49 |
"""
|
| 50 |
audio = AudioSegment.from_file(audio_file_path)
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
return chunks
|
| 55 |
|
| 56 |
def transcribe(audio_file):
|
| 57 |
"""
|
| 58 |
-
Transcribe an audio file using OpenAI Whisper model.
|
| 59 |
|
| 60 |
Args:
|
| 61 |
-
|
| 62 |
|
| 63 |
Returns:
|
| 64 |
-
|
| 65 |
"""
|
| 66 |
with open(audio_file, "rb") as audio:
|
| 67 |
response = openai.audio.transcriptions.create(
|
|
@@ -77,10 +60,10 @@ def process_audio_chunks(audio_chunks):
|
|
| 77 |
Process and transcribe each audio chunk.
|
| 78 |
|
| 79 |
Args:
|
| 80 |
-
|
| 81 |
|
| 82 |
Returns:
|
| 83 |
-
|
| 84 |
"""
|
| 85 |
transcriptions = []
|
| 86 |
min_length_ms = 100 # Minimum length required by OpenAI API (0.1 seconds)
|
|
@@ -107,11 +90,11 @@ def save_transcription_to_docx(transcription, audio_file_path):
|
|
| 107 |
Save the transcription as a .docx file.
|
| 108 |
|
| 109 |
Args:
|
| 110 |
-
|
| 111 |
-
|
| 112 |
|
| 113 |
Returns:
|
| 114 |
-
|
| 115 |
"""
|
| 116 |
# Extract the base name of the audio file (without extension)
|
| 117 |
base_name = os.path.splitext(os.path.basename(audio_file_path))[0]
|
|
@@ -132,10 +115,9 @@ def save_transcription_to_docx(transcription, audio_file_path):
|
|
| 132 |
|
| 133 |
st.title("Audio Transcription with OpenAI's Whisper")
|
| 134 |
|
| 135 |
-
#
|
| 136 |
uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"])
|
| 137 |
|
| 138 |
-
|
| 139 |
if 'transcription' not in st.session_state:
|
| 140 |
st.session_state.transcription = None
|
| 141 |
|
|
@@ -144,15 +126,19 @@ if uploaded_file is not None and st.session_state.transcription is None:
|
|
| 144 |
|
| 145 |
# Save uploaded file temporarily
|
| 146 |
file_extension = uploaded_file.name.split(".")[-1]
|
| 147 |
-
original_file_name = uploaded_file.name.rsplit('.', 1)[0] # Get
|
| 148 |
temp_audio_file = f"temp_audio_file.{file_extension}"
|
| 149 |
with open(temp_audio_file, "wb") as f:
|
| 150 |
f.write(uploaded_file.getbuffer())
|
| 151 |
|
| 152 |
-
# Split and process audio
|
| 153 |
with st.spinner('Transcribing...'):
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
transcription = process_audio_chunks(audio_chunks)
|
| 157 |
if transcription:
|
| 158 |
st.session_state.transcription = transcription
|
|
@@ -177,4 +163,3 @@ if st.session_state.transcription:
|
|
| 177 |
file_name=st.session_state.output_docx_file,
|
| 178 |
mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
| 179 |
)
|
| 180 |
-
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import openai
|
| 3 |
import os
|
| 4 |
from pydub import AudioSegment
|
| 5 |
+
from pydub.silence import split_on_silence
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
from tempfile import NamedTemporaryFile
|
| 8 |
import math
|
|
|
|
| 14 |
# Set your OpenAI API key
|
| 15 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 16 |
|
| 17 |
+
def split_audio_on_silence(audio_file_path, min_silence_len=500, silence_thresh=-40, keep_silence=250):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"""
|
| 19 |
+
Split an audio file into chunks using silence detection.
|
| 20 |
+
|
| 21 |
Args:
|
| 22 |
+
audio_file_path (str): Path to the audio file.
|
| 23 |
+
min_silence_len (int): Minimum length of silence (in ms) required to be used as a split point.
|
| 24 |
+
silence_thresh (int): The volume (in dBFS) below which is considered silence.
|
| 25 |
+
keep_silence (int): Amount of silence (in ms) to retain at the beginning and end of each chunk.
|
| 26 |
+
|
| 27 |
Returns:
|
| 28 |
+
list: List of AudioSegment chunks.
|
| 29 |
"""
|
| 30 |
audio = AudioSegment.from_file(audio_file_path)
|
| 31 |
+
chunks = split_on_silence(
|
| 32 |
+
audio,
|
| 33 |
+
min_silence_len=min_silence_len,
|
| 34 |
+
silence_thresh=silence_thresh,
|
| 35 |
+
keep_silence=keep_silence
|
| 36 |
+
)
|
| 37 |
return chunks
|
| 38 |
|
| 39 |
def transcribe(audio_file):
|
| 40 |
"""
|
| 41 |
+
Transcribe an audio file using the OpenAI Whisper model.
|
| 42 |
|
| 43 |
Args:
|
| 44 |
+
audio_file (str): Path to the audio file.
|
| 45 |
|
| 46 |
Returns:
|
| 47 |
+
str: Transcribed text.
|
| 48 |
"""
|
| 49 |
with open(audio_file, "rb") as audio:
|
| 50 |
response = openai.audio.transcriptions.create(
|
|
|
|
| 60 |
Process and transcribe each audio chunk.
|
| 61 |
|
| 62 |
Args:
|
| 63 |
+
audio_chunks (list): List of AudioSegment chunks.
|
| 64 |
|
| 65 |
Returns:
|
| 66 |
+
str: Combined transcription from all chunks.
|
| 67 |
"""
|
| 68 |
transcriptions = []
|
| 69 |
min_length_ms = 100 # Minimum length required by OpenAI API (0.1 seconds)
|
|
|
|
| 90 |
Save the transcription as a .docx file.
|
| 91 |
|
| 92 |
Args:
|
| 93 |
+
transcription (str): Transcribed text.
|
| 94 |
+
audio_file_path (str): Path to the original audio file for naming purposes.
|
| 95 |
|
| 96 |
Returns:
|
| 97 |
+
str: Path to the saved .docx file.
|
| 98 |
"""
|
| 99 |
# Extract the base name of the audio file (without extension)
|
| 100 |
base_name = os.path.splitext(os.path.basename(audio_file_path))[0]
|
|
|
|
| 115 |
|
| 116 |
st.title("Audio Transcription with OpenAI's Whisper")
|
| 117 |
|
| 118 |
+
# Allow uploading of audio or video files
|
| 119 |
uploaded_file = st.file_uploader("Upload an audio or video file", type=["wav", "mp3", "ogg", "m4a", "mp4", "mov"])
|
| 120 |
|
|
|
|
| 121 |
if 'transcription' not in st.session_state:
|
| 122 |
st.session_state.transcription = None
|
| 123 |
|
|
|
|
| 126 |
|
| 127 |
# Save uploaded file temporarily
|
| 128 |
file_extension = uploaded_file.name.split(".")[-1]
|
| 129 |
+
original_file_name = uploaded_file.name.rsplit('.', 1)[0] # Get original file name without extension
|
| 130 |
temp_audio_file = f"temp_audio_file.{file_extension}"
|
| 131 |
with open(temp_audio_file, "wb") as f:
|
| 132 |
f.write(uploaded_file.getbuffer())
|
| 133 |
|
| 134 |
+
# Split and process audio using silence detection
|
| 135 |
with st.spinner('Transcribing...'):
|
| 136 |
+
audio_chunks = split_audio_on_silence(
|
| 137 |
+
temp_audio_file,
|
| 138 |
+
min_silence_len=500, # adjust based on your audio characteristics
|
| 139 |
+
silence_thresh=-40, # adjust based on the ambient noise level
|
| 140 |
+
keep_silence=250 # optional: keeps a bit of silence at the edges
|
| 141 |
+
)
|
| 142 |
transcription = process_audio_chunks(audio_chunks)
|
| 143 |
if transcription:
|
| 144 |
st.session_state.transcription = transcription
|
|
|
|
| 163 |
file_name=st.session_state.output_docx_file,
|
| 164 |
mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
| 165 |
)
|
|
|