Spaces:

7H4M3R
/

Audio

Sleeping

App Files Files Community

7H4M3R commited on May 22, 2025

Commit

61bf2df

verified ·

1 Parent(s): 7cf2ff9

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +46 -97

src/streamlit_app.py CHANGED Viewed

@@ -1,165 +1,114 @@
 import streamlit as st
 import os
-import numpy as np # linear algebra
-import pandas as pd # data processing
 # from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
 # from utils import download_video, extract_audio, accent_classify
-import whisper
 from transformers import pipeline
 import yt_dlp
 import torchaudio
-import yt_dlp
 import ffmpeg
-from transformers.utils import logging
 logging.set_verbosity_info()
-# Define the resampling rate in Hertz (Hz) for audio data
 RATE_HZ = 16000
-# Define the maximum audio interval length to consider in seconds
 MAX_SECONDS = 1
-# Calculate the maximum audio interval length in samples by multiplying the rate and seconds
 MAX_LENGTH = RATE_HZ * MAX_SECONDS
-def download_video(url, output_dir="/app/tmp"):
-    os.makedirs(output_dir, exist_ok=True)
     ydl_opts = {
-            'format': 'worstvideo[ext=mp4]+bestaudio[ext=m4a]/bestaudio',
-            "outtmpl": os.path.join(output_dir, "video.%(ext)s"),
-            "quiet": True,
-            'merge_output_format': 'mp4',
-            'quiet': True,
-            'noplaylist': True,
-            'nocheckcertificate': True,
-            'retries': 3,
-            }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([url])
-    return os.path.join(output_dir, "video.mp4")
-def extract_audio(input_path, output_dir="/app/tmp"):
-    os.makedirs(output_dir, exist_ok=True)
-    output_path = os.path.join(output_dir, "audio.mp3")
     (
-            ffmpeg
-            .input(input_path)
-            .output(output_path, format='mp3', acodec='libmp3lame', audio_bitrate='192k')
-            .overwrite_output()
-            .run(quiet=True)
-            )
     return output_path
-# Split files by chunks with == MAX_LENGTH size
 def split_audio(file):
     try:
-        # Load the audio file using torchaudio and get its sample rate.
         audio, rate = torchaudio.load(str(file))
-        # Calculate the number of segments based on the MAX_LENGTH
         num_segments = (len(audio[0]) // MAX_LENGTH)  # Floor division to get segments
-        # Create an empty list to store segmented audio data
         segmented_audio = []
-        # Split the audio into segments
         for i in range(num_segments):
             start = i * MAX_LENGTH
             end = min((i + 1) * MAX_LENGTH, len(audio[0]))
             segment = audio[0][start:end]
-            # Create a transformation to resample the audio to a specified sample rate (RATE_HZ).
             transform = torchaudio.transforms.Resample(rate, RATE_HZ)
             segment = transform(segment).squeeze(0).numpy().reshape(-1)
             segmented_audio.append(segment)
-        # Create a DataFrame from the segmented audio
         df_segments = pd.DataFrame({'audio': segmented_audio})
         return df_segments
     except Exception as e:
-        # If an exception occurs (e.g., file not found), return nothing
         print(f"Error processing file: {e}")
         return None
 def accent_classify(pipe, audio_path):
     audio_df = split_audio(audio_path)
-    return pipe(np.concatenate(audio_df["audio"][:50].to_list()))[0]
-st.set_page_config(page_title="Accent Classifier", layout="centered")
 st.title("🎙️ English Accent Classifier")
 st.markdown("Upload a video link and get the English accent with confidence.")
-st.subheader("1. Upload a Video File")
-uploaded_file = st.file_uploader("Choose a video file", type=["mp4", "mov", "avi"])
-st.subheader("2. Or Enter a Video URL")
 video_url = st.text_input("Paste a public video URL (Loom, or MP4):")
 if st.button("Analyze"):
-    video_path = None
-    output_dir="/app/tmp"
-    os.makedirs(output_dir, exist_ok=True)
-    if uploaded_file:
-        video_path = os.path.join(output_dir, "video.mp4")
-        with open(video_path, "wb") as f:
-            f.write(uploaded_file.read())
-            st.success("✅ Video uploaded successfully.")
-    elif video_url.strip():
-        with st.spinner("Downloading video from URL..."):
-            try:
-                video_path = download_video(video_url)
-            except Exception as e:
-                st.error(f"❌ Failed to download video: {e}")
-            else:
-                st.success(f"✅ Video downloaded: {video_path}")
     else:
-        st.warning("⚠️ Please upload a video file or enter a valid URL.")
-    if video_path and os.path.exists(video_path):
-        st.write("Exists:", os.path.exists(video_path))
         with st.spinner("Extracting audio..."):
             audio_path = extract_audio(video_path)
-            st.write("Audio saved at:", audio_path)
-            st.write("Exists:", os.path.exists(audio_path))
-        # with st.spinner("Transcribing with Whisper..."):
-        #     whisper_model = whisper.load_model("base")
-        #     result = whisper_model.transcribe(audio_path)
-        #     transcription = result['text']
-        #     transcription = "Hello There"
-        #     pass
-        with st.spinner("Extracting waves..."):
-            audio_df = split_audio(audio_path)
-            # print(np.concatenate(audio_df["audio"][:50].to_list()))
-            waves = f"{np.concatenate(audio_df["audio"][:5].to_list())}"
-            st.markdown("**Audio waves:**")
-            st.text_area("Audio waves", waves, height=200)
-        with st.spinner("Classifying accent..."):
             model_name = "dima806/english_accents_classification"
             pipe = pipeline('audio-classification', model=model_name, device=0)
             accent_data = accent_classify(pipe, audio_path)
-            # accent_data = {"label": "American", "score": 0.9}
-            accent = accent_data.get("label", "American")
-            confidence = accent_data.get("score", 0.0)
-            # pass
         st.success("Analysis Complete!")
         st.markdown(f"**Accent:** {accent}")
         st.markdown(f"**Confidence Score:** {confidence:.2f}%")
         # st.markdown("**Transcription:**")
         # st.text_area("Transcript", transcription, height=200)
         # Cleanup
         os.remove(video_path)
-        os.remove(audio_path)

 import streamlit as st
 import os
 # from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
 # from utils import download_video, extract_audio, accent_classify
+# import whisper
 from transformers import pipeline
+from transformers.utils import logging
+import numpy as np
+import pandas as pd
 import yt_dlp
 import torchaudio
 import ffmpeg
 logging.set_verbosity_info()
 RATE_HZ = 16000
 MAX_SECONDS = 1
 MAX_LENGTH = RATE_HZ * MAX_SECONDS
+def download_video(url, output_path="video.mp4"):
     ydl_opts = {
+        'format': 'worstvideo[ext=mp4]+bestaudio[ext=m4a]/bestaudio',
+        'outtmpl': output_path,
+        'merge_output_format': 'mp4',
+        'quiet': True,
+        'noplaylist': True,
+        'nocheckcertificate': True,
+        'retries': 3,
+    }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([url])
+    return output_path
+def extract_audio(input_path, output_path="audio.mp3"):
     (
+        ffmpeg
+        .input(input_path)
+        .output(output_path, format='mp3', acodec='libmp3lame', audio_bitrate='192k')
+        .overwrite_output()
+        .run(quiet=True)
+    )
     return output_path
 def split_audio(file):
     try:
         audio, rate = torchaudio.load(str(file))
         num_segments = (len(audio[0]) // MAX_LENGTH)  # Floor division to get segments
         segmented_audio = []
         for i in range(num_segments):
             start = i * MAX_LENGTH
             end = min((i + 1) * MAX_LENGTH, len(audio[0]))
             segment = audio[0][start:end]
             transform = torchaudio.transforms.Resample(rate, RATE_HZ)
             segment = transform(segment).squeeze(0).numpy().reshape(-1)
             segmented_audio.append(segment)
         df_segments = pd.DataFrame({'audio': segmented_audio})
         return df_segments
     except Exception as e:
         print(f"Error processing file: {e}")
         return None
 def accent_classify(pipe, audio_path):
     audio_df = split_audio(audio_path)
+    return pipe(np.concatenate(audio_df["audio"][:250].to_list()))[0]
+accent_mapping = {
+    'us': 'American',
+    'canada': 'Canadian',
+    'england': 'British',
+    'indian': 'Indian',
+    'australia': 'Australian',
+}
+st.set_page_config(page_title="Accent Classifier", layout="centered")
 st.title("🎙️ English Accent Classifier")
 st.markdown("Upload a video link and get the English accent with confidence.")
 video_url = st.text_input("Paste a public video URL (Loom, or MP4):")
 if st.button("Analyze"):
+    if not video_url.strip():
+        st.warning("Please enter a valid URL.")
     else:
+        with st.spinner("Downloading video..."):
+            video_path = download_video(video_url)
         with st.spinner("Extracting audio..."):
             audio_path = extract_audio(video_path)
+#         with st.spinner("Transcribing with Whisper..."):
+#             whisper_model = whisper.load_model("base")
+#             result = whisper_model.transcribe(audio_path)
+#             transcription = result['text']
+#             # pass
+        with st.spinner("Classifying accent..."):
             model_name = "dima806/english_accents_classification"
             pipe = pipeline('audio-classification', model=model_name, device=0)
             accent_data = accent_classify(pipe, audio_path)
+            accent = accent_mapping.get(accent_data.get("label", "us"))
+            confidence = accent_data.get("score", 0)
         st.success("Analysis Complete!")
         st.markdown(f"**Accent:** {accent}")
         st.markdown(f"**Confidence Score:** {confidence:.2f}%")
         # st.markdown("**Transcription:**")
         # st.text_area("Transcript", transcription, height=200)
         # Cleanup
         os.remove(video_path)
+        os.remove(audio_path)