Spaces:

7H4M3R
/

Audio

Sleeping

App Files Files Community

7H4M3R commited on May 21, 2025

Commit

0381907

verified ·

1 Parent(s): b528c0e

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +182 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,184 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# import altair as alt
+# import numpy as np
+# import pandas as pd
+# import streamlit as st
+# """
+# # Welcome to Streamlit!
+# Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
+# If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
+# forums](https://discuss.streamlit.io).
+# In the meantime, below is an example of what you can do with just a few lines of code:
+# """
+# num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
+# num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
+# indices = np.linspace(0, 1, num_points)
+# theta = 2 * np.pi * num_turns * indices
+# radius = indices
+# x = radius * np.cos(theta)
+# y = radius * np.sin(theta)
+# df = pd.DataFrame({
+#     "x": x,
+#     "y": y,
+#     "idx": indices,
+#     "rand": np.random.randn(num_points),
+# })
+# st.altair_chart(alt.Chart(df, height=700, width=700)
+#     .mark_point(filled=True)
+#     .encode(
+#         x=alt.X("x", axis=None),
+#         y=alt.Y("y", axis=None),
+#         color=alt.Color("idx", legend=None, scale=alt.Scale()),
+#         size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
+#     ))
 import streamlit as st
+import os
+# from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
+# from utils import download_video, extract_audio, accent_classify
+import whisper
+from transformers import pipeline
+import numpy as np # linear algebra
+import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+import yt_dlp
+import torchaudio
+import yt_dlp
+import ffmpeg
+# Define the resampling rate in Hertz (Hz) for audio data
+RATE_HZ = 16000
+# Define the maximum audio interval length to consider in seconds
+MAX_SECONDS = 1
+# Calculate the maximum audio interval length in samples by multiplying the rate and seconds
+MAX_LENGTH = RATE_HZ * MAX_SECONDS
+def download_video(url, output_path="video.mp4"):
+    ydl_opts = {
+        'format': 'worstvideo[ext=mp4]+bestaudio[ext=m4a]/bestaudio',
+        'outtmpl': output_path,
+        'merge_output_format': 'mp4',
+        'quiet': True,
+        'noplaylist': True,
+        'nocheckcertificate': True,
+        'retries': 3,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        ydl.download([url])
+    return output_path
+def extract_audio(input_path, output_path="audio.mp3"):
+    (
+        ffmpeg
+        .input(input_path)
+        .output(output_path, format='mp3', acodec='libmp3lame', audio_bitrate='192k')
+        .overwrite_output()
+        .run(quiet=True)
+    )
+    return output_path
+# Split files by chunks with == MAX_LENGTH size
+def split_audio(file):
+    try:
+        # Load the audio file using torchaudio and get its sample rate.
+        audio, rate = torchaudio.load(str(file))
+        # Calculate the number of segments based on the MAX_LENGTH
+        num_segments = (len(audio[0]) // MAX_LENGTH)  # Floor division to get segments
+        # Create an empty list to store segmented audio data
+        segmented_audio = []
+        # Split the audio into segments
+        for i in range(num_segments):
+            start = i * MAX_LENGTH
+            end = min((i + 1) * MAX_LENGTH, len(audio[0]))
+            segment = audio[0][start:end]
+            # Create a transformation to resample the audio to a specified sample rate (RATE_HZ).
+            transform = torchaudio.transforms.Resample(rate, RATE_HZ)
+            segment = transform(segment).squeeze(0).numpy().reshape(-1)
+            segmented_audio.append(segment)
+        # Create a DataFrame from the segmented audio
+        df_segments = pd.DataFrame({'audio': segmented_audio})
+        return df_segments
+    except Exception as e:
+        # If an exception occurs (e.g., file not found), return nothing
+        print(f"Error processing file: {e}")
+        return None
+def accent_classify(pipe, audio_path):
+    audio_df = split_audio(audio_path)
+    return pipe(np.concatenate(audio_df["audio"][:50].to_list()))[0]
+# Load HF pipeline model (audio classification)
+@st.cache_resource
+def load_audio_classifier():
+    model_name = "dima806/english_accents_classification"
+    return pipeline('audio-classification', model=model_name, device=0)  # GPU (device=0) or CPU (device=-1)
+# Load Whisper model
+@st.cache_resource
+def load_whisper_model():
+    return whisper.load_model("base")
+# Load models once
+pipe = load_audio_classifier()
+whisper_model = load_whisper_model()
+st.set_page_config(page_title="Accent Classifier", layout="centered")
+st.title("🎙️ English Accent Classifier")
+st.markdown("Upload a video link and get the English accent with confidence.")
+video_url = st.text_input("Paste a public video URL (YouTube, Loom, or MP4):")
+if st.button("Analyze"):
+    if not video_url.strip():
+        st.warning("Please enter a valid URL.")
+    else:
+        with st.spinner("Downloading video..."):
+            video_path = download_video(video_url)
+            pass
+        with st.spinner("Extracting audio..."):
+            audio_path = extract_audio(video_path)
+            pass
+        with st.spinner("Transcribing with Whisper..."):
+            result = whisper_model.transcribe(audio_path)
+            transcription = result['text']
+            # pass
+        with st.spinner("Classifying accent..."):
+            accent_data = accent_classify(pipe, audio_path)
+            accent = accent_data.get("label", "us")
+            confidence = accent_data.get("score", 0)
+            pass
+        # accent = "Englsh"
+        # confidence = 0.9
+        # transcription = "Hello There."
+        st.success("Analysis Complete!")
+        st.markdown(f"**Accent:** {accent}")
+        st.markdown(f"**Confidence Score:** {confidence:.2f}%")
+        st.markdown("**Transcription:**")
+        st.text_area("Transcript", transcription, height=200)
+        # Cleanup
+        os.remove(video_path)
+        os.remove(audio_path)