Spaces:

rm8630
/

ai-transcript-clipper

Sleeping

App Files Files Community

Raj Jayendrakumar Muchhala commited on Feb 7, 2025

Commit

78ca458

1 Parent(s): debb8ec

support transcription

Browse files

Files changed (2) hide show

app.py +81 -21
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -4,6 +4,9 @@ from clipper_prompts import CLIPPER_SYSTEM_MESSAGE, CLIPPER_USER_MESSAGE
 from prompts import SYSTEM_MESSAGE, USER_MESSAGE
 import json
 import os
 # Set Streamlit layout to wide mode
 st.set_page_config(layout="wide")
@@ -49,32 +52,89 @@ col_transcript, col_output = st.columns([1, 1])
 # Left Column: Transcript Input
 with col_transcript:
-    st.subheader("📝 Paste Your Transcript")
-    transcript = st.text_area("Enter the transcript here:", height=400)
-    # Add reference link below the transcript text box
-    st.markdown("---")
-    st.markdown(
-        """
-        <div style="font-size:18px; font-weight:bold; margin-top:10px;">
-            Need a transcript? Use <a href="https://huggingface.co/spaces/openai/whisper" target="_blank" style="color:#007bff; text-decoration:none;">
-            OpenAI Whisper on Hugging Face</a> to generate one from your audio or video.
-        </div>
-        """,
-        unsafe_allow_html=True
-    )
-    st.markdown("---")
-    st.subheader("🎥 Video/Audio Upload & Playback")
-    media_file = st.file_uploader("Upload a video or audio file", type=["mp4", "mov", "avi", "mp3", "wav", "ogg"])
-    if media_file is not None:
-        # Detect media type and play accordingly
         if media_file.type.startswith("video"):
             st.video(media_file)
         elif media_file.type.startswith("audio"):
             st.audio(media_file)
 # Right Column: Clip Plan Generation and Extraction
 with col_output:

 from prompts import SYSTEM_MESSAGE, USER_MESSAGE
 import json
 import os
+import yt_dlp
+import ffmpeg
+from tempfile import NamedTemporaryFile
 # Set Streamlit layout to wide mode
 st.set_page_config(layout="wide")
 # Left Column: Transcript Input
 with col_transcript:
+    st.subheader("📝 Enter Video Source")
+    youtube_url = st.text_input("Enter YouTube Video URL")
+    media_file = st.file_uploader("Or upload a video/audio file", type=["mp4", "mov", "avi", "mp3", "wav", "ogg"])
+    transcript = ""
+    def download_youtube_audio(url):
+        ydl_opts = {
+            "format": "bestaudio/best",
+            "extractaudio": True,
+            "audioformat": "mp3",
+            "outtmpl": "% (id)s.%(ext)s",
+        }
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=True)
+            filename = ydl.prepare_filename(info).replace(".webm", ".mp3").replace(".m4a", ".mp3")
+            return compress_audio(filename)
+    def compress_audio(input_path, target_size_mb=25):
+        """Compress audio only if it exceeds the target size, adjusting bitrate dynamically."""
+        output_path = input_path.replace(".mp3", "_compressed.mp3")
+        # Check file size in MB
+        file_size_mb = os.path.getsize(input_path) / (1024 * 1024)
+        if file_size_mb <= target_size_mb:
+            return input_path  # No need to compress if already under limit
+        # Estimate appropriate bitrate (targeting 90% of desired size)
+        target_bitrate_kbps = int((target_size_mb * 1024 * 1024 * 8) / (file_size_mb * 1.1))  # 10% buffer
+        target_bitrate_kbps = max(target_bitrate_kbps, 32)  # Prevent extreme low-quality audio
+        ffmpeg.input(input_path).output(output_path, audio_bitrate=f"{target_bitrate_kbps}k").run(overwrite_output=True)
+        return output_path
+    def transcribe_audio(file_path):
+        whisper_client = OpenAI(api_key=OPENAI_API_KEY, base_url="https://api.openai.com/v1")
+        transcription_args = {
+            "file": None,
+            "model": "whisper-1",
+            "response_format": "verbose_json",
+            "timestamp_granularities": ["word"],
+            "timeout": 360,
+            "prompt": "The audio may not contain speech, do not make up words."
+        }
+        with open(file_path, "rb") as audio_file:
+            transcription_args["file"] = audio_file
+            transcript_response = whisper_client.audio.transcriptions.create(**transcription_args)
+        transcript_words = transcript_response.words
+        transcript = " ".join([word['word'] for word in transcript_words])
+        return transcript
+    if youtube_url:
+        st.video(youtube_url)
+    elif media_file:
         if media_file.type.startswith("video"):
             st.video(media_file)
         elif media_file.type.startswith("audio"):
             st.audio(media_file)
+    if st.button("Transcribe Video"):
+        with st.spinner("Processing... This may take a few minutes."):
+            try:
+                if youtube_url:
+                    audio_path = download_youtube_audio(youtube_url)
+                    transcript = transcribe_audio(audio_path)
+                elif media_file:
+                    with NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
+                        temp_audio.write(media_file.read())
+                        temp_audio.close()
+                        audio_path = compress_audio(temp_audio.name)
+                        transcript = transcribe_audio(audio_path)
+                else:
+                    st.error("❌ Please provide a YouTube link or upload a file.")
+            except Exception as e:
+                st.error(f"Error: {str(e)}")
+    # Display the extracted transcript
+    st.subheader("📝 Transcript")
+    transcript = st.text_area("Generated Transcript", transcript, height=300)
 # Right Column: Clip Plan Generation and Extraction
 with col_output:

requirements.txt CHANGED Viewed

	@@ -1 +1,4 @@
1	- openai

+openai
+yt-dlp
+pydub
+ffmpeg-python