Spaces:

francozanardi
/

pycaps

Paused

App Files Files Community

Franco Zanardi commited on Jun 30, 2025

Commit

073f329

1 Parent(s): ba9737f

use google stt instead of whisper to get better performance

Browse files

Files changed (3) hide show

requirements.txt +1 -0
src/config.py +17 -0
src/ui/step1_upload.py +121 -28

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 streamlit
 git+https://github.com/francozanardi/pycaps.git
 openai

 streamlit
 git+https://github.com/francozanardi/pycaps.git
 openai
+google-cloud-speech

src/config.py CHANGED Viewed

@@ -7,6 +7,7 @@ os.makedirs(LOCK_DIR, exist_ok=True)
 MAX_VIDEO_SIZE = 50 * 1024 * 1024
 LOCK_TTL_SECONDS = 20 * 60
 SESSION_TTL_SECONDS = 60 * 60
 TEMPLATES_INFO = [
     {"name": "classic", "ai_features": []},
@@ -19,3 +20,19 @@ TEMPLATES_INFO = [
     {"name": "vibrant", "ai_features": []},
 ]
 TEMPLATE_NAMES = [t["name"] for t in TEMPLATES_INFO]

 MAX_VIDEO_SIZE = 50 * 1024 * 1024
 LOCK_TTL_SECONDS = 20 * 60
 SESSION_TTL_SECONDS = 60 * 60
+MAX_VIDEO_DURATION = 90
 TEMPLATES_INFO = [
     {"name": "classic", "ai_features": []},
     {"name": "vibrant", "ai_features": []},
 ]
 TEMPLATE_NAMES = [t["name"] for t in TEMPLATES_INFO]
+SUPPORTED_LANGUAGES = {
+    "English (US)": ("en-US", "en"),
+    "Spanish": ("es-ES", "es"),
+    "French": ("fr-FR", "fr"),
+    "German": ("de-DE", "de"),
+    "Italian": ("it-IT", "it"),
+    "Portuguese": ("pt-BR", "pt"),
+    "Dutch": ("nl-NL", "nl"),
+    "Russian": ("ru-RU", "ru"),
+    "Japanese": ("ja-JP", "ja"),
+    "Korean": ("ko-KR", "ko"),
+    "Chinese (Mandarin)": ("cmn-CN", "zh"),
+    "Hindi": ("hi-IN", "hi"),
+    "Arabic": ("ar-SA", "ar"),
+}

src/ui/step1_upload.py CHANGED Viewed

@@ -3,18 +3,73 @@ import os
 import tempfile
 import shutil
 from pathlib import Path
-from file_manager import get_path
 import pycaps.video.render.audio_utils as audio_utils
-from pycaps import WhisperAudioTranscriber
 from utils import go_to_step, acquire_lock_slot, handle_unexpected_exception
-from config import MAX_VIDEO_SIZE, MAX_CONCURRENT_JOBS
 def render_step1():
     st.header("Upload Your Video")
     if st.session_state.active_jobs >= MAX_CONCURRENT_JOBS:
         st.warning("🚧 All our processing slots are currently busy. Please check back in a few minutes.")
-        st.info("Tip: You can also duplicate this space and get your own private and free, full-speed version instantly!")
         st.progress(1.0)
         if st.button("Refresh Status"):
             st.rerun()
@@ -22,12 +77,24 @@ def render_step1():
     if 'audio_being_analyzed' not in st.session_state:
         st.session_state['audio_being_analyzed'] = False
-    uploaded_file = st.file_uploader(
-        f"Select a video file (max {MAX_VIDEO_SIZE // (1024*1024)}MB)",
-        type=["mp4", "mov"],
-        key=f"uploader_{st.session_state.session_id}"
-    )
     if not uploaded_file:
         return
@@ -36,36 +103,62 @@ def render_step1():
         st.error(f"File is too large ({uploaded_file.size / (1024*1024):.1f}MB). Max is {MAX_VIDEO_SIZE // (1024*1024)}MB.")
         return
-    if st.button("Start", type="primary", disabled=st.session_state.audio_being_analyzed):
         lock_file = acquire_lock_slot()
         if not lock_file:
             st.error("Sorry, all slots were taken just now. Please try again.")
             st.rerun()
         st.session_state.lock_file_path = lock_file
         st.session_state.audio_being_analyzed = True
         st.rerun()
     if st.session_state.audio_being_analyzed:
         try:
-            with tempfile.TemporaryDirectory() as temp_dir:
-                video_path = Path(temp_dir) / uploaded_file.name
-                with open(video_path, "wb") as f:
-                    f.write(uploaded_file.getbuffer())
-                with st.spinner("Analyzing audio... 🎧"):
-                    audio_path = os.path.join(temp_dir, "audio.wav")
-                    audio_utils.extract_audio_for_whisper(str(video_path), audio_path)
-                    transcriber = WhisperAudioTranscriber(model_size="base")
-                    document = transcriber.transcribe(audio_path)
-                    st.session_state.transcribed_doc = document.to_dict()
-                    persisted_path = get_path(f"input.mp4")
-                    shutil.copy(video_path, persisted_path)
-                    st.session_state.video_path = persisted_path
-                    st.session_state.audio_being_analyzed = False
-                    go_to_step(2)
-                    st.rerun()
         except Exception as e:
             handle_unexpected_exception(e)

 import tempfile
 import shutil
 from pathlib import Path
+import subprocess
+import json
+from file_manager import get_path, get_session_dir
 import pycaps.video.render.audio_utils as audio_utils
+from pycaps import WhisperAudioTranscriber, GoogleAudioTranscriber
 from utils import go_to_step, acquire_lock_slot, handle_unexpected_exception
+from config import MAX_VIDEO_SIZE, MAX_VIDEO_DURATION, MAX_CONCURRENT_JOBS, SUPPORTED_LANGUAGES
+def get_video_duration(video_path: str) -> float:
+    """Gets video duration in seconds using ffprobe."""
+    try:
+        cmd = [
+            "ffprobe",
+            "-v", "quiet",
+            "-print_format", "json",
+            "-show_format",
+            str(video_path),
+        ]
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
+        data = json.loads(result.stdout)
+        return float(data["format"]["duration"])
+    except (subprocess.CalledProcessError, FileNotFoundError, KeyError, json.JSONDecodeError) as e:
+        st.error(f"Could not analyze video file to get duration. Error: {e}")
+        return -1
+def setup_google_credentials():
+    if "GOOGLE_JSON_CREDENTIALS" not in os.environ:
+        return False
+    if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ:
+        # already configured
+        return True
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json", encoding="utf-8", dir=get_session_dir()) as temp_file:
+        temp_file.write(os.environ["GOOGLE_JSON_CREDENTIALS"])
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file.name
+    return True
+def get_transcriber_instance(language_key: str):
+    """
+    Dynamically selects the best available transcriber.
+    Prefers Google STT if available, otherwise falls back to Whisper.
+    """
+    google_lang_code, whisper_lang_code = SUPPORTED_LANGUAGES[language_key]
+    try:
+        was_set = setup_google_credentials()
+        if not was_set:
+            raise Exception("Unable to setup google credentials")
+        transcriber = GoogleAudioTranscriber(language=google_lang_code)
+        transcriber._get_client()
+        st.warning(
+            "**Note:** This demo uses Google's faster transcriber. For the highest accuracy "
+            "with Whisper, please check out the [GitHub repository](https://github.com/francozanardi/pycaps)."
+        )
+        st.session_state.transcriber_used = "Google Speech-to-Text V1"
+        return transcriber
+    except Exception as e:
+        st.warning("Google Speech-to-Text not available, falling back to Whisper. Processing may be slower.")
+        st.session_state.transcriber_used = "Whisper (base model)"
+        return WhisperAudioTranscriber(model_size="base", language=whisper_lang_code)
 def render_step1():
     st.header("Upload Your Video")
     if st.session_state.active_jobs >= MAX_CONCURRENT_JOBS:
         st.warning("🚧 All our processing slots are currently busy. Please check back in a few minutes.")
+        st.info("Tip: You can also duplicate this space to get your own private and free, full-speed version instantly!")
         st.progress(1.0)
         if st.button("Refresh Status"):
             st.rerun()
     if 'audio_being_analyzed' not in st.session_state:
         st.session_state['audio_being_analyzed'] = False
+    st.info(f"For this demo, please upload a video shorter than **{MAX_VIDEO_DURATION} seconds**.")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        uploaded_file = st.file_uploader(
+            f"Select a video file (max {MAX_VIDEO_SIZE // (1024*1024)}MB)",
+            type=["mp4", "mov"],
+            key=f"uploader_{st.session_state.session_id}"
+        )
+    with col2:
+        selected_language_key = st.selectbox(
+            "Select Audio Language",
+            options=list(SUPPORTED_LANGUAGES.keys()),
+            key="language_selector"
+        )
     if not uploaded_file:
         return
         st.error(f"File is too large ({uploaded_file.size / (1024*1024):.1f}MB). Max is {MAX_VIDEO_SIZE // (1024*1024)}MB.")
         return
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp_file:
+        tmp_file.write(uploaded_file.getvalue())
+        temp_video_path = tmp_file.name
+    duration = get_video_duration(temp_video_path)
+    if duration < 0:
+        os.remove(temp_video_path)
+        return
+    if duration > MAX_VIDEO_DURATION:
+        st.error(f"Video is too long ({duration:.1f}s). Max duration for the demo is {MAX_VIDEO_DURATION} seconds.")
+        os.remove(temp_video_path)
+        return
+    # Si todo está bien, mostramos el botón
+    if st.button("Start Transcription", type="primary", disabled=st.session_state.audio_being_analyzed):
         lock_file = acquire_lock_slot()
         if not lock_file:
             st.error("Sorry, all slots were taken just now. Please try again.")
+            os.remove(temp_video_path)
             st.rerun()
         st.session_state.lock_file_path = lock_file
+        st.session_state.temp_video_path = temp_video_path
+        st.session_state.selected_language = selected_language_key
         st.session_state.audio_being_analyzed = True
         st.rerun()
     if st.session_state.audio_being_analyzed:
         try:
+            video_path = Path(st.session_state.temp_video_path)
+            language_key = st.session_state.selected_language
+            transcriber = get_transcriber_instance(language_key)
+            with st.spinner(f"Transcribing audio with {st.session_state.transcriber_used}... 🎧"):
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio:
+                    audio_path = tmp_audio.name
+                audio_utils.extract_audio_for_whisper(str(video_path), audio_path)
+                document = transcriber.transcribe(audio_path)
+                st.session_state.transcribed_doc = document.to_dict()
+                persisted_path = get_path("input.mp4")
+                shutil.copy(video_path, persisted_path)
+                st.session_state.video_path = persisted_path
+                os.remove(video_path)
+                os.remove(audio_path)
+                del st.session_state.temp_video_path
+                del st.session_state.selected_language
+                st.session_state.audio_being_analyzed = False
+                go_to_step(2)
+                st.rerun()
         except Exception as e:
+            if "temp_video_path" in st.session_state and os.path.exists(st.session_state.temp_video_path):
+                os.remove(st.session_state.temp_video_path)
             handle_unexpected_exception(e)