Franco Zanardi commited on
Commit
073f329
·
1 Parent(s): ba9737f

use google stt instead of whisper to get better performance

Browse files
Files changed (3) hide show
  1. requirements.txt +1 -0
  2. src/config.py +17 -0
  3. src/ui/step1_upload.py +121 -28
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  streamlit
2
  git+https://github.com/francozanardi/pycaps.git
3
  openai
 
 
1
  streamlit
2
  git+https://github.com/francozanardi/pycaps.git
3
  openai
4
+ google-cloud-speech
src/config.py CHANGED
@@ -7,6 +7,7 @@ os.makedirs(LOCK_DIR, exist_ok=True)
7
  MAX_VIDEO_SIZE = 50 * 1024 * 1024
8
  LOCK_TTL_SECONDS = 20 * 60
9
  SESSION_TTL_SECONDS = 60 * 60
 
10
 
11
  TEMPLATES_INFO = [
12
  {"name": "classic", "ai_features": []},
@@ -19,3 +20,19 @@ TEMPLATES_INFO = [
19
  {"name": "vibrant", "ai_features": []},
20
  ]
21
  TEMPLATE_NAMES = [t["name"] for t in TEMPLATES_INFO]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  MAX_VIDEO_SIZE = 50 * 1024 * 1024
8
  LOCK_TTL_SECONDS = 20 * 60
9
  SESSION_TTL_SECONDS = 60 * 60
10
+ MAX_VIDEO_DURATION = 90
11
 
12
  TEMPLATES_INFO = [
13
  {"name": "classic", "ai_features": []},
 
20
  {"name": "vibrant", "ai_features": []},
21
  ]
22
  TEMPLATE_NAMES = [t["name"] for t in TEMPLATES_INFO]
23
+
24
+ SUPPORTED_LANGUAGES = {
25
+ "English (US)": ("en-US", "en"),
26
+ "Spanish": ("es-ES", "es"),
27
+ "French": ("fr-FR", "fr"),
28
+ "German": ("de-DE", "de"),
29
+ "Italian": ("it-IT", "it"),
30
+ "Portuguese": ("pt-BR", "pt"),
31
+ "Dutch": ("nl-NL", "nl"),
32
+ "Russian": ("ru-RU", "ru"),
33
+ "Japanese": ("ja-JP", "ja"),
34
+ "Korean": ("ko-KR", "ko"),
35
+ "Chinese (Mandarin)": ("cmn-CN", "zh"),
36
+ "Hindi": ("hi-IN", "hi"),
37
+ "Arabic": ("ar-SA", "ar"),
38
+ }
src/ui/step1_upload.py CHANGED
@@ -3,18 +3,73 @@ import os
3
  import tempfile
4
  import shutil
5
  from pathlib import Path
6
- from file_manager import get_path
 
 
7
  import pycaps.video.render.audio_utils as audio_utils
8
- from pycaps import WhisperAudioTranscriber
9
  from utils import go_to_step, acquire_lock_slot, handle_unexpected_exception
10
- from config import MAX_VIDEO_SIZE, MAX_CONCURRENT_JOBS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def render_step1():
13
  st.header("Upload Your Video")
14
 
15
  if st.session_state.active_jobs >= MAX_CONCURRENT_JOBS:
16
  st.warning("🚧 All our processing slots are currently busy. Please check back in a few minutes.")
17
- st.info("Tip: You can also duplicate this space and get your own private and free, full-speed version instantly!")
18
  st.progress(1.0)
19
  if st.button("Refresh Status"):
20
  st.rerun()
@@ -22,12 +77,24 @@ def render_step1():
22
 
23
  if 'audio_being_analyzed' not in st.session_state:
24
  st.session_state['audio_being_analyzed'] = False
 
 
 
 
25
 
26
- uploaded_file = st.file_uploader(
27
- f"Select a video file (max {MAX_VIDEO_SIZE // (1024*1024)}MB)",
28
- type=["mp4", "mov"],
29
- key=f"uploader_{st.session_state.session_id}"
30
- )
 
 
 
 
 
 
 
 
31
 
32
  if not uploaded_file:
33
  return
@@ -36,36 +103,62 @@ def render_step1():
36
  st.error(f"File is too large ({uploaded_file.size / (1024*1024):.1f}MB). Max is {MAX_VIDEO_SIZE // (1024*1024)}MB.")
37
  return
38
 
39
- if st.button("Start", type="primary", disabled=st.session_state.audio_being_analyzed):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  lock_file = acquire_lock_slot()
41
  if not lock_file:
42
  st.error("Sorry, all slots were taken just now. Please try again.")
 
43
  st.rerun()
44
 
45
  st.session_state.lock_file_path = lock_file
 
 
46
  st.session_state.audio_being_analyzed = True
47
  st.rerun()
48
 
49
  if st.session_state.audio_being_analyzed:
50
  try:
51
- with tempfile.TemporaryDirectory() as temp_dir:
52
- video_path = Path(temp_dir) / uploaded_file.name
53
- with open(video_path, "wb") as f:
54
- f.write(uploaded_file.getbuffer())
55
-
56
- with st.spinner("Analyzing audio... 🎧"):
57
- audio_path = os.path.join(temp_dir, "audio.wav")
58
- audio_utils.extract_audio_for_whisper(str(video_path), audio_path)
59
- transcriber = WhisperAudioTranscriber(model_size="base")
60
- document = transcriber.transcribe(audio_path)
 
 
 
 
 
 
 
 
 
 
61
 
62
- st.session_state.transcribed_doc = document.to_dict()
63
- persisted_path = get_path(f"input.mp4")
64
- shutil.copy(video_path, persisted_path)
65
- st.session_state.video_path = persisted_path
66
-
67
- st.session_state.audio_being_analyzed = False
68
- go_to_step(2)
69
- st.rerun()
70
  except Exception as e:
 
 
71
  handle_unexpected_exception(e)
 
3
  import tempfile
4
  import shutil
5
  from pathlib import Path
6
+ import subprocess
7
+ import json
8
+ from file_manager import get_path, get_session_dir
9
  import pycaps.video.render.audio_utils as audio_utils
10
+ from pycaps import WhisperAudioTranscriber, GoogleAudioTranscriber
11
  from utils import go_to_step, acquire_lock_slot, handle_unexpected_exception
12
+ from config import MAX_VIDEO_SIZE, MAX_VIDEO_DURATION, MAX_CONCURRENT_JOBS, SUPPORTED_LANGUAGES
13
+
14
+ def get_video_duration(video_path: str) -> float:
15
+ """Gets video duration in seconds using ffprobe."""
16
+ try:
17
+ cmd = [
18
+ "ffprobe",
19
+ "-v", "quiet",
20
+ "-print_format", "json",
21
+ "-show_format",
22
+ str(video_path),
23
+ ]
24
+ result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True)
25
+ data = json.loads(result.stdout)
26
+ return float(data["format"]["duration"])
27
+ except (subprocess.CalledProcessError, FileNotFoundError, KeyError, json.JSONDecodeError) as e:
28
+ st.error(f"Could not analyze video file to get duration. Error: {e}")
29
+ return -1
30
+
31
+ def setup_google_credentials():
32
+ if "GOOGLE_JSON_CREDENTIALS" not in os.environ:
33
+ return False
34
+ if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ:
35
+ # already configured
36
+ return True
37
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json", encoding="utf-8", dir=get_session_dir()) as temp_file:
38
+ temp_file.write(os.environ["GOOGLE_JSON_CREDENTIALS"])
39
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_file.name
40
+ return True
41
+
42
+ def get_transcriber_instance(language_key: str):
43
+ """
44
+ Dynamically selects the best available transcriber.
45
+ Prefers Google STT if available, otherwise falls back to Whisper.
46
+ """
47
+
48
+ google_lang_code, whisper_lang_code = SUPPORTED_LANGUAGES[language_key]
49
+ try:
50
+ was_set = setup_google_credentials()
51
+ if not was_set:
52
+ raise Exception("Unable to setup google credentials")
53
+ transcriber = GoogleAudioTranscriber(language=google_lang_code)
54
+ transcriber._get_client()
55
+ st.warning(
56
+ "**Note:** This demo uses Google's faster transcriber. For the highest accuracy "
57
+ "with Whisper, please check out the [GitHub repository](https://github.com/francozanardi/pycaps)."
58
+ )
59
+ st.session_state.transcriber_used = "Google Speech-to-Text V1"
60
+ return transcriber
61
+ except Exception as e:
62
+ st.warning("Google Speech-to-Text not available, falling back to Whisper. Processing may be slower.")
63
+ st.session_state.transcriber_used = "Whisper (base model)"
64
+ return WhisperAudioTranscriber(model_size="base", language=whisper_lang_code)
65
+
66
 
67
  def render_step1():
68
  st.header("Upload Your Video")
69
 
70
  if st.session_state.active_jobs >= MAX_CONCURRENT_JOBS:
71
  st.warning("🚧 All our processing slots are currently busy. Please check back in a few minutes.")
72
+ st.info("Tip: You can also duplicate this space to get your own private and free, full-speed version instantly!")
73
  st.progress(1.0)
74
  if st.button("Refresh Status"):
75
  st.rerun()
 
77
 
78
  if 'audio_being_analyzed' not in st.session_state:
79
  st.session_state['audio_being_analyzed'] = False
80
+
81
+ st.info(f"For this demo, please upload a video shorter than **{MAX_VIDEO_DURATION} seconds**.")
82
+
83
+ col1, col2 = st.columns([2, 1])
84
 
85
+ with col1:
86
+ uploaded_file = st.file_uploader(
87
+ f"Select a video file (max {MAX_VIDEO_SIZE // (1024*1024)}MB)",
88
+ type=["mp4", "mov"],
89
+ key=f"uploader_{st.session_state.session_id}"
90
+ )
91
+
92
+ with col2:
93
+ selected_language_key = st.selectbox(
94
+ "Select Audio Language",
95
+ options=list(SUPPORTED_LANGUAGES.keys()),
96
+ key="language_selector"
97
+ )
98
 
99
  if not uploaded_file:
100
  return
 
103
  st.error(f"File is too large ({uploaded_file.size / (1024*1024):.1f}MB). Max is {MAX_VIDEO_SIZE // (1024*1024)}MB.")
104
  return
105
 
106
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp_file:
107
+ tmp_file.write(uploaded_file.getvalue())
108
+ temp_video_path = tmp_file.name
109
+
110
+ duration = get_video_duration(temp_video_path)
111
+ if duration < 0:
112
+ os.remove(temp_video_path)
113
+ return
114
+
115
+ if duration > MAX_VIDEO_DURATION:
116
+ st.error(f"Video is too long ({duration:.1f}s). Max duration for the demo is {MAX_VIDEO_DURATION} seconds.")
117
+ os.remove(temp_video_path)
118
+ return
119
+
120
+ # Si todo está bien, mostramos el botón
121
+ if st.button("Start Transcription", type="primary", disabled=st.session_state.audio_being_analyzed):
122
  lock_file = acquire_lock_slot()
123
  if not lock_file:
124
  st.error("Sorry, all slots were taken just now. Please try again.")
125
+ os.remove(temp_video_path)
126
  st.rerun()
127
 
128
  st.session_state.lock_file_path = lock_file
129
+ st.session_state.temp_video_path = temp_video_path
130
+ st.session_state.selected_language = selected_language_key
131
  st.session_state.audio_being_analyzed = True
132
  st.rerun()
133
 
134
  if st.session_state.audio_being_analyzed:
135
  try:
136
+ video_path = Path(st.session_state.temp_video_path)
137
+ language_key = st.session_state.selected_language
138
+ transcriber = get_transcriber_instance(language_key)
139
+
140
+ with st.spinner(f"Transcribing audio with {st.session_state.transcriber_used}... 🎧"):
141
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_audio:
142
+ audio_path = tmp_audio.name
143
+
144
+ audio_utils.extract_audio_for_whisper(str(video_path), audio_path)
145
+ document = transcriber.transcribe(audio_path)
146
+
147
+ st.session_state.transcribed_doc = document.to_dict()
148
+ persisted_path = get_path("input.mp4")
149
+ shutil.copy(video_path, persisted_path)
150
+ st.session_state.video_path = persisted_path
151
+
152
+ os.remove(video_path)
153
+ os.remove(audio_path)
154
+ del st.session_state.temp_video_path
155
+ del st.session_state.selected_language
156
 
157
+ st.session_state.audio_being_analyzed = False
158
+ go_to_step(2)
159
+ st.rerun()
160
+
 
 
 
 
161
  except Exception as e:
162
+ if "temp_video_path" in st.session_state and os.path.exists(st.session_state.temp_video_path):
163
+ os.remove(st.session_state.temp_video_path)
164
  handle_unexpected_exception(e)