aseelflihan commited on
Commit
126577b
·
verified ·
1 Parent(s): 9f7de57

Upload 15 files

Browse files
Files changed (15) hide show
  1. .env +1 -0
  2. .gitattributes +35 -35
  3. .gitignore +12 -0
  4. Dockerfile +34 -0
  5. README.md +20 -0
  6. app.py +386 -0
  7. audio_processor.py +253 -0
  8. mp3_embedder.py +323 -0
  9. package-lock.json +12 -0
  10. package.json +10 -0
  11. packages.txt +5 -0
  12. pyproject.toml +15 -0
  13. requirements.txt +9 -0
  14. utils.py +355 -0
  15. video_generator.py +33 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GEMINI_API_KEY=AIzaSyAS7JtrXjlNjyuo3RG5z6rkwocCwFy1YuA
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore environment files
2
+ .env
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+
8
+ # Virtual environments
9
+ .venv/
10
+
11
+ # Other
12
+ .DS_Store
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # -- Dockerfile for Streamlit app --
3
+ #
4
+
5
+ # Base image
6
+ FROM python:3.9-slim
7
+
8
+ # Set working directory
9
+ WORKDIR /app
10
+
11
+ # Install system dependencies (including ffmpeg)
12
+ RUN apt-get update && apt-get install -y \
13
+ build-essential \
14
+ ffmpeg \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Copy requirements file
18
+ COPY requirements.txt ./requirements.txt
19
+
20
+ # Install Python dependencies
21
+ RUN pip install --no-cache-dir --upgrade pip
22
+ RUN pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Copy the entire app
25
+ COPY . .
26
+
27
+ # Expose the port that Streamlit runs on
28
+ EXPOSE 8501
29
+
30
+ # Add a health check
31
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
32
+
33
+ # Command to run the app
34
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SyncMaster
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 8501
8
+ tags:
9
+ - streamlit
10
+ pinned: false
11
+ short_description: in
12
+ license: mit
13
+ ---
14
+
15
+ # Welcome to Streamlit!
16
+
17
+ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
+
19
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
+ forums](https://discuss.streamlit.io).
app.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import json
5
+ from pathlib import Path
6
+ import time
7
+ import traceback
8
+ import streamlit.components.v1 as components
9
+
10
+ AUDIO_PROCESSOR_CLASS = None
11
+ IMPORT_ERROR_TRACEBACK = None
12
+ try:
13
+ from audio_processor import AudioProcessor
14
+ AUDIO_PROCESSOR_CLASS = AudioProcessor
15
+ except Exception:
16
+ IMPORT_ERROR_TRACEBACK = traceback.format_exc()
17
+
18
+ from video_generator import VideoGenerator
19
+ from mp3_embedder import MP3Embedder
20
+ from utils import format_timestamp, validate_audio_file, get_audio_info
21
+
22
+ # Page configuration
23
+ st.set_page_config(
24
+ page_title="SyncMaster - AI Audio-Text Synchronization",
25
+ page_icon="🎵",
26
+ layout="wide"
27
+ )
28
+
29
+ # --- Function to log messages to the browser console ---
30
+ def log_to_browser_console(messages):
31
+ """Injects JavaScript to log messages to the browser's console."""
32
+ if isinstance(messages, str):
33
+ messages = [messages]
34
+
35
+ # Escape backticks, backslashes, and ${} to prevent breaking the template literal
36
+ escaped_messages = []
37
+ for msg in messages:
38
+ # Simple JSON stringification is a safe way to escape the string for JS
39
+ escaped_messages.append(json.dumps(msg))
40
+
41
+ js_code = f"""
42
+ <script>
43
+ (function() {{
44
+ const logs = [{', '.join(escaped_messages)}];
45
+ console.group("Backend Logs from SyncMaster");
46
+ logs.forEach(log => {{
47
+ if (typeof log === 'string' && log.startsWith('--- ERROR')) {{
48
+ console.error(log);
49
+ }} else if (typeof log === 'string' && log.startsWith('--- WARNING')) {{
50
+ console.warn(log);
51
+ }} else {{
52
+ console.log(log);
53
+ }}
54
+ }});
55
+ console.groupEnd();
56
+ }})();
57
+ </script>
58
+ """
59
+ components.html(js_code, height=0)
60
+
61
+ # Initialize session state
62
+ if 'step' not in st.session_state:
63
+ st.session_state.step = 1
64
+ if 'audio_file' not in st.session_state:
65
+ st.session_state.audio_file = None
66
+ if 'transcription_data' not in st.session_state:
67
+ st.session_state.transcription_data = None
68
+ if 'edited_text' not in st.session_state:
69
+ st.session_state.edited_text = ""
70
+ if 'video_style' not in st.session_state:
71
+ st.session_state.video_style = {
72
+ 'animation_style': 'Karaoke Style',
73
+ 'text_color': '#FFFFFF',
74
+ 'highlight_color': '#FFD700',
75
+ 'background_color': '#000000',
76
+ 'font_family': 'Arial',
77
+ 'font_size': 48
78
+ }
79
+
80
+ if not hasattr(st, "divider"):
81
+ def _divider():
82
+ st.markdown("---")
83
+ st.divider = _divider
84
+
85
+ # Patch st.button for Streamlit versions that don't support the 'type' argument (<=1.12)
86
+ import inspect as _st_inspect
87
+ if "type" not in _st_inspect.signature(st.button).parameters:
88
+ _orig_button = st.button
89
+
90
+ def _patched_button(label, *args, **kwargs):
91
+ kwargs.pop("type", None)
92
+ kwargs.pop("use_container_width", None)
93
+ return _orig_button(label, *args, **kwargs)
94
+
95
+ st.button = _patched_button
96
+
97
+ if not hasattr(st, "rerun") and hasattr(st, "experimental_rerun"):
98
+ st.rerun = st.experimental_rerun
99
+
100
+ if hasattr(st, "download_button"):
101
+ import inspect as _dl_inspect
102
+ _dl_sig = _dl_inspect.signature(st.download_button)
103
+ if "use_container_width" not in _dl_sig.parameters:
104
+ _orig_download_button = st.download_button
105
+
106
+ def _patched_download_button(label, data, *args, **kwargs):
107
+ kwargs.pop("use_container_width", None)
108
+ return _orig_download_button(label, data, *args, **kwargs)
109
+
110
+ st.download_button = _patched_download_button
111
+
112
+ def main():
113
+ st.title("🎵 SyncMaster")
114
+ st.markdown("### The Intelligent Audio-Text Synchronization Platform")
115
+ st.markdown("Transform your audio files into mobile-compatible MP3s with synchronized lyrics and animated MP4 videos.")
116
+
117
+ col1, col2, col3 = st.columns(3)
118
+ with col1:
119
+ if st.session_state.step >= 1:
120
+ st.success("Step 1: Upload & Process")
121
+ else:
122
+ st.info("Step 1: Upload & Process")
123
+ with col2:
124
+ if st.session_state.step >= 2:
125
+ st.success("Step 2: Review & Customize")
126
+ elif st.session_state.step == 1:
127
+ st.info("Step 2: Review & Customize")
128
+ with col3:
129
+ if st.session_state.step >= 3:
130
+ st.success("Step 3: Export")
131
+ elif st.session_state.step >= 2:
132
+ st.info("Step 3: Export")
133
+
134
+ st.divider()
135
+
136
+ if AUDIO_PROCESSOR_CLASS is None:
137
+ st.error("فشل حاسم: لم يتمكن التطبيق من بدء التشغيل بشكل صحيح.")
138
+ st.subheader("حدث خطأ أثناء محاولة استيراد `AudioProcessor`:")
139
+ st.code(IMPORT_ERROR_TRACEBACK, language="python")
140
+ st.warning("السبب المحتمل: خطأ في الكود في ملف `audio_processor.py` أو مشكلة في الاتصال بـ Google Gemini.")
141
+ st.stop()
142
+
143
+ if st.session_state.step == 1:
144
+ step_1_upload_and_process()
145
+ elif st.session_state.step == 2:
146
+ step_2_review_and_customize()
147
+ elif st.session_state.step == 3:
148
+ step_3_export()
149
+
150
+ def step_1_upload_and_process():
151
+ st.header("Step 1: Upload Your Audio File")
152
+
153
+ uploaded_file = st.file_uploader(
154
+ "Choose an audio file",
155
+ type=['mp3', 'wav', 'm4a'],
156
+ help="Supported formats: MP3, WAV, M4A"
157
+ )
158
+
159
+ if uploaded_file is not None:
160
+ st.session_state.audio_file = uploaded_file
161
+ st.success(f"File uploaded: {uploaded_file.name}")
162
+ st.info(f"File size: {uploaded_file.size / 1024 / 1024:.2f} MB")
163
+ st.audio(uploaded_file)
164
+
165
+ if st.button("🚀 Start AI Processing", type="primary", use_container_width=True):
166
+ process_audio()
167
+
168
+ if st.session_state.audio_file is not None:
169
+ if st.button("🔄 Upload Different File"):
170
+ reset_session()
171
+ st.rerun()
172
+
173
+ def process_audio():
174
+ try:
175
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(st.session_state.audio_file.name).suffix) as tmp_file:
176
+ tmp_file.write(st.session_state.audio_file.getvalue())
177
+ tmp_file_path = tmp_file.name
178
+ processor = AUDIO_PROCESSOR_CLASS()
179
+ with st.spinner("🎤 Transcribing audio with AI..."):
180
+ transcription_result = processor.transcribe_audio(tmp_file_path)
181
+ if "Error:" in transcription_result or not transcription_result:
182
+ st.error(f"Transcription failed: {transcription_result}")
183
+ os.unlink(tmp_file_path)
184
+ return
185
+ word_timestamps = []
186
+ if hasattr(processor, 'get_word_timestamps'):
187
+ try:
188
+ with st.spinner("🔍 Extracting word timestamps..."):
189
+ word_timestamps = processor.get_word_timestamps(tmp_file_path)
190
+ # فحص محتوى word_timestamps وعرضه للمستخدم
191
+ st.write("word_timestamps sample:", word_timestamps[:3])
192
+ if not word_timestamps:
193
+ st.warning("No word timestamps extracted! SYLT embedding will not work.")
194
+ except Exception as e:
195
+ st.warning(f"Could not extract word timestamps: {e}")
196
+ st.session_state.transcription_data = {
197
+ 'text': transcription_result,
198
+ 'word_timestamps': word_timestamps,
199
+ 'audio_path': tmp_file_path
200
+ }
201
+ st.session_state.edited_text = transcription_result
202
+ st.session_state.step = 2
203
+ st.success("🎉 Audio processing complete! Moving to customization...")
204
+ time.sleep(1)
205
+ st.rerun()
206
+ except Exception as e:
207
+ st.error("An error occurred during processing!")
208
+ st.exception(e)
209
+ if 'tmp_file_path' in locals() and os.path.exists(tmp_file_path):
210
+ os.unlink(tmp_file_path)
211
+
212
+ def step_2_review_and_customize():
213
+ st.header("Step 2: Review & Customize")
214
+
215
+ if st.session_state.transcription_data is None:
216
+ st.error("No transcription data found. Please go back to Step 1.")
217
+ if st.button("← Back to Step 1"):
218
+ st.session_state.step = 1
219
+ st.rerun()
220
+ return
221
+
222
+ col1, col2 = st.columns([3, 2])
223
+
224
+ with col1:
225
+ st.subheader("📝 Text Editor")
226
+ edited_text = st.text_area(
227
+ "Transcribed Text",
228
+ value=st.session_state.edited_text,
229
+ height=300
230
+ )
231
+ st.session_state.edited_text = edited_text
232
+ st.caption(f"Word count: {len(edited_text.split())}")
233
+
234
+ with col2:
235
+ st.subheader("🎨 Video Style Customization")
236
+ st.session_state.video_style['animation_style'] = st.selectbox("Animation Style", ["Karaoke Style", "Pop-up Word"])
237
+ st.session_state.video_style['text_color'] = st.color_picker("Text Color", st.session_state.video_style['text_color'])
238
+ st.session_state.video_style['highlight_color'] = st.color_picker("Highlight Color", st.session_state.video_style['highlight_color'])
239
+
240
+ col1, col2, col3 = st.columns([1, 2, 1])
241
+ with col1:
242
+ if st.button("← Back to Upload"):
243
+ st.session_state.step = 1
244
+ st.rerun()
245
+ with col3:
246
+ if st.button("Continue to Export →", type="primary"):
247
+ st.session_state.step = 3
248
+ st.rerun()
249
+
250
+ def step_3_export():
251
+ st.header("Step 3: Export Your Synchronized Media")
252
+
253
+ if st.session_state.transcription_data is None:
254
+ st.error("No data found. Please go back to Step 1.")
255
+ if st.button("← Back to Step 1"):
256
+ st.session_state.step = 1
257
+ st.rerun()
258
+ return
259
+
260
+ col1, col2 = st.columns(2)
261
+
262
+ with col1:
263
+ st.subheader("🎵 MP3 Export")
264
+ st.markdown("Export MP3 with embedded synchronized lyrics (SYLT).")
265
+ if st.button("📱 Export MP3 with Lyrics", type="primary", use_container_width=True):
266
+ export_mp3()
267
+
268
+ with col2:
269
+ st.subheader("🎬 MP4 Video Export")
270
+ st.markdown("Create an animated video with synchronized text.")
271
+ if st.button("🎥 Generate Video Summary", type="primary", use_container_width=True):
272
+ export_mp4()
273
+
274
+ st.divider()
275
+ col1, col2, col3 = st.columns([1, 2, 1])
276
+ with col1:
277
+ if st.button("← Back to Customize"):
278
+ st.session_state.step = 2
279
+ st.rerun()
280
+ with col3:
281
+ if st.button("🔄 Start Over"):
282
+ reset_session()
283
+ st.rerun()
284
+
285
+ def export_mp3():
286
+ """Export MP3 file and log diagnostics to the browser console and Streamlit UI."""
287
+ try:
288
+ with st.spinner("Embedding lyrics into MP3..."):
289
+ embedder = MP3Embedder()
290
+ word_timestamps = st.session_state.transcription_data['word_timestamps']
291
+ audio_path = st.session_state.transcription_data['audio_path']
292
+ output_filename = f"synced_{Path(st.session_state.audio_file.name).stem}.mp3"
293
+ st.info("🔄 بدء عملية دمج النصوص...")
294
+ output_path, log_messages = embedder.embed_sylt_lyrics(
295
+ audio_path,
296
+ word_timestamps,
297
+ st.session_state.edited_text,
298
+ output_filename
299
+ )
300
+ log_to_browser_console(log_messages)
301
+ # عرض الـ logs في Streamlit
302
+ st.subheader("📝 تفاصيل العملية:")
303
+ for log in log_messages:
304
+ if "ERROR" in log:
305
+ st.error(log)
306
+ elif "WARNING" in log:
307
+ st.warning(log)
308
+ else:
309
+ st.info(log)
310
+
311
+ st.subheader("✅ Export Complete")
312
+
313
+ if os.path.exists(output_path):
314
+ with open(output_path, 'rb') as audio_file:
315
+ audio_bytes = audio_file.read()
316
+ st.audio(audio_bytes, format='audio/mp3')
317
+
318
+ # --- فحص التاغات بعد الدمج مباشرة ---
319
+ from mutagen.mp3 import MP3
320
+ from mutagen.id3 import ID3, SYLT, USLT
321
+ audio_file_obj = MP3(output_path, ID3=ID3)
322
+ sylt_frames = audio_file_obj.tags.getall('SYLT') if audio_file_obj.tags else []
323
+ uslt_frames = audio_file_obj.tags.getall('USLT') if audio_file_obj.tags else []
324
+ st.write(f"SYLT frames after export: {len(sylt_frames)}")
325
+ st.write(f"USLT frames after export: {len(uslt_frames)}")
326
+ if sylt_frames:
327
+ st.write("SYLT frame sample:", sylt_frames[0])
328
+ if uslt_frames:
329
+ st.write("USLT frame sample:", uslt_frames[0])
330
+ # --- نهاية الفحص ---
331
+
332
+ verification = embedder.verify_sylt_embedding(output_path)
333
+ st.json(verification)
334
+ if verification['has_sylt']:
335
+ st.success(f"Successfully embedded {verification['sylt_entries']} synchronized words!")
336
+ else:
337
+ st.warning("Warning: Could not verify SYLT embedding. The lyrics may not be synchronized.")
338
+ st.download_button(
339
+ label="Download Synced MP3",
340
+ data=audio_bytes,
341
+ file_name=output_filename,
342
+ mime="audio/mpeg",
343
+ use_container_width=True
344
+ )
345
+ else:
346
+ st.error("Failed to create the MP3 file. Check the browser console for logs.")
347
+
348
+ except Exception as e:
349
+ st.error(f"An error occurred during MP3 export: {e}")
350
+ log_to_browser_console([f"--- FATAL ERROR in export_mp3: {traceback.format_exc()} ---"])
351
+
352
+ def export_mp4():
353
+ st.info("MP4 export functionality is not yet implemented with console logging.")
354
+
355
+ def get_audio_duration_seconds(audio_path: str) -> float:
356
+ try:
357
+ audio_info = get_audio_info(audio_path)
358
+ return audio_info.get('duration', 0)
359
+ except:
360
+ return 0
361
+
362
+ def get_audio_duration_formatted(audio_path: str) -> str:
363
+ duration = get_audio_duration_seconds(audio_path)
364
+ minutes = int(duration // 60)
365
+ seconds = int(duration % 60)
366
+ return f"{minutes}:{seconds:02d}"
367
+
368
+ def reset_session():
369
+ for key in list(st.session_state.keys()):
370
+ if key not in ['step']:
371
+ del st.session_state[key]
372
+ st.session_state.step = 1
373
+ st.session_state.audio_file = None
374
+ st.session_state.transcription_data = None
375
+ st.session_state.edited_text = ""
376
+ st.session_state.video_style = {
377
+ 'animation_style': 'Karaoke Style',
378
+ 'text_color': '#FFFFFF',
379
+ 'highlight_color': '#FFD700',
380
+ 'background_color': '#000000',
381
+ 'font_family': 'Arial',
382
+ 'font_size': 48
383
+ }
384
+
385
+ if __name__ == "__main__":
386
+ main()
audio_processor.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import tempfile
4
+ from typing import List, Dict, Optional
5
+ import json
6
+ import librosa
7
+ import numpy as np
8
+ from google import genai
9
+ from google.genai import types
10
+
11
+ class AudioProcessor:
12
+ """Handles audio transcription and word-level timestamp extraction using Gemini AI"""
13
+
14
+ def __init__(self):
15
+ """Initialize the audio processor with Gemini client"""
16
+ self.client = None
17
+ self._initialize_gemini()
18
+
19
+ def _initialize_gemini(self):
20
+ """Initialize the Gemini client"""
21
+ try:
22
+ # Load environment variables from a .env file if present
23
+ load_dotenv()
24
+
25
+ # Obtain API key from environment variables
26
+ api_key = os.getenv("GEMINI_API_KEY")
27
+
28
+ if not api_key:
29
+ raise ValueError("GEMINI_API_KEY not found in environment variables. Please set it in a .env file.")
30
+
31
+ self.client = genai.Client(api_key=api_key)
32
+ except Exception as e:
33
+ print(f"Warning: Failed to initialize Gemini client: {str(e)}")
34
+ self.client = None
35
+
36
+ def transcribe_audio(self, audio_file_path: str) -> Optional[str]:
37
+ """
38
+ Transcribe audio file to text using Gemini AI
39
+
40
+ Args:
41
+ audio_file_path: Path to the audio file
42
+
43
+ Returns:
44
+ Transcribed text or None if failed
45
+ """
46
+ try:
47
+ if not os.path.exists(audio_file_path):
48
+ raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
49
+
50
+ if not self.client:
51
+ # Fallback to sample text if Gemini is not available
52
+ return "Please edit this text to match your audio content. Gemini transcription is not available."
53
+
54
+ # Read audio file as bytes
55
+ with open(audio_file_path, 'rb') as f:
56
+ audio_bytes = f.read()
57
+
58
+ # Determine MIME type based on file extension
59
+ file_ext = os.path.splitext(audio_file_path)[1].lower()
60
+ mime_type_map = {
61
+ '.mp3': 'audio/mpeg',
62
+ '.wav': 'audio/wav',
63
+ '.m4a': 'audio/mp4',
64
+ '.flac': 'audio/flac',
65
+ '.ogg': 'audio/ogg'
66
+ }
67
+ mime_type = mime_type_map.get(file_ext, 'audio/mpeg')
68
+
69
+ # Transcribe with Gemini
70
+ response = self.client.models.generate_content(
71
+ model="gemini-2.5-flash",
72
+ contents=[
73
+ types.Part.from_bytes(
74
+ data=audio_bytes,
75
+ mime_type=mime_type,
76
+ ),
77
+ "Please transcribe this audio file accurately. Provide only the spoken text without any additional commentary, formatting, or explanations. Just return the pure transcribed text."
78
+ ],
79
+ )
80
+
81
+ if response and response.text:
82
+ return response.text.strip()
83
+ else:
84
+ return "Please edit this text to match your audio content. Transcription failed."
85
+
86
+ except Exception as e:
87
+ print(f"Error transcribing audio: {str(e)}")
88
+ return "Please edit this text to match your audio content. An error occurred during transcription."
89
+
90
+ def get_word_timestamps(self, audio_file_path: str) -> List[Dict]:
91
+ """
92
+ Create word-level timestamps from transcribed text and audio duration
93
+
94
+ Args:
95
+ audio_file_path: Path to the audio file
96
+
97
+ Returns:
98
+ List of dictionaries with word, start, and end timestamps
99
+ """
100
+ try:
101
+ if not os.path.exists(audio_file_path):
102
+ raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
103
+
104
+ # First get the transcription
105
+ transcription = self.transcribe_audio(audio_file_path)
106
+ if not transcription:
107
+ return []
108
+
109
+ # Get audio duration
110
+ audio_duration = self.get_audio_duration(audio_file_path)
111
+ if audio_duration <= 0:
112
+ return []
113
+
114
+ # Split transcription into words
115
+ words = transcription.split()
116
+ if not words:
117
+ return []
118
+
119
+ # Calculate timing for each word
120
+ word_timestamps = []
121
+ total_words = len(words)
122
+
123
+ for i, word in enumerate(words):
124
+ # Distribute words evenly across the audio duration
125
+ # Leave some silence at the beginning and end
126
+ start_offset = 0.5 # 0.5 seconds at start
127
+ end_offset = 0.5 # 0.5 seconds at end
128
+ usable_duration = audio_duration - start_offset - end_offset
129
+
130
+ if total_words == 1:
131
+ start_time = start_offset
132
+ end_time = audio_duration - end_offset
133
+ else:
134
+ # Calculate word timing
135
+ word_duration = usable_duration / total_words
136
+ start_time = start_offset + (i * word_duration)
137
+ end_time = start_offset + ((i + 1) * word_duration)
138
+
139
+ # Add some variation to make it more natural
140
+ if i > 0:
141
+ # Small gap between words
142
+ start_time += 0.05
143
+
144
+ word_data = {
145
+ 'word': word.strip(),
146
+ 'start': round(start_time, 3),
147
+ 'end': round(end_time, 3)
148
+ }
149
+ word_timestamps.append(word_data)
150
+
151
+ return word_timestamps
152
+
153
+ except Exception as e:
154
+ print(f"Error creating word timestamps: {str(e)}")
155
+ return []
156
+
157
+ def get_audio_duration(self, audio_file_path: str) -> float:
158
+ """
159
+ Get the duration of the audio file in seconds
160
+
161
+ Args:
162
+ audio_file_path: Path to the audio file
163
+
164
+ Returns:
165
+ Duration in seconds
166
+ """
167
+ try:
168
+ audio_data, sample_rate = librosa.load(audio_file_path)
169
+ duration = len(audio_data) / sample_rate
170
+ return duration
171
+ except Exception as e:
172
+ print(f"Error getting audio duration: {str(e)}")
173
+ return 0.0
174
+
175
+ def validate_timestamps(self, word_timestamps: List[Dict], audio_duration: float) -> List[Dict]:
176
+ """
177
+ Validate and clean word timestamps
178
+
179
+ Args:
180
+ word_timestamps: List of word timestamp dictionaries
181
+ audio_duration: Total duration of audio in seconds
182
+
183
+ Returns:
184
+ Cleaned list of word timestamps
185
+ """
186
+ cleaned_timestamps = []
187
+
188
+ for word_data in word_timestamps:
189
+ # Ensure start and end times are valid
190
+ start_time = max(0, word_data.get('start', 0))
191
+ end_time = min(audio_duration, word_data.get('end', start_time + 0.1))
192
+
193
+ # Ensure end time is after start time
194
+ if end_time <= start_time:
195
+ end_time = start_time + 0.1
196
+
197
+ cleaned_word = {
198
+ 'word': word_data.get('word', '').strip(),
199
+ 'start': round(start_time, 3),
200
+ 'end': round(end_time, 3)
201
+ }
202
+
203
+ if cleaned_word['word']:
204
+ cleaned_timestamps.append(cleaned_word)
205
+
206
+ return cleaned_timestamps
207
+
208
+ def create_sentence_timestamps(self, word_timestamps: List[Dict], max_words_per_line: int = 8) -> List[Dict]:
209
+ """
210
+ Group words into sentences/lines for better video display
211
+
212
+ Args:
213
+ word_timestamps: List of word timestamp dictionaries
214
+ max_words_per_line: Maximum words per line
215
+
216
+ Returns:
217
+ List of sentence/line dictionaries with timestamps
218
+ """
219
+ if not word_timestamps:
220
+ return []
221
+
222
+ sentences = []
223
+ current_sentence = []
224
+
225
+ for word_data in word_timestamps:
226
+ current_sentence.append(word_data)
227
+
228
+ # Check if we should end this sentence
229
+ word = word_data.get('word', '')
230
+ if (len(current_sentence) >= max_words_per_line or
231
+ word.endswith('.') or word.endswith('!') or word.endswith('?')):
232
+
233
+ if current_sentence:
234
+ sentence_data = {
235
+ 'text': ' '.join([w.get('word', '') for w in current_sentence]).strip(),
236
+ 'start': current_sentence[0].get('start', 0),
237
+ 'end': current_sentence[-1].get('end', 0),
238
+ 'words': current_sentence.copy()
239
+ }
240
+ sentences.append(sentence_data)
241
+ current_sentence = []
242
+
243
+ # Add remaining words as final sentence
244
+ if current_sentence:
245
+ sentence_data = {
246
+ 'text': ' '.join([w.get('word', '') for w in current_sentence]).strip(),
247
+ 'start': current_sentence[0].get('start', 0),
248
+ 'end': current_sentence[-1].get('end', 0),
249
+ 'words': current_sentence.copy()
250
+ }
251
+ sentences.append(sentence_data)
252
+
253
+ return sentences
mp3_embedder.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mutagen.mp3 import MP3
2
+ from mutagen.id3 import ID3, SYLT, USLT, Encoding
3
+ import os
4
+ import tempfile
5
+ import shutil
6
+ import subprocess
7
+ from typing import List, Dict, Tuple
8
+
9
+ # --- Helper function to check for ffmpeg ---
10
+ def is_ffmpeg_available():
11
+ """Check if ffmpeg is installed and accessible in the system's PATH."""
12
+ return shutil.which("ffmpeg") is not None
13
+
14
+ class MP3Embedder:
15
+ """Handles embedding SYLT synchronized lyrics into MP3 files with robust error handling."""
16
+
17
+ def __init__(self):
18
+ """Initialize the MP3 embedder."""
19
+ self.temp_dir = "/tmp/audio_sync"
20
+ os.makedirs(self.temp_dir, exist_ok=True)
21
+
22
+ self.ffmpeg_available = is_ffmpeg_available()
23
+
24
+ def embed_sylt_lyrics(self, audio_path: str, word_timestamps: List[Dict],
25
+ text: str, output_filename: str) -> Tuple[str, List[str]]:
26
+ """
27
+ Embeds SYLT synchronized lyrics into an MP3 file and returns logs.
28
+
29
+ Returns:
30
+ A tuple containing:
31
+ - The path to the output MP3 file.
32
+ - A list of log messages detailing the process.
33
+ """
34
+ log_messages = []
35
+ def log_and_print(message):
36
+ log_messages.append(message)
37
+ print(f"MP3_EMBEDDER: {message}")
38
+ log_and_print(f"--- MP3Embedder initialized. ffmpeg available: {self.ffmpeg_available} ---")
39
+ log_and_print(f"--- Starting SYLT embedding for: {os.path.basename(audio_path)} ---")
40
+ output_path = os.path.join(self.temp_dir, output_filename)
41
+ try:
42
+ # --- Step 1: Ensure the file is in MP3 format ---
43
+ if not audio_path.lower().endswith('.mp3'):
44
+ if self.ffmpeg_available:
45
+ log_and_print(f"'{os.path.basename(audio_path)}' is not an MP3. Converting with ffmpeg...")
46
+ try:
47
+ subprocess.run(
48
+ ['ffmpeg', '-i', audio_path, '-codec:a', 'libmp3lame', '-q:a', '2', output_path],
49
+ check=True, capture_output=True, text=True
50
+ )
51
+ log_and_print("--- ffmpeg conversion successful. ---")
52
+ except subprocess.CalledProcessError as e:
53
+ log_and_print("--- ERROR: ffmpeg conversion failed. ---")
54
+ log_and_print(f"--- ffmpeg stderr: {e.stderr} ---")
55
+ log_and_print("--- Fallback: Copying original file without conversion. ---")
56
+ shutil.copy2(audio_path, output_path)
57
+ else:
58
+ log_and_print("--- WARNING: ffmpeg is not available. Cannot convert non-MP3 file. Copying directly. ---")
59
+ shutil.copy2(audio_path, output_path)
60
+ else:
61
+ log_and_print("--- Audio is already MP3. Copying to temporary location. ---")
62
+ shutil.copy2(audio_path, output_path)
63
+
64
+ # --- Step 2: Create SYLT data ---
65
+ log_and_print("--- Creating SYLT data from timestamps... ---")
66
+ sylt_data = self._create_sylt_data(word_timestamps)
67
+ if not sylt_data:
68
+ log_and_print("--- WARNING: No SYLT data could be created. Skipping embedding. ---")
69
+ return output_path, log_messages
70
+
71
+ log_and_print(f"--- Created {len(sylt_data)} SYLT entries. ---")
72
+
73
+ # --- Step 3: Embed data into the MP3 file ---
74
+ try:
75
+ log_and_print("--- Loading MP3 file with mutagen... ---")
76
+ audio_file = MP3(output_path, ID3=ID3)
77
+
78
+ if audio_file.tags is None:
79
+ log_and_print("--- No ID3 tags found. Creating new ones. ---")
80
+ audio_file.add_tags()
81
+
82
+ # --- Embed SYLT (Synchronized Lyrics) ---
83
+ log_and_print("--- Creating and adding SYLT frame... ---")
84
+ sylt_frame = SYLT(
85
+ encoding=Encoding.UTF8,
86
+ lang='eng',
87
+ format=2,
88
+ type=1,
89
+ text=sylt_data
90
+ )
91
+ audio_file.tags.delall('SYLT')
92
+ audio_file.tags.add(sylt_frame)
93
+
94
+ # --- Embed USLT (Unsynchronized Lyrics) as a fallback ---
95
+ log_and_print("--- Creating and adding USLT frame... ---")
96
+ uslt_frame = USLT(
97
+ encoding=Encoding.UTF8,
98
+ lang='eng',
99
+ desc='',
100
+ text=text
101
+ )
102
+ audio_file.tags.delall('USLT')
103
+ audio_file.tags.add(uslt_frame)
104
+
105
+ audio_file.save()
106
+ log_and_print("--- Successfully embedded SYLT and USLT frames. ---")
107
+
108
+ except Exception as e:
109
+ log_and_print(f"--- ERROR: Failed to embed SYLT/USLT: {e} ---")
110
+ return output_path, log_messages
111
+
112
+ except Exception as e:
113
+ log_and_print(f"--- ERROR: Unexpected error in embed_sylt_lyrics: {e} ---")
114
+ return output_path, log_messages
115
+
116
+ def _create_sylt_data(self, word_timestamps: List[Dict]) -> List[tuple]:
117
+ """
118
+ Create SYLT data format from word timestamps
119
+
120
+ Args:
121
+ word_timestamps: List of word timestamp dictionaries
122
+
123
+ Returns:
124
+ List of tuples (text, timestamp_in_milliseconds)
125
+ """
126
+ # Debug print to check incoming data
127
+ print(f"DEBUG: word_timestamps received in _create_sylt_data: {word_timestamps}")
128
+ try:
129
+ sylt_data = []
130
+
131
+ for word_data in word_timestamps:
132
+ word = word_data.get('word', '').strip()
133
+ start_time = word_data.get('start', 0)
134
+
135
+ if word:
136
+ # Convert seconds to milliseconds
137
+ timestamp_ms = int(start_time * 1000)
138
+ sylt_data.append((word, timestamp_ms))
139
+
140
+ return sylt_data
141
+
142
+ except Exception as e:
143
+ print(f"Error creating SYLT data: {str(e)}")
144
+ return []
145
+
146
+ def _create_line_based_sylt_data(self, word_timestamps: List[Dict], max_words_per_line: int = 6) -> List[tuple]:
147
+ """
148
+ Create line-based SYLT data (alternative approach)
149
+
150
+ Args:
151
+ word_timestamps: List of word timestamp dictionaries
152
+ max_words_per_line: Maximum words per line
153
+
154
+ Returns:
155
+ List of tuples (line_text, timestamp_in_milliseconds)
156
+ """
157
+ try:
158
+ sylt_data = []
159
+ current_line = []
160
+
161
+ for word_data in word_timestamps:
162
+ current_line.append(word_data)
163
+
164
+ # Check if we should end this line
165
+ if len(current_line) >= max_words_per_line:
166
+ if current_line:
167
+ line_text = ' '.join([w.get('word', '') for w in current_line]).strip()
168
+ start_time = current_line[0].get('start', 0)
169
+ timestamp_ms = int(start_time * 1000)
170
+
171
+ if line_text:
172
+ sylt_data.append((line_text, timestamp_ms))
173
+
174
+ current_line = []
175
+
176
+ # Add remaining words as final line
177
+ if current_line:
178
+ line_text = ' '.join([w.get('word', '') for w in current_line]).strip()
179
+ start_time = current_line[0].get('start', 0)
180
+ timestamp_ms = int(start_time * 1000)
181
+
182
+ if line_text:
183
+ sylt_data.append((line_text, timestamp_ms))
184
+
185
+ return sylt_data
186
+
187
+ except Exception as e:
188
+ print(f"Error creating line-based SYLT data: {str(e)}")
189
+ return []
190
+
191
+ def verify_sylt_embedding(self, mp3_path: str) -> Dict:
192
+ """
193
+ Verify that SYLT lyrics are properly embedded
194
+
195
+ Args:
196
+ mp3_path: Path to the MP3 file
197
+
198
+ Returns:
199
+ Dictionary with verification results
200
+ """
201
+ try:
202
+ audio_file = MP3(mp3_path)
203
+
204
+ result = {
205
+ 'has_sylt': False,
206
+ 'has_uslt': False,
207
+ 'sylt_entries': 0,
208
+ 'error': None
209
+ }
210
+
211
+ if audio_file.tags:
212
+ # Check for SYLT
213
+ sylt_frames = audio_file.tags.getall('SYLT')
214
+ if sylt_frames:
215
+ result['has_sylt'] = True
216
+ result['sylt_entries'] = len(sylt_frames[0].text) if sylt_frames[0].text else 0
217
+
218
+ # Check for USLT (fallback)
219
+ uslt_frames = audio_file.tags.getall('USLT')
220
+ if uslt_frames:
221
+ result['has_uslt'] = True
222
+
223
+ return result
224
+
225
+ except Exception as e:
226
+ return {
227
+ 'has_sylt': False,
228
+ 'has_uslt': False,
229
+ 'sylt_entries': 0,
230
+ 'error': str(e)
231
+ }
232
+
233
+ def extract_sylt_lyrics(self, mp3_path: str) -> List[Dict]:
234
+ """
235
+ Extract SYLT lyrics from an MP3 file (for debugging)
236
+
237
+ Args:
238
+ mp3_path: Path to the MP3 file
239
+
240
+ Returns:
241
+ List of dictionaries with text and timestamp
242
+ """
243
+ try:
244
+ audio_file = MP3(mp3_path)
245
+ lyrics_data = []
246
+
247
+ if audio_file.tags:
248
+ sylt_frames = audio_file.tags.getall('SYLT')
249
+
250
+ for frame in sylt_frames:
251
+ if frame.text:
252
+ for text, timestamp_ms in frame.text:
253
+ lyrics_data.append({
254
+ 'text': text,
255
+ 'timestamp': timestamp_ms / 1000.0 # Convert to seconds
256
+ })
257
+
258
+ return lyrics_data
259
+
260
+ except Exception as e:
261
+ print(f"Error extracting SYLT lyrics: {str(e)}")
262
+ return []
263
+
264
+ def create_lrc_file(self, word_timestamps: List[Dict], output_path: str) -> str:
265
+ """
266
+ Create an LRC (lyrics) file as an additional export option
267
+
268
+ Args:
269
+ word_timestamps: List of word timestamp dictionaries
270
+ output_path: Path for the output LRC file
271
+
272
+ Returns:
273
+ Path to the created LRC file
274
+ """
275
+ try:
276
+ lrc_lines = []
277
+
278
+ # Group words into lines
279
+ current_line = []
280
+ for word_data in word_timestamps:
281
+ current_line.append(word_data)
282
+
283
+ if len(current_line) >= 8: # 8 words per line
284
+ if current_line:
285
+ line_text = ' '.join([w.get('word', '') for w in current_line])
286
+ start_time = current_line[0].get('start', 0)
287
+
288
+ # Format timestamp as [mm:ss.xx]
289
+ minutes = int(start_time // 60)
290
+ seconds = start_time % 60
291
+ timestamp_str = f"[{minutes:02d}:{seconds:05.2f}]"
292
+
293
+ lrc_lines.append(f"{timestamp_str}{line_text}")
294
+ current_line = []
295
+
296
+ # Add remaining words
297
+ if current_line:
298
+ line_text = ' '.join([w.get('word', '') for w in current_line])
299
+ start_time = current_line[0].get('start', 0)
300
+
301
+ minutes = int(start_time // 60)
302
+ seconds = start_time % 60
303
+ timestamp_str = f"[{minutes:02d}:{seconds:05.2f}]"
304
+
305
+ lrc_lines.append(f"{timestamp_str}{line_text}")
306
+
307
+ # Write LRC file
308
+ with open(output_path, 'w', encoding='utf-8') as f:
309
+ f.write('\n'.join(lrc_lines))
310
+
311
+ return output_path
312
+
313
+ except Exception as e:
314
+ raise Exception(f"Error creating LRC file: {str(e)}")
315
+
316
+ def __del__(self):
317
+ """Clean up temporary files"""
318
+ import shutil
319
+ if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
320
+ try:
321
+ shutil.rmtree(self.temp_dir)
322
+ except:
323
+ pass
package-lock.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "syncmaster",
3
+ "version": "0.1.0",
4
+ "lockfileVersion": 3,
5
+ "requires": true,
6
+ "packages": {
7
+ "": {
8
+ "name": "syncmaster",
9
+ "version": "0.1.0"
10
+ }
11
+ }
12
+ }
package.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "syncmaster",
3
+ "version": "0.1.0",
4
+ "private": true,
5
+ "description": "AI Audio-Text Synchronization Platform – convenience wrapper for Streamlit dev server",
6
+ "scripts": {
7
+ "dev": "streamlit run app.py --server.port 5050 --server.address localhost"
8
+ }
9
+
10
+ }
packages.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ffmpeg
2
+ libavcodec-extra
3
+ libavformat-dev
4
+ libavutil-dev
5
+ libmp3lame0
pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "repl-nix-workspace"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "google-genai>=1.23.0",
8
+ "librosa>=0.11.0",
9
+ "moviepy>=2.2.1",
10
+ "mutagen>=1.47.0",
11
+ "numpy>=2.2.6",
12
+ "openai>=1.93.0",
13
+ "sift-stack-py>=0.7.0",
14
+ "streamlit>=1.46.1",
15
+ ]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ google-genai==1.23.0
2
+ librosa==0.11.0
3
+ moviepy==2.2.1
4
+ mutagen==1.47.0
5
+ numpy==1.26.4
6
+ openai==1.93.0
7
+ streamlit==1.39.0
8
+ altair==5.0.1
9
+ python-dotenv==1.0.1
utils.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import mimetypes
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import Optional, List, Dict
6
+ import librosa
7
+ import numpy as np
8
+
9
+ def format_timestamp(seconds: float) -> str:
10
+ """
11
+ Format seconds into MM:SS.mmm format
12
+
13
+ Args:
14
+ seconds: Time in seconds
15
+
16
+ Returns:
17
+ Formatted timestamp string
18
+ """
19
+ minutes = int(seconds // 60)
20
+ remaining_seconds = seconds % 60
21
+ return f"{minutes:02d}:{remaining_seconds:06.3f}"
22
+
23
+ def validate_audio_file(file_path: str) -> bool:
24
+ """
25
+ Validate if the file is a supported audio format
26
+
27
+ Args:
28
+ file_path: Path to the audio file
29
+
30
+ Returns:
31
+ True if valid, False otherwise
32
+ """
33
+ try:
34
+ if not os.path.exists(file_path):
35
+ return False
36
+
37
+ # Check file extension
38
+ supported_extensions = ['.mp3', '.wav', '.m4a', '.flac', '.ogg']
39
+ file_extension = Path(file_path).suffix.lower()
40
+
41
+ if file_extension not in supported_extensions:
42
+ return False
43
+
44
+ # Check MIME type
45
+ mime_type, _ = mimetypes.guess_type(file_path)
46
+ if mime_type and not mime_type.startswith('audio/'):
47
+ return False
48
+
49
+ # Try to load with librosa to verify it's a valid audio file
50
+ try:
51
+ librosa.load(file_path, duration=1.0) # Load just 1 second for validation
52
+ return True
53
+ except:
54
+ return False
55
+
56
+ except Exception:
57
+ return False
58
+
59
+ def get_audio_info(file_path: str) -> Dict:
60
+ """
61
+ Get information about the audio file
62
+
63
+ Args:
64
+ file_path: Path to the audio file
65
+
66
+ Returns:
67
+ Dictionary with audio information
68
+ """
69
+ try:
70
+ # Load audio file
71
+ y, sr = librosa.load(file_path)
72
+
73
+ duration = len(y) / sr
74
+
75
+ return {
76
+ 'duration': duration,
77
+ 'sample_rate': sr,
78
+ 'channels': 1 if len(y.shape) == 1 else y.shape[0],
79
+ 'file_size': os.path.getsize(file_path),
80
+ 'format': Path(file_path).suffix.lower()
81
+ }
82
+
83
+ except Exception as e:
84
+ return {
85
+ 'error': str(e),
86
+ 'duration': 0,
87
+ 'sample_rate': 0,
88
+ 'channels': 0,
89
+ 'file_size': 0,
90
+ 'format': 'unknown'
91
+ }
92
+
93
+ def clean_text(text: str) -> str:
94
+ """
95
+ Clean and normalize text for better processing
96
+
97
+ Args:
98
+ text: Input text
99
+
100
+ Returns:
101
+ Cleaned text
102
+ """
103
+ if not text:
104
+ return ""
105
+
106
+ # Remove extra whitespace
107
+ text = ' '.join(text.split())
108
+
109
+ # Remove common transcription artifacts
110
+ text = text.replace('[Music]', '')
111
+ text = text.replace('[Applause]', '')
112
+ text = text.replace('[Laughter]', '')
113
+ text = text.replace('(Music)', '')
114
+ text = text.replace('(Applause)', '')
115
+ text = text.replace('(Laughter)', '')
116
+
117
+ # Clean up extra spaces
118
+ text = ' '.join(text.split())
119
+
120
+ return text.strip()
121
+
122
+ def split_text_into_chunks(text: str, max_chars_per_chunk: int = 100) -> List[str]:
123
+ """
124
+ Split text into chunks suitable for video display
125
+
126
+ Args:
127
+ text: Input text
128
+ max_chars_per_chunk: Maximum characters per chunk
129
+
130
+ Returns:
131
+ List of text chunks
132
+ """
133
+ if not text:
134
+ return []
135
+
136
+ words = text.split()
137
+ chunks = []
138
+ current_chunk = []
139
+ current_length = 0
140
+
141
+ for word in words:
142
+ word_length = len(word) + 1 # +1 for space
143
+
144
+ if current_length + word_length > max_chars_per_chunk and current_chunk:
145
+ # Add current chunk and start new one
146
+ chunks.append(' '.join(current_chunk))
147
+ current_chunk = [word]
148
+ current_length = len(word)
149
+ else:
150
+ current_chunk.append(word)
151
+ current_length += word_length
152
+
153
+ # Add final chunk
154
+ if current_chunk:
155
+ chunks.append(' '.join(current_chunk))
156
+
157
+ return chunks
158
+
159
+ def convert_color_hex_to_rgb(hex_color: str) -> tuple:
160
+ """
161
+ Convert hex color to RGB tuple
162
+
163
+ Args:
164
+ hex_color: Hex color string (e.g., '#FF0000')
165
+
166
+ Returns:
167
+ RGB tuple (r, g, b)
168
+ """
169
+ hex_color = hex_color.lstrip('#')
170
+
171
+ if len(hex_color) != 6:
172
+ return (255, 255, 255) # Default to white
173
+
174
+ try:
175
+ r = int(hex_color[0:2], 16)
176
+ g = int(hex_color[2:4], 16)
177
+ b = int(hex_color[4:6], 16)
178
+ return (r, g, b)
179
+ except ValueError:
180
+ return (255, 255, 255) # Default to white
181
+
182
+ def convert_rgb_to_hex(r: int, g: int, b: int) -> str:
183
+ """
184
+ Convert RGB values to hex color string
185
+
186
+ Args:
187
+ r, g, b: RGB color values (0-255)
188
+
189
+ Returns:
190
+ Hex color string
191
+ """
192
+ return f"#{r:02x}{g:02x}{b:02x}"
193
+
194
+ def estimate_video_file_size(duration: float, resolution: tuple = (1280, 720),
195
+ bitrate_kbps: int = 2000) -> int:
196
+ """
197
+ Estimate the file size of a video based on duration and quality
198
+
199
+ Args:
200
+ duration: Video duration in seconds
201
+ resolution: Video resolution tuple (width, height)
202
+ bitrate_kbps: Video bitrate in kbps
203
+
204
+ Returns:
205
+ Estimated file size in bytes
206
+ """
207
+ # Simple estimation: bitrate * duration / 8 (to convert bits to bytes)
208
+ estimated_size = (bitrate_kbps * 1000 * duration) / 8
209
+ return int(estimated_size)
210
+
211
+ def create_safe_filename(filename: str) -> str:
212
+ """
213
+ Create a safe filename by removing/replacing invalid characters
214
+
215
+ Args:
216
+ filename: Original filename
217
+
218
+ Returns:
219
+ Safe filename
220
+ """
221
+ import re
222
+
223
+ # Remove or replace invalid characters
224
+ safe_filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
225
+
226
+ # Remove extra underscores and spaces
227
+ safe_filename = re.sub(r'[_\s]+', '_', safe_filename)
228
+
229
+ # Trim leading/trailing underscores
230
+ safe_filename = safe_filename.strip('_')
231
+
232
+ # Ensure filename is not empty
233
+ if not safe_filename:
234
+ safe_filename = "output"
235
+
236
+ return safe_filename
237
+
238
+ def format_file_size(size_bytes: int) -> str:
239
+ """
240
+ Format file size in human-readable format
241
+
242
+ Args:
243
+ size_bytes: File size in bytes
244
+
245
+ Returns:
246
+ Formatted file size string
247
+ """
248
+ if size_bytes == 0:
249
+ return "0 B"
250
+
251
+ size_names = ["B", "KB", "MB", "GB"]
252
+ i = int(np.floor(np.log(size_bytes) / np.log(1024)))
253
+ p = np.power(1024, i)
254
+ s = round(size_bytes / p, 2)
255
+
256
+ return f"{s} {size_names[i]}"
257
+
258
+ def validate_word_timestamps(word_timestamps: List[Dict]) -> List[Dict]:
259
+ """
260
+ Validate and clean word timestamps data
261
+
262
+ Args:
263
+ word_timestamps: List of word timestamp dictionaries
264
+
265
+ Returns:
266
+ Cleaned and validated word timestamps
267
+ """
268
+ validated_timestamps = []
269
+
270
+ for word_data in word_timestamps:
271
+ # Ensure required fields exist
272
+ if not isinstance(word_data, dict):
273
+ continue
274
+
275
+ word = word_data.get('word', '').strip()
276
+ start = word_data.get('start', 0)
277
+ end = word_data.get('end', 0)
278
+
279
+ # Skip empty words
280
+ if not word:
281
+ continue
282
+
283
+ # Ensure numeric timestamps
284
+ try:
285
+ start = float(start)
286
+ end = float(end)
287
+ except (ValueError, TypeError):
288
+ continue
289
+
290
+ # Ensure logical timestamp order
291
+ if start < 0:
292
+ start = 0
293
+ if end <= start:
294
+ end = start + 0.1 # Minimum duration
295
+
296
+ validated_timestamps.append({
297
+ 'word': word,
298
+ 'start': round(start, 3),
299
+ 'end': round(end, 3)
300
+ })
301
+
302
+ return validated_timestamps
303
+
304
+ def merge_overlapping_timestamps(word_timestamps: List[Dict],
305
+ overlap_threshold: float = 0.05) -> List[Dict]:
306
+ """
307
+ Merge overlapping or very close word timestamps
308
+
309
+ Args:
310
+ word_timestamps: List of word timestamp dictionaries
311
+ overlap_threshold: Threshold for merging close timestamps (seconds)
312
+
313
+ Returns:
314
+ List with merged timestamps
315
+ """
316
+ if not word_timestamps:
317
+ return []
318
+
319
+ merged_timestamps = []
320
+ current_group = [word_timestamps[0]]
321
+
322
+ for word_data in word_timestamps[1:]:
323
+ last_end = current_group[-1]['end']
324
+ current_start = word_data['start']
325
+
326
+ # Check if words should be merged
327
+ if current_start - last_end <= overlap_threshold:
328
+ current_group.append(word_data)
329
+ else:
330
+ # Merge current group and start new one
331
+ if len(current_group) == 1:
332
+ merged_timestamps.append(current_group[0])
333
+ else:
334
+ # Merge multiple words
335
+ merged_word = {
336
+ 'word': ' '.join([w['word'] for w in current_group]),
337
+ 'start': current_group[0]['start'],
338
+ 'end': current_group[-1]['end']
339
+ }
340
+ merged_timestamps.append(merged_word)
341
+
342
+ current_group = [word_data]
343
+
344
+ # Handle final group
345
+ if len(current_group) == 1:
346
+ merged_timestamps.append(current_group[0])
347
+ else:
348
+ merged_word = {
349
+ 'word': ' '.join([w['word'] for w in current_group]),
350
+ 'start': current_group[0]['start'],
351
+ 'end': current_group[-1]['end']
352
+ }
353
+ merged_timestamps.append(merged_word)
354
+
355
+ return merged_timestamps
video_generator.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # START OF video_generator.py
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+ from typing import List, Dict
6
+
7
+ class VideoGenerator:
8
+ """A simplified and safe video generator."""
9
+
10
+ def __init__(self):
11
+ self.temp_dir = tempfile.mkdtemp()
12
+
13
+ def create_synchronized_video(self, audio_path: str, word_timestamps: List[Dict],
14
+ text: str, style_config: Dict, output_filename: str) -> str:
15
+ """
16
+ This is a fallback function. Instead of creating a video,
17
+ it copies the audio file to a .m4a format to indicate a processed file.
18
+ This avoids using ffmpeg and external fonts, which can cause errors.
19
+ """
20
+ try:
21
+ # The safest operation is to just provide the audio back in a different format
22
+ output_path = os.path.join(self.temp_dir, output_filename.replace('.mp4', '.m4a'))
23
+ shutil.copy2(audio_path, output_path)
24
+ print(f"Fallback successful: Created audio file at {output_path}")
25
+ return output_path
26
+ except Exception as e:
27
+ print(f"Critical error in fallback video generation: {e}")
28
+ raise
29
+
30
+ def __del__(self):
31
+ if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
32
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
33
+ # END OF video_generator.py