Spaces:

LPX55
/

deepgram-srt-generation

Sleeping

App Files Files Community

LPX55 commited on 17 days ago

Commit

b8edc35

verified ·

1 Parent(s): 1838e8c

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitignore +2 -0
README.md +72 -6
app.py +484 -0
main.py +348 -0
requirements.txt +39 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ .venv

README.md CHANGED Viewed

@@ -1,13 +1,79 @@
 ---
-title: Deepgram Srt Generation
-emoji: 👁
 colorFrom: indigo
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.18.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Deepgram SRT Generator
+emoji: 🎙️
 colorFrom: indigo
+colorTo: purple
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# Deepgram SRT Generation
+This project extracts audio from a video file, sends it to Deepgram for transcription, and generates an SRT file with captions.
+## Setup
+1. Clone the repository:
+    ```sh
+    git clone https://github.com/bradcypert/deepgram-srt-generation.git
+    cd deepgram-srt-generation
+    ```
+2. Install the required dependencies:
+    ```sh
+    pip install -r requirements.txt
+    ```
+3. Set your Deepgram API key as an environment variable:
+    ```sh
+    export DEEPGRAM_API_KEY=your_deepgram_api_key
+    ```
+## Usage
+Run the script with the path to your video or audio file as an argument:
+```sh
+python main.py path/to/your/file.mp4
+```
+This will generate an SRT file with captions in the same directory as your media file.
+### CLI Options
+You can customize the transcription with the following flags:
+*   `-m` / `--model`: Deepgram model to use (default: `nova-3`).
+*   `-l` / `--language`: Set the language tag (e.g. `ko`, `en`, `es`).
+*   `--no-diarize`: Disable speaker diarization.
+*   `-t` / `--translate-to`: Translate the generated subtitles using DeepL.
+Example transcribing Korean audio and translating it to English:
+```sh
+export DEEPL_API_KEY=your_deepl_api_key
+python main.py path/to/your/korean_audio.mp3 -l ko -t en
+```
+### Translate-Only Mode
+If you already have an SRT file and want to translate it to another language without transcribing again, simply pass the `.srt` file and target language:
+```sh
+python main.py path/to/your/subtitles.srt -t ja
+```
+## Dependencies
+- `httpx`
+- `moviepy`
+- `deepgram`
+- `deepgram_captions`
+- `deepl`
+- `python-dotenv`
+Make sure to install these dependencies using `pip` if they are not already installed.
+## License
+This project is licensed under the MIT License.

app.py ADDED Viewed

	@@ -0,0 +1,484 @@

+import os
+import tempfile
+import re
+import httpx
+from datetime import datetime
+import gradio as gr
+from dotenv import load_dotenv
+# Load local environment variables
+load_dotenv()
+# Import core logic from main.py
+from main import cleanup_srt_punctuation, translate_srt_content
+from deepgram import DeepgramClient, PrerecordedOptions
+from deepgram_captions import DeepgramConverter, srt
+from moviepy.video.io.VideoFileClip import VideoFileClip
+# CSS styling for a premium glassmorphism dark-mode look
+custom_css = """
+@import url('https://fonts.googleapis.com/css2?family=Outfit:wght@400;600;800&family=Inter:wght@400;500;600&display=swap');
+body, .gradio-container {
+    font-family: 'Inter', sans-serif !important;
+    background: #0b0f19 !important;
+}
+/* Main card styling */
+.glass-container {
+    background: rgba(17, 24, 39, 0.7) !important;
+    backdrop-filter: blur(16px);
+    -webkit-backdrop-filter: blur(16px);
+    border: 1px solid rgba(255, 255, 255, 0.08) !important;
+    border-radius: 20px !important;
+    padding: 30px !important;
+    box-shadow: 0 10px 40px 0 rgba(0, 0, 0, 0.5) !important;
+}
+/* Glowing text title */
+.glow-title {
+    background: linear-gradient(135deg, #a5b4fc 0%, #c084fc 50%, #818cf8 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    font-weight: 800;
+    text-align: center;
+    font-size: 2.8rem;
+    margin-bottom: 8px;
+    font-family: 'Outfit', sans-serif;
+    letter-spacing: -0.5px;
+}
+.sub-title {
+    color: #9ca3af;
+    text-align: center;
+    font-size: 1.15rem;
+    margin-bottom: 30px;
+    font-family: 'Inter', sans-serif;
+}
+/* Styled primary action button */
+.action-btn {
+    background: linear-gradient(90deg, #6366f1 0%, #8b5cf6 100%) !important;
+    color: white !important;
+    border: none !important;
+    font-weight: 600 !important;
+    border-radius: 12px !important;
+    padding: 12px 24px !important;
+    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
+    box-shadow: 0 4px 20px rgba(99, 102, 241, 0.3) !important;
+}
+.action-btn:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 6px 25px rgba(99, 102, 241, 0.5) !important;
+    opacity: 0.95;
+}
+.action-btn:active {
+    transform: translateY(1px);
+}
+/* Inputs styling */
+input, textarea, select {
+    background: #1f2937 !important;
+    border: 1px solid #374151 !important;
+    border-radius: 8px !important;
+    color: #f3f4f6 !important;
+}
+input:focus, textarea:focus, select:focus {
+    border-color: #818cf8 !important;
+}
+/* Tab styling */
+.tabs {
+    border-bottom: 2px solid #1f2937 !important;
+    margin-bottom: 20px;
+}
+.tab-nav button {
+    font-family: 'Outfit', sans-serif;
+    font-size: 1.05rem !important;
+    color: #9ca3af !important;
+    padding: 10px 20px !important;
+}
+.tab-nav button.selected {
+    color: #818cf8 !important;
+    border-bottom: 2px solid #818cf8 !important;
+}
+/* Footer styling */
+.footer-text {
+    text-align: center;
+    color: #4b5563;
+    font-size: 0.85rem;
+    margin-top: 40px;
+}
+"""
+def extract_audio(video_path):
+    """Extract audio track from video file using MoviePy."""
+    temp_dir = tempfile.gettempdir()
+    audio_path = os.path.join(temp_dir, f"extracted_{os.path.basename(video_path)}.mp3")
+    try:
+        with VideoFileClip(video_path) as video_clip:
+            audio_clip = video_clip.audio
+            # Write audio without verbose logging
+            audio_clip.write_audiofile(audio_path, logger=None)
+        return audio_path
+    except Exception as e:
+        raise gr.Error(f"Failed to extract audio from video: {str(e)}")
+def process_transcribe(
+    file_path,
+    model,
+    language,
+    diarize,
+    translate_to,
+    dg_key_override,
+    dl_key_override
+):
+    """Core transcription and translation pipeline for audio/video input."""
+    if not file_path:
+        raise gr.Error("Please upload a file first.")
+    # Resolve Deepgram API Key
+    dg_key = dg_key_override.strip() if dg_key_override else os.getenv("DEEPGRAM_API_KEY")
+    if not dg_key:
+        raise gr.Error("Deepgram API Key is required. Please provide it in the UI or environment.")
+    # Resolve DeepL API Key (if translation requested)
+    dl_key = None
+    if translate_to:
+        dl_key = dl_key_override.strip() if dl_key_override else (os.getenv("DEEPL_API_KEY") or os.getenv("DEEPL_AUTH_KEY"))
+        if not dl_key:
+            raise gr.Error("DeepL API Key is required for translation. Please provide it in the UI or environment.")
+    # Check extension to determine if audio extraction is needed
+    _, ext = os.path.splitext(file_path.lower())
+    is_audio = ext in {'.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac', '.wma', '.opus', '.webm', '.m4b', '.mp4a', '.aiff', '.aif', '.mp2'}
+    audio_filepath = file_path
+    temp_audio_to_cleanup = None
+    if not is_audio:
+        gr.Info("Video file detected. Extracting audio track...")
+        audio_filepath = extract_audio(file_path)
+        temp_audio_to_cleanup = audio_filepath
+    try:
+        # Read the file data
+        with open(audio_filepath, "rb") as file:
+            buffer_data = file.read()
+        payload = {"buffer": buffer_data}
+        # Configure Deepgram options
+        options_dict = {
+            "model": model,
+            "smart_format": True,
+            "utterances": True,
+            "punctuate": True,
+            "diarize": diarize,
+        }
+        if language:
+            if language.lower() in {"auto", "detect"}:
+                options_dict["detect_language"] = True
+            else:
+                options_dict["language"] = language
+        options = PrerecordedOptions(**options_dict)
+        deepgram = DeepgramClient(dg_key)
+        gr.Info("Transcribing audio via Deepgram...")
+        response = deepgram.listen.rest.v("1").transcribe_file(
+            payload, options, timeout=httpx.Timeout(30000.0, connect=10.0)
+        )
+        # Process words check
+        has_words = False
+        try:
+            if hasattr(response, 'results') and response.results:
+                if response.results.channels and response.results.channels[0].alternatives:
+                    if response.results.channels[0].alternatives[0].words:
+                        has_words = True
+        except Exception:
+            pass
+        if not has_words:
+            original_srt = ""
+            gr.Warning("No speech detected in the audio file.")
+        else:
+            transcription = DeepgramConverter(response)
+            original_srt = srt(transcription)
+            original_srt = cleanup_srt_punctuation(original_srt)
+        # Write original SRT to temp file
+        temp_dir = tempfile.gettempdir()
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        orig_file_path = os.path.join(temp_dir, f"transcription_{timestamp}.srt")
+        with open(orig_file_path, "w", encoding="utf-8") as f:
+            f.write(original_srt)
+        translated_srt = ""
+        trans_file_path = None
+        # Handle translation if requested
+        if translate_to and original_srt:
+            gr.Info(f"Translating subtitles to {translate_to} using DeepL...")
+            target_lang = translate_to.upper()
+            if target_lang == "EN":
+                target_lang = "EN-US"
+            elif target_lang == "PT":
+                target_lang = "PT-BR"
+            translated_srt = translate_srt_content(original_srt, dl_key, target_lang)
+            translated_srt = cleanup_srt_punctuation(translated_srt)
+            trans_file_path = os.path.join(temp_dir, f"transcription_{timestamp}.{translate_to.lower()}.srt")
+            with open(trans_file_path, "w", encoding="utf-8") as f:
+                f.write(translated_srt)
+        return original_srt, orig_file_path, translated_srt, trans_file_path
+    except Exception as e:
+        raise gr.Error(f"An error occurred: {str(e)}")
+    finally:
+        # Cleanup temporary extracted audio
+        if temp_audio_to_cleanup and os.path.exists(temp_audio_to_cleanup):
+            try:
+                os.remove(temp_audio_to_cleanup)
+            except Exception:
+                pass
+def process_translate_srt(srt_file, translate_to, dl_key_override):
+    """Translate an existing SRT file."""
+    if not srt_file:
+        raise gr.Error("Please upload an SRT file.")
+    dl_key = dl_key_override.strip() if dl_key_override else (os.getenv("DEEPL_API_KEY") or os.getenv("DEEPL_AUTH_KEY"))
+    if not dl_key:
+        raise gr.Error("DeepL API Key is required. Please provide it in the UI or environment.")
+    try:
+        with open(srt_file.name, "r", encoding="utf-8") as f:
+            original_content = f.read()
+        target_lang = translate_to.upper()
+        if target_lang == "EN":
+            target_lang = "EN-US"
+        elif target_lang == "PT":
+            target_lang = "PT-BR"
+        gr.Info(f"Translating SRT file to {translate_to} using DeepL...")
+        translated_content = translate_srt_content(original_content, dl_key, target_lang)
+        cleaned_content = cleanup_srt_punctuation(translated_content)
+        # Write to temp file
+        temp_dir = tempfile.gettempdir()
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        translated_path = os.path.join(temp_dir, f"translated_{timestamp}.srt")
+        with open(translated_path, "w", encoding="utf-8") as f:
+            f.write(cleaned_content)
+        return cleaned_content, translated_path
+    except Exception as e:
+        raise gr.Error(f"Translation error: {str(e)}")
+# ------------------ Build Interface ------------------
+# Supported languages list
+language_choices = [
+    ("Auto Detect", "auto"),
+    ("English", "en"),
+    ("Korean", "ko"),
+    ("Spanish", "es"),
+    ("French", "fr"),
+    ("German", "de"),
+    ("Italian", "it"),
+    ("Japanese", "ja"),
+    ("Chinese", "zh"),
+    ("Portuguese", "pt"),
+]
+translation_choices = [
+    ("None", ""),
+    ("Korean", "ko"),
+    ("English", "en"),
+    ("Japanese", "ja"),
+    ("Spanish", "es"),
+    ("French", "fr"),
+    ("German", "de"),
+    ("Italian", "it"),
+    ("Chinese", "zh"),
+    ("Portuguese", "pt"),
+]
+model_choices = [
+    ("Nova-3 (Latest / Recommended)", "nova-3"),
+    ("Nova-2 (Fast & Accurate)", "nova-2"),
+    ("Enhanced", "enhanced"),
+    ("Base", "base"),
+]
+with gr.Blocks(css=custom_css, title="Deepgram SRT Generator & Translator") as demo:
+    with gr.Column(elem_classes="glass-container"):
+        gr.HTML("<h1 class='glow-title'>Deepgram SRT Subtitles</h1>")
+        gr.HTML("<p class='sub-title'>Generate and translate SRT subtitles with state-of-the-art accuracy</p>")
+        # API Keys Accordion (Collapsible for cleaner layout)
+        with gr.Accordion("🔑 API Credentials (Optional Override)", open=False):
+            with gr.Row():
+                dg_key_input = gr.Textbox(
+                    label="Deepgram API Key",
+                    placeholder="Enter key to override DEEPGRAM_API_KEY environment variable",
+                    type="password"
+                )
+                dl_key_input = gr.Textbox(
+                    label="DeepL API Key",
+                    placeholder="Enter key to override DEEPL_API_KEY environment variable",
+                    type="password"
+                )
+        with gr.Tabs(elem_classes="tabs"):
+            # --- Tab 1: Video Transcription ---
+            with gr.TabItem("🎥 Transcribe Video"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        video_input = gr.Video(label="Upload Video", sources=["upload"])
+                        with gr.Row():
+                            video_model = gr.Dropdown(
+                                choices=model_choices, value="nova-3", label="Deepgram Model"
+                            )
+                            video_lang = gr.Dropdown(
+                                choices=language_choices, value="auto", label="Audio Language", allow_custom_value=True
+                            )
+                        with gr.Row():
+                            video_diarize = gr.Checkbox(label="Speaker Diarization", value=True)
+                            video_trans = gr.Dropdown(
+                                choices=translation_choices, value="", label="Translate Subtitles to (DeepL)"
+                            )
+                        video_btn = gr.Button("Generate Subtitles", elem_classes="action-btn")
+                    with gr.Column(scale=1):
+                        with gr.Tabs():
+                            with gr.TabItem("Original Subtitles"):
+                                video_original_srt = gr.Textbox(label="SRT Output", show_copy_button=True, lines=15)
+                                video_original_file = gr.File(label="Download original SRT")
+                            with gr.TabItem("Translated Subtitles"):
+                                video_translated_srt = gr.Textbox(label="Translated SRT Output", show_copy_button=True, lines=15)
+                                video_translated_file = gr.File(label="Download translated SRT")
+                video_btn.click(
+                    fn=process_transcribe,
+                    inputs=[
+                        video_input,
+                        video_model,
+                        video_lang,
+                        video_diarize,
+                        video_trans,
+                        dg_key_input,
+                        dl_key_input
+                    ],
+                    outputs=[
+                        video_original_srt,
+                        video_original_file,
+                        video_translated_srt,
+                        video_translated_file
+                    ],
+                    api_name="transcribe_video"
+                )
+            # --- Tab 2: Audio Transcription ---
+            with gr.TabItem("🎵 Transcribe Audio"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        audio_input = gr.Audio(label="Upload Audio", type="filepath", sources=["upload"])
+                        with gr.Row():
+                            audio_model = gr.Dropdown(
+                                choices=model_choices, value="nova-3", label="Deepgram Model"
+                            )
+                            audio_lang = gr.Dropdown(
+                                choices=language_choices, value="auto", label="Audio Language", allow_custom_value=True
+                            )
+                        with gr.Row():
+                            audio_diarize = gr.Checkbox(label="Speaker Diarization", value=True)
+                            audio_trans = gr.Dropdown(
+                                choices=translation_choices, value="", label="Translate Subtitles to (DeepL)"
+                            )
+                        audio_btn = gr.Button("Generate Subtitles", elem_classes="action-btn")
+                    with gr.Column(scale=1):
+                        with gr.Tabs():
+                            with gr.TabItem("Original Subtitles"):
+                                audio_original_srt = gr.Textbox(label="SRT Output", show_copy_button=True, lines=15)
+                                audio_original_file = gr.File(label="Download original SRT")
+                            with gr.TabItem("Translated Subtitles"):
+                                audio_translated_srt = gr.Textbox(label="Translated SRT Output", show_copy_button=True, lines=15)
+                                audio_translated_file = gr.File(label="Download translated SRT")
+                audio_btn.click(
+                    fn=process_transcribe,
+                    inputs=[
+                        audio_input,
+                        audio_model,
+                        audio_lang,
+                        audio_diarize,
+                        audio_trans,
+                        dg_key_input,
+                        dl_key_input
+                    ],
+                    outputs=[
+                        audio_original_srt,
+                        audio_original_file,
+                        audio_translated_srt,
+                        audio_translated_file
+                    ],
+                    api_name="transcribe_audio"
+                )
+            # --- Tab 3: SRT Translation ---
+            with gr.TabItem("📄 Translate SRT File"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        srt_input = gr.File(label="Upload SRT File", file_types=[".srt"])
+                        srt_trans_lang = gr.Dropdown(
+                            choices=translation_choices[1:], value="ko", label="Translate Subtitles to (DeepL)"
+                        )
+                        srt_btn = gr.Button("Translate File", elem_classes="action-btn")
+                    with gr.Column(scale=1):
+                        srt_output_text = gr.Textbox(label="Translated SRT Output", show_copy_button=True, lines=15)
+                        srt_output_file = gr.File(label="Download translated SRT")
+                srt_btn.click(
+                    fn=process_translate_srt,
+                    inputs=[
+                        srt_input,
+                        srt_trans_lang,
+                        dl_key_input
+                    ],
+                    outputs=[
+                        srt_output_text,
+                        srt_output_file
+                    ],
+                    api_name="translate_srt"
+                )
+        gr.HTML("<div class='footer-text'>Deepgram SRT Subtitle Tool • Powered by Deepgram & DeepL</div>")
+if __name__ == "__main__":
+    demo.queue()
+    demo.launch(server_name="0.0.0.0", server_port=7860)

main.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import sys
+import httpx
+import os
+import argparse
+from datetime import datetime
+from dotenv import load_dotenv
+from deepgram import DeepgramClient, PrerecordedOptions
+from deepgram_captions import DeepgramConverter, srt
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from moviepy.audio.io.AudioFileClip import AudioFileClip
+import deepl
+import re
+load_dotenv()
+def cleanup_srt_punctuation(srt_content):
+    # Split the SRT content into blocks
+    blocks = re.split(r'\n\s*\n', srt_content.strip())
+    parsed_blocks = []
+    for block in blocks:
+        lines = block.split('\n')
+        if len(lines) >= 2:
+            index = lines[0]
+            timecode = lines[1]
+            text = "\n".join(lines[2:]) if len(lines) > 2 else ""
+            parsed_blocks.append({
+                "index": index,
+                "timecode": timecode,
+                "text": text
+            })
+    # Rule 1: Clean up spaces before punctuation within each block
+    for block in parsed_blocks:
+        if "text" in block:
+            block["text"] = re.sub(r'\s+([.,!?~:;。、])', r'\1', block["text"])
+    # Rule 2 & 3: Handle leading punctuation and punctuation-only blocks
+    for i in range(len(parsed_blocks)):
+        block = parsed_blocks[i]
+        if "text" not in block:
+            continue
+        text = block["text"].strip()
+        # Check if the block is only punctuation
+        if text and all(c in ".,!?~:;。、" or c.isspace() for c in text):
+            for j in range(i - 1, -1, -1):
+                prev_block = parsed_blocks[j]
+                if "text" in prev_block and prev_block["text"].strip():
+                    prev_block["text"] = prev_block["text"].rstrip() + " " + text
+                    prev_block["text"] = re.sub(r'\s+([.,!?~:;。、])', r'\1', prev_block["text"])
+                    break
+            block["text"] = ""
+            continue
+        # Check if the block starts with leading punctuation (e.g. ", text")
+        match = re.match(r'^([.,!?~:;。、\s]+)(.*)', block["text"])
+        if match:
+            lead_punct = match.group(1).strip()
+            remaining_text = match.group(2)
+            if lead_punct:
+                for j in range(i - 1, -1, -1):
+                    prev_block = parsed_blocks[j]
+                    if "text" in prev_block and prev_block["text"].strip():
+                        prev_block["text"] = prev_block["text"].rstrip() + " " + lead_punct
+                        prev_block["text"] = re.sub(r'\s+([.,!?~:;。、])', r'\1', prev_block["text"])
+                        break
+                block["text"] = remaining_text
+    # Reconstruct and re-index the SRT string, filtering out empty blocks
+    reconstructed = []
+    entry = 1
+    for block in parsed_blocks:
+        text = block["text"].strip()
+        if text:
+            reconstructed.append(f"{entry}\n{block['timecode']}\n{text}")
+            entry += 1
+    return "\n\n".join(reconstructed) + "\n"
+def translate_srt_content(srt_content, deepl_api_key, target_lang):
+    import deepl
+    # Split the SRT content into blocks
+    blocks = re.split(r'\n\s*\n', srt_content.strip())
+    parsed_blocks = []
+    text_list = []
+    for block in blocks:
+        lines = block.split('\n')
+        if len(lines) >= 2:
+            index = lines[0]
+            timecode = lines[1]
+            text = "\n".join(lines[2:]) if len(lines) > 2 else ""
+            # Extract speaker tag if any (e.g. "[speaker 0] Hello" or "[Speaker 1]")
+            tag = ""
+            clean_text = text
+            match = re.match(r'^(\[speaker \d+\]\s*)(.*)', text, re.IGNORECASE)
+            if match:
+                tag = match.group(1)
+                clean_text = match.group(2)
+            parsed_blocks.append({
+                "index": index,
+                "timecode": timecode,
+                "tag": tag,
+                "clean_text": clean_text
+            })
+            if clean_text.strip():
+                text_list.append(clean_text)
+        else:
+            parsed_blocks.append({
+                "raw": block
+            })
+    # Translate clean texts using DeepL text translation
+    translator = deepl.Translator(deepl_api_key)
+    translated_texts = []
+    # Chunk text requests to avoid hitting DeepL payload size limits
+    chunk_size = 50
+    for i in range(0, len(text_list), chunk_size):
+        chunk = text_list[i:i + chunk_size]
+        try:
+            results = translator.translate_text(chunk, target_lang=target_lang)
+            translated_texts.extend([r.text for r in results])
+        except Exception as e:
+            print(f"Error translating chunk: {e}")
+            translated_texts.extend(chunk)
+    # Reassemble the parsed blocks
+    text_idx = 0
+    reconstructed = []
+    entry = 1
+    for block in parsed_blocks:
+        if "raw" in block:
+            reconstructed.append(block["raw"])
+        else:
+            clean_text = block["clean_text"]
+            tag = block["tag"]
+            if clean_text.strip():
+                translated_text = translated_texts[text_idx] if text_idx < len(translated_texts) else clean_text
+                text_idx += 1
+                full_text = tag + translated_text
+            else:
+                full_text = tag + clean_text
+            # Filter out empty blocks after translation and re-index sequentially
+            stripped_text = full_text.strip()
+            if stripped_text:
+                reconstructed.append(f"{entry}\n{block['timecode']}\n{stripped_text}")
+                entry += 1
+    return "\n\n".join(reconstructed) + "\n"
+def main():
+    parser = argparse.ArgumentParser(description="Transcribe video/audio to SRT subtitles using Deepgram.")
+    parser.add_argument("filepath", type=str, help="Path to the audio or video file to transcribe.")
+    parser.add_argument("-m", "--model", type=str, default="nova-3", help="Deepgram model to use (default: %(default)s).")
+    parser.add_argument("-l", "--language", type=str, default=None, help="BCP-47 language tag (e.g. 'en', 'es', 'fr'), or 'auto'/'detect' to enable automatic language detection.")
+    parser.add_argument("--no-diarize", dest="diarize", action="store_false", help="Disable speaker diarization.")
+    parser.add_argument("-t", "--translate-to", type=str, default=None, help="Translate the generated subtitles to this BCP-47 language tag (e.g. 'ko', 'en', 'ja') using DeepL.")
+    parser.set_defaults(diarize=True)
+    args = parser.parse_args()
+    filepath = args.filepath
+    # Resolve filepath. If it doesn't exist directly but exists in 'media/', use it from there.
+    if not os.path.exists(filepath):
+        media_fallback = os.path.join("media", filepath)
+        if os.path.exists(media_fallback):
+            filepath = media_fallback
+    if not os.path.exists(filepath):
+        print(f"Error: File '{filepath}' not found.")
+        print("Please check the path or place the file in the 'media' directory.")
+        return
+    _, ext = os.path.splitext(filepath.lower())
+    if ext == '.srt':
+        if not args.translate_to:
+            print("Error: When passing an .srt file, you must specify a target language using -t or --translate-to.")
+            return
+        deepl_api_key = os.getenv("DEEPL_API_KEY") or os.getenv("DEEPL_AUTH_KEY")
+        if not deepl_api_key:
+            print("Error: DEEPL_API_KEY or DEEPL_AUTH_KEY environment variable is not set.")
+            print("Please set it in your environment or add it to your .env file to use translation.")
+            return
+        try:
+            target_lang = args.translate_to.upper()
+            if target_lang == "EN":
+                target_lang = "EN-US"
+            elif target_lang == "PT":
+                target_lang = "PT-BR"
+            base, _ = os.path.splitext(filepath)
+            translated_srt_path = f"{base}.{args.translate_to.lower()}.srt"
+            print(f"Translating {filepath} to {args.translate_to} using DeepL...")
+            with open(filepath, "r", encoding="utf-8") as f:
+                original_content = f.read()
+            translated_content = translate_srt_content(original_content, deepl_api_key, target_lang)
+            cleaned_content = cleanup_srt_punctuation(translated_content)
+            with open(translated_srt_path, "w", encoding="utf-8") as f:
+                f.write(cleaned_content)
+            print(f"Successfully translated subtitles. Saved to: {translated_srt_path}")
+        except Exception as translate_err:
+            print(f"An error occurred during translation: {translate_err}")
+        return
+    api_key = os.getenv("DEEPGRAM_API_KEY")
+    if not api_key:
+        print("Error: DEEPGRAM_API_KEY environment variable is not set.")
+        print("Please set it in your environment or add it to a .env file in the project directory.")
+        return
+    try:
+        deepgram = DeepgramClient(api_key)
+        is_audio = ext in {'.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac', '.wma', '.opus', '.webm', '.m4b', '.mp4a', '.aiff', '.aif', '.mp2'}
+        audio_filepath = filepath
+        should_remove_audio = False
+        if not is_audio:
+            audio_filepath = f"{filepath}-audio.mp3"
+            should_remove_audio = False
+            audio_exists = False
+            if os.path.exists(audio_filepath) and os.path.getsize(audio_filepath) > 0:
+                try:
+                    with VideoFileClip(filepath) as video_clip:
+                        video_duration = video_clip.duration
+                    with AudioFileClip(audio_filepath) as audio_clip:
+                        audio_duration = audio_clip.duration
+                    if abs(video_duration - audio_duration) < 1.0:
+                        audio_exists = True
+                        print(f"Found existing audio file '{audio_filepath}' with matching duration. Skipping extraction.")
+                except Exception as check_err:
+                    print(f"Could not verify existing audio file: {check_err}. Re-extracting...")
+            if not audio_exists:
+                try:
+                    with VideoFileClip(filepath) as video_clip:
+                        audio_clip = video_clip.audio
+                        audio_clip.write_audiofile(audio_filepath)
+                except Exception as e:
+                    print(f"An error occurred extracting audio from video: {e}")
+                    return
+        with open(audio_filepath, "rb") as file:
+            buffer_data = file.read()
+        payload = {"buffer": buffer_data}
+        options_dict = {
+            "model": args.model,
+            "smart_format": True,
+            "utterances": True,
+            "punctuate": True,
+            "diarize": args.diarize,
+        }
+        if args.language:
+            if args.language.lower() in {"auto", "detect"}:
+                options_dict["detect_language"] = True
+            else:
+                options_dict["language"] = args.language
+        options = PrerecordedOptions(**options_dict)
+        print("Making request to deepgram")
+        before = datetime.now()
+        response = deepgram.listen.rest.v("1").transcribe_file(
+            payload, options, timeout=httpx.Timeout(30000.0, connect=10.0)
+        )
+        after = datetime.now()
+        print("Got response from deepgram")
+        print(response.to_json(indent=4))
+        # Check if the transcription contains words to avoid IndexError on silent audio files
+        has_words = False
+        try:
+            if hasattr(response, 'results') and response.results:
+                if response.results.channels and response.results.channels[0].alternatives:
+                    if response.results.channels[0].alternatives[0].words:
+                        has_words = True
+        except Exception:
+            pass
+        if not has_words:
+            print("No speech or words detected in the audio file. Generating empty subtitle file.")
+            captions = ""
+        else:
+            transcription = DeepgramConverter(response)
+            captions = srt(transcription)
+        original_srt_path = f"{filepath}-captions.srt"
+        cleaned_captions = cleanup_srt_punctuation(captions)
+        with open(original_srt_path, "a", encoding="utf-8") as f:
+            f.write(cleaned_captions)
+        if args.translate_to:
+            print(f"Translating subtitles to {args.translate_to} using DeepL...")
+            deepl_api_key = os.getenv("DEEPL_API_KEY") or os.getenv("DEEPL_AUTH_KEY")
+            if not deepl_api_key:
+                print("Error: DEEPL_API_KEY or DEEPL_AUTH_KEY environment variable is not set.")
+                print("Please set it in your environment or add it to your .env file to use translation.")
+            else:
+                try:
+                    target_lang = args.translate_to.upper()
+                    # DeepL-specific target language code overrides
+                    if target_lang == "EN":
+                        target_lang = "EN-US"
+                    elif target_lang == "PT":
+                        target_lang = "PT-BR"
+                    translated_srt_path = f"{filepath}-captions.{args.translate_to.lower()}.srt"
+                    # Translate and post-process
+                    translated_content = translate_srt_content(cleaned_captions, deepl_api_key, target_lang)
+                    cleaned_content = cleanup_srt_punctuation(translated_content)
+                    with open(translated_srt_path, "w", encoding="utf-8") as f:
+                        f.write(cleaned_content)
+                    print(f"Successfully translated subtitles. Saved to: {translated_srt_path}")
+                except Exception as translate_err:
+                    print(f"An error occurred during translation: {translate_err}")
+        if should_remove_audio:
+            os.remove(audio_filepath)
+    except Exception as e:
+        print(f"Exception: {e}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+aenum==3.1.15
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+anyio==4.9.0
+attrs==25.3.0
+certifi==2025.1.31
+dataclasses-json==0.6.7
+decorator==5.2.1
+deepgram-captions==1.2.0
+deepgram-sdk==3.11.0
+deprecation==2.1.0
+frozenlist==1.6.0
+h11==0.14.0
+httpcore==1.0.8
+httpx==0.28.1
+idna==3.10
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+marshmallow==3.26.1
+moviepy==2.1.2
+multidict==6.4.3
+mypy-extensions==1.1.0
+numpy==2.2.5
+packaging==25.0
+pillow==10.4.0
+proglog==0.1.11
+propcache==0.3.1
+python-dotenv==1.1.0
+sniffio==1.3.1
+tqdm==4.67.1
+typing-extensions==4.13.2
+typing-inspect==0.9.0
+websockets==15.0.1
+yarl==1.20.0
+deepl==1.30.0
+gradio>=4.44.0