Spaces:

staraks
/

transcribemulti

Sleeping

App Files Files Community

staraks commited on Nov 11, 2025

Commit

8208346

verified ·

1 Parent(s): daf797c

Upload 6 files

Browse files

Files changed (6) hide show

LICENSE_Version6.txt +7 -0
README_Version6.md +47 -0
apt_Version4.txt +1 -0
gitignore_Version6.txt +5 -0
main_Version7.py +449 -0
requirements_Version5.txt +6 -0

LICENSE_Version6.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+MIT License
+Copyright (c) 2025 staraks486
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction...

README_Version6.md ADDED Viewed

	@@ -0,0 +1,47 @@

+```markdown
+# Whisper Transcription Tool — Hugging Face Space (enhanced)
+This Space-ready Gradio app transcribes audio files using multiple backends and exports TXT, SRT, merged DOCX, and ZIP outputs.
+What's included:
+- main.py: Gradio app with backend selection (openai-whisper, faster-whisper, openai-api), chunking, SRT export, merged DOCX, ZIP creation.
+- requirements.txt: Python dependencies.
+- apt.txt: system dependency (ffmpeg).
+Quick deployment steps (CLI method)
+1. Install the Hugging Face CLI:
+   pip install huggingface-hub
+2. Login to the Hub:
+   huggingface-cli login
+   (enter your token from https://huggingface.co/settings/tokens)
+3. Create a new Space (replace <space-name> with your chosen name):
+   huggingface-cli repo create YOUR_USERNAME/<space-name> --type space --space-sdk gradio
+4. Clone the new Space repo:
+   git clone https://huggingface.co/spaces/YOUR_USERNAME/<space-name>
+   cd <space-name>
+5. Copy main.py, requirements.txt, apt.txt, README.md into the repo directory, then:
+   git add -A
+   git commit -m "Initial commit: Whisper Transcription Space"
+   git push
+6. In the Space settings (web UI):
+   - If you plan to run medium/large models or faster-whisper, choose "Hardware accelerator: GPU".
+   - Add OPENAI_API_KEY as a Space secret if you want to use the openai-api backend securely.
+7. Wait for build to finish. The app will be available at:
+   https://huggingface.co/spaces/YOUR_USERNAME/<space-name>
+Notes & tips
+- Use smaller models (tiny/base) on CPU Spaces to avoid OOM. For better GPU performance use faster-whisper.
+- If build fails due to memory or dependency issues, try removing faster-whisper from requirements.txt (or select CPU/GPU appropriately).
+- You can paste an OpenAI API key into the UI for quick tests, but prefer saving it as a Space secret named OPENAI_API_KEY.
+Next improvements:
+- Streaming transcripts to the UI while running.
+- Add VTT/JSON timestamp exports.
+- Integrate remote storage backends (S3, Google Drive).
+```

apt_Version4.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

gitignore_Version6.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+outputs/
+*.zip
+*.pyc
+__pycache__/
+.env

main_Version7.py ADDED Viewed

	@@ -0,0 +1,449 @@

+#!/usr/bin/env python3
+"""
+Whisper Transcription Tool (Gradio) — Spaces-ready with backend selection, chunking, SRT export, and OpenAI API option.
+Features:
+- Backend selection: "openai-whisper" (local), "faster-whisper" (local, faster on GPU), "openai-api" (hosted whisper-1).
+- Optional audio chunking (split long files with ffmpeg) to avoid OOM and speed up processing.
+- SRT export (from segments) and per-file .txt exports.
+- Merged Word (.docx) export.
+- Zipped download containing all generated transcripts (TXT + SRT) if requested.
+- Gradio UI updated to select backend and options, and to accept an OpenAI API key (when using openai-api).
+Notes:
+- On Hugging Face Spaces, select GPU in Space settings to run medium/large models or faster-whisper efficiently.
+- Store your OpenAI API key in the Space "Secrets" as OPENAI_API_KEY to avoid exposing it in the UI.
+"""
+import os
+import sys
+import tempfile
+import shutil
+import subprocess
+import traceback
+from typing import List, Optional, Tuple
+import json
+import time
+import zipfile
+import gradio as gr
+from docx import Document
+import pyzipper
+# optional imports
+try:
+    import whisper  # openai-whisper
+except Exception:
+    whisper = None
+try:
+    from faster_whisper import WhisperModel  # faster-whisper
+except Exception:
+    WhisperModel = None
+try:
+    import openai  # openai API
+except Exception:
+    openai = None
+AUDIO_EXTS = {".mp3", ".wav", ".m4a", ".flac", ".aac", ".ogg", ".webm", ".dat", ".dct"}
+DEFAULT_CHUNK_SECONDS = 15 * 60  # 15 minutes
+# -------------------------
+# Utilities
+# -------------------------
+def save_as_word(text: str, filename: str = "merged_transcripts.docx") -> str:
+    document = Document()
+    document.add_paragraph(text)
+    document.save(filename)
+    return os.path.abspath(filename)
+def srt_time(sec: float) -> str:
+    """Convert seconds to SRT timecode 'HH:MM:SS,mmm'."""
+    ms = int((sec - int(sec)) * 1000)
+    h = int(sec // 3600)
+    m = int((sec % 3600) // 60)
+    s = int(sec % 60)
+    return f"{h:02}:{m:02}:{s:02},{ms:03}"
+def segments_to_srt(segments: List[dict]) -> str:
+    """Convert segments (with start, end, text) to SRT string."""
+    lines = []
+    for i, seg in enumerate(segments, start=1):
+        start = srt_time(seg.get("start", 0.0))
+        end = srt_time(seg.get("end", seg.get("start", 0.0) + 1.0))
+        text = seg.get("text", "").strip()
+        lines.append(f"{i}")
+        lines.append(f"{start} --> {end}")
+        lines.append(text)
+        lines.append("")  # blank line
+    return "\n".join(lines)
+def safe_mkdir(path: str):
+    os.makedirs(path, exist_ok=True)
+def chunk_audio_ffmpeg(input_path: str, out_dir: str, chunk_seconds: int) -> List[str]:
+    """
+    Split input audio into chunks using ffmpeg segment muxer (copy codec).
+    Returns list of chunk file paths.
+    """
+    safe_mkdir(out_dir)
+    _, ext = os.path.splitext(input_path)
+    pattern = os.path.join(out_dir, "chunk_%04d" + ext)
+    cmd = [
+        "ffmpeg", "-y", "-i", input_path,
+        "-f", "segment",
+        "-segment_time", str(chunk_seconds),
+        "-c", "copy",
+        pattern
+    ]
+    try:
+        subprocess.check_output(cmd, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        try:
+            pattern = os.path.join(out_dir, "chunk_%04d.wav")
+            cmd2 = [
+                "ffmpeg", "-y", "-i", input_path,
+                "-f", "segment",
+                "-segment_time", str(chunk_seconds),
+                "-ar", "16000", "-ac", "1",
+                pattern
+            ]
+            subprocess.check_output(cmd2, stderr=subprocess.STDOUT)
+        except Exception as e2:
+            raise RuntimeError(f"ffmpeg chunking failed: {e}\nFallback failed: {e2}")
+    created = sorted([os.path.join(out_dir, f) for f in os.listdir(out_dir)])
+    return created
+def _extract_audio_from_zip(zip_path: str, password: Optional[str], extract_dir: str, logs: List[str]) -> List[str]:
+    extracted_paths = []
+    try:
+        with pyzipper.ZipFile(zip_path, "r") as zf:
+            if password:
+                try:
+                    zf.setpassword(password.encode())
+                except RuntimeError:
+                    logs.append("Error: Incorrect password for the zip file.")
+                    return []
+            os.makedirs(extract_dir, exist_ok=True)
+            for info in zf.infolist():
+                if info.is_dir():
+                    continue
+                _, ext = os.path.splitext(info.filename)
+                if ext.lower() in AUDIO_EXTS:
+                    try:
+                        zf.extract(info, path=extract_dir)
+                        extracted_path = os.path.abspath(os.path.join(extract_dir, info.filename))
+                        extracted_paths.append(extracted_path)
+                        logs.append(f"Extracted: {info.filename}")
+                    except Exception as e:
+                        logs.append(f"Error extracting {info.filename}: {e}")
+    except pyzipper.BadZipFile:
+        logs.append("Error: Invalid zip file format.")
+    except FileNotFoundError:
+        logs.append("Error: Zip file not found.")
+    except Exception as e:
+        logs.append(f"Unexpected error while extracting zip: {e}\n{traceback.format_exc()}")
+    return extracted_paths
+# -------------------------
+# Backend wrappers
+# -------------------------
+def transcribe_with_openai_whisper(model, audio_path: str, **kwargs) -> Tuple[str, List[dict]]:
+    res = model.transcribe(audio_path, **kwargs)
+    text = res.get("text", "")
+    segments = res.get("segments", []) or []
+    return text, segments
+def transcribe_with_faster_whisper(model_obj, audio_path: str, **kwargs) -> Tuple[str, List[dict]]:
+    segments = []
+    text_parts = []
+    try:
+        result = model_obj.transcribe(audio_path, **kwargs)
+        if isinstance(result, dict):
+            text = result.get("text", "")
+            segments = result.get("segments", []) or []
+            return text, segments
+        elif isinstance(result, tuple) and len(result) == 2:
+            segs, info = result
+            for s in segs:
+                segments.append({"start": s.start, "end": s.end, "text": s.text})
+            text = " ".join([s.text for s in segments])
+            return text, segments
+        else:
+            for seg in result:
+                segments.append({"start": seg.start, "end": seg.end, "text": seg.text})
+            text = " ".join([s["text"] for s in segments])
+            return text, segments
+    except Exception as e:
+        raise
+def transcribe_with_openai_api(api_key: str, audio_path: str, model_name: str = "whisper-1") -> Tuple[str, List[dict]]:
+    if openai is None:
+        raise RuntimeError("openai package not installed")
+    openai.api_key = api_key
+    with open(audio_path, "rb") as f:
+        try:
+            resp = openai.Audio.transcribe(model_name, f)
+            text = ""
+            segments = []
+            if isinstance(resp, dict):
+                text = resp.get("text", "")
+                segments = resp.get("segments", []) or []
+            else:
+                text = getattr(resp, "text", "") or ""
+            return text, segments
+        except Exception as e:
+            raise
+# -------------------------
+# Main processing function
+# -------------------------
+def process_files(
+    audio_list: Optional[List[str]],
+    backend: str,
+    model_name: str,
+    use_chunks: bool,
+    chunk_seconds: int,
+    export_srt: bool,
+    merge_docx: bool,
+    zip_outputs: bool,
+    zip_file: Optional[str],
+    zip_password: Optional[str],
+    openai_api_key_input: Optional[str],
+):
+    logs: List[str] = []
+    transcript_outputs: List[str] = []
+    created_files: List[str] = []
+    temp_dirs: List[str] = []
+    try:
+        temp_extract_dir = None
+        extracted_paths = []
+        if zip_file:
+            logs.append(f"Processing zip file: {zip_file}")
+            temp_extract_dir = tempfile.mkdtemp(prefix="extracted_audio_")
+            temp_dirs.append(temp_extract_dir)
+            extracted_paths = _extract_audio_from_zip(zip_file, zip_password, temp_extract_dir, logs)
+        all_audio_files: List[str] = []
+        if audio_list:
+            all_audio_files.extend([os.path.abspath(p) for p in audio_list if p])
+        if extracted_paths:
+            all_audio_files.extend(extracted_paths)
+        if not all_audio_files:
+            logs.append("No audio files provided.")
+            return ("\n".join(logs), "", gr.update(value=None, visible=False), gr.update(value=None, visible=False))
+        model_local = None
+        faster_model = None
+        api_key = openai_api_key_input or os.environ.get("OPENAI_API_KEY")
+        if backend == "openai-whisper":
+            if whisper is None:
+                logs.append("openai-whisper package not installed.")
+                return ("\n".join(logs), "", gr.update(value=None, visible=False), gr.update(value=None, visible=False))
+            logs.append(f"Loading openai-whisper model: {model_name}")
+            model_local = whisper.load_model(model_name)
+            logs.append("Model loaded.")
+        elif backend == "faster-whisper":
+            if WhisperModel is None:
+                logs.append("faster-whisper package not installed.")
+                return ("\n".join(logs), "", gr.update(value=None, visible=False), gr.update(value=None, visible=False))
+            logs.append(f"Loading faster-whisper model: {model_name}")
+            device = "cuda" if (os.environ.get("CUDA_VISIBLE_DEVICES") or os.path.exists('/usr/local/cuda')) else "cpu"
+            faster_model = WhisperModel(model_name, device=device)
+            logs.append("Faster-Whisper model loaded.")
+        elif backend == "openai-api":
+            if openai is None:
+                logs.append("openai package not installed.")
+                return ("\n".join(logs), "", gr.update(value=None, visible=False), gr.update(value=None, visible=False))
+            if not api_key:
+                logs.append("OpenAI API key not provided (use Space secret OPENAI_API_KEY or enter in UI).")
+                return ("\n".join(logs), "", gr.update(value=None, visible=False), gr.update(value=None, visible=False))
+            logs.append("Using OpenAI hosted Whisper (whisper-1).")
+        out_dir = os.path.abspath("outputs")
+        safe_mkdir(out_dir)
+        for audio_path in all_audio_files:
+            try:
+                if not os.path.exists(audio_path):
+                    logs.append(f"Missing file, skipping: {audio_path}")
+                    transcript_outputs.append(f"Could not transcribe {os.path.basename(audio_path)} — missing.")
+                    continue
+                _, ext = os.path.splitext(audio_path)
+                if ext.lower() not in AUDIO_EXTS:
+                    logs.append(f"Skipping unsupported file type: {audio_path}")
+                    transcript_outputs.append(f"Skipped unsupported {os.path.basename(audio_path)}.")
+                    continue
+                logs.append(f"Processing: {os.path.basename(audio_path)}")
+                to_transcribe_paths = [audio_path]
+                if use_chunks and chunk_seconds > 0:
+                    try:
+                        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", audio_path]
+                        out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL).decode().strip()
+                        duration = float(out)
+                    except Exception:
+                        duration = 0.0
+                    if duration > chunk_seconds and duration > 0:
+                        logs.append(f"Chunking {os.path.basename(audio_path)} ({int(duration)}s) into {chunk_seconds}s parts...")
+                        chunk_dir = tempfile.mkdtemp(prefix="chunks_")
+                        temp_dirs.append(chunk_dir)
+                        try:
+                            chunks = chunk_audio_ffmpeg(audio_path, chunk_dir, chunk_seconds)
+                            if chunks:
+                                to_transcribe_paths = chunks
+                                logs.append(f"Created {len(chunks)} chunks.")
+                            else:
+                                logs.append("No chunks created, using original file.")
+                        except Exception as e:
+                            logs.append(f"Chunking failed, will use original file. Error: {e}")
+                combined_texts = []
+                combined_segments = []
+                for piece in to_transcribe_paths:
+                    try:
+                        if backend == "openai-whisper":
+                            text, segments = transcribe_with_openai_whisper(model_local, piece)
+                        elif backend == "faster-whisper":
+                            text, segments = transcribe_with_faster_whisper(faster_model, piece)
+                        elif backend == "openai-api":
+                            text, segments = transcribe_with_openai_api(api_key, piece, model_name="whisper-1")
+                        else:
+                            raise RuntimeError("Unknown backend")
+                        if text:
+                            combined_texts.append(text.strip())
+                        if segments:
+                            combined_segments.extend(segments)
+                        logs.append(f"Transcribed: {os.path.basename(piece)}")
+                    except Exception as e:
+                        logs.append(f"Error transcribing {os.path.basename(piece)}: {e}\n{traceback.format_exc()}")
+                        combined_texts.append(f"[Error transcribing {os.path.basename(piece)}]")
+                final_text = "\n".join(combined_texts).strip()
+                transcript_outputs.append(f"Transcript for {os.path.basename(audio_path)}:\n{final_text}")
+                base = os.path.splitext(os.path.basename(audio_path))[0]
+                txt_path = os.path.join(out_dir, f"{base}.txt")
+                with open(txt_path, "w", encoding="utf-8") as f:
+                    f.write(final_text)
+                created_files.append(txt_path)
+                logs.append(f"Saved TXT: {txt_path}")
+                if export_srt and combined_segments:
+                    srt_text = segments_to_srt(combined_segments)
+                    srt_path = os.path.join(out_dir, f"{base}.srt")
+                    with open(srt_path, "w", encoding="utf-8") as f:
+                        f.write(srt_text)
+                    created_files.append(srt_path)
+                    logs.append(f"Saved SRT: {srt_path}")
+            except Exception as e:
+                logs.append(f"Fatal error while processing {audio_path}: {e}\n{traceback.format_exc()}")
+                transcript_outputs.append(f"Could not transcribe {os.path.basename(audio_path)} due to an error.")
+        merged_docx_path = None
+        if merge_docx:
+            combined_all = "\n\n---\n\n".join(transcript_outputs)
+            if combined_all.strip():
+                merged_docx_path = save_as_word(combined_all, filename=os.path.abspath("merged_transcripts.docx"))
+                created_files.append(merged_docx_path)
+                logs.append(f"Saved merged DOCX: {merged_docx_path}")
+        zip_path = None
+        if zip_outputs and created_files:
+            zip_path = os.path.abspath("transcripts_outputs.zip")
+            with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+                for fpath in created_files:
+                    zf.write(fpath, arcname=os.path.basename(fpath))
+            logs.append(f"Created outputs ZIP: {zip_path}")
+        for d in temp_dirs:
+            try:
+                shutil.rmtree(d)
+            except Exception:
+                pass
+        docx_update = gr.update(value=merged_docx_path, visible=bool(merged_docx_path))
+        zip_update = gr.update(value=zip_path, visible=bool(zip_path))
+        return ("\n".join(logs), "\n\n".join(transcript_outputs), docx_update, zip_update)
+    except Exception as e:
+        logs.append(f"Unhandled error: {e}\n{traceback.format_exc()}")
+        return ("\n".join(logs), "", gr.update(value=None, visible=False), gr.update(value=None, visible=False))
+# -------------------------
+# Gradio UI
+# -------------------------
+def build_ui():
+    with gr.Blocks() as demo:
+        gr.Markdown("## Whisper Transcription Tool — Spaces-ready\nSelect backend, upload audio files or a ZIP, and choose options like chunking, SRT export, and merged DOCX/ZIP outputs.")
+        with gr.Row():
+            backend_dropdown = gr.Dropdown(choices=["openai-whisper", "faster-whisper", "openai-api"], value="openai-whisper", label="Backend")
+            model_dropdown = gr.Dropdown(
+                choices=["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large"],
+                value="base",
+                label="Model"
+            )
+        with gr.Row():
+            audio_input = gr.File(file_count="multiple", type="filepath", label="Upload Audio Files (Optional)")
+            zip_input = gr.File(file_count="single", type="filepath", label="Upload Zip of Audio (Optional)")
+            zip_password = gr.Textbox(label="Zip Password (Optional)", type="password")
+        with gr.Row():
+            use_chunks = gr.Checkbox(label="Enable chunking for long files (recommended for large files)", value=True)
+            chunk_seconds = gr.Slider(minimum=60, maximum=3600, value=DEFAULT_CHUNK_SECONDS, step=60, label="Chunk length (seconds)")
+        with gr.Row():
+            export_srt = gr.Checkbox(label="Export SRT files (timestamped subtitles)", value=True)
+            merge_docx = gr.Checkbox(label="Merge transcripts into one DOCX", value=False)
+            zip_outputs = gr.Checkbox(label="Produce ZIP with all outputs (TXT/SRT/DOCX)", value=True)
+        openai_key = gr.Textbox(label="OpenAI API Key (only needed for openai-api backend)", type="password", value=os.environ.get("OPENAI_API_KEY", ""))
+        transcribe_btn = gr.Button("Start Transcription")
+        log_output = gr.Textbox(label="Log Output", lines=12)
+        transcript_output = gr.Textbox(label="Transcripts", lines=20)
+        docx_file_output = gr.File(label="Download Merged Transcript (.docx)", visible=False)
+        zip_file_output = gr.File(label="Download Outputs (.zip)", visible=False)
+        def toggle_openai_key(backend_choice):
+            return gr.update(visible=(backend_choice == "openai-api"))
+        backend_dropdown.change(
+            toggle_openai_key,
+            inputs=[backend_dropdown],
+            outputs=[openai_key]
+        )
+        transcribe_btn.click(
+            process_files,
+            inputs=[audio_input, backend_dropdown, model_dropdown, use_chunks, chunk_seconds, export_srt, merge_docx, zip_outputs, zip_input, zip_password, openai_key],
+            outputs=[log_output, transcript_output, docx_file_output, zip_file_output],
+        )
+        return demo
+if __name__ == "__main__":
+    app = build_ui()
+    port = int(os.environ.get("PORT", 7860))
+    app.launch(server_name="0.0.0.0", server_port=port, enable_queue=True)

requirements_Version5.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio>=3.0
+openai-whisper>=20230314
+faster-whisper>=0.7.0
+openai>=0.27.0
+pyzipper>=0.3.6
+python-docx>=0.8.11