audio-to-srt

Running

App Files Files Community

Nav3005 commited on Jan 24

Commit

0536406

verified ·

1 Parent(s): 73672ed

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +1 -0
.gitignore +35 -0
README.md +2 -8
app.py +196 -0
audio.srt +40 -0
audio.wav +3 -0
requirements.txt +96 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+audio.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+env/
+venv/
+ENV/
+.venv/
+build/
+dist/
+*.egg-info/
+.DS_Store
+.ipynb_checkpoints
+.snapshots
+.gradio
+*.static/
+templates/
+uploads/
+outputs/
+# VSCode
+.vscode/
+.idea/
+*.code-workspace
+# MacOS
+.DS_Store
+# Logs
+logs/
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+dump/

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Audio To Srt
-emoji: 🐨
-colorFrom: yellow
-colorTo: indigo
 sdk: gradio
 sdk_version: 6.4.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: audio-to-srt
+app_file: app.py
 sdk: gradio
 sdk_version: 6.4.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+import gradio as gr
+import pysrt
+import requests
+import tempfile
+from faster_whisper import WhisperModel
+from datetime import timedelta
+from urllib.parse import urlparse
+# -----------------------------
+# Core subtitle generator
+# -----------------------------
+class FibonacciSubtitleGenerator:
+    def __init__(self, model_size="base"):
+        self.model = WhisperModel(
+            model_size,
+            device="cpu",
+            compute_type="int8"
+        )
+    def transcribe(self, audio_path):
+        segments, _ = self.model.transcribe(
+            audio_path,
+            word_timestamps=True,
+            vad_filter=True
+        )
+        return segments
+    def extract_words(self, segments):
+        words = []
+        for segment in segments:
+            if not segment.words:
+                continue
+            for w in segment.words:
+                if w.start is None or w.end is None:
+                    continue
+                words.append({
+                    "word": w.word.strip(),
+                    "start": float(w.start),
+                    "end": float(w.end)
+                })
+        return words
+    def create_fibonacci_subtitles(self, words):
+        subs = pysrt.SubRipFile()
+        total_words = len(words)
+        index = 0
+        subtitle_index = 1
+        prev_size = 1
+        curr_size = 1
+        while index < total_words:
+            planned_size = curr_size
+            remaining = total_words - (index + planned_size)
+            next_size = prev_size + curr_size
+            # absorb leftovers
+            if remaining > 0 and remaining < next_size:
+                planned_size += remaining
+            subtitle_words = []
+            start_time = None
+            end_time = None
+            for _ in range(planned_size):
+                if index >= total_words:
+                    break
+                w = words[index]
+                subtitle_words.append(w["word"])
+                if start_time is None:
+                    start_time = w["start"]
+                end_time = w["end"]
+                index += 1
+            subs.append(
+                pysrt.SubRipItem(
+                    index=subtitle_index,
+                    start=self._to_time(start_time),
+                    end=self._to_time(end_time),
+                    text=" ".join(subtitle_words)
+                )
+            )
+            subtitle_index += 1
+            if planned_size == curr_size:
+                prev_size, curr_size = curr_size, next_size
+            else:
+                break
+        return subs
+    def _to_time(self, seconds):
+        td = timedelta(seconds=seconds)
+        return pysrt.SubRipTime(
+            hours=td.seconds // 3600,
+            minutes=(td.seconds % 3600) // 60,
+            seconds=td.seconds % 60,
+            milliseconds=td.microseconds // 1000
+        )
+# -----------------------------
+# Helper: download audio from URL
+# -----------------------------
+def download_audio(url: str) -> str:
+    parsed = urlparse(url)
+    if parsed.scheme not in ("http", "https"):
+        raise ValueError("Invalid URL scheme")
+    response = requests.get(url, stream=True, timeout=30)
+    response.raise_for_status()
+    suffix = os.path.splitext(parsed.path)[1] or ".wav"
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
+    for chunk in response.iter_content(chunk_size=8192):
+        tmp.write(chunk)
+    tmp.close()
+    return tmp.name
+# -----------------------------
+# Gradio callable function
+# -----------------------------
+def generate_srt(audio_file, audio_url, model_size):
+    # Validation: exactly one input must be provided
+    if bool(audio_file) == bool(audio_url):
+        raise gr.Error(
+            "Please provide EITHER an audio file OR an audio URL (not both)."
+        )
+    # Resolve audio path
+    if audio_url:
+        audio_path = download_audio(audio_url)
+    else:
+        audio_path = audio_file
+    generator = FibonacciSubtitleGenerator(model_size)
+    segments = generator.transcribe(audio_path)
+    words = generator.extract_words(segments)
+    subs = generator.create_fibonacci_subtitles(words)
+    out = tempfile.NamedTemporaryFile(delete=False, suffix=".srt")
+    subs.save(out.name, encoding="utf-8")
+    return out.name
+# -----------------------------
+# Gradio UI
+# -----------------------------
+with gr.Blocks(title="Fibonacci Subtitle Generator") as demo:
+    gr.Markdown(
+        """
+        # srt generator
+        """
+    )
+    with gr.Row():
+        audio_file = gr.Audio(
+            label="Upload Audio File",
+            type="filepath"
+        )
+        audio_url = gr.Textbox(
+            label="Audio URL (http/https)",
+            placeholder="https://example.com/audio.wav"
+        )
+    model_choice = gr.Dropdown(
+        choices=["tiny", "base", "small", "medium"],
+        value="base",
+        label="Whisper Model"
+    )
+    generate_btn = gr.Button("Generate SRT")
+    output_file = gr.File(label="Download SRT")
+    generate_btn.click(
+        fn=generate_srt,
+        inputs=[audio_file, audio_url, model_choice],
+        outputs=output_file
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

audio.srt ADDED Viewed

	@@ -0,0 +1,40 @@

+1
+00:00:00,000 --> 00:00:00,180
+I
+2
+00:00:00,180 --> 00:00:00,920
+spotted buried
+3
+00:00:00,920 --> 00:00:01,700
+in the appendix
+4
+00:00:01,700 --> 00:00:03,460
+proofs at Minotrew, a verb
+5
+00:00:03,460 --> 00:00:06,340
+that leans backward, were, for three seconds it
+6
+00:00:06,340 --> 00:00:10,140
+glows on my screen before the correction order coughs through the telescreen. The
+7
+00:00:10,140 --> 00:00:16,600
+chute yawns like a mouth, my finger hovers over send, the page updates, the past is repaired, but the syllable burrows.
+8
+00:00:16,940 --> 00:00:25,440
+In new speed there is no room to lean only to stand. I sign out, drift past the two minutes and slip into the pearl quarter where the air smells of soap and rain.
+9
+00:00:25,440 --> 00:00:41,460
+In a junk shop window, a small glass round as an eye, a paperweight trapping a curl of pale paper and a bubble of air. I buy it for nothing I can afford. Back home I write it tiny on a torn scrap, were. I tilt the paperweight and the word distort swims, multiplies a
+10
+00:00:41,460 --> 00:01:18,000
+whole tense blooms like a reef. Bells ring somewhere far off, names I almost remember, and for a moment the room brightens with the light without edges. Then the telescreen clears its throat and the memory hole exhales. I let the scrap go. Smoke eats it. The glass stays cool in my palm. Years later, in a quiet building that needs no slogans, a curator dusts a cracked paperweight labeled. Relic, airstrip one, ministry of truth, inside clings a browned curl of paper where an ink blot suggests a letter. The curator squints guesses a word that leans backward. She whispers it, an ordinary word, and the city's bells answer as if they always had.

audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec0fc316ff0d84bced1d9fcfb68fb0fd556746cc52477c303615ad6ae4e2d8f3
+size 5005356

requirements.txt ADDED Viewed

	@@ -0,0 +1,96 @@

+aiofiles==24.1.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.1
+attrs==25.4.0
+av==16.1.0
+blinker==1.9.0
+brotli==1.2.0
+certifi==2026.1.4
+cffi==2.0.0
+chardet==5.2.0
+charset-normalizer==3.4.4
+click==8.3.1
+coloredlogs==15.0.1
+cryptography==46.0.3
+ctranslate2==4.6.3
+fastapi==0.128.0
+faster-whisper==1.2.1
+ffmpeg-python==0.2.0
+ffmpy==1.0.0
+filelock==3.20.3
+Flask==3.1.2
+flatbuffers==25.12.19
+fsspec==2026.1.0
+future==1.0.0
+gradio==6.4.0
+gradio_client==2.0.3
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==1.3.3
+humanfriendly==10.0
+idna==3.11
+itsdangerous==2.2.0
+Jinja2==3.1.6
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+llvmlite==0.46.0
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mcp==1.25.0
+mdurl==0.1.2
+more-itertools==10.8.0
+mpmath==1.3.0
+networkx==3.6.1
+numba==0.63.1
+numpy==2.3.5
+onnxruntime==1.23.2
+openai-whisper==20250625
+orjson==3.11.5
+packaging==26.0
+pandas==2.3.3
+pillow==12.1.0
+protobuf==6.33.4
+pycparser==3.0
+pydantic==2.12.5
+pydantic-settings==2.12.0
+pydantic_core==2.41.5
+pydub==0.25.1
+Pygments==2.19.2
+PyJWT==2.10.1
+pysrt==1.1.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-multipart==0.0.21
+pytz==2025.2
+PyYAML==6.0.3
+referencing==0.37.0
+regex==2026.1.15
+requests==2.32.5
+rich==14.2.0
+rpds-py==0.30.0
+safehttpx==0.1.7
+semantic-version==2.10.0
+setuptools==80.10.1
+shellingham==1.5.4
+six==1.17.0
+sse-starlette==3.2.0
+starlette==0.50.0
+sympy==1.14.0
+tiktoken==0.12.0
+tokenizers==0.22.2
+tomlkit==0.13.3
+torch==2.10.0
+tqdm==4.67.1
+typer==0.21.1
+typer-slim==0.21.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.3
+urllib3==2.6.3
+uvicorn==0.40.0
+Werkzeug==3.1.5