Spaces:

abidlabs
/

TextCut

Running

App Files Files Community

abidlabs HF Staff commited on Jan 21

Commit

20292f7

1 Parent(s): 10e4317

changes

Browse files

Files changed (3) hide show

README.md +37 -4
app.py +434 -4
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,12 +1,45 @@
 ---
 title: TextCut
-emoji: 📉
-colorFrom: green
-colorTo: yellow
 sdk: gradio
 sdk_version: 6.3.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: TextCut
+emoji: ✂️
+colorFrom: blue
+colorTo: purple
 sdk: gradio
 sdk_version: 6.3.0
 app_file: app.py
 pinned: false
 ---
+# TextCut
+Edit videos by simply editing their transcript. Upload a video, get an automatic transcription with timestamps using VibeVoice-ASR, then delete lines from the transcript to cut those parts from your video.
+## Features
+- **Automatic Transcription**: Uses Microsoft's VibeVoice-ASR model for accurate speech-to-text with timestamps
+- **Real-time Highlighting**: Current sentence is highlighted (uppercased) as the video plays
+- **Simple Editing**: Delete lines from the transcript to mark segments for removal
+- **Video Cutting**: Automatically cuts the video based on deleted transcript segments using FFmpeg
+## Usage
+1. **Upload**: Upload a video file (mp4, mov, etc.)
+2. **Transcribe**: Click "Transcribe" to generate the transcript with timestamps
+3. **Edit**: Delete lines from the transcript that you want to cut from the video
+4. **Apply Cuts**: Click "Apply Cuts" to generate the edited video
+## Requirements
+- Python 3.10+
+- FFmpeg installed on the system
+- CUDA-capable GPU (for transcription)
+## Local Development
+```bash
+pip install -r requirements.txt
+python app.py
+```
+## Hugging Face Spaces
+This app is designed to run on Hugging Face Spaces with ZeroGPU support for the transcription model.

app.py CHANGED Viewed

@@ -1,7 +1,437 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import os
+import tempfile
+import subprocess
+import json
+import re
+from typing import List, Dict, Optional, Tuple, Generator
 import gradio as gr
+try:
+    import spaces
+    HAS_SPACES = True
+except ImportError:
+    HAS_SPACES = False
+import torch
+import numpy as np
+MODEL_PATH = "microsoft/VibeVoice-ASR"
+model = None
+processor = None
+def get_model():
+    global model, processor
+    if model is None:
+        from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
+        from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
+        processor = VibeVoiceASRProcessor.from_pretrained(MODEL_PATH)
+        model = VibeVoiceASRForConditionalGeneration.from_pretrained(
+            MODEL_PATH,
+            dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        model.eval()
+    return model, processor
+def transcribe_audio_inner(audio_path: str) -> List[Dict]:
+    model, processor = get_model()
+    device = next(model.parameters()).device
+    inputs = processor(
+        audio=audio_path,
+        sampling_rate=16000,
+        return_tensors="pt",
+        add_generation_prompt=True,
+    )
+    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=8192,
+            temperature=None,
+            do_sample=False,
+            num_beams=1,
+            pad_token_id=processor.pad_id,
+            eos_token_id=processor.tokenizer.eos_token_id,
+        )
+    generated_ids = output_ids[0, inputs['input_ids'].shape[1]:]
+    generated_text = processor.decode(generated_ids, skip_special_tokens=True)
+    try:
+        segments = processor.post_process_transcription(generated_text)
+    except Exception:
+        segments = parse_raw_transcript(generated_text)
+    return segments
+def parse_raw_transcript(text: str) -> List[Dict]:
+    segments = []
+    pattern = r'\[(\d+\.?\d*)\s*-\s*(\d+\.?\d*)\]\s*(?:\[([^\]]*)\])?\s*(.+?)(?=\[\d+\.?\d*\s*-|\Z)'
+    matches = re.findall(pattern, text, re.DOTALL)
+    for match in matches:
+        start, end, speaker, content = match
+        segments.append({
+            'start': float(start),
+            'end': float(end),
+            'speaker': speaker.strip() if speaker else 'Speaker',
+            'text': content.strip()
+        })
+    if not segments and text.strip():
+        sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+        duration_per_sentence = 3.0
+        for i, sentence in enumerate(sentences):
+            if sentence.strip():
+                segments.append({
+                    'start': i * duration_per_sentence,
+                    'end': (i + 1) * duration_per_sentence,
+                    'speaker': 'Speaker',
+                    'text': sentence.strip()
+                })
+    return segments
+if HAS_SPACES:
+    @spaces.GPU(duration=120)
+    def transcribe_with_gpu(audio_path: str) -> List[Dict]:
+        return transcribe_audio_inner(audio_path)
+else:
+    def transcribe_with_gpu(audio_path: str) -> List[Dict]:
+        return transcribe_audio_inner(audio_path)
+def extract_audio(video_path: str) -> str:
+    audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
+    cmd = [
+        "ffmpeg", "-y", "-i", video_path,
+        "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
+        audio_path
+    ]
+    subprocess.run(cmd, capture_output=True, check=True)
+    return audio_path
+def get_video_duration(video_path: str) -> float:
+    cmd = [
+        "ffprobe", "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "json", video_path
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+    data = json.loads(result.stdout)
+    return float(data["format"]["duration"])
+def segments_to_transcript(segments: List[Dict]) -> str:
+    lines = []
+    for seg in segments:
+        start = seg['start']
+        end = seg['end']
+        text = seg['text']
+        lines.append(f"[{start:.2f}-{end:.2f}] {text}")
+    return "\n".join(lines)
+def parse_transcript_to_segments(transcript: str) -> List[Dict]:
+    segments = []
+    pattern = r'\[(\d+\.?\d*)-(\d+\.?\d*)\]\s*(.+)'
+    for line in transcript.strip().split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        match = re.match(pattern, line)
+        if match:
+            start, end, text = match.groups()
+            segments.append({
+                'start': float(start),
+                'end': float(end),
+                'text': text.strip()
+            })
+    return segments
+def find_current_segment_index(segments: List[Dict], current_time: float) -> int:
+    for i, seg in enumerate(segments):
+        if seg['start'] <= current_time < seg['end']:
+            return i
+    return -1
+def format_transcript_with_highlight(segments: List[Dict], current_index: int) -> str:
+    lines = []
+    for i, seg in enumerate(segments):
+        start = seg['start']
+        end = seg['end']
+        text = seg['text']
+        line = f"[{start:.2f}-{end:.2f}] {text}"
+        if i == current_index:
+            line = line.upper()
+        lines.append(line)
+    return "\n".join(lines)
+def cut_video_segments(video_path: str, segments_to_keep: List[Dict]) -> Optional[str]:
+    if not segments_to_keep:
+        return None
+    segments_to_keep = sorted(segments_to_keep, key=lambda x: x['start'])
+    temp_dir = tempfile.mkdtemp()
+    clip_files = []
+    for i, seg in enumerate(segments_to_keep):
+        clip_path = os.path.join(temp_dir, f"clip_{i:04d}.mp4")
+        cmd = [
+            "ffmpeg", "-y", "-i", video_path,
+            "-ss", str(seg['start']),
+            "-to", str(seg['end']),
+            "-c:v", "libx264", "-c:a", "aac",
+            "-avoid_negative_ts", "make_zero",
+            clip_path
+        ]
+        subprocess.run(cmd, capture_output=True, check=True)
+        clip_files.append(clip_path)
+    list_file = os.path.join(temp_dir, "list.txt")
+    with open(list_file, "w") as f:
+        for clip in clip_files:
+            f.write(f"file '{clip}'\n")
+    output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+    cmd = [
+        "ffmpeg", "-y", "-f", "concat", "-safe", "0",
+        "-i", list_file,
+        "-c", "copy",
+        output_path
+    ]
+    subprocess.run(cmd, capture_output=True, check=True)
+    for clip in clip_files:
+        os.remove(clip)
+    os.remove(list_file)
+    os.rmdir(temp_dir)
+    return output_path
+def process_upload(video_file):
+    if video_file is None:
+        return None, "", [], "Please upload a video file."
+    video_path = video_file
+    return video_path, "", [], "Video uploaded. Click 'Transcribe' to start transcription."
+def run_transcription(video_path, progress=gr.Progress()):
+    if video_path is None:
+        return "", [], "No video uploaded."
+    progress(0.1, desc="Extracting audio...")
+    try:
+        audio_path = extract_audio(video_path)
+    except Exception as e:
+        return "", [], f"Error extracting audio: {str(e)}"
+    progress(0.3, desc="Running transcription (this may take a while)...")
+    try:
+        segments = transcribe_with_gpu(audio_path)
+    except Exception as e:
+        return "", [], f"Error during transcription: {str(e)}"
+    finally:
+        if os.path.exists(audio_path):
+            os.remove(audio_path)
+    progress(0.9, desc="Formatting transcript...")
+    transcript = segments_to_transcript(segments)
+    progress(1.0, desc="Done!")
+    return transcript, segments, f"Transcription complete! {len(segments)} segments found."
+def update_highlight(video_path, original_segments, current_time):
+    if not original_segments:
+        return ""
+    current_index = find_current_segment_index(original_segments, current_time)
+    return format_transcript_with_highlight(original_segments, current_index)
+def apply_cuts(video_path, edited_transcript, original_segments):
+    if video_path is None:
+        return None, "No video to process."
+    if not original_segments:
+        return None, "No transcript available. Please transcribe first."
+    edited_segments = parse_transcript_to_segments(edited_transcript)
+    original_texts = {seg['text'].strip().lower() for seg in original_segments}
+    edited_texts = {seg['text'].strip().lower() for seg in edited_segments}
+    segments_to_keep = []
+    for seg in original_segments:
+        if seg['text'].strip().lower() in edited_texts:
+            segments_to_keep.append(seg)
+    if not segments_to_keep:
+        return None, "All segments were removed. Cannot create empty video."
+    deleted_count = len(original_segments) - len(segments_to_keep)
+    if deleted_count == 0:
+        return video_path, "No changes detected. Original video returned."
+    try:
+        output_path = cut_video_segments(video_path, segments_to_keep)
+        if output_path:
+            return output_path, f"Video edited! Removed {deleted_count} segment(s)."
+        else:
+            return None, "Error creating edited video."
+    except Exception as e:
+        return None, f"Error cutting video: {str(e)}"
+JS_CODE = """
+<script>
+(function() {
+    let lastUpdate = 0;
+    const updateInterval = 500;
+    function findVideoElement() {
+        const videos = document.querySelectorAll('video');
+        for (const video of videos) {
+            if (video.src && !video.src.includes('blob:')) {
+                return video;
+            }
+        }
+        return videos[0];
+    }
+    function setupVideoListener() {
+        const video = findVideoElement();
+        if (!video) {
+            setTimeout(setupVideoListener, 1000);
+            return;
+        }
+        video.addEventListener('timeupdate', function() {
+            const now = Date.now();
+            if (now - lastUpdate < updateInterval) return;
+            lastUpdate = now;
+            const timeInput = document.querySelector('#current-time-input input');
+            if (timeInput) {
+                timeInput.value = video.currentTime.toFixed(2);
+                timeInput.dispatchEvent(new Event('input', { bubbles: true }));
+            }
+        });
+    }
+    if (document.readyState === 'loading') {
+        document.addEventListener('DOMContentLoaded', setupVideoListener);
+    } else {
+        setupVideoListener();
+    }
+    const observer = new MutationObserver(function(mutations) {
+        setupVideoListener();
+    });
+    observer.observe(document.body, { childList: true, subtree: true });
+})();
+</script>
+"""
+with gr.Blocks(title="TextCut - Edit Videos by Editing Transcripts") as demo:
+    gr.Markdown("# TextCut")
+    gr.Markdown("Edit videos by simply editing their transcript. Upload a video, transcribe it, then delete lines to cut those parts from the video.")
+    gr.HTML(JS_CODE)
+    original_segments = gr.State([])
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### Transcript")
+            transcript_box = gr.Textbox(
+                label="Transcript (delete lines to cut those parts)",
+                lines=15,
+                interactive=True,
+                placeholder="Transcript will appear here after transcription..."
+            )
+            current_time = gr.Number(
+                label="Current Video Time (seconds)",
+                value=0,
+                visible=True,
+                elem_id="current-time-input"
+            )
+            highlight_btn = gr.Button("Update Highlight", size="sm")
+        with gr.Column(scale=1):
+            gr.Markdown("### Video")
+            video_input = gr.Video(
+                label="Upload Video",
+                sources=["upload"],
+                interactive=True
+            )
+            with gr.Row():
+                transcribe_btn = gr.Button("Transcribe", variant="primary")
+                cut_btn = gr.Button("Apply Cuts", variant="secondary")
+            status_text = gr.Textbox(label="Status", interactive=False, lines=2)
+            gr.Markdown("### Edited Video Output")
+            video_output = gr.Video(label="Edited Video")
+    video_input.change(
+        fn=process_upload,
+        inputs=[video_input],
+        outputs=[video_input, transcript_box, original_segments, status_text]
+    )
+    transcribe_btn.click(
+        fn=run_transcription,
+        inputs=[video_input],
+        outputs=[transcript_box, original_segments, status_text]
+    )
+    highlight_btn.click(
+        fn=update_highlight,
+        inputs=[video_input, original_segments, current_time],
+        outputs=[transcript_box]
+    )
+    current_time.change(
+        fn=update_highlight,
+        inputs=[video_input, original_segments, current_time],
+        outputs=[transcript_box]
+    )
+    cut_btn.click(
+        fn=apply_cuts,
+        inputs=[video_input, transcript_box, original_segments],
+        outputs=[video_output, status_text]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=6.0.0
+torch>=2.0.0
+transformers>=4.40.0
+soundfile
+numpy
+spaces
+vibevoice @ git+https://github.com/microsoft/VibeVoice.git