abidlabs HF Staff commited on
Commit
20292f7
·
1 Parent(s): 10e4317
Files changed (3) hide show
  1. README.md +37 -4
  2. app.py +434 -4
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,45 @@
1
  ---
2
  title: TextCut
3
- emoji: 📉
4
- colorFrom: green
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: TextCut
3
+ emoji: ✂️
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # TextCut
13
+
14
+ Edit videos by simply editing their transcript. Upload a video, get an automatic transcription with timestamps using VibeVoice-ASR, then delete lines from the transcript to cut those parts from your video.
15
+
16
+ ## Features
17
+
18
+ - **Automatic Transcription**: Uses Microsoft's VibeVoice-ASR model for accurate speech-to-text with timestamps
19
+ - **Real-time Highlighting**: Current sentence is highlighted (uppercased) as the video plays
20
+ - **Simple Editing**: Delete lines from the transcript to mark segments for removal
21
+ - **Video Cutting**: Automatically cuts the video based on deleted transcript segments using FFmpeg
22
+
23
+ ## Usage
24
+
25
+ 1. **Upload**: Upload a video file (mp4, mov, etc.)
26
+ 2. **Transcribe**: Click "Transcribe" to generate the transcript with timestamps
27
+ 3. **Edit**: Delete lines from the transcript that you want to cut from the video
28
+ 4. **Apply Cuts**: Click "Apply Cuts" to generate the edited video
29
+
30
+ ## Requirements
31
+
32
+ - Python 3.10+
33
+ - FFmpeg installed on the system
34
+ - CUDA-capable GPU (for transcription)
35
+
36
+ ## Local Development
37
+
38
+ ```bash
39
+ pip install -r requirements.txt
40
+ python app.py
41
+ ```
42
+
43
+ ## Hugging Face Spaces
44
+
45
+ This app is designed to run on Hugging Face Spaces with ZeroGPU support for the transcription model.
app.py CHANGED
@@ -1,7 +1,437 @@
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import subprocess
4
+ import json
5
+ import re
6
+ from typing import List, Dict, Optional, Tuple, Generator
7
  import gradio as gr
8
 
9
+ try:
10
+ import spaces
11
+ HAS_SPACES = True
12
+ except ImportError:
13
+ HAS_SPACES = False
14
 
15
+ import torch
16
+ import numpy as np
17
+
18
+
19
+ MODEL_PATH = "microsoft/VibeVoice-ASR"
20
+ model = None
21
+ processor = None
22
+
23
+
24
+ def get_model():
25
+ global model, processor
26
+ if model is None:
27
+ from vibevoice.modular.modeling_vibevoice_asr import VibeVoiceASRForConditionalGeneration
28
+ from vibevoice.processor.vibevoice_asr_processor import VibeVoiceASRProcessor
29
+
30
+ processor = VibeVoiceASRProcessor.from_pretrained(MODEL_PATH)
31
+ model = VibeVoiceASRForConditionalGeneration.from_pretrained(
32
+ MODEL_PATH,
33
+ dtype=torch.bfloat16,
34
+ device_map="auto",
35
+ trust_remote_code=True
36
+ )
37
+ model.eval()
38
+ return model, processor
39
+
40
+
41
+ def transcribe_audio_inner(audio_path: str) -> List[Dict]:
42
+ model, processor = get_model()
43
+ device = next(model.parameters()).device
44
+
45
+ inputs = processor(
46
+ audio=audio_path,
47
+ sampling_rate=16000,
48
+ return_tensors="pt",
49
+ add_generation_prompt=True,
50
+ )
51
+
52
+ inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
53
+
54
+ with torch.no_grad():
55
+ output_ids = model.generate(
56
+ **inputs,
57
+ max_new_tokens=8192,
58
+ temperature=None,
59
+ do_sample=False,
60
+ num_beams=1,
61
+ pad_token_id=processor.pad_id,
62
+ eos_token_id=processor.tokenizer.eos_token_id,
63
+ )
64
+
65
+ generated_ids = output_ids[0, inputs['input_ids'].shape[1]:]
66
+ generated_text = processor.decode(generated_ids, skip_special_tokens=True)
67
+
68
+ try:
69
+ segments = processor.post_process_transcription(generated_text)
70
+ except Exception:
71
+ segments = parse_raw_transcript(generated_text)
72
+
73
+ return segments
74
+
75
+
76
+ def parse_raw_transcript(text: str) -> List[Dict]:
77
+ segments = []
78
+ pattern = r'\[(\d+\.?\d*)\s*-\s*(\d+\.?\d*)\]\s*(?:\[([^\]]*)\])?\s*(.+?)(?=\[\d+\.?\d*\s*-|\Z)'
79
+ matches = re.findall(pattern, text, re.DOTALL)
80
+
81
+ for match in matches:
82
+ start, end, speaker, content = match
83
+ segments.append({
84
+ 'start': float(start),
85
+ 'end': float(end),
86
+ 'speaker': speaker.strip() if speaker else 'Speaker',
87
+ 'text': content.strip()
88
+ })
89
+
90
+ if not segments and text.strip():
91
+ sentences = re.split(r'(?<=[.!?])\s+', text.strip())
92
+ duration_per_sentence = 3.0
93
+ for i, sentence in enumerate(sentences):
94
+ if sentence.strip():
95
+ segments.append({
96
+ 'start': i * duration_per_sentence,
97
+ 'end': (i + 1) * duration_per_sentence,
98
+ 'speaker': 'Speaker',
99
+ 'text': sentence.strip()
100
+ })
101
+
102
+ return segments
103
+
104
+
105
+ if HAS_SPACES:
106
+ @spaces.GPU(duration=120)
107
+ def transcribe_with_gpu(audio_path: str) -> List[Dict]:
108
+ return transcribe_audio_inner(audio_path)
109
+ else:
110
+ def transcribe_with_gpu(audio_path: str) -> List[Dict]:
111
+ return transcribe_audio_inner(audio_path)
112
+
113
+
114
+ def extract_audio(video_path: str) -> str:
115
+ audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
116
+ cmd = [
117
+ "ffmpeg", "-y", "-i", video_path,
118
+ "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
119
+ audio_path
120
+ ]
121
+ subprocess.run(cmd, capture_output=True, check=True)
122
+ return audio_path
123
+
124
+
125
+ def get_video_duration(video_path: str) -> float:
126
+ cmd = [
127
+ "ffprobe", "-v", "error",
128
+ "-show_entries", "format=duration",
129
+ "-of", "json", video_path
130
+ ]
131
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
132
+ data = json.loads(result.stdout)
133
+ return float(data["format"]["duration"])
134
+
135
+
136
+ def segments_to_transcript(segments: List[Dict]) -> str:
137
+ lines = []
138
+ for seg in segments:
139
+ start = seg['start']
140
+ end = seg['end']
141
+ text = seg['text']
142
+ lines.append(f"[{start:.2f}-{end:.2f}] {text}")
143
+ return "\n".join(lines)
144
+
145
+
146
+ def parse_transcript_to_segments(transcript: str) -> List[Dict]:
147
+ segments = []
148
+ pattern = r'\[(\d+\.?\d*)-(\d+\.?\d*)\]\s*(.+)'
149
+
150
+ for line in transcript.strip().split("\n"):
151
+ line = line.strip()
152
+ if not line:
153
+ continue
154
+ match = re.match(pattern, line)
155
+ if match:
156
+ start, end, text = match.groups()
157
+ segments.append({
158
+ 'start': float(start),
159
+ 'end': float(end),
160
+ 'text': text.strip()
161
+ })
162
+
163
+ return segments
164
+
165
+
166
+ def find_current_segment_index(segments: List[Dict], current_time: float) -> int:
167
+ for i, seg in enumerate(segments):
168
+ if seg['start'] <= current_time < seg['end']:
169
+ return i
170
+ return -1
171
+
172
+
173
+ def format_transcript_with_highlight(segments: List[Dict], current_index: int) -> str:
174
+ lines = []
175
+ for i, seg in enumerate(segments):
176
+ start = seg['start']
177
+ end = seg['end']
178
+ text = seg['text']
179
+ line = f"[{start:.2f}-{end:.2f}] {text}"
180
+ if i == current_index:
181
+ line = line.upper()
182
+ lines.append(line)
183
+ return "\n".join(lines)
184
+
185
+
186
+ def cut_video_segments(video_path: str, segments_to_keep: List[Dict]) -> Optional[str]:
187
+ if not segments_to_keep:
188
+ return None
189
+
190
+ segments_to_keep = sorted(segments_to_keep, key=lambda x: x['start'])
191
+
192
+ temp_dir = tempfile.mkdtemp()
193
+ clip_files = []
194
+
195
+ for i, seg in enumerate(segments_to_keep):
196
+ clip_path = os.path.join(temp_dir, f"clip_{i:04d}.mp4")
197
+ cmd = [
198
+ "ffmpeg", "-y", "-i", video_path,
199
+ "-ss", str(seg['start']),
200
+ "-to", str(seg['end']),
201
+ "-c:v", "libx264", "-c:a", "aac",
202
+ "-avoid_negative_ts", "make_zero",
203
+ clip_path
204
+ ]
205
+ subprocess.run(cmd, capture_output=True, check=True)
206
+ clip_files.append(clip_path)
207
+
208
+ list_file = os.path.join(temp_dir, "list.txt")
209
+ with open(list_file, "w") as f:
210
+ for clip in clip_files:
211
+ f.write(f"file '{clip}'\n")
212
+
213
+ output_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
214
+ cmd = [
215
+ "ffmpeg", "-y", "-f", "concat", "-safe", "0",
216
+ "-i", list_file,
217
+ "-c", "copy",
218
+ output_path
219
+ ]
220
+ subprocess.run(cmd, capture_output=True, check=True)
221
+
222
+ for clip in clip_files:
223
+ os.remove(clip)
224
+ os.remove(list_file)
225
+ os.rmdir(temp_dir)
226
+
227
+ return output_path
228
+
229
+
230
+ def process_upload(video_file):
231
+ if video_file is None:
232
+ return None, "", [], "Please upload a video file."
233
+
234
+ video_path = video_file
235
+ return video_path, "", [], "Video uploaded. Click 'Transcribe' to start transcription."
236
+
237
+
238
+ def run_transcription(video_path, progress=gr.Progress()):
239
+ if video_path is None:
240
+ return "", [], "No video uploaded."
241
+
242
+ progress(0.1, desc="Extracting audio...")
243
+
244
+ try:
245
+ audio_path = extract_audio(video_path)
246
+ except Exception as e:
247
+ return "", [], f"Error extracting audio: {str(e)}"
248
+
249
+ progress(0.3, desc="Running transcription (this may take a while)...")
250
+
251
+ try:
252
+ segments = transcribe_with_gpu(audio_path)
253
+ except Exception as e:
254
+ return "", [], f"Error during transcription: {str(e)}"
255
+ finally:
256
+ if os.path.exists(audio_path):
257
+ os.remove(audio_path)
258
+
259
+ progress(0.9, desc="Formatting transcript...")
260
+
261
+ transcript = segments_to_transcript(segments)
262
+
263
+ progress(1.0, desc="Done!")
264
+
265
+ return transcript, segments, f"Transcription complete! {len(segments)} segments found."
266
+
267
+
268
+ def update_highlight(video_path, original_segments, current_time):
269
+ if not original_segments:
270
+ return ""
271
+
272
+ current_index = find_current_segment_index(original_segments, current_time)
273
+ return format_transcript_with_highlight(original_segments, current_index)
274
+
275
+
276
+ def apply_cuts(video_path, edited_transcript, original_segments):
277
+ if video_path is None:
278
+ return None, "No video to process."
279
+
280
+ if not original_segments:
281
+ return None, "No transcript available. Please transcribe first."
282
+
283
+ edited_segments = parse_transcript_to_segments(edited_transcript)
284
+
285
+ original_texts = {seg['text'].strip().lower() for seg in original_segments}
286
+ edited_texts = {seg['text'].strip().lower() for seg in edited_segments}
287
+
288
+ segments_to_keep = []
289
+ for seg in original_segments:
290
+ if seg['text'].strip().lower() in edited_texts:
291
+ segments_to_keep.append(seg)
292
+
293
+ if not segments_to_keep:
294
+ return None, "All segments were removed. Cannot create empty video."
295
+
296
+ deleted_count = len(original_segments) - len(segments_to_keep)
297
+
298
+ if deleted_count == 0:
299
+ return video_path, "No changes detected. Original video returned."
300
+
301
+ try:
302
+ output_path = cut_video_segments(video_path, segments_to_keep)
303
+ if output_path:
304
+ return output_path, f"Video edited! Removed {deleted_count} segment(s)."
305
+ else:
306
+ return None, "Error creating edited video."
307
+ except Exception as e:
308
+ return None, f"Error cutting video: {str(e)}"
309
+
310
+
311
+ JS_CODE = """
312
+ <script>
313
+ (function() {
314
+ let lastUpdate = 0;
315
+ const updateInterval = 500;
316
+
317
+ function findVideoElement() {
318
+ const videos = document.querySelectorAll('video');
319
+ for (const video of videos) {
320
+ if (video.src && !video.src.includes('blob:')) {
321
+ return video;
322
+ }
323
+ }
324
+ return videos[0];
325
+ }
326
+
327
+ function setupVideoListener() {
328
+ const video = findVideoElement();
329
+ if (!video) {
330
+ setTimeout(setupVideoListener, 1000);
331
+ return;
332
+ }
333
+
334
+ video.addEventListener('timeupdate', function() {
335
+ const now = Date.now();
336
+ if (now - lastUpdate < updateInterval) return;
337
+ lastUpdate = now;
338
+
339
+ const timeInput = document.querySelector('#current-time-input input');
340
+ if (timeInput) {
341
+ timeInput.value = video.currentTime.toFixed(2);
342
+ timeInput.dispatchEvent(new Event('input', { bubbles: true }));
343
+ }
344
+ });
345
+ }
346
+
347
+ if (document.readyState === 'loading') {
348
+ document.addEventListener('DOMContentLoaded', setupVideoListener);
349
+ } else {
350
+ setupVideoListener();
351
+ }
352
+
353
+ const observer = new MutationObserver(function(mutations) {
354
+ setupVideoListener();
355
+ });
356
+ observer.observe(document.body, { childList: true, subtree: true });
357
+ })();
358
+ </script>
359
+ """
360
+
361
+
362
+ with gr.Blocks(title="TextCut - Edit Videos by Editing Transcripts") as demo:
363
+ gr.Markdown("# TextCut")
364
+ gr.Markdown("Edit videos by simply editing their transcript. Upload a video, transcribe it, then delete lines to cut those parts from the video.")
365
+ gr.HTML(JS_CODE)
366
+
367
+ original_segments = gr.State([])
368
+
369
+ with gr.Row():
370
+ with gr.Column(scale=1):
371
+ gr.Markdown("### Transcript")
372
+ transcript_box = gr.Textbox(
373
+ label="Transcript (delete lines to cut those parts)",
374
+ lines=15,
375
+ interactive=True,
376
+ placeholder="Transcript will appear here after transcription..."
377
+ )
378
+
379
+ current_time = gr.Number(
380
+ label="Current Video Time (seconds)",
381
+ value=0,
382
+ visible=True,
383
+ elem_id="current-time-input"
384
+ )
385
+
386
+ highlight_btn = gr.Button("Update Highlight", size="sm")
387
+
388
+ with gr.Column(scale=1):
389
+ gr.Markdown("### Video")
390
+ video_input = gr.Video(
391
+ label="Upload Video",
392
+ sources=["upload"],
393
+ interactive=True
394
+ )
395
+
396
+ with gr.Row():
397
+ transcribe_btn = gr.Button("Transcribe", variant="primary")
398
+ cut_btn = gr.Button("Apply Cuts", variant="secondary")
399
+
400
+ status_text = gr.Textbox(label="Status", interactive=False, lines=2)
401
+
402
+ gr.Markdown("### Edited Video Output")
403
+ video_output = gr.Video(label="Edited Video")
404
+
405
+ video_input.change(
406
+ fn=process_upload,
407
+ inputs=[video_input],
408
+ outputs=[video_input, transcript_box, original_segments, status_text]
409
+ )
410
+
411
+ transcribe_btn.click(
412
+ fn=run_transcription,
413
+ inputs=[video_input],
414
+ outputs=[transcript_box, original_segments, status_text]
415
+ )
416
+
417
+ highlight_btn.click(
418
+ fn=update_highlight,
419
+ inputs=[video_input, original_segments, current_time],
420
+ outputs=[transcript_box]
421
+ )
422
+
423
+ current_time.change(
424
+ fn=update_highlight,
425
+ inputs=[video_input, original_segments, current_time],
426
+ outputs=[transcript_box]
427
+ )
428
+
429
+ cut_btn.click(
430
+ fn=apply_cuts,
431
+ inputs=[video_input, transcript_box, original_segments],
432
+ outputs=[video_output, status_text]
433
+ )
434
+
435
+
436
+ if __name__ == "__main__":
437
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=6.0.0
2
+ torch>=2.0.0
3
+ transformers>=4.40.0
4
+ soundfile
5
+ numpy
6
+ spaces
7
+ vibevoice @ git+https://github.com/microsoft/VibeVoice.git
8
+