Spaces:

FunAudioLLM
/

FunClip

Runtime error

Zhifu Gao

feat: initial FunClip demo - AI video clipping with FunASR

a9f639a about 1 month ago

6.79 kB

	import os
	import json
	import tempfile
	import subprocess
	import gradio as gr
	import numpy as np
	import torch

	from funasr import AutoModel

	model = AutoModel(
	model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
	hub="hf",
	model_hub="hf",
	device="cpu",
	)


	def extract_audio(video_path):
	audio_path = tempfile.mktemp(suffix=".wav")
	cmd = [
	"ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le",
	"-ar", "16000", "-ac", "1", "-y", audio_path
	]
	subprocess.run(cmd, capture_output=True)
	return audio_path


	def transcribe_video(video_path, progress=gr.Progress()):
	if video_path is None:
	return "Please upload a video file.", [], None

	progress(0.1, desc="Extracting audio...")
	audio_path = extract_audio(video_path)

	if not os.path.exists(audio_path):
	return "Failed to extract audio from video. Make sure it contains an audio track.", [], None

	progress(0.3, desc="Transcribing speech...")
	try:
	res = model.generate(input=audio_path, batch_size_s=300)
	except Exception as e:
	return f"Transcription error: {str(e)}", [], None
	finally:
	if os.path.exists(audio_path):
	os.unlink(audio_path)

	if not res or not res[0].get("sentence_info"):
	text = res[0].get("text", "") if res else ""
	return text, [], None

	progress(0.8, desc="Processing timestamps...")
	sentences = []
	for sent in res[0]["sentence_info"]:
	start_ms = sent["start"]
	end_ms = sent["end"]
	text = sent["text"]
	sentences.append({
	"start": start_ms / 1000.0,
	"end": end_ms / 1000.0,
	"text": text,
	})

	full_text = "\n".join(
	[f"[{s['start']:.1f}s - {s['end']:.1f}s] {s['text']}" for s in sentences]
	)

	progress(1.0, desc="Done!")
	return full_text, sentences, json.dumps(sentences, ensure_ascii=False)


	def clip_video(video_path, sentences_json, selected_indices):
	if not video_path or not sentences_json or not selected_indices:
	return None, "Please transcribe a video first, then select segments to clip."

	sentences = json.loads(sentences_json)

	indices = [int(i) for i in selected_indices]
	if not indices:
	return None, "No segments selected."

	clips = []
	for idx in sorted(indices):
	if 0 <= idx < len(sentences):
	clips.append((sentences[idx]["start"], sentences[idx]["end"]))

	if not clips:
	return None, "Invalid selection."

	merged = [clips[0]]
	for start, end in clips[1:]:
	if start - merged[-1][1] < 0.5:
	merged[-1] = (merged[-1][0], end)
	else:
	merged.append((start, end))

	output_path = tempfile.mktemp(suffix=".mp4")

	filter_parts = []
	for i, (start, end) in enumerate(merged):
	filter_parts.append(
	f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}];"
	f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}];"
	)

	concat_v = "".join(f"[v{i}]" for i in range(len(merged)))
	concat_a = "".join(f"[a{i}]" for i in range(len(merged)))
	filter_parts.append(f"{concat_v}{concat_a}concat=n={len(merged)}:v=1:a=1[outv][outa]")

	filter_complex = "".join(filter_parts)

	cmd = [
	"ffmpeg", "-i", video_path, "-filter_complex", filter_complex,
	"-map", "[outv]", "-map", "[outa]", "-y", output_path
	]

	result = subprocess.run(cmd, capture_output=True, text=True)
	if result.returncode != 0:
	return None, f"FFmpeg error: {result.stderr[-500:]}"

	total_duration = sum(end - start for start, end in merged)
	return output_path, f"Clipped {len(merged)} segment(s), total {total_duration:.1f}s"


	description_html = """
	<div style="text-align: center; max-width: 850px; margin: 0 auto;">
	<h1 style="font-size: 2.2em; margin-bottom: 0.1em;">✂️ FunClip</h1>
	<p style="font-size: 1.3em; color: #444;">AI Video Clipping — Speak to Clip</p>
	<p style="font-size: 1em; color: #666;">
	Upload a video → Auto-transcribe with timestamps → Select text segments → Export precise clips
	</p>
	<p style="font-size: 0.9em; margin-top: 0.8em;">
	<a href="https://github.com/modelscope/FunClip" target="_blank">⭐ GitHub (5.6k+ stars)</a> ·
	<a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR</a> ·
	<a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">🚀 Fun-ASR</a>
	</p>
	</div>
	"""

	how_it_works = """
	### How It Works
	1. Upload a video (any format with audio)
	2. Transcribe — FunASR extracts speech with precise timestamps
	3. Select the sentences you want to keep (by index)
	4. Clip — FFmpeg cuts and concatenates the selected segments

	For the full experience with LLM-assisted smart clipping, install [FunClip](https://github.com/modelscope/FunClip) locally.
	"""


	def build_selector(sentences_json):
	if not sentences_json:
	return gr.update(choices=[], value=[])
	sentences = json.loads(sentences_json)
	choices = [f"{i}: [{s['start']:.1f}s-{s['end']:.1f}s] {s['text']}" for i, s in enumerate(sentences)]
	return gr.update(choices=choices, value=[])


	def launch():
	with gr.Blocks(theme=gr.themes.Soft(), title="FunClip - AI Video Clipping") as demo:
	gr.HTML(description_html)

	sentences_state = gr.State("")

	with gr.Tab("1. Transcribe"):
	with gr.Row():
	video_input = gr.Video(label="Upload Video")
	transcribe_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg")
	transcript_output = gr.Textbox(label="Transcription with Timestamps", lines=12, show_copy_button=True)

	with gr.Tab("2. Clip"):
	segment_selector = gr.CheckboxGroup(
	label="Select segments to clip",
	choices=[],
	)
	clip_btn = gr.Button("✂️ Generate Clip", variant="primary", size="lg")
	with gr.Row():
	clip_output = gr.Video(label="Output Clip")
	clip_info = gr.Textbox(label="Info", lines=2)

	transcribe_btn.click(
	transcribe_video,
	inputs=[video_input],
	outputs=[transcript_output, gr.State(), sentences_state],
	).then(
	build_selector,
	inputs=[sentences_state],
	outputs=[segment_selector],
	)

	clip_btn.click(
	clip_video,
	inputs=[video_input, sentences_state, segment_selector],
	outputs=[clip_output, clip_info],
	)

	gr.Markdown(how_it_works)

	demo.launch()


	if __name__ == "__main__":
	launch()