Spaces:

ulduldp
/

test-ffmpeg

Running

App Files Files Community

test-ffmpeg / app.py

ulduldp

Update app.py

45fc8b1 verified 6 days ago

raw

history blame contribute delete

16.7 kB

	from flask import Flask, render_template_string, request, jsonify, send_from_directory, abort
	import os
	import uuid
	import subprocess
	from PIL import Image, ImageDraw, ImageFont
	from werkzeug.utils import secure_filename
	from faster_whisper import WhisperModel

	app = Flask(__name__)

	BASE_DIR = os.path.dirname(os.path.abspath(__file__))

	UPLOAD_FOLDER = os.path.join(BASE_DIR, "uploads")
	OUTPUT_FOLDER = os.path.join(BASE_DIR, "static", "videos")
	SUBTITLE_FOLDER = os.path.join(BASE_DIR, "subtitles")

	os.makedirs(UPLOAD_FOLDER, exist_ok=True)
	os.makedirs(OUTPUT_FOLDER, exist_ok=True)
	os.makedirs(SUBTITLE_FOLDER, exist_ok=True)

	# Fast CPU model
	model = WhisperModel(
	"tiny",
	device="cpu",
	compute_type="int8"
	)

	FRAME_W = 1080
	FRAME_H = 1920

	HTML = """
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Photo + Audio To Video</title>

	<style>
	*{
	margin:0;
	padding:0;
	box-sizing:border-box;
	font-family:Arial;
	}

	body{
	background:#0f0f0f;
	color:white;
	min-height:100vh;
	display:flex;
	justify-content:center;
	align-items:center;
	padding:20px;
	}

	.container{
	width:100%;
	max-width:500px;
	background:#1b1b1b;
	border-radius:20px;
	padding:25px;
	box-shadow:0 0 20px rgba(0,0,0,0.4);
	}

	h1{
	text-align:center;
	margin-bottom:25px;
	font-size:28px;
	}

	.upload-box{
	border:2px dashed #444;
	padding:20px;
	border-radius:15px;
	margin-bottom:20px;
	}

	label{
	display:block;
	margin-bottom:8px;
	color:#ccc;
	}

	input{
	width:100%;
	padding:12px;
	background:#2a2a2a;
	border:none;
	border-radius:10px;
	color:white;
	margin-bottom:15px;
	}

	button{
	width:100%;
	padding:15px;
	border:none;
	border-radius:12px;
	background:#00aaff;
	color:white;
	font-size:18px;
	cursor:pointer;
	transition:0.3s;
	}

	button:hover{
	opacity:0.9;
	}

	#loading{
	display:none;
	text-align:center;
	margin-top:20px;
	}

	video{
	width:100%;
	margin-top:20px;
	border-radius:15px;
	display:none;
	aspect-ratio:9/16;
	background:#000;
	object-fit:cover;
	}

	.download-btn{
	display:none;
	margin-top:15px;
	text-align:center;
	}

	.download-btn a{
	display:inline-block;
	background:#22c55e;
	color:white;
	text-decoration:none;
	padding:12px 20px;
	border-radius:10px;
	}

	.preview{
	margin-top:15px;
	width:100%;
	border-radius:15px;
	display:none;
	}
	</style>
	</head>

	<body>
	<div class="container">
	<h1>Photo + Audio → Video</h1>

	<form id="form">
	<div class="upload-box">
	<label>Select Photo</label>
	<input type="file" id="image" name="image" accept="image/*" required>

	<img id="preview" class="preview">

	<label>Select Audio (mp3/wav)</label>
	<input type="file" name="audio" accept="audio/*" required>
	</div>

	<button type="submit">Generate Video</button>
	</form>

	<div id="loading">Generating Video...</div>

	<video id="video" controls playsinline></video>

	<div class="download-btn" id="downloadDiv">
	<a id="downloadBtn" download>Download Video</a>
	</div>
	</div>

	<script>
	const form = document.getElementById("form");
	const loading = document.getElementById("loading");
	const video = document.getElementById("video");
	const downloadBtn = document.getElementById("downloadBtn");
	const downloadDiv = document.getElementById("downloadDiv");
	const preview = document.getElementById("preview");

	document.getElementById("image").addEventListener("change", function(e){
	const file = e.target.files[0];
	if(file){
	preview.src = URL.createObjectURL(file);
	preview.style.display = "block";
	}
	});

	form.addEventListener("submit", async (e)=>{
	e.preventDefault();

	loading.style.display = "block";
	video.style.display = "none";
	downloadDiv.style.display = "none";

	const formData = new FormData(form);

	try{
	const response = await fetch("/generate", {
	method:"POST",
	body:formData
	});

	const data = await response.json();
	loading.style.display = "none";

	if(data.video_url){
	video.src = data.video_url + "?t=" + new Date().getTime();
	video.style.display = "block";

	downloadBtn.href = data.video_url;
	downloadDiv.style.display = "block";
	}else{
	alert(data.error \|\| "Failed");
	console.log(data.details \|\| "");
	}
	}catch(err){
	loading.style.display = "none";
	alert("Server Error");
	console.error(err);
	}
	});
	</script>
	</body>
	</html>
	"""

	def find_font_path():
	candidates = [
	"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
	"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
	"/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf",
	"/usr/share/fonts/truetype/liberation2/LiberationSans-Regular.ttf",
	"/usr/share/fonts/truetype/freefont/FreeSansBold.ttf",
	"/usr/share/fonts/truetype/freefont/FreeSans.ttf",
	]
	for path in candidates:
	if os.path.exists(path):
	return path
	return None

	FONT_PATH = find_font_path()

	def ass_time(seconds: float) -> str:
	if seconds < 0:
	seconds = 0
	h = int(seconds // 3600)
	m = int((seconds % 3600) // 60)
	s = seconds % 60
	return f"{h}:{m:02d}:{s:05.2f}"

	def measure_text_width(font, text: str) -> int:
	bbox = font.getbbox(text)
	return bbox[2] - bbox[0]

	def measure_text_height(font, text: str) -> int:
	bbox = font.getbbox(text)
	return bbox[3] - bbox[1]

	def clean_text(text: str) -> str:
	return " ".join(text.strip().split())

	def wrap_text_by_pixels(text: str, font, max_width_px: int, max_lines: int = 4) -> list[str]:
	text = clean_text(text)
	if not text:
	return []

	def split_long_word(word: str) -> list[str]:
	if measure_text_width(font, word) <= max_width_px:
	return [word]

	parts = []
	chunk = ""
	for ch in word:
	trial = chunk + ch
	if measure_text_width(font, trial) <= max_width_px:
	chunk = trial
	else:
	if chunk:
	parts.append(chunk)
	chunk = ch
	if chunk:
	parts.append(chunk)
	return parts

	tokens = []
	for word in text.split(" "):
	tokens.extend(split_long_word(word))

	lines = []
	current = ""

	for token in tokens:
	trial = token if not current else f"{current} {token}"
	if measure_text_width(font, trial) <= max_width_px:
	current = trial
	else:
	if current:
	lines.append(current)
	current = token

	if current:
	lines.append(current)

	if len(lines) > max_lines:
	kept = lines[:max_lines - 1]
	kept.append(" ".join(lines[max_lines - 1:]))
	lines = kept

	return lines

	def pick_layout(text: str):
	"""
	Try a few font sizes and pick one that fits nicely.
	"""
	if FONT_PATH and os.path.exists(FONT_PATH):
	candidates = [42, 40, 38, 36, 34, 32]
	max_box_width = 940
	padding_x = 36
	padding_y = 22
	line_spacing = 10
	bottom_margin = 230
	radius = 20

	for font_size in candidates:
	font = ImageFont.truetype(FONT_PATH, font_size)
	lines = wrap_text_by_pixels(
	text=text,
	font=font,
	max_width_px=max_box_width - (padding_x * 2),
	max_lines=5
	)

	if not lines:
	continue

	widths = [measure_text_width(font, line) for line in lines]
	heights = [measure_text_height(font, line) for line in lines]

	box_w = min(max_box_width, max(widths) + padding_x * 2)
	box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2

	if box_h <= 520:
	return {
	"font": font,
	"font_size": font_size,
	"lines": lines,
	"box_w": box_w,
	"box_h": box_h,
	"padding_x": padding_x,
	"padding_y": padding_y,
	"line_spacing": line_spacing,
	"bottom_margin": bottom_margin,
	"radius": radius,
	}

	font = ImageFont.truetype(FONT_PATH, 32)
	lines = wrap_text_by_pixels(
	text=text,
	font=font,
	max_width_px=max_box_width - (padding_x * 2),
	max_lines=5
	)
	widths = [measure_text_width(font, line) for line in lines] if lines else [0]
	heights = [measure_text_height(font, line) for line in lines] if lines else [0]
	box_w = min(max_box_width, max(widths) + padding_x * 2)
	box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2
	return {
	"font": font,
	"font_size": 32,
	"lines": lines,
	"box_w": box_w,
	"box_h": box_h,
	"padding_x": padding_x,
	"padding_y": padding_y,
	"line_spacing": line_spacing,
	"bottom_margin": bottom_margin,
	"radius": radius,
	}

	font = ImageFont.load_default()
	lines = wrap_text_by_pixels(text=text, font=font, max_width_px=900, max_lines=4)
	widths = [measure_text_width(font, line) for line in lines] if lines else [0]
	heights = [measure_text_height(font, line) for line in lines] if lines else [0]
	box_w = min(940, max(widths) + 72)
	box_h = sum(heights) + 10 * (len(lines) - 1) + 44
	return {
	"font": font,
	"font_size": 16,
	"lines": lines,
	"box_w": box_w,
	"box_h": box_h,
	"padding_x": 36,
	"padding_y": 22,
	"line_spacing": 10,
	"bottom_margin": 230,
	"radius": 20,
	}

	def render_subtitle_frame(text: str, image_path: str):
	layout = pick_layout(text)
	font = layout["font"]
	lines = layout["lines"]
	box_w = layout["box_w"]
	box_h = layout["box_h"]
	padding_x = layout["padding_x"]
	padding_y = layout["padding_y"]
	line_spacing = layout["line_spacing"]
	bottom_margin = layout["bottom_margin"]
	radius = layout["radius"]

	img = Image.new("RGBA", (FRAME_W, FRAME_H), (0, 0, 0, 0))
	draw = ImageDraw.Draw(img)

	x0 = int((FRAME_W - box_w) / 2)
	y0 = int(FRAME_H - bottom_margin - box_h)
	x1 = x0 + box_w
	y1 = y0 + box_h

	# Solid black background box
	draw.rounded_rectangle(
	[x0, y0, x1, y1],
	radius=radius,
	fill=(0, 0, 0, 255)
	)

	y = y0 + padding_y
	for line in lines:
	line_w = measure_text_width(font, line)
	line_h = measure_text_height(font, line)
	tx = int((FRAME_W - line_w) / 2)
	draw.text(
	(tx, y),
	line,
	font=font,
	fill=(255, 255, 255, 255)
	)
	y += line_h + line_spacing

	img.save(image_path)

	def build_subtitle_overlays(transcript, job_dir):
	overlay_specs = []
	for idx, seg in enumerate(transcript):
	text = seg["text"].strip()
	if not text:
	continue

	png_name = f"sub_{idx:03d}.png"
	png_path = os.path.join(job_dir, png_name)
	render_subtitle_frame(text, png_path)

	overlay_specs.append({
	"path": png_path,
	"start": float(seg["start"]),
	"end": float(seg["end"]),
	})

	return overlay_specs

	@app.route("/")
	def home():
	return render_template_string(HTML)

	@app.route("/video/<path:filename>")
	def serve_video(filename):
	file_path = os.path.join(OUTPUT_FOLDER, filename)
	if not os.path.exists(file_path):
	abort(404)

	response = send_from_directory(
	OUTPUT_FOLDER,
	filename,
	as_attachment=False,
	conditional=True
	)
	response.headers["Cache-Control"] = "no-store"
	return response

	@app.route("/generate", methods=["POST"])
	def generate():
	if "image" not in request.files or "audio" not in request.files:
	return jsonify({"error": "Missing files"}), 400

	image = request.files["image"]
	audio = request.files["audio"]

	if not image.filename or not audio.filename:
	return jsonify({"error": "Please upload both image and audio"}), 400

	uid = str(uuid.uuid4())

	image_name = secure_filename(image.filename)
	audio_name = secure_filename(audio.filename)

	image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}")
	audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}")
	output_filename = f"{uid}.mp4"
	output_path = os.path.join(OUTPUT_FOLDER, output_filename)
	job_subtitle_dir = os.path.join(SUBTITLE_FOLDER, uid)

	os.makedirs(job_subtitle_dir, exist_ok=True)

	image.save(image_path)
	audio.save(audio_path)

	try:
	segments_iter, info = model.transcribe(
	audio_path,
	beam_size=1,
	vad_filter=True
	)

	transcript = []
	full_text_parts = []

	for segment in segments_iter:
	text = segment.text.strip()
	if not text:
	continue

	transcript.append({
	"start": round(segment.start, 2),
	"end": round(segment.end, 2),
	"text": text
	})
	full_text_parts.append(text)

	overlay_specs = build_subtitle_overlays(transcript, job_subtitle_dir)

	# Inputs:
	# 0 = image
	# 1..n = subtitle PNG overlays
	# last = audio
	cmd = [
	"ffmpeg",
	"-y",
	"-loop", "1",
	"-framerate", "1",
	"-i", image_path,
	]

	for spec in overlay_specs:
	cmd.extend([
	"-loop", "1",
	"-framerate", "1",
	"-i", spec["path"]
	])

	cmd.extend([
	"-i", audio_path,
	])

	filter_parts = [
	"[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[base]"
	]

	last_label = "[base]"

	for idx, spec in enumerate(overlay_specs):
	input_idx = idx + 1
	next_label = f"[v{idx}]"
	start = spec["start"]
	end = spec["end"]
	filter_parts.append(
	f"{last_label}[{input_idx}:v]overlay=0:0:enable='between(t,{start:.2f},{end:.2f})'{next_label}"
	)
	last_label = next_label

	if overlay_specs:
	filter_complex = ";".join(filter_parts)
	else:
	filter_complex = "[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[vout]"

	if overlay_specs:
	final_video_label = last_label
	else:
	final_video_label = "[vout]"

	audio_input_index = len(overlay_specs) + 1

	cmd.extend([
	"-filter_complex", filter_complex,
	"-map", final_video_label,
	"-map", f"{audio_input_index}:a:0",
	"-c:v", "libx264",
	"-preset", "ultrafast",
	"-crf", "20",
	"-pix_fmt", "yuv420p",
	"-r", "24",
	"-c:a", "aac",
	"-b:a", "128k",
	"-movflags", "+faststart",
	"-shortest",
	output_path
	])

	result = subprocess.run(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=True
	)

	if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
	return jsonify({
	"error": "Video file not created",
	"details": "FFmpeg ran but output file is missing or empty."
	}), 500

	return jsonify({
	"video_url": f"/video/{output_filename}",
	"transcript": transcript,
	"full_text": " ".join(full_text_parts).strip(),
	"language": getattr(info, "language", None)
	})

	except subprocess.CalledProcessError as e:
	return jsonify({
	"error": "FFmpeg failed",
	"details": e.stderr.decode("utf-8", errors="ignore")
	}), 500

	except Exception as e:
	return jsonify({
	"error": "Processing failed",
	"details": str(e)
	}), 500

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860, debug=True)