test-ffmpeg / app.py
ulduldp's picture
Update app.py
45fc8b1 verified
from flask import Flask, render_template_string, request, jsonify, send_from_directory, abort
import os
import uuid
import subprocess
from PIL import Image, ImageDraw, ImageFont
from werkzeug.utils import secure_filename
from faster_whisper import WhisperModel
app = Flask(__name__)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
UPLOAD_FOLDER = os.path.join(BASE_DIR, "uploads")
OUTPUT_FOLDER = os.path.join(BASE_DIR, "static", "videos")
SUBTITLE_FOLDER = os.path.join(BASE_DIR, "subtitles")
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
# Fast CPU model
model = WhisperModel(
"tiny",
device="cpu",
compute_type="int8"
)
FRAME_W = 1080
FRAME_H = 1920
HTML = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Photo + Audio To Video</title>
<style>
*{
margin:0;
padding:0;
box-sizing:border-box;
font-family:Arial;
}
body{
background:#0f0f0f;
color:white;
min-height:100vh;
display:flex;
justify-content:center;
align-items:center;
padding:20px;
}
.container{
width:100%;
max-width:500px;
background:#1b1b1b;
border-radius:20px;
padding:25px;
box-shadow:0 0 20px rgba(0,0,0,0.4);
}
h1{
text-align:center;
margin-bottom:25px;
font-size:28px;
}
.upload-box{
border:2px dashed #444;
padding:20px;
border-radius:15px;
margin-bottom:20px;
}
label{
display:block;
margin-bottom:8px;
color:#ccc;
}
input{
width:100%;
padding:12px;
background:#2a2a2a;
border:none;
border-radius:10px;
color:white;
margin-bottom:15px;
}
button{
width:100%;
padding:15px;
border:none;
border-radius:12px;
background:#00aaff;
color:white;
font-size:18px;
cursor:pointer;
transition:0.3s;
}
button:hover{
opacity:0.9;
}
#loading{
display:none;
text-align:center;
margin-top:20px;
}
video{
width:100%;
margin-top:20px;
border-radius:15px;
display:none;
aspect-ratio:9/16;
background:#000;
object-fit:cover;
}
.download-btn{
display:none;
margin-top:15px;
text-align:center;
}
.download-btn a{
display:inline-block;
background:#22c55e;
color:white;
text-decoration:none;
padding:12px 20px;
border-radius:10px;
}
.preview{
margin-top:15px;
width:100%;
border-radius:15px;
display:none;
}
</style>
</head>
<body>
<div class="container">
<h1>Photo + Audio → Video</h1>
<form id="form">
<div class="upload-box">
<label>Select Photo</label>
<input type="file" id="image" name="image" accept="image/*" required>
<img id="preview" class="preview">
<label>Select Audio (mp3/wav)</label>
<input type="file" name="audio" accept="audio/*" required>
</div>
<button type="submit">Generate Video</button>
</form>
<div id="loading">Generating Video...</div>
<video id="video" controls playsinline></video>
<div class="download-btn" id="downloadDiv">
<a id="downloadBtn" download>Download Video</a>
</div>
</div>
<script>
const form = document.getElementById("form");
const loading = document.getElementById("loading");
const video = document.getElementById("video");
const downloadBtn = document.getElementById("downloadBtn");
const downloadDiv = document.getElementById("downloadDiv");
const preview = document.getElementById("preview");
document.getElementById("image").addEventListener("change", function(e){
const file = e.target.files[0];
if(file){
preview.src = URL.createObjectURL(file);
preview.style.display = "block";
}
});
form.addEventListener("submit", async (e)=>{
e.preventDefault();
loading.style.display = "block";
video.style.display = "none";
downloadDiv.style.display = "none";
const formData = new FormData(form);
try{
const response = await fetch("/generate", {
method:"POST",
body:formData
});
const data = await response.json();
loading.style.display = "none";
if(data.video_url){
video.src = data.video_url + "?t=" + new Date().getTime();
video.style.display = "block";
downloadBtn.href = data.video_url;
downloadDiv.style.display = "block";
}else{
alert(data.error || "Failed");
console.log(data.details || "");
}
}catch(err){
loading.style.display = "none";
alert("Server Error");
console.error(err);
}
});
</script>
</body>
</html>
"""
def find_font_path():
candidates = [
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf",
"/usr/share/fonts/truetype/liberation2/LiberationSans-Regular.ttf",
"/usr/share/fonts/truetype/freefont/FreeSansBold.ttf",
"/usr/share/fonts/truetype/freefont/FreeSans.ttf",
]
for path in candidates:
if os.path.exists(path):
return path
return None
FONT_PATH = find_font_path()
def ass_time(seconds: float) -> str:
if seconds < 0:
seconds = 0
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = seconds % 60
return f"{h}:{m:02d}:{s:05.2f}"
def measure_text_width(font, text: str) -> int:
bbox = font.getbbox(text)
return bbox[2] - bbox[0]
def measure_text_height(font, text: str) -> int:
bbox = font.getbbox(text)
return bbox[3] - bbox[1]
def clean_text(text: str) -> str:
return " ".join(text.strip().split())
def wrap_text_by_pixels(text: str, font, max_width_px: int, max_lines: int = 4) -> list[str]:
text = clean_text(text)
if not text:
return []
def split_long_word(word: str) -> list[str]:
if measure_text_width(font, word) <= max_width_px:
return [word]
parts = []
chunk = ""
for ch in word:
trial = chunk + ch
if measure_text_width(font, trial) <= max_width_px:
chunk = trial
else:
if chunk:
parts.append(chunk)
chunk = ch
if chunk:
parts.append(chunk)
return parts
tokens = []
for word in text.split(" "):
tokens.extend(split_long_word(word))
lines = []
current = ""
for token in tokens:
trial = token if not current else f"{current} {token}"
if measure_text_width(font, trial) <= max_width_px:
current = trial
else:
if current:
lines.append(current)
current = token
if current:
lines.append(current)
if len(lines) > max_lines:
kept = lines[:max_lines - 1]
kept.append(" ".join(lines[max_lines - 1:]))
lines = kept
return lines
def pick_layout(text: str):
"""
Try a few font sizes and pick one that fits nicely.
"""
if FONT_PATH and os.path.exists(FONT_PATH):
candidates = [42, 40, 38, 36, 34, 32]
max_box_width = 940
padding_x = 36
padding_y = 22
line_spacing = 10
bottom_margin = 230
radius = 20
for font_size in candidates:
font = ImageFont.truetype(FONT_PATH, font_size)
lines = wrap_text_by_pixels(
text=text,
font=font,
max_width_px=max_box_width - (padding_x * 2),
max_lines=5
)
if not lines:
continue
widths = [measure_text_width(font, line) for line in lines]
heights = [measure_text_height(font, line) for line in lines]
box_w = min(max_box_width, max(widths) + padding_x * 2)
box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2
if box_h <= 520:
return {
"font": font,
"font_size": font_size,
"lines": lines,
"box_w": box_w,
"box_h": box_h,
"padding_x": padding_x,
"padding_y": padding_y,
"line_spacing": line_spacing,
"bottom_margin": bottom_margin,
"radius": radius,
}
font = ImageFont.truetype(FONT_PATH, 32)
lines = wrap_text_by_pixels(
text=text,
font=font,
max_width_px=max_box_width - (padding_x * 2),
max_lines=5
)
widths = [measure_text_width(font, line) for line in lines] if lines else [0]
heights = [measure_text_height(font, line) for line in lines] if lines else [0]
box_w = min(max_box_width, max(widths) + padding_x * 2)
box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2
return {
"font": font,
"font_size": 32,
"lines": lines,
"box_w": box_w,
"box_h": box_h,
"padding_x": padding_x,
"padding_y": padding_y,
"line_spacing": line_spacing,
"bottom_margin": bottom_margin,
"radius": radius,
}
font = ImageFont.load_default()
lines = wrap_text_by_pixels(text=text, font=font, max_width_px=900, max_lines=4)
widths = [measure_text_width(font, line) for line in lines] if lines else [0]
heights = [measure_text_height(font, line) for line in lines] if lines else [0]
box_w = min(940, max(widths) + 72)
box_h = sum(heights) + 10 * (len(lines) - 1) + 44
return {
"font": font,
"font_size": 16,
"lines": lines,
"box_w": box_w,
"box_h": box_h,
"padding_x": 36,
"padding_y": 22,
"line_spacing": 10,
"bottom_margin": 230,
"radius": 20,
}
def render_subtitle_frame(text: str, image_path: str):
layout = pick_layout(text)
font = layout["font"]
lines = layout["lines"]
box_w = layout["box_w"]
box_h = layout["box_h"]
padding_x = layout["padding_x"]
padding_y = layout["padding_y"]
line_spacing = layout["line_spacing"]
bottom_margin = layout["bottom_margin"]
radius = layout["radius"]
img = Image.new("RGBA", (FRAME_W, FRAME_H), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
x0 = int((FRAME_W - box_w) / 2)
y0 = int(FRAME_H - bottom_margin - box_h)
x1 = x0 + box_w
y1 = y0 + box_h
# Solid black background box
draw.rounded_rectangle(
[x0, y0, x1, y1],
radius=radius,
fill=(0, 0, 0, 255)
)
y = y0 + padding_y
for line in lines:
line_w = measure_text_width(font, line)
line_h = measure_text_height(font, line)
tx = int((FRAME_W - line_w) / 2)
draw.text(
(tx, y),
line,
font=font,
fill=(255, 255, 255, 255)
)
y += line_h + line_spacing
img.save(image_path)
def build_subtitle_overlays(transcript, job_dir):
overlay_specs = []
for idx, seg in enumerate(transcript):
text = seg["text"].strip()
if not text:
continue
png_name = f"sub_{idx:03d}.png"
png_path = os.path.join(job_dir, png_name)
render_subtitle_frame(text, png_path)
overlay_specs.append({
"path": png_path,
"start": float(seg["start"]),
"end": float(seg["end"]),
})
return overlay_specs
@app.route("/")
def home():
return render_template_string(HTML)
@app.route("/video/<path:filename>")
def serve_video(filename):
file_path = os.path.join(OUTPUT_FOLDER, filename)
if not os.path.exists(file_path):
abort(404)
response = send_from_directory(
OUTPUT_FOLDER,
filename,
as_attachment=False,
conditional=True
)
response.headers["Cache-Control"] = "no-store"
return response
@app.route("/generate", methods=["POST"])
def generate():
if "image" not in request.files or "audio" not in request.files:
return jsonify({"error": "Missing files"}), 400
image = request.files["image"]
audio = request.files["audio"]
if not image.filename or not audio.filename:
return jsonify({"error": "Please upload both image and audio"}), 400
uid = str(uuid.uuid4())
image_name = secure_filename(image.filename)
audio_name = secure_filename(audio.filename)
image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}")
audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}")
output_filename = f"{uid}.mp4"
output_path = os.path.join(OUTPUT_FOLDER, output_filename)
job_subtitle_dir = os.path.join(SUBTITLE_FOLDER, uid)
os.makedirs(job_subtitle_dir, exist_ok=True)
image.save(image_path)
audio.save(audio_path)
try:
segments_iter, info = model.transcribe(
audio_path,
beam_size=1,
vad_filter=True
)
transcript = []
full_text_parts = []
for segment in segments_iter:
text = segment.text.strip()
if not text:
continue
transcript.append({
"start": round(segment.start, 2),
"end": round(segment.end, 2),
"text": text
})
full_text_parts.append(text)
overlay_specs = build_subtitle_overlays(transcript, job_subtitle_dir)
# Inputs:
# 0 = image
# 1..n = subtitle PNG overlays
# last = audio
cmd = [
"ffmpeg",
"-y",
"-loop", "1",
"-framerate", "1",
"-i", image_path,
]
for spec in overlay_specs:
cmd.extend([
"-loop", "1",
"-framerate", "1",
"-i", spec["path"]
])
cmd.extend([
"-i", audio_path,
])
filter_parts = [
"[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[base]"
]
last_label = "[base]"
for idx, spec in enumerate(overlay_specs):
input_idx = idx + 1
next_label = f"[v{idx}]"
start = spec["start"]
end = spec["end"]
filter_parts.append(
f"{last_label}[{input_idx}:v]overlay=0:0:enable='between(t,{start:.2f},{end:.2f})'{next_label}"
)
last_label = next_label
if overlay_specs:
filter_complex = ";".join(filter_parts)
else:
filter_complex = "[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[vout]"
if overlay_specs:
final_video_label = last_label
else:
final_video_label = "[vout]"
audio_input_index = len(overlay_specs) + 1
cmd.extend([
"-filter_complex", filter_complex,
"-map", final_video_label,
"-map", f"{audio_input_index}:a:0",
"-c:v", "libx264",
"-preset", "ultrafast",
"-crf", "20",
"-pix_fmt", "yuv420p",
"-r", "24",
"-c:a", "aac",
"-b:a", "128k",
"-movflags", "+faststart",
"-shortest",
output_path
])
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True
)
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
return jsonify({
"error": "Video file not created",
"details": "FFmpeg ran but output file is missing or empty."
}), 500
return jsonify({
"video_url": f"/video/{output_filename}",
"transcript": transcript,
"full_text": " ".join(full_text_parts).strip(),
"language": getattr(info, "language", None)
})
except subprocess.CalledProcessError as e:
return jsonify({
"error": "FFmpeg failed",
"details": e.stderr.decode("utf-8", errors="ignore")
}), 500
except Exception as e:
return jsonify({
"error": "Processing failed",
"details": str(e)
}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)