from flask import Flask, render_template_string, request, jsonify, send_from_directory, abort
import os
import uuid
import subprocess
from PIL import Image, ImageDraw, ImageFont
from werkzeug.utils import secure_filename
from faster_whisper import WhisperModel
app = Flask(__name__)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
UPLOAD_FOLDER = os.path.join(BASE_DIR, "uploads")
OUTPUT_FOLDER = os.path.join(BASE_DIR, "static", "videos")
SUBTITLE_FOLDER = os.path.join(BASE_DIR, "subtitles")
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
# Fast CPU model
model = WhisperModel(
"tiny",
device="cpu",
compute_type="int8"
)
FRAME_W = 1080
FRAME_H = 1920
HTML = """
Photo + Audio To Video
Photo + Audio → Video
Generating Video...
"""
def find_font_path():
candidates = [
"/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf",
"/usr/share/fonts/truetype/liberation2/LiberationSans-Regular.ttf",
"/usr/share/fonts/truetype/freefont/FreeSansBold.ttf",
"/usr/share/fonts/truetype/freefont/FreeSans.ttf",
]
for path in candidates:
if os.path.exists(path):
return path
return None
FONT_PATH = find_font_path()
def ass_time(seconds: float) -> str:
if seconds < 0:
seconds = 0
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = seconds % 60
return f"{h}:{m:02d}:{s:05.2f}"
def measure_text_width(font, text: str) -> int:
bbox = font.getbbox(text)
return bbox[2] - bbox[0]
def measure_text_height(font, text: str) -> int:
bbox = font.getbbox(text)
return bbox[3] - bbox[1]
def clean_text(text: str) -> str:
return " ".join(text.strip().split())
def wrap_text_by_pixels(text: str, font, max_width_px: int, max_lines: int = 4) -> list[str]:
text = clean_text(text)
if not text:
return []
def split_long_word(word: str) -> list[str]:
if measure_text_width(font, word) <= max_width_px:
return [word]
parts = []
chunk = ""
for ch in word:
trial = chunk + ch
if measure_text_width(font, trial) <= max_width_px:
chunk = trial
else:
if chunk:
parts.append(chunk)
chunk = ch
if chunk:
parts.append(chunk)
return parts
tokens = []
for word in text.split(" "):
tokens.extend(split_long_word(word))
lines = []
current = ""
for token in tokens:
trial = token if not current else f"{current} {token}"
if measure_text_width(font, trial) <= max_width_px:
current = trial
else:
if current:
lines.append(current)
current = token
if current:
lines.append(current)
if len(lines) > max_lines:
kept = lines[:max_lines - 1]
kept.append(" ".join(lines[max_lines - 1:]))
lines = kept
return lines
def pick_layout(text: str):
"""
Try a few font sizes and pick one that fits nicely.
"""
if FONT_PATH and os.path.exists(FONT_PATH):
candidates = [42, 40, 38, 36, 34, 32]
max_box_width = 940
padding_x = 36
padding_y = 22
line_spacing = 10
bottom_margin = 230
radius = 20
for font_size in candidates:
font = ImageFont.truetype(FONT_PATH, font_size)
lines = wrap_text_by_pixels(
text=text,
font=font,
max_width_px=max_box_width - (padding_x * 2),
max_lines=5
)
if not lines:
continue
widths = [measure_text_width(font, line) for line in lines]
heights = [measure_text_height(font, line) for line in lines]
box_w = min(max_box_width, max(widths) + padding_x * 2)
box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2
if box_h <= 520:
return {
"font": font,
"font_size": font_size,
"lines": lines,
"box_w": box_w,
"box_h": box_h,
"padding_x": padding_x,
"padding_y": padding_y,
"line_spacing": line_spacing,
"bottom_margin": bottom_margin,
"radius": radius,
}
font = ImageFont.truetype(FONT_PATH, 32)
lines = wrap_text_by_pixels(
text=text,
font=font,
max_width_px=max_box_width - (padding_x * 2),
max_lines=5
)
widths = [measure_text_width(font, line) for line in lines] if lines else [0]
heights = [measure_text_height(font, line) for line in lines] if lines else [0]
box_w = min(max_box_width, max(widths) + padding_x * 2)
box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2
return {
"font": font,
"font_size": 32,
"lines": lines,
"box_w": box_w,
"box_h": box_h,
"padding_x": padding_x,
"padding_y": padding_y,
"line_spacing": line_spacing,
"bottom_margin": bottom_margin,
"radius": radius,
}
font = ImageFont.load_default()
lines = wrap_text_by_pixels(text=text, font=font, max_width_px=900, max_lines=4)
widths = [measure_text_width(font, line) for line in lines] if lines else [0]
heights = [measure_text_height(font, line) for line in lines] if lines else [0]
box_w = min(940, max(widths) + 72)
box_h = sum(heights) + 10 * (len(lines) - 1) + 44
return {
"font": font,
"font_size": 16,
"lines": lines,
"box_w": box_w,
"box_h": box_h,
"padding_x": 36,
"padding_y": 22,
"line_spacing": 10,
"bottom_margin": 230,
"radius": 20,
}
def render_subtitle_frame(text: str, image_path: str):
layout = pick_layout(text)
font = layout["font"]
lines = layout["lines"]
box_w = layout["box_w"]
box_h = layout["box_h"]
padding_x = layout["padding_x"]
padding_y = layout["padding_y"]
line_spacing = layout["line_spacing"]
bottom_margin = layout["bottom_margin"]
radius = layout["radius"]
img = Image.new("RGBA", (FRAME_W, FRAME_H), (0, 0, 0, 0))
draw = ImageDraw.Draw(img)
x0 = int((FRAME_W - box_w) / 2)
y0 = int(FRAME_H - bottom_margin - box_h)
x1 = x0 + box_w
y1 = y0 + box_h
# Solid black background box
draw.rounded_rectangle(
[x0, y0, x1, y1],
radius=radius,
fill=(0, 0, 0, 255)
)
y = y0 + padding_y
for line in lines:
line_w = measure_text_width(font, line)
line_h = measure_text_height(font, line)
tx = int((FRAME_W - line_w) / 2)
draw.text(
(tx, y),
line,
font=font,
fill=(255, 255, 255, 255)
)
y += line_h + line_spacing
img.save(image_path)
def build_subtitle_overlays(transcript, job_dir):
overlay_specs = []
for idx, seg in enumerate(transcript):
text = seg["text"].strip()
if not text:
continue
png_name = f"sub_{idx:03d}.png"
png_path = os.path.join(job_dir, png_name)
render_subtitle_frame(text, png_path)
overlay_specs.append({
"path": png_path,
"start": float(seg["start"]),
"end": float(seg["end"]),
})
return overlay_specs
@app.route("/")
def home():
return render_template_string(HTML)
@app.route("/video/")
def serve_video(filename):
file_path = os.path.join(OUTPUT_FOLDER, filename)
if not os.path.exists(file_path):
abort(404)
response = send_from_directory(
OUTPUT_FOLDER,
filename,
as_attachment=False,
conditional=True
)
response.headers["Cache-Control"] = "no-store"
return response
@app.route("/generate", methods=["POST"])
def generate():
if "image" not in request.files or "audio" not in request.files:
return jsonify({"error": "Missing files"}), 400
image = request.files["image"]
audio = request.files["audio"]
if not image.filename or not audio.filename:
return jsonify({"error": "Please upload both image and audio"}), 400
uid = str(uuid.uuid4())
image_name = secure_filename(image.filename)
audio_name = secure_filename(audio.filename)
image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}")
audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}")
output_filename = f"{uid}.mp4"
output_path = os.path.join(OUTPUT_FOLDER, output_filename)
job_subtitle_dir = os.path.join(SUBTITLE_FOLDER, uid)
os.makedirs(job_subtitle_dir, exist_ok=True)
image.save(image_path)
audio.save(audio_path)
try:
segments_iter, info = model.transcribe(
audio_path,
beam_size=1,
vad_filter=True
)
transcript = []
full_text_parts = []
for segment in segments_iter:
text = segment.text.strip()
if not text:
continue
transcript.append({
"start": round(segment.start, 2),
"end": round(segment.end, 2),
"text": text
})
full_text_parts.append(text)
overlay_specs = build_subtitle_overlays(transcript, job_subtitle_dir)
# Inputs:
# 0 = image
# 1..n = subtitle PNG overlays
# last = audio
cmd = [
"ffmpeg",
"-y",
"-loop", "1",
"-framerate", "1",
"-i", image_path,
]
for spec in overlay_specs:
cmd.extend([
"-loop", "1",
"-framerate", "1",
"-i", spec["path"]
])
cmd.extend([
"-i", audio_path,
])
filter_parts = [
"[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[base]"
]
last_label = "[base]"
for idx, spec in enumerate(overlay_specs):
input_idx = idx + 1
next_label = f"[v{idx}]"
start = spec["start"]
end = spec["end"]
filter_parts.append(
f"{last_label}[{input_idx}:v]overlay=0:0:enable='between(t,{start:.2f},{end:.2f})'{next_label}"
)
last_label = next_label
if overlay_specs:
filter_complex = ";".join(filter_parts)
else:
filter_complex = "[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[vout]"
if overlay_specs:
final_video_label = last_label
else:
final_video_label = "[vout]"
audio_input_index = len(overlay_specs) + 1
cmd.extend([
"-filter_complex", filter_complex,
"-map", final_video_label,
"-map", f"{audio_input_index}:a:0",
"-c:v", "libx264",
"-preset", "ultrafast",
"-crf", "20",
"-pix_fmt", "yuv420p",
"-r", "24",
"-c:a", "aac",
"-b:a", "128k",
"-movflags", "+faststart",
"-shortest",
output_path
])
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True
)
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
return jsonify({
"error": "Video file not created",
"details": "FFmpeg ran but output file is missing or empty."
}), 500
return jsonify({
"video_url": f"/video/{output_filename}",
"transcript": transcript,
"full_text": " ".join(full_text_parts).strip(),
"language": getattr(info, "language", None)
})
except subprocess.CalledProcessError as e:
return jsonify({
"error": "FFmpeg failed",
"details": e.stderr.decode("utf-8", errors="ignore")
}), 500
except Exception as e:
return jsonify({
"error": "Processing failed",
"details": str(e)
}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860, debug=True)