from flask import Flask, render_template_string, request, jsonify, send_from_directory, abort import os import uuid import subprocess from PIL import Image, ImageDraw, ImageFont from werkzeug.utils import secure_filename from faster_whisper import WhisperModel app = Flask(__name__) BASE_DIR = os.path.dirname(os.path.abspath(__file__)) UPLOAD_FOLDER = os.path.join(BASE_DIR, "uploads") OUTPUT_FOLDER = os.path.join(BASE_DIR, "static", "videos") SUBTITLE_FOLDER = os.path.join(BASE_DIR, "subtitles") os.makedirs(UPLOAD_FOLDER, exist_ok=True) os.makedirs(OUTPUT_FOLDER, exist_ok=True) os.makedirs(SUBTITLE_FOLDER, exist_ok=True) # Fast CPU model model = WhisperModel( "tiny", device="cpu", compute_type="int8" ) FRAME_W = 1080 FRAME_H = 1920 HTML = """ Photo + Audio To Video

Photo + Audio → Video

Generating Video...
Download Video
""" def find_font_path(): candidates = [ "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", "/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf", "/usr/share/fonts/truetype/liberation2/LiberationSans-Regular.ttf", "/usr/share/fonts/truetype/freefont/FreeSansBold.ttf", "/usr/share/fonts/truetype/freefont/FreeSans.ttf", ] for path in candidates: if os.path.exists(path): return path return None FONT_PATH = find_font_path() def ass_time(seconds: float) -> str: if seconds < 0: seconds = 0 h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = seconds % 60 return f"{h}:{m:02d}:{s:05.2f}" def measure_text_width(font, text: str) -> int: bbox = font.getbbox(text) return bbox[2] - bbox[0] def measure_text_height(font, text: str) -> int: bbox = font.getbbox(text) return bbox[3] - bbox[1] def clean_text(text: str) -> str: return " ".join(text.strip().split()) def wrap_text_by_pixels(text: str, font, max_width_px: int, max_lines: int = 4) -> list[str]: text = clean_text(text) if not text: return [] def split_long_word(word: str) -> list[str]: if measure_text_width(font, word) <= max_width_px: return [word] parts = [] chunk = "" for ch in word: trial = chunk + ch if measure_text_width(font, trial) <= max_width_px: chunk = trial else: if chunk: parts.append(chunk) chunk = ch if chunk: parts.append(chunk) return parts tokens = [] for word in text.split(" "): tokens.extend(split_long_word(word)) lines = [] current = "" for token in tokens: trial = token if not current else f"{current} {token}" if measure_text_width(font, trial) <= max_width_px: current = trial else: if current: lines.append(current) current = token if current: lines.append(current) if len(lines) > max_lines: kept = lines[:max_lines - 1] kept.append(" ".join(lines[max_lines - 1:])) lines = kept return lines def pick_layout(text: str): """ Try a few font sizes and pick one that fits nicely. """ if FONT_PATH and os.path.exists(FONT_PATH): candidates = [42, 40, 38, 36, 34, 32] max_box_width = 940 padding_x = 36 padding_y = 22 line_spacing = 10 bottom_margin = 230 radius = 20 for font_size in candidates: font = ImageFont.truetype(FONT_PATH, font_size) lines = wrap_text_by_pixels( text=text, font=font, max_width_px=max_box_width - (padding_x * 2), max_lines=5 ) if not lines: continue widths = [measure_text_width(font, line) for line in lines] heights = [measure_text_height(font, line) for line in lines] box_w = min(max_box_width, max(widths) + padding_x * 2) box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2 if box_h <= 520: return { "font": font, "font_size": font_size, "lines": lines, "box_w": box_w, "box_h": box_h, "padding_x": padding_x, "padding_y": padding_y, "line_spacing": line_spacing, "bottom_margin": bottom_margin, "radius": radius, } font = ImageFont.truetype(FONT_PATH, 32) lines = wrap_text_by_pixels( text=text, font=font, max_width_px=max_box_width - (padding_x * 2), max_lines=5 ) widths = [measure_text_width(font, line) for line in lines] if lines else [0] heights = [measure_text_height(font, line) for line in lines] if lines else [0] box_w = min(max_box_width, max(widths) + padding_x * 2) box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2 return { "font": font, "font_size": 32, "lines": lines, "box_w": box_w, "box_h": box_h, "padding_x": padding_x, "padding_y": padding_y, "line_spacing": line_spacing, "bottom_margin": bottom_margin, "radius": radius, } font = ImageFont.load_default() lines = wrap_text_by_pixels(text=text, font=font, max_width_px=900, max_lines=4) widths = [measure_text_width(font, line) for line in lines] if lines else [0] heights = [measure_text_height(font, line) for line in lines] if lines else [0] box_w = min(940, max(widths) + 72) box_h = sum(heights) + 10 * (len(lines) - 1) + 44 return { "font": font, "font_size": 16, "lines": lines, "box_w": box_w, "box_h": box_h, "padding_x": 36, "padding_y": 22, "line_spacing": 10, "bottom_margin": 230, "radius": 20, } def render_subtitle_frame(text: str, image_path: str): layout = pick_layout(text) font = layout["font"] lines = layout["lines"] box_w = layout["box_w"] box_h = layout["box_h"] padding_x = layout["padding_x"] padding_y = layout["padding_y"] line_spacing = layout["line_spacing"] bottom_margin = layout["bottom_margin"] radius = layout["radius"] img = Image.new("RGBA", (FRAME_W, FRAME_H), (0, 0, 0, 0)) draw = ImageDraw.Draw(img) x0 = int((FRAME_W - box_w) / 2) y0 = int(FRAME_H - bottom_margin - box_h) x1 = x0 + box_w y1 = y0 + box_h # Solid black background box draw.rounded_rectangle( [x0, y0, x1, y1], radius=radius, fill=(0, 0, 0, 255) ) y = y0 + padding_y for line in lines: line_w = measure_text_width(font, line) line_h = measure_text_height(font, line) tx = int((FRAME_W - line_w) / 2) draw.text( (tx, y), line, font=font, fill=(255, 255, 255, 255) ) y += line_h + line_spacing img.save(image_path) def build_subtitle_overlays(transcript, job_dir): overlay_specs = [] for idx, seg in enumerate(transcript): text = seg["text"].strip() if not text: continue png_name = f"sub_{idx:03d}.png" png_path = os.path.join(job_dir, png_name) render_subtitle_frame(text, png_path) overlay_specs.append({ "path": png_path, "start": float(seg["start"]), "end": float(seg["end"]), }) return overlay_specs @app.route("/") def home(): return render_template_string(HTML) @app.route("/video/") def serve_video(filename): file_path = os.path.join(OUTPUT_FOLDER, filename) if not os.path.exists(file_path): abort(404) response = send_from_directory( OUTPUT_FOLDER, filename, as_attachment=False, conditional=True ) response.headers["Cache-Control"] = "no-store" return response @app.route("/generate", methods=["POST"]) def generate(): if "image" not in request.files or "audio" not in request.files: return jsonify({"error": "Missing files"}), 400 image = request.files["image"] audio = request.files["audio"] if not image.filename or not audio.filename: return jsonify({"error": "Please upload both image and audio"}), 400 uid = str(uuid.uuid4()) image_name = secure_filename(image.filename) audio_name = secure_filename(audio.filename) image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}") audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}") output_filename = f"{uid}.mp4" output_path = os.path.join(OUTPUT_FOLDER, output_filename) job_subtitle_dir = os.path.join(SUBTITLE_FOLDER, uid) os.makedirs(job_subtitle_dir, exist_ok=True) image.save(image_path) audio.save(audio_path) try: segments_iter, info = model.transcribe( audio_path, beam_size=1, vad_filter=True ) transcript = [] full_text_parts = [] for segment in segments_iter: text = segment.text.strip() if not text: continue transcript.append({ "start": round(segment.start, 2), "end": round(segment.end, 2), "text": text }) full_text_parts.append(text) overlay_specs = build_subtitle_overlays(transcript, job_subtitle_dir) # Inputs: # 0 = image # 1..n = subtitle PNG overlays # last = audio cmd = [ "ffmpeg", "-y", "-loop", "1", "-framerate", "1", "-i", image_path, ] for spec in overlay_specs: cmd.extend([ "-loop", "1", "-framerate", "1", "-i", spec["path"] ]) cmd.extend([ "-i", audio_path, ]) filter_parts = [ "[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[base]" ] last_label = "[base]" for idx, spec in enumerate(overlay_specs): input_idx = idx + 1 next_label = f"[v{idx}]" start = spec["start"] end = spec["end"] filter_parts.append( f"{last_label}[{input_idx}:v]overlay=0:0:enable='between(t,{start:.2f},{end:.2f})'{next_label}" ) last_label = next_label if overlay_specs: filter_complex = ";".join(filter_parts) else: filter_complex = "[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[vout]" if overlay_specs: final_video_label = last_label else: final_video_label = "[vout]" audio_input_index = len(overlay_specs) + 1 cmd.extend([ "-filter_complex", filter_complex, "-map", final_video_label, "-map", f"{audio_input_index}:a:0", "-c:v", "libx264", "-preset", "ultrafast", "-crf", "20", "-pix_fmt", "yuv420p", "-r", "24", "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", "-shortest", output_path ]) result = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True ) if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: return jsonify({ "error": "Video file not created", "details": "FFmpeg ran but output file is missing or empty." }), 500 return jsonify({ "video_url": f"/video/{output_filename}", "transcript": transcript, "full_text": " ".join(full_text_parts).strip(), "language": getattr(info, "language", None) }) except subprocess.CalledProcessError as e: return jsonify({ "error": "FFmpeg failed", "details": e.stderr.decode("utf-8", errors="ignore") }), 500 except Exception as e: return jsonify({ "error": "Processing failed", "details": str(e) }), 500 if __name__ == "__main__": app.run(host="0.0.0.0", port=7860, debug=True)