Spaces:
Running
Running
| from flask import Flask, render_template_string, request, jsonify, send_from_directory, abort | |
| import os | |
| import uuid | |
| import subprocess | |
| from PIL import Image, ImageDraw, ImageFont | |
| from werkzeug.utils import secure_filename | |
| from faster_whisper import WhisperModel | |
| app = Flask(__name__) | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| UPLOAD_FOLDER = os.path.join(BASE_DIR, "uploads") | |
| OUTPUT_FOLDER = os.path.join(BASE_DIR, "static", "videos") | |
| SUBTITLE_FOLDER = os.path.join(BASE_DIR, "subtitles") | |
| os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
| os.makedirs(OUTPUT_FOLDER, exist_ok=True) | |
| os.makedirs(SUBTITLE_FOLDER, exist_ok=True) | |
| # Fast CPU model | |
| model = WhisperModel( | |
| "tiny", | |
| device="cpu", | |
| compute_type="int8" | |
| ) | |
| FRAME_W = 1080 | |
| FRAME_H = 1920 | |
| HTML = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Photo + Audio To Video</title> | |
| <style> | |
| *{ | |
| margin:0; | |
| padding:0; | |
| box-sizing:border-box; | |
| font-family:Arial; | |
| } | |
| body{ | |
| background:#0f0f0f; | |
| color:white; | |
| min-height:100vh; | |
| display:flex; | |
| justify-content:center; | |
| align-items:center; | |
| padding:20px; | |
| } | |
| .container{ | |
| width:100%; | |
| max-width:500px; | |
| background:#1b1b1b; | |
| border-radius:20px; | |
| padding:25px; | |
| box-shadow:0 0 20px rgba(0,0,0,0.4); | |
| } | |
| h1{ | |
| text-align:center; | |
| margin-bottom:25px; | |
| font-size:28px; | |
| } | |
| .upload-box{ | |
| border:2px dashed #444; | |
| padding:20px; | |
| border-radius:15px; | |
| margin-bottom:20px; | |
| } | |
| label{ | |
| display:block; | |
| margin-bottom:8px; | |
| color:#ccc; | |
| } | |
| input{ | |
| width:100%; | |
| padding:12px; | |
| background:#2a2a2a; | |
| border:none; | |
| border-radius:10px; | |
| color:white; | |
| margin-bottom:15px; | |
| } | |
| button{ | |
| width:100%; | |
| padding:15px; | |
| border:none; | |
| border-radius:12px; | |
| background:#00aaff; | |
| color:white; | |
| font-size:18px; | |
| cursor:pointer; | |
| transition:0.3s; | |
| } | |
| button:hover{ | |
| opacity:0.9; | |
| } | |
| #loading{ | |
| display:none; | |
| text-align:center; | |
| margin-top:20px; | |
| } | |
| video{ | |
| width:100%; | |
| margin-top:20px; | |
| border-radius:15px; | |
| display:none; | |
| aspect-ratio:9/16; | |
| background:#000; | |
| object-fit:cover; | |
| } | |
| .download-btn{ | |
| display:none; | |
| margin-top:15px; | |
| text-align:center; | |
| } | |
| .download-btn a{ | |
| display:inline-block; | |
| background:#22c55e; | |
| color:white; | |
| text-decoration:none; | |
| padding:12px 20px; | |
| border-radius:10px; | |
| } | |
| .preview{ | |
| margin-top:15px; | |
| width:100%; | |
| border-radius:15px; | |
| display:none; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1>Photo + Audio → Video</h1> | |
| <form id="form"> | |
| <div class="upload-box"> | |
| <label>Select Photo</label> | |
| <input type="file" id="image" name="image" accept="image/*" required> | |
| <img id="preview" class="preview"> | |
| <label>Select Audio (mp3/wav)</label> | |
| <input type="file" name="audio" accept="audio/*" required> | |
| </div> | |
| <button type="submit">Generate Video</button> | |
| </form> | |
| <div id="loading">Generating Video...</div> | |
| <video id="video" controls playsinline></video> | |
| <div class="download-btn" id="downloadDiv"> | |
| <a id="downloadBtn" download>Download Video</a> | |
| </div> | |
| </div> | |
| <script> | |
| const form = document.getElementById("form"); | |
| const loading = document.getElementById("loading"); | |
| const video = document.getElementById("video"); | |
| const downloadBtn = document.getElementById("downloadBtn"); | |
| const downloadDiv = document.getElementById("downloadDiv"); | |
| const preview = document.getElementById("preview"); | |
| document.getElementById("image").addEventListener("change", function(e){ | |
| const file = e.target.files[0]; | |
| if(file){ | |
| preview.src = URL.createObjectURL(file); | |
| preview.style.display = "block"; | |
| } | |
| }); | |
| form.addEventListener("submit", async (e)=>{ | |
| e.preventDefault(); | |
| loading.style.display = "block"; | |
| video.style.display = "none"; | |
| downloadDiv.style.display = "none"; | |
| const formData = new FormData(form); | |
| try{ | |
| const response = await fetch("/generate", { | |
| method:"POST", | |
| body:formData | |
| }); | |
| const data = await response.json(); | |
| loading.style.display = "none"; | |
| if(data.video_url){ | |
| video.src = data.video_url + "?t=" + new Date().getTime(); | |
| video.style.display = "block"; | |
| downloadBtn.href = data.video_url; | |
| downloadDiv.style.display = "block"; | |
| }else{ | |
| alert(data.error || "Failed"); | |
| console.log(data.details || ""); | |
| } | |
| }catch(err){ | |
| loading.style.display = "none"; | |
| alert("Server Error"); | |
| console.error(err); | |
| } | |
| }); | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| def find_font_path(): | |
| candidates = [ | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", | |
| "/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf", | |
| "/usr/share/fonts/truetype/liberation2/LiberationSans-Regular.ttf", | |
| "/usr/share/fonts/truetype/freefont/FreeSansBold.ttf", | |
| "/usr/share/fonts/truetype/freefont/FreeSans.ttf", | |
| ] | |
| for path in candidates: | |
| if os.path.exists(path): | |
| return path | |
| return None | |
| FONT_PATH = find_font_path() | |
| def ass_time(seconds: float) -> str: | |
| if seconds < 0: | |
| seconds = 0 | |
| h = int(seconds // 3600) | |
| m = int((seconds % 3600) // 60) | |
| s = seconds % 60 | |
| return f"{h}:{m:02d}:{s:05.2f}" | |
| def measure_text_width(font, text: str) -> int: | |
| bbox = font.getbbox(text) | |
| return bbox[2] - bbox[0] | |
| def measure_text_height(font, text: str) -> int: | |
| bbox = font.getbbox(text) | |
| return bbox[3] - bbox[1] | |
| def clean_text(text: str) -> str: | |
| return " ".join(text.strip().split()) | |
| def wrap_text_by_pixels(text: str, font, max_width_px: int, max_lines: int = 4) -> list[str]: | |
| text = clean_text(text) | |
| if not text: | |
| return [] | |
| def split_long_word(word: str) -> list[str]: | |
| if measure_text_width(font, word) <= max_width_px: | |
| return [word] | |
| parts = [] | |
| chunk = "" | |
| for ch in word: | |
| trial = chunk + ch | |
| if measure_text_width(font, trial) <= max_width_px: | |
| chunk = trial | |
| else: | |
| if chunk: | |
| parts.append(chunk) | |
| chunk = ch | |
| if chunk: | |
| parts.append(chunk) | |
| return parts | |
| tokens = [] | |
| for word in text.split(" "): | |
| tokens.extend(split_long_word(word)) | |
| lines = [] | |
| current = "" | |
| for token in tokens: | |
| trial = token if not current else f"{current} {token}" | |
| if measure_text_width(font, trial) <= max_width_px: | |
| current = trial | |
| else: | |
| if current: | |
| lines.append(current) | |
| current = token | |
| if current: | |
| lines.append(current) | |
| if len(lines) > max_lines: | |
| kept = lines[:max_lines - 1] | |
| kept.append(" ".join(lines[max_lines - 1:])) | |
| lines = kept | |
| return lines | |
| def pick_layout(text: str): | |
| """ | |
| Try a few font sizes and pick one that fits nicely. | |
| """ | |
| if FONT_PATH and os.path.exists(FONT_PATH): | |
| candidates = [42, 40, 38, 36, 34, 32] | |
| max_box_width = 940 | |
| padding_x = 36 | |
| padding_y = 22 | |
| line_spacing = 10 | |
| bottom_margin = 230 | |
| radius = 20 | |
| for font_size in candidates: | |
| font = ImageFont.truetype(FONT_PATH, font_size) | |
| lines = wrap_text_by_pixels( | |
| text=text, | |
| font=font, | |
| max_width_px=max_box_width - (padding_x * 2), | |
| max_lines=5 | |
| ) | |
| if not lines: | |
| continue | |
| widths = [measure_text_width(font, line) for line in lines] | |
| heights = [measure_text_height(font, line) for line in lines] | |
| box_w = min(max_box_width, max(widths) + padding_x * 2) | |
| box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2 | |
| if box_h <= 520: | |
| return { | |
| "font": font, | |
| "font_size": font_size, | |
| "lines": lines, | |
| "box_w": box_w, | |
| "box_h": box_h, | |
| "padding_x": padding_x, | |
| "padding_y": padding_y, | |
| "line_spacing": line_spacing, | |
| "bottom_margin": bottom_margin, | |
| "radius": radius, | |
| } | |
| font = ImageFont.truetype(FONT_PATH, 32) | |
| lines = wrap_text_by_pixels( | |
| text=text, | |
| font=font, | |
| max_width_px=max_box_width - (padding_x * 2), | |
| max_lines=5 | |
| ) | |
| widths = [measure_text_width(font, line) for line in lines] if lines else [0] | |
| heights = [measure_text_height(font, line) for line in lines] if lines else [0] | |
| box_w = min(max_box_width, max(widths) + padding_x * 2) | |
| box_h = sum(heights) + line_spacing * (len(lines) - 1) + padding_y * 2 | |
| return { | |
| "font": font, | |
| "font_size": 32, | |
| "lines": lines, | |
| "box_w": box_w, | |
| "box_h": box_h, | |
| "padding_x": padding_x, | |
| "padding_y": padding_y, | |
| "line_spacing": line_spacing, | |
| "bottom_margin": bottom_margin, | |
| "radius": radius, | |
| } | |
| font = ImageFont.load_default() | |
| lines = wrap_text_by_pixels(text=text, font=font, max_width_px=900, max_lines=4) | |
| widths = [measure_text_width(font, line) for line in lines] if lines else [0] | |
| heights = [measure_text_height(font, line) for line in lines] if lines else [0] | |
| box_w = min(940, max(widths) + 72) | |
| box_h = sum(heights) + 10 * (len(lines) - 1) + 44 | |
| return { | |
| "font": font, | |
| "font_size": 16, | |
| "lines": lines, | |
| "box_w": box_w, | |
| "box_h": box_h, | |
| "padding_x": 36, | |
| "padding_y": 22, | |
| "line_spacing": 10, | |
| "bottom_margin": 230, | |
| "radius": 20, | |
| } | |
| def render_subtitle_frame(text: str, image_path: str): | |
| layout = pick_layout(text) | |
| font = layout["font"] | |
| lines = layout["lines"] | |
| box_w = layout["box_w"] | |
| box_h = layout["box_h"] | |
| padding_x = layout["padding_x"] | |
| padding_y = layout["padding_y"] | |
| line_spacing = layout["line_spacing"] | |
| bottom_margin = layout["bottom_margin"] | |
| radius = layout["radius"] | |
| img = Image.new("RGBA", (FRAME_W, FRAME_H), (0, 0, 0, 0)) | |
| draw = ImageDraw.Draw(img) | |
| x0 = int((FRAME_W - box_w) / 2) | |
| y0 = int(FRAME_H - bottom_margin - box_h) | |
| x1 = x0 + box_w | |
| y1 = y0 + box_h | |
| # Solid black background box | |
| draw.rounded_rectangle( | |
| [x0, y0, x1, y1], | |
| radius=radius, | |
| fill=(0, 0, 0, 255) | |
| ) | |
| y = y0 + padding_y | |
| for line in lines: | |
| line_w = measure_text_width(font, line) | |
| line_h = measure_text_height(font, line) | |
| tx = int((FRAME_W - line_w) / 2) | |
| draw.text( | |
| (tx, y), | |
| line, | |
| font=font, | |
| fill=(255, 255, 255, 255) | |
| ) | |
| y += line_h + line_spacing | |
| img.save(image_path) | |
| def build_subtitle_overlays(transcript, job_dir): | |
| overlay_specs = [] | |
| for idx, seg in enumerate(transcript): | |
| text = seg["text"].strip() | |
| if not text: | |
| continue | |
| png_name = f"sub_{idx:03d}.png" | |
| png_path = os.path.join(job_dir, png_name) | |
| render_subtitle_frame(text, png_path) | |
| overlay_specs.append({ | |
| "path": png_path, | |
| "start": float(seg["start"]), | |
| "end": float(seg["end"]), | |
| }) | |
| return overlay_specs | |
| def home(): | |
| return render_template_string(HTML) | |
| def serve_video(filename): | |
| file_path = os.path.join(OUTPUT_FOLDER, filename) | |
| if not os.path.exists(file_path): | |
| abort(404) | |
| response = send_from_directory( | |
| OUTPUT_FOLDER, | |
| filename, | |
| as_attachment=False, | |
| conditional=True | |
| ) | |
| response.headers["Cache-Control"] = "no-store" | |
| return response | |
| def generate(): | |
| if "image" not in request.files or "audio" not in request.files: | |
| return jsonify({"error": "Missing files"}), 400 | |
| image = request.files["image"] | |
| audio = request.files["audio"] | |
| if not image.filename or not audio.filename: | |
| return jsonify({"error": "Please upload both image and audio"}), 400 | |
| uid = str(uuid.uuid4()) | |
| image_name = secure_filename(image.filename) | |
| audio_name = secure_filename(audio.filename) | |
| image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}") | |
| audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}") | |
| output_filename = f"{uid}.mp4" | |
| output_path = os.path.join(OUTPUT_FOLDER, output_filename) | |
| job_subtitle_dir = os.path.join(SUBTITLE_FOLDER, uid) | |
| os.makedirs(job_subtitle_dir, exist_ok=True) | |
| image.save(image_path) | |
| audio.save(audio_path) | |
| try: | |
| segments_iter, info = model.transcribe( | |
| audio_path, | |
| beam_size=1, | |
| vad_filter=True | |
| ) | |
| transcript = [] | |
| full_text_parts = [] | |
| for segment in segments_iter: | |
| text = segment.text.strip() | |
| if not text: | |
| continue | |
| transcript.append({ | |
| "start": round(segment.start, 2), | |
| "end": round(segment.end, 2), | |
| "text": text | |
| }) | |
| full_text_parts.append(text) | |
| overlay_specs = build_subtitle_overlays(transcript, job_subtitle_dir) | |
| # Inputs: | |
| # 0 = image | |
| # 1..n = subtitle PNG overlays | |
| # last = audio | |
| cmd = [ | |
| "ffmpeg", | |
| "-y", | |
| "-loop", "1", | |
| "-framerate", "1", | |
| "-i", image_path, | |
| ] | |
| for spec in overlay_specs: | |
| cmd.extend([ | |
| "-loop", "1", | |
| "-framerate", "1", | |
| "-i", spec["path"] | |
| ]) | |
| cmd.extend([ | |
| "-i", audio_path, | |
| ]) | |
| filter_parts = [ | |
| "[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[base]" | |
| ] | |
| last_label = "[base]" | |
| for idx, spec in enumerate(overlay_specs): | |
| input_idx = idx + 1 | |
| next_label = f"[v{idx}]" | |
| start = spec["start"] | |
| end = spec["end"] | |
| filter_parts.append( | |
| f"{last_label}[{input_idx}:v]overlay=0:0:enable='between(t,{start:.2f},{end:.2f})'{next_label}" | |
| ) | |
| last_label = next_label | |
| if overlay_specs: | |
| filter_complex = ";".join(filter_parts) | |
| else: | |
| filter_complex = "[0:v]scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920[vout]" | |
| if overlay_specs: | |
| final_video_label = last_label | |
| else: | |
| final_video_label = "[vout]" | |
| audio_input_index = len(overlay_specs) + 1 | |
| cmd.extend([ | |
| "-filter_complex", filter_complex, | |
| "-map", final_video_label, | |
| "-map", f"{audio_input_index}:a:0", | |
| "-c:v", "libx264", | |
| "-preset", "ultrafast", | |
| "-crf", "20", | |
| "-pix_fmt", "yuv420p", | |
| "-r", "24", | |
| "-c:a", "aac", | |
| "-b:a", "128k", | |
| "-movflags", "+faststart", | |
| "-shortest", | |
| output_path | |
| ]) | |
| result = subprocess.run( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| check=True | |
| ) | |
| if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: | |
| return jsonify({ | |
| "error": "Video file not created", | |
| "details": "FFmpeg ran but output file is missing or empty." | |
| }), 500 | |
| return jsonify({ | |
| "video_url": f"/video/{output_filename}", | |
| "transcript": transcript, | |
| "full_text": " ".join(full_text_parts).strip(), | |
| "language": getattr(info, "language", None) | |
| }) | |
| except subprocess.CalledProcessError as e: | |
| return jsonify({ | |
| "error": "FFmpeg failed", | |
| "details": e.stderr.decode("utf-8", errors="ignore") | |
| }), 500 | |
| except Exception as e: | |
| return jsonify({ | |
| "error": "Processing failed", | |
| "details": str(e) | |
| }), 500 | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860, debug=True) |