Spaces:

ulduldp
/

test-ffmpeg

Running

App Files Files Community

ulduldp commited on 15 days ago

Commit

62e66d3

verified ·

1 Parent(s): 9ca7553

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -110

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ from flask import Flask, render_template_string, request, jsonify
 import os
 import uuid
 import subprocess
-import textwrap
 import tempfile
 from werkzeug.utils import secure_filename
 from faster_whisper import WhisperModel
@@ -12,11 +11,9 @@ app = Flask(__name__)
 UPLOAD_FOLDER = "uploads"
 OUTPUT_FOLDER = "static/videos"
-SUBTITLE_FOLDER = "subtitles"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(OUTPUT_FOLDER, exist_ok=True)
-os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
 # Load Whisper once
 model = WhisperModel(
@@ -105,7 +102,7 @@ video{
     margin-top:20px;
     border-radius:15px;
     display:none;
-    aspect-ratio: 9 / 16;
     background:#000;
     object-fit:cover;
 }
@@ -133,22 +130,30 @@ video{
 <body>
 <div class="container">
     <h1>Photo + Audio → Video</h1>
     <form id="form">
         <div class="upload-box">
             <label>Select Photo</label>
             <input type="file" id="image" name="image" accept="image/*" required>
             <img id="preview" class="preview">
             <label>Select Audio (mp3/wav)</label>
             <input type="file" name="audio" accept="audio/*" required>
         </div>
         <button type="submit">Generate Video</button>
     </form>
     <div id="loading">Generating Video...</div>
     <video id="video" controls></video>
     <div class="download-btn" id="downloadDiv">
         <a id="downloadBtn" download>Download Video</a>
     </div>
 </div>
 <script>
 const form = document.getElementById("form");
 const loading = document.getElementById("loading");
@@ -156,34 +161,45 @@ const video = document.getElementById("video");
 const downloadBtn = document.getElementById("downloadBtn");
 const downloadDiv = document.getElementById("downloadDiv");
 const preview = document.getElementById("preview");
 document.getElementById("image").addEventListener("change", function(e){
     const file = e.target.files[0];
     if(file){
         preview.src = URL.createObjectURL(file);
         preview.style.display = "block";
     }
 });
 form.addEventListener("submit", async (e)=>{
     e.preventDefault();
     loading.style.display = "block";
     video.style.display = "none";
     downloadDiv.style.display = "none";
     const formData = new FormData(form);
     try{
         const response = await fetch("/generate", {
             method: "POST",
             body: formData
         });
         const data = await response.json();
         loading.style.display = "none";
         if(data.video_url){
             video.src = data.video_url + "?t=" + new Date().getTime();
             video.style.display = "block";
             downloadBtn.href = data.video_url;
             downloadDiv.style.display = "block";
         }else{
             alert(data.error || "Failed");
         }
     }catch(err){
         loading.style.display = "none";
         alert("Server Error");
@@ -194,100 +210,97 @@ form.addEventListener("submit", async (e)=>{
 </html>
 """
-# Reel canvas size
 VIDEO_W = 1080
 VIDEO_H = 1920
-# Caption box settings
-BOX_MARGIN_BOTTOM = 180
-BOX_MAX_MARGIN_X = 80
-BOX_PADDING_X = 42
-BOX_PADDING_Y = 24
-BOX_RADIUS = 28
-FONT_SIZE = 54
-def find_font() -> str:
     candidates = [
         "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
         "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
-        "/System/Library/Fonts/Supplemental/Arial Bold.ttf",
         "C:\\Windows\\Fonts\\arialbd.ttf",
         "C:\\Windows\\Fonts\\arial.ttf",
     ]
     for p in candidates:
         if os.path.exists(p):
             return p
-    raise FileNotFoundError("No usable font found.")
-def wrap_text_by_width(text: str, font: ImageFont.FreeTypeFont, max_width: int) -> list[str]:
-    words = text.split()
-    if not words:
-        return [""]
     dummy = Image.new("RGBA", (10, 10))
     draw = ImageDraw.Draw(dummy)
     lines = []
     current = words[0]
     for word in words[1:]:
-        trial = current + " " + word
-        bbox = draw.textbbox((0, 0), trial, font=font)
         width = bbox[2] - bbox[0]
         if width <= max_width:
-            current = trial
         else:
             lines.append(current)
             current = word
     lines.append(current)
-    return lines
-def draw_rounded_rect(draw: ImageDraw.ImageDraw, xy, radius, fill):
-    # Pillow has rounded_rectangle in modern versions.
-    # This fallback keeps it safe if needed.
-    if hasattr(draw, "rounded_rectangle"):
-        draw.rounded_rectangle(xy, radius=radius, fill=fill)
-        return
-    x1, y1, x2, y2 = xy
-    draw.rectangle([x1 + radius, y1, x2 - radius, y2], fill=fill)
-    draw.rectangle([x1, y1 + radius, x2, y2 - radius], fill=fill)
-    draw.pieslice([x1, y1, x1 + 2 * radius, y1 + 2 * radius], 180, 270, fill=fill)
-    draw.pieslice([x2 - 2 * radius, y1, x2, y1 + 2 * radius], 270, 360, fill=fill)
-    draw.pieslice([x1, y2 - 2 * radius, x1 + 2 * radius, y2], 90, 180, fill=fill)
-    draw.pieslice([x2 - 2 * radius, y2 - 2 * radius, x2, y2], 0, 90, fill=fill)
-def make_caption_png(text: str, out_path: str):
     font = ImageFont.truetype(find_font(), FONT_SIZE)
-    max_text_width = VIDEO_W - (2 * BOX_MAX_MARGIN_X) - (2 * BOX_PADDING_X)
-    lines = wrap_text_by_width(text, font, max_text_width)
     measure_img = Image.new("RGBA", (10, 10))
     measure_draw = ImageDraw.Draw(measure_img)
-    line_metrics = []
     for line in lines:
         bbox = measure_draw.textbbox((0, 0), line, font=font)
         line_w = bbox[2] - bbox[0]
         line_h = bbox[3] - bbox[1]
-        line_metrics.append((line_w, line_h))
-    text_w = max((w for w, h in line_metrics), default=0)
-    text_h = sum(h for w, h in line_metrics)
-    line_gap = max(10, FONT_SIZE // 5)
-    if len(lines) > 1:
-        text_h += line_gap * (len(lines) - 1)
-    box_w = min(VIDEO_W - 2 * BOX_MAX_MARGIN_X, text_w + 2 * BOX_PADDING_X)
-    box_h = text_h + 2 * BOX_PADDING_Y
-    img = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0, 0, 0, 0))
     draw = ImageDraw.Draw(img)
     x1 = (VIDEO_W - box_w) // 2
@@ -295,44 +308,64 @@ def make_caption_png(text: str, out_path: str):
     y1 = y2 - box_h
     x2 = x1 + box_w
-    # Black rounded background
-    draw_rounded_rect(draw, (x1, y1, x2, y2), BOX_RADIUS, (0, 0, 0, 215))
     # Glow layer
-    glow = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0, 0, 0, 0))
     glow_draw = ImageDraw.Draw(glow)
     current_y = y1 + BOX_PADDING_Y
-    for i, line in enumerate(lines):
-        bbox = measure_draw.textbbox((0, 0), line, font=font)
-        line_w = bbox[2] - bbox[0]
-        line_h = bbox[3] - bbox[1]
         tx = (VIDEO_W - line_w) // 2
-        # soft glow
-        for dx, dy in [(-2, 0), (2, 0), (0, -2), (0, 2), (-2, -2), (-2, 2), (2, -2), (2, 2)]:
-            glow_draw.text((tx + dx, current_y + dy), line, font=font, fill=(255, 255, 255, 90))
         current_y += line_h + line_gap
     glow = glow.filter(ImageFilter.GaussianBlur(4))
     img = Image.alpha_composite(img, glow)
-    # Crisp white text over glow
     current_y = y1 + BOX_PADDING_Y
-    for i, line in enumerate(lines):
-        bbox = measure_draw.textbbox((0, 0), line, font=font)
-        line_w = bbox[2] - bbox[0]
-        line_h = bbox[3] - bbox[1]
         tx = (VIDEO_W - line_w) // 2
-        draw.text((tx, current_y), line, font=font, fill=(255, 255, 255, 255))
         current_y += line_h + line_gap
     img.save(out_path)
-def build_filter_complex(num_caption_inputs: int, transcript):
-    # Base vertical reel pipeline unchanged
     base = (
         "[0:v]"
         "scale=1080:1920:force_original_aspect_ratio=increase,"
@@ -345,20 +378,21 @@ def build_filter_complex(num_caption_inputs: int, transcript):
         "[base]"
     )
-    if num_caption_inputs == 0:
-        return base, "[base]"
     parts = [base]
     last = "[base]"
     for idx, seg in enumerate(transcript, start=2):
         start = f"{seg['start']:.2f}"
         end = f"{seg['end']:.2f}"
-        out_label = f"[v{idx}]"
         parts.append(
-            f"{last}[{idx}:v]overlay=0:0:enable='between(t,{start},{end})'{out_label}"
         )
-        last = out_label
     return ";".join(parts), last
@@ -370,6 +404,7 @@ def home():
 @app.route("/generate", methods=["POST"])
 def generate():
     if "image" not in request.files or "audio" not in request.files:
         return jsonify({"error": "Missing files"})
@@ -384,16 +419,28 @@ def generate():
     image_name = secure_filename(image.filename)
     audio_name = secure_filename(audio.filename)
-    image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}")
-    audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}")
     output_filename = f"{uid}.mp4"
-    output_path = os.path.join(OUTPUT_FOLDER, output_filename)
     image.save(image_path)
     audio.save(audio_path)
     try:
-        # Transcribe audio
         segments_iter, info = model.transcribe(
             audio_path,
             beam_size=5,
@@ -405,6 +452,7 @@ def generate():
         for segment in segments_iter:
             text = segment.text.strip()
             if not text:
                 continue
@@ -413,57 +461,69 @@ def generate():
                 "end": round(segment.end, 2),
                 "text": text
             })
             full_text_parts.append(text)
         with tempfile.TemporaryDirectory() as tmpdir:
             caption_paths = []
             for i, seg in enumerate(transcript, start=1):
-                cap_path = os.path.join(tmpdir, f"caption_{i:04d}.png")
-                make_caption_png(seg["text"], cap_path)
-                caption_paths.append(cap_path)
             cmd = [
                 "ffmpeg",
                 "-y",
                 "-loop", "1",
                 "-i", image_path,
-                "-i", audio_path,
             ]
             for p in caption_paths:
-                cmd += ["-loop", "1", "-i", p]
-            filter_complex, last_video_label = build_filter_complex(len(caption_paths), transcript)
-            if len(caption_paths) > 0:
-                filter_file = os.path.join(tmpdir, "filter_complex.txt")
-                with open(filter_file, "w", encoding="utf-8") as f:
-                    f.write(filter_complex)
-                cmd += [
-                    "-filter_complex_script", filter_file,
-                    "-map", last_video_label,
-                    "-map", "1:a?",
-                    "-c:v", "libx264",
-                    "-pix_fmt", "yuv420p",
-                    "-c:a", "aac",
-                    "-b:a", "192k",
-                    "-shortest",
-                    output_path
-                ]
-            else:
-                # No transcript found: still create the video without captions
-                cmd += [
-                    "-vf",
-                    "scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920,zoompan=z='min(zoom+0.0008,1.10)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=999999:s=1080x1920:fps=30",
-                    "-c:v", "libx264",
-                    "-pix_fmt", "yuv420p",
-                    "-c:a", "aac",
-                    "-b:a", "192k",
-                    "-shortest",
-                    output_path
-                ]
             subprocess.run(
                 cmd,
@@ -480,11 +540,14 @@ def generate():
         })
     except subprocess.CalledProcessError as e:
         return jsonify({
             "error": "FFmpeg failed",
             "details": e.stderr.decode("utf-8", errors="ignore")
         })
     except Exception as e:
         return jsonify({
             "error": "Processing failed",
             "details": str(e)

 import os
 import uuid
 import subprocess
 import tempfile
 from werkzeug.utils import secure_filename
 from faster_whisper import WhisperModel
 UPLOAD_FOLDER = "uploads"
 OUTPUT_FOLDER = "static/videos"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(OUTPUT_FOLDER, exist_ok=True)
 # Load Whisper once
 model = WhisperModel(
     margin-top:20px;
     border-radius:15px;
     display:none;
+    aspect-ratio:9/16;
     background:#000;
     object-fit:cover;
 }
 <body>
 <div class="container">
     <h1>Photo + Audio → Video</h1>
     <form id="form">
         <div class="upload-box">
             <label>Select Photo</label>
             <input type="file" id="image" name="image" accept="image/*" required>
             <img id="preview" class="preview">
             <label>Select Audio (mp3/wav)</label>
             <input type="file" name="audio" accept="audio/*" required>
         </div>
         <button type="submit">Generate Video</button>
     </form>
     <div id="loading">Generating Video...</div>
     <video id="video" controls></video>
     <div class="download-btn" id="downloadDiv">
         <a id="downloadBtn" download>Download Video</a>
     </div>
 </div>
 <script>
 const form = document.getElementById("form");
 const loading = document.getElementById("loading");
 const downloadBtn = document.getElementById("downloadBtn");
 const downloadDiv = document.getElementById("downloadDiv");
 const preview = document.getElementById("preview");
 document.getElementById("image").addEventListener("change", function(e){
     const file = e.target.files[0];
     if(file){
         preview.src = URL.createObjectURL(file);
         preview.style.display = "block";
     }
 });
 form.addEventListener("submit", async (e)=>{
     e.preventDefault();
     loading.style.display = "block";
     video.style.display = "none";
     downloadDiv.style.display = "none";
     const formData = new FormData(form);
     try{
         const response = await fetch("/generate", {
             method: "POST",
             body: formData
         });
         const data = await response.json();
         loading.style.display = "none";
         if(data.video_url){
             video.src = data.video_url + "?t=" + new Date().getTime();
             video.style.display = "block";
             downloadBtn.href = data.video_url;
             downloadDiv.style.display = "block";
         }else{
             alert(data.error || "Failed");
         }
     }catch(err){
         loading.style.display = "none";
         alert("Server Error");
 </html>
 """
+# Reel resolution
 VIDEO_W = 1080
 VIDEO_H = 1920
+# Caption styling
+FONT_SIZE = 58
+BOX_RADIUS = 32
+BOX_PADDING_X = 45
+BOX_PADDING_Y = 28
+BOX_MARGIN_BOTTOM = 190
+BOX_MARGIN_X = 80
+def find_font():
     candidates = [
         "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
         "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
         "C:\\Windows\\Fonts\\arialbd.ttf",
         "C:\\Windows\\Fonts\\arial.ttf",
     ]
     for p in candidates:
         if os.path.exists(p):
             return p
+    raise FileNotFoundError("No font found")
+def wrap_text(text, font, max_width):
     dummy = Image.new("RGBA", (10, 10))
     draw = ImageDraw.Draw(dummy)
+    words = text.split()
     lines = []
+    if not words:
+        return [""]
     current = words[0]
     for word in words[1:]:
+        test = current + " " + word
+        bbox = draw.textbbox((0, 0), test, font=font)
         width = bbox[2] - bbox[0]
         if width <= max_width:
+            current = test
         else:
             lines.append(current)
             current = word
     lines.append(current)
+    return lines
+def draw_rounded_rect(draw, xy, radius, fill):
+    draw.rounded_rectangle(xy, radius=radius, fill=fill)
+def make_caption_png(text, out_path):
     font = ImageFont.truetype(find_font(), FONT_SIZE)
+    max_text_width = VIDEO_W - (2 * BOX_MARGIN_X) - (2 * BOX_PADDING_X)
+    lines = wrap_text(text, font, max_text_width)
     measure_img = Image.new("RGBA", (10, 10))
     measure_draw = ImageDraw.Draw(measure_img)
+    line_data = []
     for line in lines:
         bbox = measure_draw.textbbox((0, 0), line, font=font)
         line_w = bbox[2] - bbox[0]
         line_h = bbox[3] - bbox[1]
+        line_data.append((line, line_w, line_h))
+    text_w = max([x[1] for x in line_data]) if line_data else 0
+    line_gap = 12
+    text_h = sum([x[2] for x in line_data]) + line_gap * (len(line_data)-1)
+    box_w = text_w + (BOX_PADDING_X * 2)
+    box_h = text_h + (BOX_PADDING_Y * 2)
+    img = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0,0,0,0))
     draw = ImageDraw.Draw(img)
     x1 = (VIDEO_W - box_w) // 2
     y1 = y2 - box_h
     x2 = x1 + box_w
+    # Solid black rounded background
+    draw_rounded_rect(
+        draw,
+        (x1, y1, x2, y2),
+        BOX_RADIUS,
+        (0,0,0,240)
+    )
     # Glow layer
+    glow = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0,0,0,0))
     glow_draw = ImageDraw.Draw(glow)
     current_y = y1 + BOX_PADDING_Y
+    for line, line_w, line_h in line_data:
         tx = (VIDEO_W - line_w) // 2
+        # glow
+        for dx, dy in [
+            (-3,0),(3,0),(0,-3),(0,3),
+            (-2,-2),(-2,2),(2,-2),(2,2)
+        ]:
+            glow_draw.text(
+                (tx+dx, current_y+dy),
+                line,
+                font=font,
+                fill=(255,255,255,90)
+            )
         current_y += line_h + line_gap
     glow = glow.filter(ImageFilter.GaussianBlur(4))
     img = Image.alpha_composite(img, glow)
+    # Main crisp white text
+    draw = ImageDraw.Draw(img)
     current_y = y1 + BOX_PADDING_Y
+    for line, line_w, line_h in line_data:
         tx = (VIDEO_W - line_w) // 2
+        draw.text(
+            (tx, current_y),
+            line,
+            font=font,
+            fill=(255,255,255,255),
+            stroke_width=4,
+            stroke_fill=(0,0,0,255)
+        )
         current_y += line_h + line_gap
     img.save(out_path)
+def build_filter_complex(transcript):
     base = (
         "[0:v]"
         "scale=1080:1920:force_original_aspect_ratio=increase,"
         "[base]"
     )
     parts = [base]
     last = "[base]"
     for idx, seg in enumerate(transcript, start=2):
         start = f"{seg['start']:.2f}"
         end = f"{seg['end']:.2f}"
+        out = f"[v{idx}]"
         parts.append(
+            f"{last}[{idx}:v]overlay=0:0:enable='between(t,{start},{end})'{out}"
         )
+        last = out
     return ";".join(parts), last
 @app.route("/generate", methods=["POST"])
 def generate():
     if "image" not in request.files or "audio" not in request.files:
         return jsonify({"error": "Missing files"})
     image_name = secure_filename(image.filename)
     audio_name = secure_filename(audio.filename)
+    image_path = os.path.join(
+        UPLOAD_FOLDER,
+        f"{uid}_{image_name}"
+    )
+    audio_path = os.path.join(
+        UPLOAD_FOLDER,
+        f"{uid}_{audio_name}"
+    )
     output_filename = f"{uid}.mp4"
+    output_path = os.path.join(
+        OUTPUT_FOLDER,
+        output_filename
+    )
     image.save(image_path)
     audio.save(audio_path)
     try:
+        # Transcribe
         segments_iter, info = model.transcribe(
             audio_path,
             beam_size=5,
         for segment in segments_iter:
             text = segment.text.strip()
             if not text:
                 continue
                 "end": round(segment.end, 2),
                 "text": text
             })
             full_text_parts.append(text)
         with tempfile.TemporaryDirectory() as tmpdir:
             caption_paths = []
             for i, seg in enumerate(transcript, start=1):
+                caption_path = os.path.join(
+                    tmpdir,
+                    f"caption_{i:04d}.png"
+                )
+                make_caption_png(
+                    seg["text"],
+                    caption_path
+                )
+                caption_paths.append(caption_path)
             cmd = [
                 "ffmpeg",
                 "-y",
                 "-loop", "1",
                 "-i", image_path,
+                "-i", audio_path
             ]
             for p in caption_paths:
+                cmd += [
+                    "-loop", "1",
+                    "-i", p
+                ]
+            filter_complex, last_video = build_filter_complex(transcript)
+            filter_script = os.path.join(
+                tmpdir,
+                "filter.txt"
+            )
+            with open(filter_script, "w", encoding="utf-8") as f:
+                f.write(filter_complex)
+            cmd += [
+                "-filter_complex_script", filter_script,
+                "-map", last_video,
+                "-map", "1:a?",
+                "-c:v", "libx264",
+                "-pix_fmt", "yuv420p",
+                "-c:a", "aac",
+                "-b:a", "192k",
+                "-shortest",
+                output_path
+            ]
             subprocess.run(
                 cmd,
         })
     except subprocess.CalledProcessError as e:
         return jsonify({
             "error": "FFmpeg failed",
             "details": e.stderr.decode("utf-8", errors="ignore")
         })
     except Exception as e:
         return jsonify({
             "error": "Processing failed",
             "details": str(e)