Spaces:

ulduldp
/

test-ffmpeg

Running

App Files Files Community

ulduldp commited on 8 days ago

Commit

da9ee87

verified ·

1 Parent(s): 62e66d3

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -230

app.py CHANGED Viewed

@@ -2,22 +2,23 @@ from flask import Flask, render_template_string, request, jsonify
 import os
 import uuid
 import subprocess
-import tempfile
 from werkzeug.utils import secure_filename
 from faster_whisper import WhisperModel
-from PIL import Image, ImageDraw, ImageFont, ImageFilter
 app = Flask(__name__)
 UPLOAD_FOLDER = "uploads"
 OUTPUT_FOLDER = "static/videos"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(OUTPUT_FOLDER, exist_ok=True)
-# Load Whisper once
 model = WhisperModel(
-    "base",
     device="cpu",
     compute_type="int8"
 )
@@ -29,6 +30,7 @@ HTML = """
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
 <title>Photo + Audio To Video</title>
 <style>
 *{
     margin:0;
@@ -36,6 +38,7 @@ HTML = """
     box-sizing:border-box;
     font-family:Arial;
 }
 body{
     background:#0f0f0f;
     color:white;
@@ -45,6 +48,7 @@ body{
     align-items:center;
     padding:20px;
 }
 .container{
     width:100%;
     max-width:500px;
@@ -53,22 +57,26 @@ body{
     padding:25px;
     box-shadow:0 0 20px rgba(0,0,0,0.4);
 }
 h1{
     text-align:center;
     margin-bottom:25px;
     font-size:28px;
 }
 .upload-box{
     border:2px dashed #444;
     padding:20px;
     border-radius:15px;
     margin-bottom:20px;
 }
 label{
     display:block;
     margin-bottom:8px;
     color:#ccc;
 }
 input{
     width:100%;
     padding:12px;
@@ -78,6 +86,7 @@ input{
     color:white;
     margin-bottom:15px;
 }
 button{
     width:100%;
     padding:15px;
@@ -89,14 +98,17 @@ button{
     cursor:pointer;
     transition:0.3s;
 }
 button:hover{
     opacity:0.9;
 }
 #loading{
     display:none;
     text-align:center;
     margin-top:20px;
 }
 video{
     width:100%;
     margin-top:20px;
@@ -106,11 +118,13 @@ video{
     background:#000;
     object-fit:cover;
 }
 .download-btn{
     display:none;
     margin-top:15px;
     text-align:center;
 }
 .download-btn a{
     display:inline-block;
     background:#22c55e;
@@ -119,6 +133,7 @@ video{
     padding:12px 20px;
     border-radius:10px;
 }
 .preview{
     margin-top:15px;
     width:100%;
@@ -127,12 +142,17 @@ video{
 }
 </style>
 </head>
 <body>
 <div class="container">
     <h1>Photo + Audio → Video</h1>
     <form id="form">
         <div class="upload-box">
             <label>Select Photo</label>
             <input type="file" id="image" name="image" accept="image/*" required>
@@ -140,9 +160,11 @@ video{
             <label>Select Audio (mp3/wav)</label>
             <input type="file" name="audio" accept="audio/*" required>
         </div>
         <button type="submit">Generate Video</button>
     </form>
     <div id="loading">Generating Video...</div>
@@ -152,6 +174,7 @@ video{
     <div class="download-btn" id="downloadDiv">
         <a id="downloadBtn" download>Download Video</a>
     </div>
 </div>
 <script>
@@ -163,6 +186,7 @@ const downloadDiv = document.getElementById("downloadDiv");
 const preview = document.getElementById("preview");
 document.getElementById("image").addEventListener("change", function(e){
     const file = e.target.files[0];
     if(file){
@@ -172,6 +196,7 @@ document.getElementById("image").addEventListener("change", function(e){
 });
 form.addEventListener("submit", async (e)=>{
     e.preventDefault();
     loading.style.display = "block";
@@ -181,9 +206,10 @@ form.addEventListener("submit", async (e)=>{
     const formData = new FormData(form);
     try{
         const response = await fetch("/generate", {
-            method: "POST",
-            body: formData
         });
         const data = await response.json();
@@ -191,228 +217,122 @@ form.addEventListener("submit", async (e)=>{
         loading.style.display = "none";
         if(data.video_url){
             video.src = data.video_url + "?t=" + new Date().getTime();
             video.style.display = "block";
             downloadBtn.href = data.video_url;
             downloadDiv.style.display = "block";
         }else{
             alert(data.error || "Failed");
         }
     }catch(err){
         loading.style.display = "none";
         alert("Server Error");
     }
 });
 </script>
 </body>
 </html>
 """
-# Reel resolution
-VIDEO_W = 1080
-VIDEO_H = 1920
-# Caption styling
-FONT_SIZE = 58
-BOX_RADIUS = 32
-BOX_PADDING_X = 45
-BOX_PADDING_Y = 28
-BOX_MARGIN_BOTTOM = 190
-BOX_MARGIN_X = 80
-def find_font():
-    candidates = [
-        "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
-        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
-        "C:\\Windows\\Fonts\\arialbd.ttf",
-        "C:\\Windows\\Fonts\\arial.ttf",
-    ]
-    for p in candidates:
-        if os.path.exists(p):
-            return p
-    raise FileNotFoundError("No font found")
-def wrap_text(text, font, max_width):
-    dummy = Image.new("RGBA", (10, 10))
-    draw = ImageDraw.Draw(dummy)
-    words = text.split()
-    lines = []
-    if not words:
-        return [""]
-    current = words[0]
-    for word in words[1:]:
-        test = current + " " + word
-        bbox = draw.textbbox((0, 0), test, font=font)
-        width = bbox[2] - bbox[0]
-        if width <= max_width:
-            current = test
-        else:
-            lines.append(current)
-            current = word
-    lines.append(current)
-    return lines
-def draw_rounded_rect(draw, xy, radius, fill):
-    draw.rounded_rectangle(xy, radius=radius, fill=fill)
-def make_caption_png(text, out_path):
-    font = ImageFont.truetype(find_font(), FONT_SIZE)
-    max_text_width = VIDEO_W - (2 * BOX_MARGIN_X) - (2 * BOX_PADDING_X)
-    lines = wrap_text(text, font, max_text_width)
-    measure_img = Image.new("RGBA", (10, 10))
-    measure_draw = ImageDraw.Draw(measure_img)
-    line_data = []
-    for line in lines:
-        bbox = measure_draw.textbbox((0, 0), line, font=font)
-        line_w = bbox[2] - bbox[0]
-        line_h = bbox[3] - bbox[1]
-        line_data.append((line, line_w, line_h))
-    text_w = max([x[1] for x in line_data]) if line_data else 0
-    line_gap = 12
-    text_h = sum([x[2] for x in line_data]) + line_gap * (len(line_data)-1)
-    box_w = text_w + (BOX_PADDING_X * 2)
-    box_h = text_h + (BOX_PADDING_Y * 2)
-    img = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0,0,0,0))
-    draw = ImageDraw.Draw(img)
-    x1 = (VIDEO_W - box_w) // 2
-    y2 = VIDEO_H - BOX_MARGIN_BOTTOM
-    y1 = y2 - box_h
-    x2 = x1 + box_w
-    # Solid black rounded background
-    draw_rounded_rect(
-        draw,
-        (x1, y1, x2, y2),
-        BOX_RADIUS,
-        (0,0,0,240)
     )
-    # Glow layer
-    glow = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0,0,0,0))
-    glow_draw = ImageDraw.Draw(glow)
-    current_y = y1 + BOX_PADDING_Y
-    for line, line_w, line_h in line_data:
-        tx = (VIDEO_W - line_w) // 2
-        # glow
-        for dx, dy in [
-            (-3,0),(3,0),(0,-3),(0,3),
-            (-2,-2),(-2,2),(2,-2),(2,2)
-        ]:
-            glow_draw.text(
-                (tx+dx, current_y+dy),
-                line,
-                font=font,
-                fill=(255,255,255,90)
-            )
-        current_y += line_h + line_gap
-    glow = glow.filter(ImageFilter.GaussianBlur(4))
-    img = Image.alpha_composite(img, glow)
-    # Main crisp white text
-    draw = ImageDraw.Draw(img)
-    current_y = y1 + BOX_PADDING_Y
-    for line, line_w, line_h in line_data:
-        tx = (VIDEO_W - line_w) // 2
-        draw.text(
-            (tx, current_y),
-            line,
-            font=font,
-            fill=(255,255,255,255),
-            stroke_width=4,
-            stroke_fill=(0,0,0,255)
-        )
-        current_y += line_h + line_gap
-    img.save(out_path)
-def build_filter_complex(transcript):
-    base = (
-        "[0:v]"
-        "scale=1080:1920:force_original_aspect_ratio=increase,"
-        "crop=1080:1920,"
-        "zoompan=z='min(zoom+0.0008,1.10)':"
-        "x='iw/2-(iw/zoom/2)':"
-        "y='ih/2-(ih/zoom/2)':"
-        "d=999999:s=1080x1920:fps=30,"
-        "format=rgba"
-        "[base]"
-    )
-    parts = [base]
-    last = "[base]"
-    for idx, seg in enumerate(transcript, start=2):
-        start = f"{seg['start']:.2f}"
-        end = f"{seg['end']:.2f}"
-        out = f"[v{idx}]"
-        parts.append(
-            f"{last}[{idx}:v]overlay=0:0:enable='between(t,{start},{end})'{out}"
         )
-        last = out
-    return ";".join(parts), last
 @app.route("/")
 def home():
     return render_template_string(HTML)
 @app.route("/generate", methods=["POST"])
 def generate():
     if "image" not in request.files or "audio" not in request.files:
-        return jsonify({"error": "Missing files"})
     image = request.files["image"]
     audio = request.files["audio"]
     if not image.filename or not audio.filename:
-        return jsonify({"error": "Please upload both image and audio"})
     uid = str(uuid.uuid4())
@@ -436,14 +356,20 @@ def generate():
         output_filename
     )
     image.save(image_path)
     audio.save(audio_path)
     try:
-        # Transcribe
         segments_iter, info = model.transcribe(
             audio_path,
-            beam_size=5,
             vad_filter=True
         )
@@ -451,6 +377,7 @@ def generate():
         full_text_parts = []
         for segment in segments_iter:
             text = segment.text.strip()
             if not text:
@@ -464,73 +391,50 @@ def generate():
             full_text_parts.append(text)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            caption_paths = []
-            for i, seg in enumerate(transcript, start=1):
-                caption_path = os.path.join(
-                    tmpdir,
-                    f"caption_{i:04d}.png"
-                )
-                make_caption_png(
-                    seg["text"],
-                    caption_path
-                )
-                caption_paths.append(caption_path)
-            cmd = [
-                "ffmpeg",
-                "-y",
-                "-loop", "1",
-                "-i", image_path,
-                "-i", audio_path
-            ]
-            for p in caption_paths:
-                cmd += [
-                    "-loop", "1",
-                    "-i", p
-                ]
-            filter_complex, last_video = build_filter_complex(transcript)
-            filter_script = os.path.join(
-                tmpdir,
-                "filter.txt"
-            )
-            with open(filter_script, "w", encoding="utf-8") as f:
-                f.write(filter_complex)
-            cmd += [
-                "-filter_complex_script", filter_script,
-                "-map", last_video,
-                "-map", "1:a?",
-                "-c:v", "libx264",
-                "-pix_fmt", "yuv420p",
-                "-c:a", "aac",
-                "-b:a", "192k",
-                "-shortest",
-                output_path
-            ]
-            subprocess.run(
-                cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                check=True
-            )
         return jsonify({
             "video_url": f"/static/videos/{output_filename}",
@@ -542,17 +446,19 @@ def generate():
     except subprocess.CalledProcessError as e:
         return jsonify({
-            "error": "FFmpeg failed",
-            "details": e.stderr.decode("utf-8", errors="ignore")
         })
     except Exception as e:
         return jsonify({
-            "error": "Processing failed",
             "details": str(e)
         })
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)

 import os
 import uuid
 import subprocess
+import textwrap
 from werkzeug.utils import secure_filename
 from faster_whisper import WhisperModel
 app = Flask(__name__)
 UPLOAD_FOLDER = "uploads"
 OUTPUT_FOLDER = "static/videos"
+SUBTITLE_FOLDER = "subtitles"
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
 os.makedirs(OUTPUT_FOLDER, exist_ok=True)
+os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
+# Smallest & fastest Whisper model
 model = WhisperModel(
+    "tiny",
     device="cpu",
     compute_type="int8"
 )
 <meta charset="UTF-8">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
 <title>Photo + Audio To Video</title>
 <style>
 *{
     margin:0;
     box-sizing:border-box;
     font-family:Arial;
 }
 body{
     background:#0f0f0f;
     color:white;
     align-items:center;
     padding:20px;
 }
 .container{
     width:100%;
     max-width:500px;
     padding:25px;
     box-shadow:0 0 20px rgba(0,0,0,0.4);
 }
 h1{
     text-align:center;
     margin-bottom:25px;
     font-size:28px;
 }
 .upload-box{
     border:2px dashed #444;
     padding:20px;
     border-radius:15px;
     margin-bottom:20px;
 }
 label{
     display:block;
     margin-bottom:8px;
     color:#ccc;
 }
 input{
     width:100%;
     padding:12px;
     color:white;
     margin-bottom:15px;
 }
 button{
     width:100%;
     padding:15px;
     cursor:pointer;
     transition:0.3s;
 }
 button:hover{
     opacity:0.9;
 }
 #loading{
     display:none;
     text-align:center;
     margin-top:20px;
 }
 video{
     width:100%;
     margin-top:20px;
     background:#000;
     object-fit:cover;
 }
 .download-btn{
     display:none;
     margin-top:15px;
     text-align:center;
 }
 .download-btn a{
     display:inline-block;
     background:#22c55e;
     padding:12px 20px;
     border-radius:10px;
 }
 .preview{
     margin-top:15px;
     width:100%;
 }
 </style>
 </head>
 <body>
 <div class="container">
     <h1>Photo + Audio → Video</h1>
     <form id="form">
         <div class="upload-box">
             <label>Select Photo</label>
             <input type="file" id="image" name="image" accept="image/*" required>
             <label>Select Audio (mp3/wav)</label>
             <input type="file" name="audio" accept="audio/*" required>
         </div>
         <button type="submit">Generate Video</button>
     </form>
     <div id="loading">Generating Video...</div>
     <div class="download-btn" id="downloadDiv">
         <a id="downloadBtn" download>Download Video</a>
     </div>
 </div>
 <script>
 const preview = document.getElementById("preview");
 document.getElementById("image").addEventListener("change", function(e){
     const file = e.target.files[0];
     if(file){
 });
 form.addEventListener("submit", async (e)=>{
     e.preventDefault();
     loading.style.display = "block";
     const formData = new FormData(form);
     try{
         const response = await fetch("/generate", {
+            method:"POST",
+            body:formData
         });
         const data = await response.json();
         loading.style.display = "none";
         if(data.video_url){
             video.src = data.video_url + "?t=" + new Date().getTime();
             video.style.display = "block";
             downloadBtn.href = data.video_url;
             downloadDiv.style.display = "block";
         }else{
             alert(data.error || "Failed");
         }
     }catch(err){
         loading.style.display = "none";
         alert("Server Error");
     }
 });
 </script>
 </body>
 </html>
 """
+def ass_time(seconds: float) -> str:
+    if seconds < 0:
+        seconds = 0
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = seconds % 60
+    return f"{h}:{m:02d}:{s:05.2f}"
+def ass_escape(text: str) -> str:
+    text = text.replace("\\", "\\\\")
+    text = text.replace("{", "\\{")
+    text = text.replace("}", "\\}")
+    text = text.replace("\n", " ")
+    return text
+def escape_ffmpeg_path(path: str) -> str:
+    return (
+        path
+        .replace("\\", "\\\\")
+        .replace(":", "\\:")
+        .replace("'", r"\'")
     )
+def make_ass_subtitles(segments, ass_path):
+    header = """[Script Info]
+ScriptType: v4.00+
+PlayResX: 1080
+PlayResY: 1920
+ScaledBorderAndShadow: yes
+WrapStyle: 2
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: Default,DejaVu Sans,60,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,1,0,0,0,100,100,0,0,3,0,0,2,80,80,140,1
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+    lines = [header]
+    for seg in segments:
+        start = ass_time(seg["start"])
+        end = ass_time(seg["end"])
+        text = seg["text"].strip()
+        if not text:
+            continue
+        # auto wrap
+        wrapped = textwrap.fill(text, width=22)
+        wrapped = ass_escape(wrapped)
+        wrapped = wrapped.replace("\n", r"\N")
+        dialogue = (
+            f"Dialogue: 0,{start},{end},Default,,0,0,0,,"
+            r"{\bord0\shad0\1c&HFFFFFF&\3c&H000000&\4c&H000000&}"
+            f"{wrapped}\n"
         )
+        lines.append(dialogue)
+    with open(ass_path, "w", encoding="utf-8") as f:
+        f.writelines(lines)
 @app.route("/")
 def home():
     return render_template_string(HTML)
 @app.route("/generate", methods=["POST"])
 def generate():
     if "image" not in request.files or "audio" not in request.files:
+        return jsonify({"error":"Missing files"})
     image = request.files["image"]
     audio = request.files["audio"]
     if not image.filename or not audio.filename:
+        return jsonify({"error":"Please upload both image and audio"})
     uid = str(uuid.uuid4())
         output_filename
     )
+    ass_path = os.path.join(
+        SUBTITLE_FOLDER,
+        f"{uid}.ass"
+    )
     image.save(image_path)
     audio.save(audio_path)
     try:
+        # Faster transcription
         segments_iter, info = model.transcribe(
             audio_path,
+            beam_size=1,
             vad_filter=True
         )
         full_text_parts = []
         for segment in segments_iter:
             text = segment.text.strip()
             if not text:
             full_text_parts.append(text)
+        make_ass_subtitles(transcript, ass_path)
+        safe_ass_path = escape_ffmpeg_path(
+            os.path.abspath(ass_path)
+        )
+        # Low CPU video processing
+        vf = (
+            "scale=1080:1920:force_original_aspect_ratio=increase,"
+            "crop=1080:1920,"
+            f"subtitles='{safe_ass_path}'"
+        )
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-loop", "1",
+            "-i", image_path,
+            "-i", audio_path,
+            "-vf", vf,
+            "-c:v", "libx264",
+            "-preset", "ultrafast",
+            "-pix_fmt", "yuv420p",
+            "-c:a", "aac",
+            "-b:a", "128k",
+            "-shortest",
+            output_path
+        ]
+        subprocess.run(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True
+        )
         return jsonify({
             "video_url": f"/static/videos/{output_filename}",
     except subprocess.CalledProcessError as e:
         return jsonify({
+            "error":"FFmpeg failed",
+            "details": e.stderr.decode(
+                "utf-8",
+                errors="ignore"
+            )
         })
     except Exception as e:
         return jsonify({
+            "error":"Processing failed",
             "details": str(e)
         })
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860)