Spaces:

leicam
/

EditorCortes

Running

App Files Files Community

leicam commited on Oct 9, 2025

Commit

5ad5f13

verified ·

1 Parent(s): a304d68

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -62

app.py CHANGED Viewed

@@ -10,10 +10,11 @@ import whisper
 import subprocess
 from pathlib import Path
 from dataclasses import dataclass
-from typing import List, Tuple, Optional
 import tempfile
 import os
 import shutil
 # ======================= DATACLASSES =======================
@@ -38,6 +39,40 @@ class FaceBox:
     center_y: int
     confidence: float = 1.0
 # ======================= FACE TRACKING =======================
 class FaceTracker:
@@ -147,6 +182,7 @@ def extract_audio_wav(input_video: str, sr: int = 16000) -> str:
     """Extrai o áudio para WAV mono 16kHz para robustez da transcrição."""
     fd, tmp_path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
     cmd = [
         "ffmpeg", "-y", "-i", input_video,
         "-vn", "-ac", "1", "-ar", str(sr), "-f", "wav", tmp_path
@@ -155,31 +191,41 @@ def extract_audio_wav(input_video: str, sr: int = 16000) -> str:
     return tmp_path
 def transcribe(video_file: str, model_size: str = "small") -> List[Segment]:
-    print(f"Carregando modelo Whisper: {model_size}")
-    model = whisper.load_model(model_size)
-    print(f"Extraindo áudio (WAV) de: {video_file}")
-    audio_wav = extract_audio_wav(video_file, sr=16000)
-    print("Transcrevendo WAV…")
     result = model.transcribe(
         audio_wav,
         language="pt",
         verbose=False,
         task="transcribe",
-        temperature=0
     )
-    segments = []
-    for seg in result["segments"]:
-        segments.append(Segment(
-            start=seg["start"],
-            end=seg["end"],
-            text=seg["text"].strip()
-        ))
-    print(f"Transcrição completa: {len(segments)} segmentos")
-    # limpa o wav temporário
     try:
         Path(audio_wav).unlink(missing_ok=True)
     except Exception:
@@ -189,21 +235,23 @@ def transcribe(video_file: str, model_size: str = "small") -> List[Segment]:
 # ======================= PROCESSAMENTO DE VÍDEO =======================
 def extract_video_segment(input_video: str, output_video: str, start_time: float, end_time: float) -> bool:
-    duration = end_time - start_time
     cmd = [
         "ffmpeg", "-y", "-ss", str(start_time), "-i", input_video,
         "-t", str(duration),
         "-c:v", "libx264",
-        "-c:a", "aac",  # pode manter aac para compatibilidade ampla
-        "-strict", "experimental",
         output_video
     ]
     try:
         subprocess.run(cmd, check=True, capture_output=True)
         return True
     except subprocess.CalledProcessError as e:
-        print(f"Erro ao extrair: {e}")
         return False
 def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: int,
@@ -211,19 +259,16 @@ def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: i
     """Calcula o melhor crop com rastreamento facial e aplica o crop com FFmpeg preservando o áudio."""
     tracker = FaceTracker()
     cap = cv2.VideoCapture(input_path)
     if not cap.isOpened():
-        print(f"Erro ao abrir: {input_path}")
         return False
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    # Amostragem para suavização
     sample_positions = []
     frame_indices = np.linspace(0, frame_count - 1, min(sample_frames, max(1, frame_count)), dtype=int)
     for idx in frame_indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
@@ -232,7 +277,6 @@ def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: i
             sample_positions.append(crop_coords)
     cap.release()
-    # Posição média (suavizada)
     if sample_positions:
         avg_x = int(np.median([p[0] for p in sample_positions]))
         avg_y = int(np.median([p[1] for p in sample_positions]))
@@ -240,7 +284,6 @@ def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: i
         crop_h = sample_positions[0][3]
         final_crop = (avg_x, avg_y, crop_w, crop_h)
     else:
-        # Fallback central
         target_ar = target_width / target_height
         frame_ar = frame_w / frame_h
         if target_ar < frame_ar:
@@ -253,23 +296,23 @@ def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: i
             final_crop = (0, (frame_h - crop_h) // 2, crop_w, crop_h)
     x, y, w, h = final_crop
-    print(f"Crop final: x={x}, y={y}, w={w}, h={h} -> {target_width}x{target_height}")
-    # Aplica o crop com FFmpeg preservando o áudio
     vf = f"crop={w}:{h}:{x}:{y},scale={target_width}:{target_height}:flags=lanczos"
     cmd = [
         "ffmpeg", "-y", "-i", input_path,
         "-vf", vf,
         "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
-        "-c:a", "copy",  # mantém o áudio original
         output_path
     ]
     try:
         subprocess.run(cmd, check=True, capture_output=True)
-        print(f"Concluído: {output_path}")
         return True
     except subprocess.CalledProcessError as e:
-        print(f"Erro no ffmpeg (smart crop): {e}")
         return False
 def apply_aspect_ratio(input_video: str, output_video: str, ar_mode: str, face_tracking: bool = False) -> bool:
@@ -282,21 +325,19 @@ def apply_aspect_ratio(input_video: str, output_video: str, ar_mode: str, face_t
         "Quadrado 1:1": (1080, 1080),
         "Retrato 4:5": (1080, 1350),
     }
     if ar_mode not in ar_dims:
         return False
     width, height = ar_dims[ar_mode]
     if face_tracking:
         return apply_smart_crop_to_video(input_video, output_video, width, height)
     else:
-        # Crop centralizado tradicional com áudio preservado
         cmd = [
             "ffmpeg", "-y", "-i", input_video,
             "-vf", f"scale={width}:{height}:force_original_aspect_ratio=increase,crop={width}:{height}",
-            "-c:a", "copy",
             "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
             output_video
         ]
         try:
@@ -315,16 +356,14 @@ def concatenate_videos(video_files: List[str], output_file: str) -> bool:
             f.write(f"file '{os.path.abspath(vf)}'\n")
     try:
-        # Se der problema de "different stream parameters", troque -c copy por reencode controlado
-        cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_file]
         subprocess.run(cmd, check=True, capture_output=True)
         return True
     except subprocess.CalledProcessError:
-        # fallback reencode
         try:
             cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file,
                    "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
-                   "-c:a", "aac", output_file]
             subprocess.run(cmd, check=True, capture_output=True)
             return True
         except subprocess.CalledProcessError:
@@ -343,12 +382,12 @@ def generate_linear_cuts(video_file: str, segments: List[Segment], output_dir: s
     Path(output_dir).mkdir(parents=True, exist_ok=True)
     total_duration = segments[-1].end - segments[0].start
-    target_duration = min(max_len, max(min_len, total_duration / k))
     outputs = []
     current_start = segments[0].start
-    for i in range(k):
         target_end = current_start + target_duration
         best_end = target_end
@@ -366,9 +405,10 @@ def generate_linear_cuts(video_file: str, segments: List[Segment], output_dir: s
         temp_file = Path(output_dir) / f"temp_linear_{i+1}.mp4"
         final_file = Path(output_dir) / f"cut_linear_{i+1}.mp4"
-        print(f"Corte {i+1}/{k}: {start_with_pad:.1f}s - {end_with_pad:.1f}s")
-        if extract_video_segment(video_file, str(temp_file), start_with_pad, end_with_pad):
             if ar_mode != "Original":
                 if apply_aspect_ratio(str(temp_file), str(final_file), ar_mode, face_tracking):
                     Path(temp_file).unlink(missing_ok=True)
@@ -395,7 +435,7 @@ def generate_creative_cuts(video_file: str, segments: List[Segment], output_dir:
     outputs = []
     import random
-    for i in range(k):
         num_blocks = random.randint(min_blocks, min(max_blocks, len(segments)))
         step = max(1, len(segments) // num_blocks)
         selected_indices = [j * step for j in range(num_blocks)]
@@ -406,8 +446,8 @@ def generate_creative_cuts(video_file: str, segments: List[Segment], output_dir:
             block_file = Path(output_dir) / f"temp_creative_{i+1}_block_{j+1}.mp4"
             start = max(0, seg.start - pad)
             end = seg.end + pad
-            if extract_video_segment(video_file, str(block_file), start, end):
                 block_files.append(str(block_file))
         if not block_files:
@@ -436,9 +476,10 @@ SPACE_OUT = Path("outputs")
 SPACE_OUT.mkdir(exist_ok=True, parents=True)
 def do_transcribe(video_file, model_size):
-    if video_file is None:
-        return [], "Selecione um vídeo."
-    segs = transcribe(video_file, model_size=model_size)
     preview = "\n".join([f"[{s.start:.1f}–{s.end:.1f}] {s.text}" for s in segs[:12]])
     return segs, f"Transcrição ok. Segmentos: {len(segs)}\n\nPrévia:\n{preview}"
@@ -446,19 +487,21 @@ def run_linear(segs, video_file, out_subdir, min_len, max_len, ideal_len, k, gap
     if not segs:
         return [], "Transcreva antes de cortar."
     workdir = SPACE_OUT / (out_subdir or "cortes")
-    outs = generate_linear_cuts(video_file, segs, str(workdir), min_len=min_len, max_len=max_len,
-                                ideal_len=ideal_len, k=int(k), gap_threshold=gap, pad=pad,
-                                ar_mode=ar_mode, face_tracking=face_tracking)
     return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
 def run_creative(segs, video_file, out_subdir, min_len, max_len, ideal_len, minb, maxb, k, gap, pad, ar_mode, face_tracking):
     if not segs:
         return [], "Transcreva antes de cortar."
     workdir = SPACE_OUT / (out_subdir or "cortes")
-    outs = generate_creative_cuts(video_file, segs, str(workdir), min_len=min_len, max_len=max_len,
-                                  ideal_len=ideal_len, min_blocks=int(minb), max_blocks=int(maxb),
-                                  k=int(k), gap_threshold=gap, pad=pad, ar_mode=ar_mode,
-                                  face_tracking=face_tracking)
     return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
 css = """
@@ -538,4 +581,5 @@ with gr.Blocks(title="Editor de Cortes Automático", css=css) as demo:
                      outputs=[out_creative, status_creative])
 if __name__ == "__main__":
-    demo.launch()

 import subprocess
 from pathlib import Path
 from dataclasses import dataclass
+from typing import List, Tuple, Optional, Union
 import tempfile
 import os
 import shutil
+import json
 # ======================= DATACLASSES =======================
     center_y: int
     confidence: float = 1.0
+# ======================= UTILS =======================
+def resolve_video_path(v: Union[str, dict, None]) -> Optional[str]:
+    """Gradio às vezes entrega str (caminho) ou dict {'name':..., 'data':...}. Normaliza para caminho."""
+    if v is None:
+        return None
+    if isinstance(v, str):
+        return v
+    if isinstance(v, dict):
+        # Prioriza caminho local temporário
+        if "name" in v and isinstance(v["name"], str) and len(v["name"]) > 0 and os.path.exists(v["name"]):
+            return v["name"]
+        # Algumas versões usam 'path'
+        if "path" in v and isinstance(v["path"], str) and os.path.exists(v["path"]):
+            return v["path"]
+        # Fallback: alguns frontends mandam apenas nome base; não há como resolver sem arquivo
+        return v.get("name") or v.get("path")
+    return None
+def probe_duration(path: str) -> Optional[float]:
+    """Retorna a duração (segundos) via ffprobe, ou None se falhar."""
+    try:
+        cmd = [
+            "ffprobe", "-v", "error", "-show_entries", "format=duration",
+            "-of", "json", path
+        ]
+        out = subprocess.run(cmd, check=True, capture_output=True)
+        data = json.loads(out.stdout.decode("utf-8", errors="ignore"))
+        dur = float(data.get("format", {}).get("duration", 0.0))
+        return dur if dur > 0 else None
+    except Exception as e:
+        print(f"[ffprobe] falhou: {e}")
+        return None
 # ======================= FACE TRACKING =======================
 class FaceTracker:
     """Extrai o áudio para WAV mono 16kHz para robustez da transcrição."""
     fd, tmp_path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
+    print(f"[ffmpeg] extraindo WAV -> {tmp_path}")
     cmd = [
         "ffmpeg", "-y", "-i", input_video,
         "-vn", "-ac", "1", "-ar", str(sr), "-f", "wav", tmp_path
     return tmp_path
 def transcribe(video_file: str, model_size: str = "small") -> List[Segment]:
+    true_path = resolve_video_path(video_file)
+    if not true_path or not os.path.exists(true_path):
+        print(f"[transcribe] caminho inválido: {video_file}")
+        return []
+    # Durações para diagnóstico
+    vid_dur = probe_duration(true_path)
+    print(f"[probe] duração do vídeo: {vid_dur:.2f}s" if vid_dur else "[probe] duração do vídeo: desconhecida")
+    print(f"[whisper] carregando modelo: {model_size}")
+    model = whisper.load_model(model_size)  # device auto
+    print(f"[whisper] extraindo áudio WAV…")
+    audio_wav = extract_audio_wav(true_path, sr=16000)
+    wav_dur = probe_duration(audio_wav)
+    print(f"[probe] duração do WAV: {wav_dur:.2f}s" if wav_dur else "[probe] duração do WAV: desconhecida")
+    if vid_dur and wav_dur and wav_dur + 1 < vid_dur:
+        print("[aviso] WAV menor que o vídeo — verifique codecs/ffmpeg. Mesmo assim vou transcrever o que foi extraído.")
+    print("[whisper] transcrevendo…")
+    # Configs mais robustas para CPU/Spaces
     result = model.transcribe(
         audio_wav,
         language="pt",
         verbose=False,
         task="transcribe",
+        temperature=0,
+        condition_on_previous_text=False,
+        fp16=False
     )
+    segments = [Segment(start=s["start"], end=s["end"], text=s["text"].strip())
+                for s in result.get("segments", [])]
+    print(f"[whisper] segmentos: {len(segments)}")
     try:
         Path(audio_wav).unlink(missing_ok=True)
     except Exception:
 # ======================= PROCESSAMENTO DE VÍDEO =======================
 def extract_video_segment(input_video: str, output_video: str, start_time: float, end_time: float) -> bool:
+    duration = max(0.0, end_time - start_time)
+    if duration <= 0:
+        print(f"[extract] duração inválida: {duration}")
+        return False
     cmd = [
         "ffmpeg", "-y", "-ss", str(start_time), "-i", input_video,
         "-t", str(duration),
         "-c:v", "libx264",
+        "-c:a", "aac",
+        "-movflags", "+faststart",
         output_video
     ]
     try:
         subprocess.run(cmd, check=True, capture_output=True)
         return True
     except subprocess.CalledProcessError as e:
+        print(f"[extract] erro: {e}")
         return False
 def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: int,
     """Calcula o melhor crop com rastreamento facial e aplica o crop com FFmpeg preservando o áudio."""
     tracker = FaceTracker()
     cap = cv2.VideoCapture(input_path)
     if not cap.isOpened():
+        print(f"[crop] erro ao abrir: {input_path}")
         return False
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     sample_positions = []
     frame_indices = np.linspace(0, frame_count - 1, min(sample_frames, max(1, frame_count)), dtype=int)
     for idx in frame_indices:
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
             sample_positions.append(crop_coords)
     cap.release()
     if sample_positions:
         avg_x = int(np.median([p[0] for p in sample_positions]))
         avg_y = int(np.median([p[1] for p in sample_positions]))
         crop_h = sample_positions[0][3]
         final_crop = (avg_x, avg_y, crop_w, crop_h)
     else:
         target_ar = target_width / target_height
         frame_ar = frame_w / frame_h
         if target_ar < frame_ar:
             final_crop = (0, (frame_h - crop_h) // 2, crop_w, crop_h)
     x, y, w, h = final_crop
+    print(f"[crop] final: x={x}, y={y}, w={w}, h={h} -> {target_width}x{target_height}")
     vf = f"crop={w}:{h}:{x}:{y},scale={target_width}:{target_height}:flags=lanczos"
     cmd = [
         "ffmpeg", "-y", "-i", input_path,
         "-vf", vf,
         "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
+        "-c:a", "copy",
+        "-movflags", "+faststart",
         output_path
     ]
     try:
         subprocess.run(cmd, check=True, capture_output=True)
+        print(f"[crop] concluído: {output_path}")
         return True
     except subprocess.CalledProcessError as e:
+        print(f"[crop] erro ffmpeg: {e}")
         return False
 def apply_aspect_ratio(input_video: str, output_video: str, ar_mode: str, face_tracking: bool = False) -> bool:
         "Quadrado 1:1": (1080, 1080),
         "Retrato 4:5": (1080, 1350),
     }
     if ar_mode not in ar_dims:
         return False
     width, height = ar_dims[ar_mode]
     if face_tracking:
         return apply_smart_crop_to_video(input_video, output_video, width, height)
     else:
         cmd = [
             "ffmpeg", "-y", "-i", input_video,
             "-vf", f"scale={width}:{height}:force_original_aspect_ratio=increase,crop={width}:{height}",
             "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
+            "-c:a", "copy",
+            "-movflags", "+faststart",
             output_video
         ]
         try:
             f.write(f"file '{os.path.abspath(vf)}'\n")
     try:
+        cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", "-movflags", "+faststart", output_file]
         subprocess.run(cmd, check=True, capture_output=True)
         return True
     except subprocess.CalledProcessError:
         try:
             cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file,
                    "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
+                   "-c:a", "aac", "-movflags", "+faststart", output_file]
             subprocess.run(cmd, check=True, capture_output=True)
             return True
         except subprocess.CalledProcessError:
     Path(output_dir).mkdir(parents=True, exist_ok=True)
     total_duration = segments[-1].end - segments[0].start
+    target_duration = min(max_len, max(min_len, total_duration / max(1, int(k))))
     outputs = []
     current_start = segments[0].start
+    for i in range(int(k)):
         target_end = current_start + target_duration
         best_end = target_end
         temp_file = Path(output_dir) / f"temp_linear_{i+1}.mp4"
         final_file = Path(output_dir) / f"cut_linear_{i+1}.mp4"
+        print(f"[linear] corte {i+1}/{k}: {start_with_pad:.1f}s - {end_with_pad:.1f}s")
+        src_path = resolve_video_path(video_file) or video_file
+        if extract_video_segment(src_path, str(temp_file), start_with_pad, end_with_pad):
             if ar_mode != "Original":
                 if apply_aspect_ratio(str(temp_file), str(final_file), ar_mode, face_tracking):
                     Path(temp_file).unlink(missing_ok=True)
     outputs = []
     import random
+    for i in range(int(k)):
         num_blocks = random.randint(min_blocks, min(max_blocks, len(segments)))
         step = max(1, len(segments) // num_blocks)
         selected_indices = [j * step for j in range(num_blocks)]
             block_file = Path(output_dir) / f"temp_creative_{i+1}_block_{j+1}.mp4"
             start = max(0, seg.start - pad)
             end = seg.end + pad
+            src_path = resolve_video_path(video_file) or video_file
+            if extract_video_segment(src_path, str(block_file), start, end):
                 block_files.append(str(block_file))
         if not block_files:
 SPACE_OUT.mkdir(exist_ok=True, parents=True)
 def do_transcribe(video_file, model_size):
+    true_path = resolve_video_path(video_file)
+    if not true_path or not os.path.exists(true_path):
+        return [], "Selecione um vídeo válido."
+    segs = transcribe(true_path, model_size=model_size)
     preview = "\n".join([f"[{s.start:.1f}–{s.end:.1f}] {s.text}" for s in segs[:12]])
     return segs, f"Transcrição ok. Segmentos: {len(segs)}\n\nPrévia:\n{preview}"
     if not segs:
         return [], "Transcreva antes de cortar."
     workdir = SPACE_OUT / (out_subdir or "cortes")
+    outs = generate_linear_cuts(video_file, segs, str(workdir),
+                                min_len=float(min_len), max_len=float(max_len), ideal_len=float(ideal_len),
+                                k=int(k), gap_threshold=float(gap), pad=float(pad),
+                                ar_mode=str(ar_mode), face_tracking=bool(face_tracking))
     return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
 def run_creative(segs, video_file, out_subdir, min_len, max_len, ideal_len, minb, maxb, k, gap, pad, ar_mode, face_tracking):
     if not segs:
         return [], "Transcreva antes de cortar."
     workdir = SPACE_OUT / (out_subdir or "cortes")
+    outs = generate_creative_cuts(video_file, segs, str(workdir),
+                                  min_len=float(min_len), max_len=float(max_len), ideal_len=float(ideal_len),
+                                  min_blocks=int(minb), max_blocks=int(maxb), k=int(k),
+                                  gap_threshold=float(gap), pad=float(pad),
+                                  ar_mode=str(ar_mode), face_tracking=bool(face_tracking))
     return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
 css = """
                      outputs=[out_creative, status_creative])
 if __name__ == "__main__":
+    # Ativa fila para tarefas longas no Space
+    demo.queue(concurrency_count=1, max_size=20).launch()