leicam commited on
Commit
1beb3d5
·
verified ·
1 Parent(s): 60eef44

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +436 -386
app.py CHANGED
@@ -1,23 +1,20 @@
1
  """
2
  Video Clip Generator - Tudo integrado
3
- Transcrição + Cortes + Face Tracking + Cortes Virais
4
  """
5
 
6
  import gradio as gr
7
  import cv2
8
  import numpy as np
 
9
  import whisper
10
  import subprocess
11
  from pathlib import Path
12
  from dataclasses import dataclass
13
- from typing import List, Tuple, Optional, Union
14
  import tempfile
15
  import os
16
  import shutil
17
- import json
18
- import random
19
- from difflib import SequenceMatcher
20
- import math
21
 
22
  # ======================= DATACLASSES =======================
23
 
@@ -42,49 +39,6 @@ class FaceBox:
42
  center_y: int
43
  confidence: float = 1.0
44
 
45
- # ======================= UTILS =======================
46
-
47
- def resolve_video_path(v: Union[str, dict, None]) -> Optional[str]:
48
- """Gradio pode entregar str (caminho) ou dict. Normaliza para caminho local."""
49
- if v is None:
50
- return None
51
- if isinstance(v, str):
52
- return v
53
- if isinstance(v, dict):
54
- if "name" in v and isinstance(v["name"], str) and os.path.exists(v["name"]):
55
- return v["name"]
56
- if "path" in v and isinstance(v["path"], str) and os.path.exists(v["path"]):
57
- return v["path"]
58
- return v.get("name") or v.get("path")
59
- return None
60
-
61
- def probe_duration(path: str) -> Optional[float]:
62
- """Retorna a duração (s) via ffprobe."""
63
- try:
64
- cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "json", path]
65
- out = subprocess.run(cmd, check=True, capture_output=True)
66
- data = json.loads(out.stdout.decode("utf-8", errors="ignore"))
67
- dur = float(data.get("format", {}).get("duration", 0.0))
68
- return dur if dur > 0 else None
69
- except Exception as e:
70
- print(f"[ffprobe] falhou: {e}")
71
- return None
72
-
73
- def remux_video(src: str) -> str:
74
- """Gera um MP4 remuxado (ajusta PTS/timebase e faststart)."""
75
- fd, tmp_path = tempfile.mkstemp(suffix=".mp4")
76
- os.close(fd)
77
- cmd = [
78
- "ffmpeg", "-y",
79
- "-fflags", "+genpts",
80
- "-i", src,
81
- "-c", "copy",
82
- "-movflags", "+faststart",
83
- tmp_path
84
- ]
85
- subprocess.run(cmd, check=True, capture_output=True)
86
- return tmp_path
87
-
88
  # ======================= FACE TRACKING =======================
89
 
90
  class FaceTracker:
@@ -107,9 +61,9 @@ class FaceTracker:
107
 
108
  self.enabled = self.face_cascade is not None and not self.face_cascade.empty()
109
  if self.enabled:
110
- print("Detector de rostos carregado")
111
  else:
112
- print("Detector de rostos não disponível - usando crop centralizado")
113
 
114
  def detect_faces(self, frame: np.ndarray) -> List[FaceBox]:
115
  if not self.enabled:
@@ -188,147 +142,69 @@ class FaceTracker:
188
 
189
  return (crop_x, crop_y, crop_w, crop_h)
190
 
191
- # ======================= TRANSCRIÇÃO (ROBUSTA) =======================
192
-
193
- def extract_audio_wav_strong(input_video: str, sr: int = 16000) -> str:
194
- """
195
- Extração de áudio à prova de VFR/PTS ruins.
196
- 1) Remuxa o vídeo (ajusta timebase)
197
- 2) Extrai WAV mono 16k
198
- 3) Se o WAV vier curto, faz fallback re-decodificando o original
199
- """
200
- vid_dur = probe_duration(input_video)
201
- print(f"[probe] video: {vid_dur:.2f}s" if vid_dur else "[probe] video: ?")
202
-
203
- remux = remux_video(input_video)
204
- print(f"[remux] -> {remux}")
205
-
206
- fd, wav_path = tempfile.mkstemp(suffix=".wav")
207
- os.close(fd)
208
-
209
- # Tentativa 1 — do remux, convertendo para PCM
210
- cmd1 = [
211
- "ffmpeg", "-y",
212
- "-i", remux,
213
- "-vn",
214
- "-map", "0:a:0?",
215
- "-ac", "1", "-ar", str(sr),
216
- "-c:a", "pcm_s16le",
217
- wav_path
218
- ]
219
- subprocess.run(cmd1, check=True, capture_output=True)
220
- wav_dur = probe_duration(wav_path)
221
- print(f"[probe] wav #1: {wav_dur:.2f}s" if wav_dur else "[probe] wav #1: ?")
222
-
223
- # Fallback — redecodifica direto do original
224
- if vid_dur and (not wav_dur or wav_dur + 2 < vid_dur):
225
- print("[fallback] re-decodificando o arquivo original…")
226
- fd2, wav2 = tempfile.mkstemp(suffix=".wav")
227
- os.close(fd2)
228
- cmd2 = [
229
- "ffmpeg", "-y",
230
- "-fflags", "+genpts",
231
- "-i", input_video,
232
- "-vn",
233
- "-ac", "1", "-ar", str(sr),
234
- "-c:a", "pcm_s16le",
235
- wav2
236
- ]
237
- subprocess.run(cmd2, check=True, capture_output=True)
238
- wav2_dur = probe_duration(wav2)
239
- print(f"[probe] wav #2: {wav2_dur:.2f}s" if wav2_dur else "[probe] wav #2: ?")
240
- if wav2_dur and (not wav_dur or wav2_dur > wav_dur):
241
- try:
242
- Path(wav_path).unlink(missing_ok=True)
243
- Path(remux).unlink(missing_ok=True)
244
- except Exception:
245
- pass
246
- return wav2
247
-
248
- try:
249
- Path(remux).unlink(missing_ok=True)
250
- except Exception:
251
- pass
252
- return wav_path
253
 
254
  def transcribe(video_file: str, model_size: str = "small") -> List[Segment]:
255
- print(f"[whisper] modelo: {model_size}")
256
  model = whisper.load_model(model_size)
257
-
258
- print("[audio] extraindo WAV robusto…")
259
- audio_wav = extract_audio_wav_strong(video_file, sr=16000)
260
-
261
- vid_dur = probe_duration(video_file)
262
- wav_dur = probe_duration(audio_wav)
263
- if vid_dur: print(f"[dur] vídeo: {vid_dur:.2f}s")
264
- if wav_dur: print(f"[dur] wav: {wav_dur:.2f}s")
265
-
266
- print("[whisper] transcrevendo…")
267
- result = model.transcribe(
268
- audio_wav,
269
- language="pt",
270
- verbose=False,
271
- task="transcribe",
272
- temperature=0,
273
- condition_on_previous_text=False,
274
- fp16=False
275
- )
276
-
277
- segments = [Segment(start=s["start"], end=s["end"], text=s["text"].strip())
278
- for s in result.get("segments", [])]
279
- print(f"[whisper] segmentos: {len(segments)}")
280
-
281
- try:
282
- Path(audio_wav).unlink(missing_ok=True)
283
- except Exception:
284
- pass
285
  return segments
286
 
287
  # ======================= PROCESSAMENTO DE VÍDEO =======================
288
 
289
  def extract_video_segment(input_video: str, output_video: str, start_time: float, end_time: float) -> bool:
290
- duration = max(0.0, end_time - start_time)
291
- if duration <= 0:
292
- print(f"[extract] duração inválida: {duration}")
293
- return False
294
  cmd = [
295
  "ffmpeg", "-y", "-ss", str(start_time), "-i", input_video,
296
- "-t", str(duration),
297
- "-c:v", "libx264",
298
- "-c:a", "aac",
299
- "-movflags", "+faststart",
300
- output_video
301
  ]
 
302
  try:
303
  subprocess.run(cmd, check=True, capture_output=True)
304
  return True
305
  except subprocess.CalledProcessError as e:
306
- print(f"[extract] erro: {e}")
307
  return False
308
 
309
  def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: int,
310
  target_height: int, sample_frames: int = 10) -> bool:
311
- """Calcula o melhor crop com rastreamento facial e aplica com FFmpeg preservando áudio."""
312
  tracker = FaceTracker()
313
  cap = cv2.VideoCapture(input_path)
 
314
  if not cap.isOpened():
315
- print(f"[crop] erro ao abrir: {input_path}")
316
  return False
317
 
 
318
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
319
  frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
320
  frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
321
 
 
322
  sample_positions = []
323
- frame_indices = np.linspace(0, frame_count - 1, min(sample_frames, max(1, frame_count)), dtype=int)
 
324
  for idx in frame_indices:
325
  cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
326
  ret, frame = cap.read()
327
  if ret:
328
  crop_coords = tracker.calculate_smart_crop(frame, target_width, target_height)
329
  sample_positions.append(crop_coords)
330
- cap.release()
331
 
 
332
  if sample_positions:
333
  avg_x = int(np.median([p[0] for p in sample_positions]))
334
  avg_y = int(np.median([p[1] for p in sample_positions]))
@@ -336,6 +212,7 @@ def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: i
336
  crop_h = sample_positions[0][3]
337
  final_crop = (avg_x, avg_y, crop_w, crop_h)
338
  else:
 
339
  target_ar = target_width / target_height
340
  frame_ar = frame_w / frame_h
341
  if target_ar < frame_ar:
@@ -347,25 +224,39 @@ def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: i
347
  crop_h = int(frame_w / target_ar)
348
  final_crop = (0, (frame_h - crop_h) // 2, crop_w, crop_h)
349
 
350
- x, y, w, h = final_crop
351
- print(f"[crop] final: x={x}, y={y}, w={w}, h={h} -> {target_width}x{target_height}")
352
 
353
- vf = f"crop={w}:{h}:{x}:{y},scale={target_width}:{target_height}:flags=lanczos"
354
- cmd = [
355
- "ffmpeg", "-y", "-i", input_path,
356
- "-vf", vf,
357
- "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
358
- "-c:a", "copy",
359
- "-movflags", "+faststart",
360
- output_path
361
- ]
362
- try:
363
- subprocess.run(cmd, check=True, capture_output=True)
364
- print(f"[crop] concluído: {output_path}")
365
- return True
366
- except subprocess.CalledProcessError as e:
367
- print(f"[crop] erro ffmpeg: {e}")
368
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
  def apply_aspect_ratio(input_video: str, output_video: str, ar_mode: str, face_tracking: bool = False) -> bool:
371
  if ar_mode == "Original":
@@ -377,20 +268,20 @@ def apply_aspect_ratio(input_video: str, output_video: str, ar_mode: str, face_t
377
  "Quadrado 1:1": (1080, 1080),
378
  "Retrato 4:5": (1080, 1350),
379
  }
 
380
  if ar_mode not in ar_dims:
381
  return False
382
 
383
  width, height = ar_dims[ar_mode]
 
384
  if face_tracking:
385
  return apply_smart_crop_to_video(input_video, output_video, width, height)
386
  else:
 
387
  cmd = [
388
  "ffmpeg", "-y", "-i", input_video,
389
  "-vf", f"scale={width}:{height}:force_original_aspect_ratio=increase,crop={width}:{height}",
390
- "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
391
- "-c:a", "copy",
392
- "-movflags", "+faststart",
393
- output_video
394
  ]
395
  try:
396
  subprocess.run(cmd, check=True, capture_output=True)
@@ -408,67 +299,362 @@ def concatenate_videos(video_files: List[str], output_file: str) -> bool:
408
  f.write(f"file '{os.path.abspath(vf)}'\n")
409
 
410
  try:
411
- cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", "-movflags", "+faststart", output_file]
412
  subprocess.run(cmd, check=True, capture_output=True)
413
  return True
414
  except subprocess.CalledProcessError:
415
- try:
416
- cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file,
417
- "-c:v", "libx264", "-preset", "veryfast", "-crf", "18",
418
- "-c:a", "aac", "-movflags", "+faststart", output_file]
419
- subprocess.run(cmd, check=True, capture_output=True)
420
- return True
421
- except subprocess.CalledProcessError:
422
- return False
423
  finally:
424
  Path(list_file).unlink(missing_ok=True)
425
 
426
- # ======================= HEURÍSTICAS VIRAL =======================
427
-
428
- IMPACT_WORDS = [
429
- "problema", "não dá", "imaginar", "decisão", "escolha",
430
- "compromisso", "vocês", "precisa", "eu tenho", "é o começo",
431
- "querem que tudo mude", "caminho", "rede neural", "resultado", "responsável"
432
- ]
433
-
434
- def emotional_score(text: str) -> float:
435
- t = text.lower()
436
- hits = sum(1 for w in IMPACT_WORDS if w in t)
437
- # bonus por perguntas/provocações
438
- bonus = 0.5 if "?" in t else 0.0
439
- # normaliza 0..1
440
- return min(1.0, (hits / max(1, len(IMPACT_WORDS))) * 4 + bonus)
441
-
442
- def semantic_continuity(a: str, b: str) -> float:
443
- a_words = a.lower().split()[-12:]
444
- b_words = b.lower().split()[:12]
445
- seq = SequenceMatcher(None, " ".join(a_words), " ".join(b_words))
446
- return seq.ratio()
447
-
448
- def iou(a: Tuple[float,float], b: Tuple[float,float]) -> float:
449
- s1, e1 = a
450
- s2, e2 = b
451
- inter = max(0.0, min(e1, e2) - max(s1, s2))
452
- union = (e1 - s1) + (e2 - s2) - inter
453
- return 0.0 if union <= 0 else inter / union
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
  # ======================= GERAÇÃO DE CORTES =======================
456
 
457
  def generate_linear_cuts(video_file: str, segments: List[Segment], output_dir: str,
458
  min_len: float = 600, max_len: float = 900, ideal_len: float = 900,
459
  k: int = 2, gap_threshold: float = 0.60, pad: float = 0.08,
460
- ar_mode: str = "Original", face_tracking: bool = False) -> List[str]:
 
461
  if not segments:
462
  return []
463
 
464
  Path(output_dir).mkdir(parents=True, exist_ok=True)
465
  total_duration = segments[-1].end - segments[0].start
466
- target_duration = min(max_len, max(min_len, total_duration / max(1, int(k))))
467
 
468
  outputs = []
469
  current_start = segments[0].start
470
 
471
- for i in range(int(k)):
472
  target_end = current_start + target_duration
473
  best_end = target_end
474
 
@@ -486,16 +672,15 @@ def generate_linear_cuts(video_file: str, segments: List[Segment], output_dir: s
486
  temp_file = Path(output_dir) / f"temp_linear_{i+1}.mp4"
487
  final_file = Path(output_dir) / f"cut_linear_{i+1}.mp4"
488
 
489
- print(f"[linear] corte {i+1}/{k}: {start_with_pad:.1f}s - {end_with_pad:.1f}s")
490
 
491
- src_path = resolve_video_path(video_file) or video_file
492
- if extract_video_segment(src_path, str(temp_file), start_with_pad, end_with_pad):
493
  if ar_mode != "Original":
494
  if apply_aspect_ratio(str(temp_file), str(final_file), ar_mode, face_tracking):
495
- Path(temp_file).unlink(missing_ok=True)
496
  outputs.append(str(final_file))
497
  else:
498
- Path(temp_file).rename(final_file)
499
  outputs.append(str(final_file))
500
 
501
  current_start = best_end + gap_threshold
@@ -508,14 +693,16 @@ def generate_creative_cuts(video_file: str, segments: List[Segment], output_dir:
508
  min_len: float = 600, max_len: float = 900, ideal_len: float = 900,
509
  min_blocks: int = 3, max_blocks: int = 8, k: int = 2,
510
  gap_threshold: float = 0.60, pad: float = 0.08,
511
- ar_mode: str = "Original", face_tracking: bool = False) -> List[str]:
 
512
  if not segments or len(segments) < min_blocks:
513
  return []
514
 
515
  Path(output_dir).mkdir(parents=True, exist_ok=True)
516
  outputs = []
517
 
518
- for i in range(int(k)):
 
519
  num_blocks = random.randint(min_blocks, min(max_blocks, len(segments)))
520
  step = max(1, len(segments) // num_blocks)
521
  selected_indices = [j * step for j in range(num_blocks)]
@@ -526,8 +713,8 @@ def generate_creative_cuts(video_file: str, segments: List[Segment], output_dir:
526
  block_file = Path(output_dir) / f"temp_creative_{i+1}_block_{j+1}.mp4"
527
  start = max(0, seg.start - pad)
528
  end = seg.end + pad
529
- src_path = resolve_video_path(video_file) or video_file
530
- if extract_video_segment(src_path, str(block_file), start, end):
531
  block_files.append(str(block_file))
532
 
533
  if not block_files:
@@ -539,10 +726,10 @@ def generate_creative_cuts(video_file: str, segments: List[Segment], output_dir:
539
 
540
  if ar_mode != "Original":
541
  if apply_aspect_ratio(str(concat_file), str(final_file), ar_mode, face_tracking):
542
- Path(concat_file).unlink(missing_ok=True)
543
  outputs.append(str(final_file))
544
  else:
545
- Path(concat_file).rename(final_file)
546
  outputs.append(str(final_file))
547
 
548
  for bf in block_files:
@@ -550,123 +737,15 @@ def generate_creative_cuts(video_file: str, segments: List[Segment], output_dir:
550
 
551
  return outputs
552
 
553
- # ======== NOVO: GERAÇÃO DE CORTES VIRAIS (semântica + emoção) ========
554
-
555
- def merge_adjacent_semantic(segments: List[Segment],
556
- cont_thresh: float = 0.35,
557
- max_gap: float = 2.0) -> List[Segment]:
558
- """Une segmentos adjacentes que mantêm continuidade semântica."""
559
- if not segments:
560
- return []
561
- merged: List[Segment] = []
562
- i = 0
563
- while i < len(segments):
564
- cur = segments[i]
565
- j = i + 1
566
- while j < len(segments):
567
- nxt = segments[j]
568
- if (nxt.start - cur.end) > max_gap:
569
- break
570
- cont = semantic_continuity(cur.text, nxt.text)
571
- if cont >= cont_thresh:
572
- cur = Segment(cur.start, nxt.end, f"{cur.text} {nxt.text}")
573
- j += 1
574
- else:
575
- break
576
- merged.append(cur)
577
- i = j
578
- return merged
579
-
580
- def build_candidates(segments: List[Segment],
581
- min_len: float = 20.0,
582
- max_len: float = 60.0,
583
- ideal: float = 40.0) -> List[Tuple[Tuple[float,float], float, str]]:
584
- """
585
- Gera janelas [start,end] com score:
586
- 0.6 emoção + 0.3 continuidade média + 0.1 proximidade ao ideal.
587
- """
588
- cands = []
589
- n = len(segments)
590
- for i in range(n):
591
- start = segments[i].start
592
- text_acc = segments[i].text
593
- cont_sum = 0.0
594
- words = segments[i].text.split()
595
- j = i
596
- while j+1 < n:
597
- j += 1
598
- text_acc += " " + segments[j].text
599
- cont_sum += semantic_continuity(segments[j-1].text, segments[j].text)
600
- dur = segments[j].end - start
601
- if dur < min_len:
602
- continue
603
- if dur > max_len:
604
- break
605
- emo = emotional_score(text_acc)
606
- cont_avg = cont_sum / max(1, (j - i))
607
- ideal_penalty = 1.0 - min(1.0, abs(dur - ideal) / ideal) # 1 se ideal, 0 se muito longe
608
- score = 0.6*emo + 0.3*cont_avg + 0.1*ideal_penalty
609
- cands.append(((start, segments[j].end), score, text_acc))
610
- # também considerar single-merge como candidato
611
- for s in segments:
612
- dur = s.end - s.start
613
- if min_len <= dur <= max_len:
614
- score = 0.6*emotional_score(s.text) + 0.3*0.5 + 0.1*(1 - abs(dur-40.0)/40.0)
615
- cands.append(((s.start, s.end), score, s.text))
616
- # ordenar por score desc
617
- cands.sort(key=lambda x: x[1], reverse=True)
618
- return cands
619
-
620
- def pick_top_non_overlapping(cands: List[Tuple[Tuple[float,float], float, str]],
621
- k: int = 3, iou_thresh: float = 0.15) -> List[Tuple[float,float,str]]:
622
- picks: List[Tuple[float,float,str]] = []
623
- for (win, score, text) in cands:
624
- if len(picks) >= k:
625
- break
626
- if all(iou(win, (p[0], p[1])) < iou_thresh for p in picks):
627
- picks.append((win[0], win[1], text))
628
- return picks
629
-
630
- def generate_viral_cuts(video_file: str, segments: List[Segment], output_dir: str,
631
- k: int = 3, pad: float = 0.05,
632
- min_len: float = 20.0, max_len: float = 60.0, ideal: float = 40.0,
633
- ar_mode: str = "Original", face_tracking: bool = False) -> List[str]:
634
- if not segments:
635
- return []
636
- Path(output_dir).mkdir(parents=True, exist_ok=True)
637
-
638
- merged = merge_adjacent_semantic(segments, cont_thresh=0.35, max_gap=2.0)
639
- cands = build_candidates(merged, min_len=min_len, max_len=max_len, ideal=ideal)
640
- chosen = pick_top_non_overlapping(cands, k=k, iou_thresh=0.15)
641
-
642
- outputs = []
643
- for idx, (s, e, _txt) in enumerate(chosen, start=1):
644
- start = max(0, s - pad)
645
- end = e + pad
646
- temp_file = Path(output_dir) / f"temp_viral_{idx}.mp4"
647
- final_file = Path(output_dir) / f"cut_viral_{idx}.mp4"
648
- print(f"[viral] corte {idx}/{len(chosen)}: {start:.1f}s - {end:.1f}s")
649
- src_path = resolve_video_path(video_file) or video_file
650
- if extract_video_segment(src_path, str(temp_file), start, end):
651
- if ar_mode != "Original":
652
- if apply_aspect_ratio(str(temp_file), str(final_file), ar_mode, face_tracking):
653
- Path(temp_file).unlink(missing_ok=True)
654
- outputs.append(str(final_file))
655
- else:
656
- Path(temp_file).rename(final_file)
657
- outputs.append(str(final_file))
658
- return outputs
659
-
660
  # ======================= INTERFACE GRADIO =======================
661
 
662
  SPACE_OUT = Path("outputs")
663
  SPACE_OUT.mkdir(exist_ok=True, parents=True)
664
 
665
  def do_transcribe(video_file, model_size):
666
- true_path = resolve_video_path(video_file)
667
- if not true_path or not os.path.exists(true_path):
668
- return [], "Selecione um vídeo válido."
669
- segs = transcribe(true_path, model_size=model_size)
670
  preview = "\n".join([f"[{s.start:.1f}–{s.end:.1f}] {s.text}" for s in segs[:12]])
671
  return segs, f"Transcrição ok. Segmentos: {len(segs)}\n\nPrévia:\n{preview}"
672
 
@@ -674,31 +753,19 @@ def run_linear(segs, video_file, out_subdir, min_len, max_len, ideal_len, k, gap
674
  if not segs:
675
  return [], "Transcreva antes de cortar."
676
  workdir = SPACE_OUT / (out_subdir or "cortes")
677
- outs = generate_linear_cuts(video_file, segs, str(workdir),
678
- min_len=float(min_len), max_len=float(max_len), ideal_len=float(ideal_len),
679
- k=int(k), gap_threshold=float(gap), pad=float(pad),
680
- ar_mode=str(ar_mode), face_tracking=bool(face_tracking))
681
  return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
682
 
683
  def run_creative(segs, video_file, out_subdir, min_len, max_len, ideal_len, minb, maxb, k, gap, pad, ar_mode, face_tracking):
684
  if not segs:
685
  return [], "Transcreva antes de cortar."
686
  workdir = SPACE_OUT / (out_subdir or "cortes")
687
- outs = generate_creative_cuts(video_file, segs, str(workdir),
688
- min_len=float(min_len), max_len=float(max_len), ideal_len=float(ideal_len),
689
- min_blocks=int(minb), max_blocks=int(maxb), k=int(k),
690
- gap_threshold=float(gap), pad=float(pad),
691
- ar_mode=str(ar_mode), face_tracking=bool(face_tracking))
692
- return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
693
-
694
- def run_viral(segs, video_file, out_subdir, k, pad, min_len, max_len, ideal_len, ar_mode, face_tracking):
695
- if not segs:
696
- return [], "Transcreva antes de cortar."
697
- workdir = SPACE_OUT / (out_subdir or "cortes")
698
- outs = generate_viral_cuts(video_file, segs, str(workdir),
699
- k=int(k), pad=float(pad),
700
- min_len=float(min_len), max_len=float(max_len), ideal=float(ideal_len),
701
- ar_mode=str(ar_mode), face_tracking=bool(face_tracking))
702
  return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
703
 
704
  css = """
@@ -722,7 +789,7 @@ with gr.Blocks(title="Editor de Cortes Automático", css=css) as demo:
722
  gr.HTML("""
723
  <link href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;600;800&display=swap" rel="stylesheet">
724
  <div style="text-align: center; padding: 24px 0;">
725
- <h1>Editor de Cortes Automático</h1>
726
  <p style="color: #6b7280;">Gere cortes com rastreamento facial inteligente</p>
727
  </div>
728
  """)
@@ -733,11 +800,11 @@ with gr.Blocks(title="Editor de Cortes Automático", css=css) as demo:
733
  with gr.Row():
734
  model_size = gr.Dropdown(["tiny","base","small","medium"], value="small", label="Modelo Whisper")
735
  out_subdir = gr.Textbox(label="Pasta de saída", value="cortes")
736
- transcribe_btn = gr.Button("1) Transcrever", variant="primary")
737
  transcript_preview = gr.Textbox(label="Status", lines=10)
738
 
739
  with gr.Column():
740
- with gr.Tab("Cortes Simples"):
741
  with gr.Row():
742
  min_len = gr.Number(value=600, label="Min (s)")
743
  max_len = gr.Number(value=900, label="Max (s)")
@@ -749,12 +816,12 @@ with gr.Blocks(title="Editor de Cortes Automático", css=css) as demo:
749
  pad = gr.Number(value=0.08, label="Pad")
750
  ar_mode = gr.Dropdown(["Original","Vertical 9:16","Quadrado 1:1","Retrato 4:5"],
751
  value="Original", label="Formato")
752
- face_tracking = gr.Checkbox(label="Rastreamento facial", value=True)
753
- go_linear = gr.Button("2) Gerar Cortes", variant="primary")
754
  out_linear = gr.Files(label="Arquivos gerados")
755
  status_linear = gr.Textbox(label="Status", lines=2)
756
 
757
- with gr.Tab("Cortes Criativos"):
758
  with gr.Row():
759
  minb = gr.Number(value=3, label="Blocos min")
760
  maxb = gr.Number(value=8, label="Blocos max")
@@ -764,35 +831,18 @@ with gr.Blocks(title="Editor de Cortes Automático", css=css) as demo:
764
  pad2 = gr.Number(value=0.08, label="Pad")
765
  ar_mode2 = gr.Dropdown(["Original","Vertical 9:16","Quadrado 1:1","Retrato 4:5"],
766
  value="Original", label="Formato")
767
- face_tracking2 = gr.Checkbox(label="Rastreamento facial", value=True)
768
- go_creative = gr.Button("3) Gerar Criativos", variant="primary")
769
  out_creative = gr.Files(label="Arquivos gerados")
770
  status_creative = gr.Textbox(label="Status", lines=2)
771
-
772
- with gr.Tab("Cortes Virais (Opus-style)"):
773
- with gr.Row():
774
- kv = gr.Number(value=3, label="Qtde de cortes")
775
- padv = gr.Number(value=0.05, label="Pad (s)")
776
- with gr.Row():
777
- minv = gr.Number(value=20, label="Min (s)")
778
- maxv = gr.Number(value=60, label="Max (s)")
779
- idealv = gr.Number(value=40, label="Ideal (s)")
780
- ar_modev = gr.Dropdown(["Original","Vertical 9:16","Quadrado 1:1","Retrato 4:5"],
781
- value="Original", label="Formato")
782
- face_trackingv = gr.Checkbox(label="Rastreamento facial", value=True)
783
- go_viral = gr.Button("4) Gerar Virais", variant="primary")
784
- out_viral = gr.Files(label="Arquivos gerados")
785
- status_viral = gr.Textbox(label="Status", lines=2)
786
 
787
  segs_state = gr.State([])
788
 
789
  transcribe_btn.click(do_transcribe, inputs=[video, model_size], outputs=[segs_state, transcript_preview])
790
- go_linear.click(run_linear, inputs=[segs_state, video, out_subdir, min_len, max_len, ideal_len, k, gap, pad, ar_mode, face_tracking],
791
  outputs=[out_linear, status_linear])
792
- go_creative.click(run_creative, inputs=[segs_state, video, out_subdir, min_len, max_len, ideal_len, minb, maxb, k2, gap2, pad2, ar_mode2, face_tracking2],
793
  outputs=[out_creative, status_creative])
794
- go_viral.click(run_viral, inputs=[segs_state, video, out_subdir, kv, padv, minv, maxv, idealv, ar_modev, face_trackingv],
795
- outputs=[out_viral, status_viral])
796
 
797
  if __name__ == "__main__":
798
- demo.queue(max_size=20).launch()
 
1
  """
2
  Video Clip Generator - Tudo integrado
3
+ Transcrição + Cortes + Face Tracking
4
  """
5
 
6
  import gradio as gr
7
  import cv2
8
  import numpy as np
9
+ from moviepy.editor import VideoFileClip, concatenate_videoclips
10
  import whisper
11
  import subprocess
12
  from pathlib import Path
13
  from dataclasses import dataclass
14
+ from typing import List, Tuple, Optional
15
  import tempfile
16
  import os
17
  import shutil
 
 
 
 
18
 
19
  # ======================= DATACLASSES =======================
20
 
 
39
  center_y: int
40
  confidence: float = 1.0
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # ======================= FACE TRACKING =======================
43
 
44
  class FaceTracker:
 
61
 
62
  self.enabled = self.face_cascade is not None and not self.face_cascade.empty()
63
  if self.enabled:
64
+ print("Detector de rostos carregado")
65
  else:
66
+ print("⚠️ Detector de rostos não disponível - usando crop centralizado")
67
 
68
  def detect_faces(self, frame: np.ndarray) -> List[FaceBox]:
69
  if not self.enabled:
 
142
 
143
  return (crop_x, crop_y, crop_w, crop_h)
144
 
145
+ # ======================= TRANSCRIÇÃO =======================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  def transcribe(video_file: str, model_size: str = "small") -> List[Segment]:
148
+ print(f"🎙️ Carregando modelo Whisper: {model_size}")
149
  model = whisper.load_model(model_size)
150
+
151
+ print(f"🎬 Transcrevendo: {video_file}")
152
+ result = model.transcribe(video_file, language="pt", verbose=False)
153
+
154
+ segments = []
155
+ for seg in result["segments"]:
156
+ segments.append(Segment(
157
+ start=seg["start"],
158
+ end=seg["end"],
159
+ text=seg["text"].strip()
160
+ ))
161
+
162
+ print(f"✅ Transcrição completa: {len(segments)} segmentos")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  return segments
164
 
165
  # ======================= PROCESSAMENTO DE VÍDEO =======================
166
 
167
  def extract_video_segment(input_video: str, output_video: str, start_time: float, end_time: float) -> bool:
168
+ duration = end_time - start_time
 
 
 
169
  cmd = [
170
  "ffmpeg", "-y", "-ss", str(start_time), "-i", input_video,
171
+ "-t", str(duration), "-c:v", "libx264", "-c:a", "aac",
172
+ "-strict", "experimental", output_video
 
 
 
173
  ]
174
+
175
  try:
176
  subprocess.run(cmd, check=True, capture_output=True)
177
  return True
178
  except subprocess.CalledProcessError as e:
179
+ print(f" Erro ao extrair: {e}")
180
  return False
181
 
182
  def apply_smart_crop_to_video(input_path: str, output_path: str, target_width: int,
183
  target_height: int, sample_frames: int = 10) -> bool:
 
184
  tracker = FaceTracker()
185
  cap = cv2.VideoCapture(input_path)
186
+
187
  if not cap.isOpened():
188
+ print(f" Erro ao abrir: {input_path}")
189
  return False
190
 
191
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
192
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
193
  frame_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
194
  frame_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
195
 
196
+ # Amostragem para suavização
197
  sample_positions = []
198
+ frame_indices = np.linspace(0, frame_count - 1, min(sample_frames, frame_count), dtype=int)
199
+
200
  for idx in frame_indices:
201
  cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
202
  ret, frame = cap.read()
203
  if ret:
204
  crop_coords = tracker.calculate_smart_crop(frame, target_width, target_height)
205
  sample_positions.append(crop_coords)
 
206
 
207
+ # Posição média (suavizada)
208
  if sample_positions:
209
  avg_x = int(np.median([p[0] for p in sample_positions]))
210
  avg_y = int(np.median([p[1] for p in sample_positions]))
 
212
  crop_h = sample_positions[0][3]
213
  final_crop = (avg_x, avg_y, crop_w, crop_h)
214
  else:
215
+ # Fallback
216
  target_ar = target_width / target_height
217
  frame_ar = frame_w / frame_h
218
  if target_ar < frame_ar:
 
224
  crop_h = int(frame_w / target_ar)
225
  final_crop = (0, (frame_h - crop_h) // 2, crop_w, crop_h)
226
 
227
+ cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
 
228
 
229
+ # Writer
230
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
231
+ out = cv2.VideoWriter(output_path, fourcc, fps, (target_width, target_height))
232
+
233
+ if not out.isOpened():
234
+ print(f"❌ Erro ao criar saída: {output_path}")
235
+ cap.release()
 
 
 
 
 
 
 
 
236
  return False
237
+
238
+ print(f"🎬 Processando com crop: {final_crop}")
239
+ frame_num = 0
240
+
241
+ while True:
242
+ ret, frame = cap.read()
243
+ if not ret:
244
+ break
245
+
246
+ x, y, w, h = final_crop
247
+ cropped = frame[y:y+h, x:x+w]
248
+ resized = cv2.resize(cropped, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
249
+ out.write(resized)
250
+ frame_num += 1
251
+
252
+ if frame_num % 30 == 0:
253
+ progress = (frame_num / frame_count) * 100
254
+ print(f" {progress:.1f}% ({frame_num}/{frame_count})")
255
+
256
+ cap.release()
257
+ out.release()
258
+ print(f"✅ Concluído: {output_path}")
259
+ return True
260
 
261
  def apply_aspect_ratio(input_video: str, output_video: str, ar_mode: str, face_tracking: bool = False) -> bool:
262
  if ar_mode == "Original":
 
268
  "Quadrado 1:1": (1080, 1080),
269
  "Retrato 4:5": (1080, 1350),
270
  }
271
+
272
  if ar_mode not in ar_dims:
273
  return False
274
 
275
  width, height = ar_dims[ar_mode]
276
+
277
  if face_tracking:
278
  return apply_smart_crop_to_video(input_video, output_video, width, height)
279
  else:
280
+ # Crop centralizado tradicional
281
  cmd = [
282
  "ffmpeg", "-y", "-i", input_video,
283
  "-vf", f"scale={width}:{height}:force_original_aspect_ratio=increase,crop={width}:{height}",
284
+ "-c:a", "copy", output_video
 
 
 
285
  ]
286
  try:
287
  subprocess.run(cmd, check=True, capture_output=True)
 
299
  f.write(f"file '{os.path.abspath(vf)}'\n")
300
 
301
  try:
302
+ cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", list_file, "-c", "copy", output_file]
303
  subprocess.run(cmd, check=True, capture_output=True)
304
  return True
305
  except subprocess.CalledProcessError:
306
+ return False
 
 
 
 
 
 
 
307
  finally:
308
  Path(list_file).unlink(missing_ok=True)
309
 
310
+ # ======================= LEGENDAS CRIATIVAS =======================
311
+
312
+ def highlight_keywords(text: str) -> List[Tuple[str, bool]]:
313
+ """
314
+ Identifica palavras-chave para destaque.
315
+ Retorna lista de (palavra, is_highlighted)
316
+ """
317
+ keywords = [
318
+ # Ação/Imperativo
319
+ "tem que", "precisa", "deve", "faça", "veja", "olha", "escuta",
320
+ # Negação/Contraste
321
+ "não", "nunca", "jamais", "mas", "porém", "entretanto",
322
+ # Impacto
323
+ "problema", "solução", "segredo", "verdade", "realidade",
324
+ # Números
325
+ "milhão", "mil", "bilhão", "100%", "zero",
326
+ # Emoção
327
+ "incrível", "impossível", "fácil", "difícil", "importante",
328
+ # Ação mental
329
+ "imagina", "pensa", "considera", "decide", "escolhe"
330
+ ]
331
+
332
+ words = text.split()
333
+ result = []
334
+
335
+ for word in words:
336
+ word_lower = word.lower().strip(".,!?")
337
+ is_key = any(k in word_lower for k in keywords)
338
+ result.append((word, is_key))
339
+
340
+ return result
341
+
342
+ def create_subtitle_clip(text: str, start: float, end: float,
343
+ video_width: int, video_height: int,
344
+ style: str = "hormozi") -> str:
345
+ """
346
+ Cria arquivo ASS (Advanced SubStation Alpha) com legendas estilizadas.
347
+ Retorna caminho do arquivo .ass
348
+ """
349
+
350
+ if style == "hormozi":
351
+ # Estilo Alex Hormozi
352
+ style_def = """[V4+ Styles]
353
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
354
+ Style: Default,Montserrat,72,&H00FFFF,&H00FFFF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,2,2,10,10,80,1
355
+ Style: Highlight,Montserrat,78,&H0000FFFF,&H0000FFFF,&H00000000,&H80000000,-1,0,0,0,110,110,0,0,1,4,3,2,10,10,80,1"""
356
+
357
+ # Processa texto com highlights
358
+ words_with_highlight = highlight_keywords(text)
359
+
360
+ # Divide em linhas (max 40 caracteres por linha)
361
+ lines = []
362
+ current_line = []
363
+ current_length = 0
364
+
365
+ for word, is_highlight in words_with_highlight:
366
+ word_len = len(word) + 1
367
+ if current_length + word_len > 40 and current_line:
368
+ lines.append(current_line)
369
+ current_line = [(word, is_highlight)]
370
+ current_length = word_len
371
+ else:
372
+ current_line.append((word, is_highlight))
373
+ current_length += word_len
374
+
375
+ if current_line:
376
+ lines.append(current_line)
377
+
378
+ # Limita a 2 linhas
379
+ if len(lines) > 2:
380
+ lines = lines[:2]
381
+
382
+ # Gera texto formatado
383
+ formatted_lines = []
384
+ for line in lines:
385
+ line_text = []
386
+ for word, is_highlight in line:
387
+ if is_highlight:
388
+ # Destaque: maior, amarelo vibrante, caps
389
+ line_text.append(f"{{\\1c&H00FFFF&\\fs78\\b1}}{word.upper()}{{\\r}}")
390
+ else:
391
+ line_text.append(word)
392
+ formatted_lines.append(" ".join(line_text))
393
+
394
+ final_text = "\\N".join(formatted_lines) # \N = quebra de linha no ASS
395
+
396
+ # Cria arquivo ASS
397
+ fd, ass_path = tempfile.mkstemp(suffix=".ass")
398
+
399
+ with os.fdopen(fd, 'w', encoding='utf-8') as f:
400
+ f.write("""[Script Info]
401
+ Title: Viral Subtitles
402
+ ScriptType: v4.00+
403
+ WrapStyle: 0
404
+ PlayResX: """ + str(video_width) + """
405
+ PlayResY: """ + str(video_height) + """
406
+ ScaledBorderAndShadow: yes
407
+
408
+ """)
409
+ f.write(style_def + "\n\n")
410
+ f.write("""[Events]
411
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
412
+ """)
413
+
414
+ # Converte tempo para formato ASS (0:00:00.00)
415
+ def format_time(seconds):
416
+ h = int(seconds // 3600)
417
+ m = int((seconds % 3600) // 60)
418
+ s = seconds % 60
419
+ return f"{h}:{m:02d}:{s:05.2f}"
420
+
421
+ start_time = format_time(start)
422
+ end_time = format_time(end)
423
+
424
+ f.write(f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{final_text}\n")
425
+
426
+ return ass_path
427
+
428
+ def add_subtitles_to_video(input_video: str, output_video: str,
429
+ segments: List[Segment], style: str = "hormozi") -> bool:
430
+ """
431
+ Adiciona legendas estilizadas ao vídeo usando FFmpeg + ASS.
432
+ """
433
+
434
+ # Pega dimensões do vídeo
435
+ cap = cv2.VideoCapture(input_video)
436
+ video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
437
+ video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
438
+ cap.release()
439
+
440
+ # Cria arquivo ASS completo com todos os segmentos
441
+ fd, ass_path = tempfile.mkstemp(suffix=".ass")
442
+
443
+ with os.fdopen(fd, 'w', encoding='utf-8') as f:
444
+ # Header
445
+ f.write(f"""[Script Info]
446
+ Title: Viral Subtitles
447
+ ScriptType: v4.00+
448
+ WrapStyle: 0
449
+ PlayResX: {video_width}
450
+ PlayResY: {video_height}
451
+ ScaledBorderAndShadow: yes
452
+
453
+ [V4+ Styles]
454
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
455
+ Style: Default,Montserrat,68,&H00FFFF00,&H00FFFF00,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,2,2,10,10,60,1
456
+
457
+ [Events]
458
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
459
+ """)
460
+
461
+ # Adiciona cada segmento
462
+ for seg in segments:
463
+ words_with_highlight = highlight_keywords(seg.text)
464
+
465
+ # Formata texto
466
+ formatted_words = []
467
+ for word, is_highlight in words_with_highlight:
468
+ if is_highlight:
469
+ formatted_words.append(f"{{\\1c&H0000FFFF&\\fs76\\b1}}{word.upper()}{{\\r}}")
470
+ else:
471
+ formatted_words.append(word)
472
+
473
+ text = " ".join(formatted_words)
474
+
475
+ # Quebra em linhas (max 40 chars)
476
+ words = text.split()
477
+ lines = []
478
+ current = []
479
+ length = 0
480
+
481
+ for w in words:
482
+ w_len = len(w.replace("{\\1c&H0000FFFF&\\fs76\\b1}", "").replace("{\\r}", "")) + 1
483
+ if length + w_len > 40 and current:
484
+ lines.append(" ".join(current))
485
+ current = [w]
486
+ length = w_len
487
+ else:
488
+ current.append(w)
489
+ length += w_len
490
+
491
+ if current:
492
+ lines.append(" ".join(current))
493
+
494
+ final_text = "\\N".join(lines[:2]) # Max 2 linhas
495
+
496
+ # Formato de tempo ASS
497
+ def fmt_time(s):
498
+ h = int(s // 3600)
499
+ m = int((s % 3600) // 60)
500
+ sec = s % 60
501
+ return f"{h}:{m:02d}:{sec:05.2f}"
502
+
503
+ start_str = fmt_time(seg.start)
504
+ end_str = fmt_time(seg.end)
505
+
506
+ f.write(f"Dialogue: 0,{start_str},{end_str},Default,,0,0,0,,{final_text}\n")
507
+
508
+ # Aplica legendas com FFmpeg
509
+ print(f"[legendas] Aplicando estilo {style}...")
510
+
511
+ # Escape do caminho para FFmpeg (Windows/Linux)
512
+ ass_path_escaped = ass_path.replace('\\', '/').replace(':', '\\:')
513
+
514
+ cmd = [
515
+ "ffmpeg", "-y",
516
+ "-i", input_video,
517
+ "-vf", f"ass={ass_path_escaped}",
518
+ "-c:v", "libx264",
519
+ "-preset", "medium",
520
+ "-crf", "18",
521
+ "-c:a", "copy",
522
+ "-movflags", "+faststart",
523
+ output_video
524
+ ]
525
+
526
+ try:
527
+ subprocess.run(cmd, check=True, capture_output=True)
528
+ print(f"[legendas] ✅ Concluído: {output_video}")
529
+ return True
530
+ except subprocess.CalledProcessError as e:
531
+ print(f"[legendas] ❌ Erro: {e}")
532
+ return False
533
+ finally:
534
+ try:
535
+ Path(ass_path).unlink(missing_ok=True)
536
+ except:
537
+ pass
538
+
539
+ def score_segment_virality(seg: Segment, idx: int, total: int) -> float:
540
+ """
541
+ Pontua um segmento baseado em potencial viral.
542
+ Inspirado nos padrões do OpsClip.
543
+ """
544
+ score = 0.0
545
+ text = seg.text.lower()
546
+
547
+ # GANCHOS (perguntas, provocações)
548
+ if any(w in text for w in ["?", "por que", "qual", "como", "você"]):
549
+ score += 15
550
+
551
+ # FRASES DE IMPACTO
552
+ impact_phrases = [
553
+ "não dá", "problema", "esse é o", "imaginou", "é só",
554
+ "mas", "porém", "entretanto", "então", "olha",
555
+ "escuta", "presta atenção", "isso", "agora"
556
+ ]
557
+ for phrase in impact_phrases:
558
+ if phrase in text:
559
+ score += 8
560
+
561
+ # NEGAÇÕES E CONTRASTES (criam tensão)
562
+ if any(w in text for w in ["não", "nunca", "jamais", "sem"]):
563
+ score += 5
564
+
565
+ # AÇÃO/IMPERATIVO (engajamento)
566
+ if any(w in text for w in ["tem que", "precisa", "deve", "faça", "veja"]):
567
+ score += 7
568
+
569
+ # NÚMEROS E DADOS (autoridade)
570
+ if any(c.isdigit() for c in text):
571
+ score += 6
572
+
573
+ # DURAÇÃO IDEAL (15-45s = viral)
574
+ duration = seg.end - seg.start
575
+ if 15 <= duration <= 45:
576
+ score += 20
577
+ elif 10 <= duration <= 60:
578
+ score += 10
579
+
580
+ # POSIÇÃO NO VÍDEO (meio tem mais contexto)
581
+ position_ratio = idx / max(1, total)
582
+ if 0.2 <= position_ratio <= 0.8: # Evita extremos
583
+ score += 10
584
+
585
+ # COMPLETUDE (evita frases cortadas)
586
+ if text.strip().endswith((".", "!", "?", "né", "tá")):
587
+ score += 8
588
+
589
+ return score
590
+
591
+ def find_viral_moments(segments: List[Segment], k: int = 5) -> List[Tuple[int, int, float]]:
592
+ """
593
+ Encontra os k melhores momentos virais.
594
+ Retorna lista de (start_idx, end_idx, score)
595
+ """
596
+ viral_windows = []
597
+
598
+ # Janelas deslizantes de diferentes tamanhos
599
+ window_sizes = [1, 2, 3, 4, 5] # Quantos segmentos consecutivos
600
+
601
+ for window_size in window_sizes:
602
+ for i in range(len(segments) - window_size + 1):
603
+ window_segments = segments[i:i+window_size]
604
+
605
+ # Calcula duração total da janela
606
+ total_duration = window_segments[-1].end - window_segments[0].start
607
+
608
+ # Pula janelas muito longas ou curtas
609
+ if total_duration < 10 or total_duration > 60:
610
+ continue
611
+
612
+ # Pontuação agregada da janela
613
+ window_score = sum(score_segment_virality(seg, i+j, len(segments))
614
+ for j, seg in enumerate(window_segments))
615
+
616
+ # Bonus para janelas com narrativa completa
617
+ combined_text = " ".join(s.text for s in window_segments)
618
+ if "?" in combined_text and any(w in combined_text.lower() for w in ["porque", "então", "mas", "porém"]):
619
+ window_score += 15 # Pergunta + resposta = narrativa completa
620
+
621
+ viral_windows.append((i, i+window_size-1, window_score, total_duration))
622
+
623
+ # Ordena por score e remove sobreposições
624
+ viral_windows.sort(key=lambda x: x[2], reverse=True)
625
+
626
+ selected = []
627
+ used_indices = set()
628
+
629
+ for start_idx, end_idx, score, duration in viral_windows:
630
+ # Verifica se não sobrepõe com já selecionados
631
+ if not any(idx in used_indices for idx in range(start_idx, end_idx + 1)):
632
+ selected.append((start_idx, end_idx, score))
633
+ used_indices.update(range(start_idx, end_idx + 1))
634
+
635
+ if len(selected) >= k:
636
+ break
637
+
638
+ return selected
639
 
640
  # ======================= GERAÇÃO DE CORTES =======================
641
 
642
  def generate_linear_cuts(video_file: str, segments: List[Segment], output_dir: str,
643
  min_len: float = 600, max_len: float = 900, ideal_len: float = 900,
644
  k: int = 2, gap_threshold: float = 0.60, pad: float = 0.08,
645
+ ar_mode: str = "Original", face_tracking: bool = False,
646
+ add_subtitles: bool = False) -> List[str]:
647
  if not segments:
648
  return []
649
 
650
  Path(output_dir).mkdir(parents=True, exist_ok=True)
651
  total_duration = segments[-1].end - segments[0].start
652
+ target_duration = min(max_len, max(min_len, total_duration / k))
653
 
654
  outputs = []
655
  current_start = segments[0].start
656
 
657
+ for i in range(k):
658
  target_end = current_start + target_duration
659
  best_end = target_end
660
 
 
672
  temp_file = Path(output_dir) / f"temp_linear_{i+1}.mp4"
673
  final_file = Path(output_dir) / f"cut_linear_{i+1}.mp4"
674
 
675
+ print(f"✂️ Corte {i+1}/{k}: {start_with_pad:.1f}s - {end_with_pad:.1f}s")
676
 
677
+ if extract_video_segment(video_file, str(temp_file), start_with_pad, end_with_pad):
 
678
  if ar_mode != "Original":
679
  if apply_aspect_ratio(str(temp_file), str(final_file), ar_mode, face_tracking):
680
+ temp_file.unlink()
681
  outputs.append(str(final_file))
682
  else:
683
+ temp_file.rename(final_file)
684
  outputs.append(str(final_file))
685
 
686
  current_start = best_end + gap_threshold
 
693
  min_len: float = 600, max_len: float = 900, ideal_len: float = 900,
694
  min_blocks: int = 3, max_blocks: int = 8, k: int = 2,
695
  gap_threshold: float = 0.60, pad: float = 0.08,
696
+ ar_mode: str = "Original", face_tracking: bool = False,
697
+ add_subtitles: bool = False) -> List[str]:
698
  if not segments or len(segments) < min_blocks:
699
  return []
700
 
701
  Path(output_dir).mkdir(parents=True, exist_ok=True)
702
  outputs = []
703
 
704
+ import random
705
+ for i in range(k):
706
  num_blocks = random.randint(min_blocks, min(max_blocks, len(segments)))
707
  step = max(1, len(segments) // num_blocks)
708
  selected_indices = [j * step for j in range(num_blocks)]
 
713
  block_file = Path(output_dir) / f"temp_creative_{i+1}_block_{j+1}.mp4"
714
  start = max(0, seg.start - pad)
715
  end = seg.end + pad
716
+
717
+ if extract_video_segment(video_file, str(block_file), start, end):
718
  block_files.append(str(block_file))
719
 
720
  if not block_files:
 
726
 
727
  if ar_mode != "Original":
728
  if apply_aspect_ratio(str(concat_file), str(final_file), ar_mode, face_tracking):
729
+ concat_file.unlink()
730
  outputs.append(str(final_file))
731
  else:
732
+ concat_file.rename(final_file)
733
  outputs.append(str(final_file))
734
 
735
  for bf in block_files:
 
737
 
738
  return outputs
739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
  # ======================= INTERFACE GRADIO =======================
741
 
742
  SPACE_OUT = Path("outputs")
743
  SPACE_OUT.mkdir(exist_ok=True, parents=True)
744
 
745
  def do_transcribe(video_file, model_size):
746
+ if video_file is None:
747
+ return [], "Selecione um vídeo."
748
+ segs = transcribe(video_file, model_size=model_size)
 
749
  preview = "\n".join([f"[{s.start:.1f}–{s.end:.1f}] {s.text}" for s in segs[:12]])
750
  return segs, f"Transcrição ok. Segmentos: {len(segs)}\n\nPrévia:\n{preview}"
751
 
 
753
  if not segs:
754
  return [], "Transcreva antes de cortar."
755
  workdir = SPACE_OUT / (out_subdir or "cortes")
756
+ outs = generate_linear_cuts(video_file, segs, str(workdir), min_len=min_len, max_len=max_len,
757
+ ideal_len=ideal_len, k=k, gap_threshold=gap, pad=pad,
758
+ ar_mode=ar_mode, face_tracking=face_tracking)
 
759
  return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
760
 
761
  def run_creative(segs, video_file, out_subdir, min_len, max_len, ideal_len, minb, maxb, k, gap, pad, ar_mode, face_tracking):
762
  if not segs:
763
  return [], "Transcreva antes de cortar."
764
  workdir = SPACE_OUT / (out_subdir or "cortes")
765
+ outs = generate_creative_cuts(video_file, segs, str(workdir), min_len=min_len, max_len=max_len,
766
+ ideal_len=ideal_len, min_blocks=minb, max_blocks=maxb,
767
+ k=k, gap_threshold=gap, pad=pad, ar_mode=ar_mode,
768
+ face_tracking=face_tracking)
 
 
 
 
 
 
 
 
 
 
 
769
  return [str(Path(p)) for p in outs], f"Gerados: {len(outs)} arquivo(s)."
770
 
771
  css = """
 
789
  gr.HTML("""
790
  <link href="https://fonts.googleapis.com/css2?family=Manrope:wght@400;600;800&display=swap" rel="stylesheet">
791
  <div style="text-align: center; padding: 24px 0;">
792
+ <h1>🎬 Editor de Cortes Automático</h1>
793
  <p style="color: #6b7280;">Gere cortes com rastreamento facial inteligente</p>
794
  </div>
795
  """)
 
800
  with gr.Row():
801
  model_size = gr.Dropdown(["tiny","base","small","medium"], value="small", label="Modelo Whisper")
802
  out_subdir = gr.Textbox(label="Pasta de saída", value="cortes")
803
+ transcribe_btn = gr.Button("🎙️ 1) Transcrever", variant="primary")
804
  transcript_preview = gr.Textbox(label="Status", lines=10)
805
 
806
  with gr.Column():
807
+ with gr.Tab("✂️ Cortes Simples"):
808
  with gr.Row():
809
  min_len = gr.Number(value=600, label="Min (s)")
810
  max_len = gr.Number(value=900, label="Max (s)")
 
816
  pad = gr.Number(value=0.08, label="Pad")
817
  ar_mode = gr.Dropdown(["Original","Vertical 9:16","Quadrado 1:1","Retrato 4:5"],
818
  value="Original", label="Formato")
819
+ face_tracking = gr.Checkbox(label="👤 Rastreamento facial", value=True)
820
+ go_linear = gr.Button("🚀 2) Gerar Cortes", variant="primary")
821
  out_linear = gr.Files(label="Arquivos gerados")
822
  status_linear = gr.Textbox(label="Status", lines=2)
823
 
824
+ with gr.Tab("🎨 Cortes Criativos"):
825
  with gr.Row():
826
  minb = gr.Number(value=3, label="Blocos min")
827
  maxb = gr.Number(value=8, label="Blocos max")
 
831
  pad2 = gr.Number(value=0.08, label="Pad")
832
  ar_mode2 = gr.Dropdown(["Original","Vertical 9:16","Quadrado 1:1","Retrato 4:5"],
833
  value="Original", label="Formato")
834
+ face_tracking2 = gr.Checkbox(label="👤 Rastreamento facial", value=True)
835
+ go_creative = gr.Button("🎬 3) Gerar Criativos", variant="primary")
836
  out_creative = gr.Files(label="Arquivos gerados")
837
  status_creative = gr.Textbox(label="Status", lines=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
838
 
839
  segs_state = gr.State([])
840
 
841
  transcribe_btn.click(do_transcribe, inputs=[video, model_size], outputs=[segs_state, transcript_preview])
842
+ go_linear.click(run_linear, inputs=[segs_state, video, out_subdir, min_len, max_len, ideal_len, k, gap, pad, ar_mode, face_tracking, add_subs],
843
  outputs=[out_linear, status_linear])
844
+ go_creative.click(run_creative, inputs=[segs_state, video, out_subdir, min_len, max_len, ideal_len, minb, maxb, k2, gap2, pad2, ar_mode2, face_tracking2, add_subs2],
845
  outputs=[out_creative, status_creative])
 
 
846
 
847
  if __name__ == "__main__":
848
+ demo.launch()