Spaces:
Paused
Paused
Upload app.py
Browse files
app.py
CHANGED
|
@@ -99,12 +99,11 @@ def background_process(job_id: str, req: ProcessRequest):
|
|
| 99 |
file=audio_file,
|
| 100 |
model="whisper-1",
|
| 101 |
response_format="verbose_json",
|
| 102 |
-
timestamp_granularities=["segment"]
|
| 103 |
)
|
| 104 |
segments = transcript.segments
|
| 105 |
print(f"[{job_id}] Whisper analysis complete. Found {len(segments)} segments.")
|
| 106 |
except Exception as e:
|
| 107 |
-
# Catch common JSON decoding errors from OpenAI/Network here
|
| 108 |
print(f"[{job_id}] OpenAI/JSON Error: {traceback.format_exc()}")
|
| 109 |
raise Exception(f"OpenAI Analysis Error: {str(e)}")
|
| 110 |
|
|
@@ -112,27 +111,35 @@ def background_process(job_id: str, req: ProcessRequest):
|
|
| 112 |
raise Exception("No speech detected in video")
|
| 113 |
|
| 114 |
# 5. Slice Video and Upload
|
| 115 |
-
print(f"[{job_id}] Step 5: Starting slice loop...")
|
| 116 |
processed_slices = []
|
| 117 |
total_segments = len(segments)
|
| 118 |
|
| 119 |
-
# Reduced buffers to avoid repetition while maintaining clean cuts
|
| 120 |
-
BUFFER_START = 0.05
|
| 121 |
-
BUFFER_END = 0.2
|
| 122 |
-
|
| 123 |
for i, segment in enumerate(segments):
|
| 124 |
orig_start = segment.start
|
| 125 |
orig_end = segment.end
|
| 126 |
|
| 127 |
-
#
|
| 128 |
-
|
| 129 |
-
# Lookbehind to avoid overlapping with previous segment
|
| 130 |
-
prev_end = segments[i-1].end if i > 0 else 0
|
| 131 |
|
| 132 |
-
#
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
|
|
|
|
|
|
| 136 |
text = segment.text.strip()
|
| 137 |
duration = end - start
|
| 138 |
|
|
@@ -145,11 +152,14 @@ def background_process(job_id: str, req: ProcessRequest):
|
|
| 145 |
output_path = temp_dir / output_filename
|
| 146 |
|
| 147 |
try:
|
| 148 |
-
# Precise Slicing
|
|
|
|
|
|
|
| 149 |
subprocess.run([
|
| 150 |
"ffmpeg", "-ss", str(start), "-i", str(video_path), "-t", str(duration), "-y",
|
| 151 |
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
| 152 |
-
"-c:a", "aac", "-b:a", "128k", "-
|
|
|
|
| 153 |
str(output_path)
|
| 154 |
], check=True, capture_output=True)
|
| 155 |
except subprocess.CalledProcessError as e:
|
|
|
|
| 99 |
file=audio_file,
|
| 100 |
model="whisper-1",
|
| 101 |
response_format="verbose_json",
|
| 102 |
+
timestamp_granularities=["segment", "word"]
|
| 103 |
)
|
| 104 |
segments = transcript.segments
|
| 105 |
print(f"[{job_id}] Whisper analysis complete. Found {len(segments)} segments.")
|
| 106 |
except Exception as e:
|
|
|
|
| 107 |
print(f"[{job_id}] OpenAI/JSON Error: {traceback.format_exc()}")
|
| 108 |
raise Exception(f"OpenAI Analysis Error: {str(e)}")
|
| 109 |
|
|
|
|
| 111 |
raise Exception("No speech detected in video")
|
| 112 |
|
| 113 |
# 5. Slice Video and Upload
|
| 114 |
+
print(f"[{job_id}] Step 5: Starting intelligent slice loop...")
|
| 115 |
processed_slices = []
|
| 116 |
total_segments = len(segments)
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
for i, segment in enumerate(segments):
|
| 119 |
orig_start = segment.start
|
| 120 |
orig_end = segment.end
|
| 121 |
|
| 122 |
+
# Intelligent Midpoint Slicing:
|
| 123 |
+
# We split the silence between segments 50/50, but with safety caps.
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
# 5.1 Calculate End Padding (Next Segment Gap)
|
| 126 |
+
if i + 1 < total_segments:
|
| 127 |
+
gap_next = segments[i+1].start - orig_end
|
| 128 |
+
# Split gap, ensure at least 0.05s overlap if tight, cap at 0.3s
|
| 129 |
+
end_padding = max(0.05, min(0.3, gap_next / 2))
|
| 130 |
+
else:
|
| 131 |
+
end_padding = 0.5 # Tail for the last segment
|
| 132 |
+
|
| 133 |
+
# 5.2 Calculate Start Padding (Previous Segment Gap)
|
| 134 |
+
if i > 0:
|
| 135 |
+
gap_prev = orig_start - segments[i-1].end
|
| 136 |
+
# Split gap, ensure at least 0.05s overlap if tight, cap at 0.1s
|
| 137 |
+
start_padding = max(0.05, min(0.1, gap_prev / 2))
|
| 138 |
+
else:
|
| 139 |
+
start_padding = 0.1 # Lead-in for the first segment
|
| 140 |
|
| 141 |
+
start = max(0, orig_start - start_padding)
|
| 142 |
+
end = orig_end + end_padding
|
| 143 |
text = segment.text.strip()
|
| 144 |
duration = end - start
|
| 145 |
|
|
|
|
| 152 |
output_path = temp_dir / output_filename
|
| 153 |
|
| 154 |
try:
|
| 155 |
+
# Precise Slicing with Audio Sync Optimization
|
| 156 |
+
# -ss before -i is fast; -t after -i is precise duration.
|
| 157 |
+
# -af aresample=async=1 ensures audio starts/ends correctly relative to the seek.
|
| 158 |
subprocess.run([
|
| 159 |
"ffmpeg", "-ss", str(start), "-i", str(video_path), "-t", str(duration), "-y",
|
| 160 |
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
| 161 |
+
"-c:a", "aac", "-b:a", "128k", "-af", "aresample=async=1",
|
| 162 |
+
"-map_metadata", "-1", "-avoid_negative_ts", "make_zero",
|
| 163 |
str(output_path)
|
| 164 |
], check=True, capture_output=True)
|
| 165 |
except subprocess.CalledProcessError as e:
|