Spaces:

rc19477
/

avse_dev_only

Runtime error

App Files Files Community

roychao19477 commited on Aug 4, 2025

Commit

2cb0aee

1 Parent(s): 0b3d66c

Test on lengths

Browse files

Files changed (1) hide show

app.py +68 -22

app.py CHANGED Viewed

@@ -7,6 +7,8 @@ import shutil
 import glob
 import gradio as gr
 # install packages for mamba
 def install_mamba():
     subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
@@ -63,8 +65,6 @@ from moviepy import ImageSequenceClip
 from scipy.io import wavfile
 from avse_code import run_avse
-# Load face detector
-model = YOLO("yolov8n-face.pt").cuda()  # assumes CUDA available
 from decord import VideoReader, cpu
@@ -75,15 +75,18 @@ import spaces
 # Load model once globally
 #ckpt_path = "ckpts/ep215_0906.oat.ckpt"
 #model = AVSEModule.load_from_checkpoint(ckpt_path)
-avse_model = AVSEModule()
 #avse_state_dict = torch.load("ckpts/ep215_0906.oat.ckpt")
-avse_state_dict = torch.load("ckpts/ep220_0908.oat.ckpt")
-avse_model.load_state_dict(avse_state_dict, strict=True)
-avse_model.to("cuda")
-avse_model.eval()
 @spaces.GPU
 def run_avse_inference(video_path, audio_path):
     estimated = run_avse(video_path, audio_path)
     # Load audio
     #noisy, _ = sf.read(audio_path, dtype='float32')  # (N, )
@@ -101,15 +104,39 @@ def run_avse_inference(video_path, audio_path):
     ]).astype(np.float32)
     bg_frames /= 255.0
     # Combine into input dict (match what model.enhance expects)
-    data = {
-        "noisy_audio": noisy,
-        "video_frames": bg_frames[np.newaxis, ...]
-    }
     with torch.no_grad():
-        estimated = avse_model.enhance(data).reshape(-1)
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")
@@ -135,9 +162,32 @@ def extract_resampled_audio(video_path, target_sr=16000):
     torchaudio.save(resampled_audio_path, waveform, sample_rate=target_sr)
     return resampled_audio_path
 @spaces.GPU
 def extract_faces(video_file):
     cap = cv2.VideoCapture(video_file)
     fps = cap.get(cv2.CAP_PROP_FPS)
     frames = []
@@ -148,7 +198,8 @@ def extract_faces(video_file):
             break
         # Inference
-        results = model(frame, verbose=False)[0]
         for box in results.boxes:
             # version 1
             # x1, y1, x2, y2 = map(int, box.xyxy[0])
@@ -218,14 +269,7 @@ def extract_faces(video_file):
     enhanced_audio_path = run_avse_inference(output_path, audio_path)
-    from moviepy import VideoFileClip
-    flipped_output_path = os.path.join(tmpdir, "face_only_video_flipped.mp4")
-    flipped_clip = VideoFileClip(output_path, fps=25)
-    flipped_clip = flipped_clip.fx(vfx.mirror_y)
-    flipped_clip.write_videofile(flipped_output_path, codec="libx264", audio=False, fps=25)
-    return flipped_output_path, enhanced_audio_path
-    #return output_path, enhanced_audio_path
     #return output_path, audio_path
 iface = gr.Interface(
@@ -237,7 +281,9 @@ iface = gr.Interface(
         gr.Audio(label="Enhanced Audio", type="filepath")
     ],
     title="Face Detector",
-    description="Upload or record a video. We'll crop face regions and return a face-only video and its 16kHz audio."
 )
 iface.launch()

 import glob
 import gradio as gr
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 # install packages for mamba
 def install_mamba():
     subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.2/mamba_ssm-2.2.2+cu122torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"))
 from scipy.io import wavfile
 from avse_code import run_avse
 from decord import VideoReader, cpu
 # Load model once globally
 #ckpt_path = "ckpts/ep215_0906.oat.ckpt"
 #model = AVSEModule.load_from_checkpoint(ckpt_path)
 #avse_state_dict = torch.load("ckpts/ep215_0906.oat.ckpt")
+CHUNK_SIZE_AUDIO = 48000  # 3 sec at 16kHz
+CHUNK_SIZE_VIDEO = 75     # 25fps × 3 sec
 @spaces.GPU
 def run_avse_inference(video_path, audio_path):
+    avse_model = AVSEModule()
+    avse_state_dict = torch.load("ckpts/ep220_0908.oat.ckpt")
+    avse_model.load_state_dict(avse_state_dict, strict=True)
+    avse_model.to("cuda")
+    avse_model.eval()
     estimated = run_avse(video_path, audio_path)
     # Load audio
     #noisy, _ = sf.read(audio_path, dtype='float32')  # (N, )
     ]).astype(np.float32)
     bg_frames /= 255.0
+    audio_chunks = [
+        noisy[i:i + CHUNK_SIZE_AUDIO]
+        for i in range(0, len(noisy), CHUNK_SIZE_AUDIO)
+    ]
+    video_chunks = [
+        bg_frames[i:i + CHUNK_SIZE_VIDEO]
+        for i in range(0, len(bg_frames), CHUNK_SIZE_VIDEO)
+    ]
+    min_len = min(len(audio_chunks), len(video_chunks))  # sync length
     # Combine into input dict (match what model.enhance expects)
+    #data = {
+    #    "noisy_audio": noisy,
+    #    "video_frames": bg_frames[np.newaxis, ...]
+    #}
+    #with torch.no_grad():
+    #    estimated = avse_model.enhance(data).reshape(-1)
+    estimated_chunks = []
     with torch.no_grad():
+        for i in range(min_len):
+            chunk_data = {
+                "noisy_audio": audio_chunks[i],
+                "video_frames": video_chunks[i][np.newaxis, ...]
+            }
+            est = avse_model.enhance(chunk_data).reshape(-1)
+            estimated_chunks.append(est)
+    estimated = np.concatenate(estimated_chunks, axis=0)
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")
     torchaudio.save(resampled_audio_path, waveform, sample_rate=target_sr)
     return resampled_audio_path
+@spaces.GPU
+def yolo_detection(frame, verbose=False):
+    # Load face detector
+    model = YOLO("yolov8n-face.pt").cuda()  # assumes CUDA available
+    return model(frame, verbose=verbose)[0]
 @spaces.GPU
 def extract_faces(video_file):
+    if isinstance(video_input, dict):
+        video_path = video_input.get("path") or video_input.get("url")
+        if video_path.startswith("http"):
+            # download video
+            tmpdir = tempfile.mkdtemp()
+            ext = os.path.splitext(urlparse(video_path).path)[1]
+            local_path = os.path.join(tmpdir, "input_video" + ext)
+            with open(local_path, "wb") as f:
+                f.write(requests.get(video_path).content)
+            video_file = local_path
+        else:
+            video_file = video_path
+    else:
+        video_file = video_input  # string path from UI
     cap = cv2.VideoCapture(video_file)
     fps = cap.get(cv2.CAP_PROP_FPS)
     frames = []
             break
         # Inference
+        #results = model(frame, verbose=False)[0]
+        results = yolo_detection(frame, verbose=False)
         for box in results.boxes:
             # version 1
             # x1, y1, x2, y2 = map(int, box.xyxy[0])
     enhanced_audio_path = run_avse_inference(output_path, audio_path)
+    return output_path, enhanced_audio_path
     #return output_path, audio_path
 iface = gr.Interface(
         gr.Audio(label="Enhanced Audio", type="filepath")
     ],
     title="Face Detector",
+    description="Upload or record a video. We'll crop face regions and return a face-only video and its 16kHz audio.",
+    api_name="/predict"
 )
 iface.launch()