Spaces:

Curify-dev
/

studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 27, 2025

Commit

b5a9cd7

verified ·

1 Parent(s): d79df48

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -17

app.py CHANGED Viewed

@@ -220,53 +220,84 @@ def transcribe_video_with_speakers(video_path):
         }
         for segment in result["segments"]
     ]
     # Collect audio for each speaker
     speaker_audio = {}
-    for segment in result["segments"]:
         speaker = segment["speaker"]
-        end = segment["end"]
         start = segment["start"]
         if end > start and (end - start) > 0.05:  # Require >50ms duration
             if speaker not in speaker_audio:
-                speaker_audio[speaker] = [(segment["start"], segment["end"])]
             else:
-                speaker_audio[speaker].append((segment["start"], segment["end"]))
     # Collapse and truncate speaker audio
     speaker_sample_paths = {}
     audio_clip = AudioFileClip(speech_audio_path)
     for speaker, segments in speaker_audio.items():
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
-        # Add a check to ensure speaker_clips is not empty
         if not speaker_clips:
-            logger.warning(f"No valid audio segments found for speaker {speaker} meeting the duration requirement. Skipping sample creation.")
-            continue # Skip the rest of the loop for this speaker
-        combined_clip = concatenate_audioclips(speaker_clips)
         truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
         # Step 1: Get audio array from the clip
         fps = 16000  # target sampling rate
         audio_array = truncated_clip.to_soundarray(fps=fps)
-        # If stereo → convert to mono
         if audio_array.ndim == 2:
             audio_array = np.mean(audio_array, axis=1)
         # Step 2: Apply denoising
         denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
         # Step 3: Save denoised audio directly
         sample_path = f"speaker_{speaker}_sample.wav"
         sf.write(sample_path, denoised_audio_array, fps)
         speaker_sample_paths[speaker] = sample_path
-        logger.info(f"Created sample for {speaker}: {sample_path}")
-    # Clean up
     video.close()
     audio_clip.close()
     os.remove(speech_audio_path)
     return transcript_with_speakers, detected_language

         }
         for segment in result["segments"]
     ]
     # Collect audio for each speaker
     speaker_audio = {}
+    logger.info("🔎 Start collecting valid audio segments per speaker...")
+    for idx, segment in enumerate(result["segments"]):
         speaker = segment["speaker"]
         start = segment["start"]
+        end = segment["end"]
         if end > start and (end - start) > 0.05:  # Require >50ms duration
             if speaker not in speaker_audio:
+                speaker_audio[speaker] = [(start, end)]
             else:
+                speaker_audio[speaker].append((start, end))
+            logger.debug(f"Segment {idx}: Added to speaker {speaker} [{start:.2f}s → {end:.2f}s]")
+        else:
+            logger.warning(f"⚠️ Segment {idx} discarded: invalid duration ({start:.2f}s → {end:.2f}s)")
     # Collapse and truncate speaker audio
     speaker_sample_paths = {}
     audio_clip = AudioFileClip(speech_audio_path)
+    logger.info(f"🔎 Found {len(speaker_audio)} speakers with valid segments. Start creating speaker samples...")
     for speaker, segments in speaker_audio.items():
+        logger.info(f"🔹 Speaker {speaker}: {len(segments)} valid segments")
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
         if not speaker_clips:
+            logger.warning(f"⚠️ No valid audio clips for speaker {speaker}. Skipping sample creation.")
+            continue
+        if len(speaker_clips) == 1:
+            logger.debug(f"Speaker {speaker}: Only one clip, skipping concatenation.")
+            combined_clip = speaker_clips[0]
+        else:
+            logger.debug(f"Speaker {speaker}: Concatenating {len(speaker_clips)} clips.")
+            combined_clip = concatenate_audioclips(speaker_clips)
         truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
+        logger.debug(f"Speaker {speaker}: Truncated to {truncated_clip.duration:.2f} seconds.")
         # Step 1: Get audio array from the clip
         fps = 16000  # target sampling rate
         audio_array = truncated_clip.to_soundarray(fps=fps)
         if audio_array.ndim == 2:
+            logger.debug(f"Speaker {speaker}: Stereo detected, converting to mono.")
             audio_array = np.mean(audio_array, axis=1)
         # Step 2: Apply denoising
         denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
+        if isinstance(denoised_audio_array, (list, tuple)):
+             logger.debug(f"Speaker {speaker}: Denoising returned a sequence, concatenating.")
+             # Concatenate the arrays along the first axis (samples)
+             try:
+                denoised_audio_array = np.concatenate(denoised_audio_array, axis=0)
+             except ValueError as e:
+                logger.error(f"Failed to concatenate denoised audio segments for {speaker}: {e}")
+                # Decide how to handle this - maybe skip saving the sample?
+                continue # Skip saving this sample if concatenation fails
         # Step 3: Save denoised audio directly
         sample_path = f"speaker_{speaker}_sample.wav"
         sf.write(sample_path, denoised_audio_array, fps)
         speaker_sample_paths[speaker] = sample_path
+        logger.info(f"✅ Created and saved sample for {speaker}: {sample_path}")
+    # Cleanup
+    logger.info("🧹 Closing audio clip and removing temporary files...")
     video.close()
     audio_clip.close()
     os.remove(speech_audio_path)
+    logger.info("✅ Finished processing all speaker samples.")
     return transcript_with_speakers, detected_language