Spaces:

MicroHealth
/

AV-to-transcripts

Paused

App Files Files Community

bluenevus commited on Apr 24, 2025

Commit

a80c887

verified ·

1 Parent(s): ffd5e97

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -29

app.py CHANGED Viewed

@@ -70,38 +70,30 @@ def download_audio_from_url(url):
 def transcribe_audio(audio_file):
     try:
         logger.info("Loading audio file...")
-        audio = AudioSegment.from_file(audio_file)
-        audio = audio.set_channels(1).set_frame_rate(16000)
-        audio_array = torch.tensor(audio.get_array_of_samples()).float()
-        logger.info(f"Audio duration: {len(audio) / 1000:.2f} seconds")
         logger.info("Starting transcription...")
-        input_features = whisper_processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features.to(device)
-        # Create attention mask
-        attention_mask = torch.ones_like(input_features)
-        max_retries = 3
-        for attempt in range(max_retries):
-            # Generate with specific parameters
-            predicted_ids = whisper_model.generate(
-                input_features,
-                attention_mask=attention_mask,
-                language='en',
-                task='translate',
-                temperature=0.7,  # Adjust temperature for potentially better results
-                num_beams=5,  # Increase beam search for potentially better results
-                max_length=448,  # Increase max length to allow for longer transcriptions
-            )
             transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
-            logger.info(f"Transcription attempt {attempt + 1} complete. Length: {len(transcription[0])} characters")
-            if len(transcription[0]) >= 10:
-                return transcription[0]
-            else:
-                logger.warning(f"Transcription too short on attempt {attempt + 1}: {transcription[0]}")
-        raise ValueError(f"Failed to generate a valid transcription after {max_retries} attempts")
     except Exception as e:
         logger.error(f"Error in transcribe_audio: {str(e)}")
         raise

 def transcribe_audio(audio_file):
     try:
         logger.info("Loading audio file...")
+        audio_input, sr = librosa.load(audio_file, sr=16000)
+        audio_input = audio_input.astype(np.float32)
+        logger.info(f"Audio duration: {len(audio_input) / sr:.2f} seconds")
+        chunk_length = 30 * sr
+        overlap = 5 * sr
+        transcriptions = []
         logger.info("Starting transcription...")
+        for i in range(0, len(audio_input), chunk_length - overlap):
+            chunk = audio_input[i:i+chunk_length]
+            input_features = whisper_processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
+            predicted_ids = whisper_model.generate(input_features)
             transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
+            transcriptions.extend(transcription)
+            logger.info(f"Processed {i / sr:.2f} to {(i + chunk_length) / sr:.2f} seconds")
+        full_transcription = " ".join(transcriptions)
+        logger.info(f"Transcription complete. Full transcription length: {len(full_transcription)} characters")
+        logger.info("Applying speaker separation using Qwen...")
+        separated_transcript = separate_speakers(full_transcription)
+        return separated_transcript
     except Exception as e:
         logger.error(f"Error in transcribe_audio: {str(e)}")
         raise