Spaces:

Grinding
/

AudioSummarizer

Sleeping

App Files Files Community

Grinding commited on Aug 26, 2025

Commit

a617e93

verified ·

1 Parent(s): f16db0b

Update app/processing.py

Browse files

Files changed (1) hide show

app/processing.py +35 -39

app/processing.py CHANGED Viewed

@@ -65,6 +65,8 @@ async def transcribe_chunk(chunk_index: int, audio_chunk: AudioSegment):
         with io.BytesIO() as chunk_bytes:
             audio_chunk.export(chunk_bytes, format="wav")
             chunk_bytes.seek(0)
             transcription = await asyncio.to_thread(
                 groq_client.audio.transcriptions.create,
@@ -73,7 +75,6 @@ async def transcribe_chunk(chunk_index: int, audio_chunk: AudioSegment):
                 response_format="text"
             )
         logger.info(f"Finished transcription for chunk {chunk_index + 1}.")
-        # Return index along with text for sorting after parallel processing
         return (chunk_index, transcription)
     except Exception as e:
         logger.error(f"Error transcribing chunk {chunk_index + 1}: {e}")
@@ -87,58 +88,57 @@ async def run_pipeline(task_id: str, file_path: Path, tasks_db: dict):
     try:
         logger.info(f"Starting pipeline for task {task_id} with file {file_path}")
-        # Make chunk duration configurable via environment variable, default to 120 seconds
-        CHUNK_DURATION_S = int(os.getenv("CHUNK_DURATION_S", 120))
-        sr = librosa.get_samplerate(str(file_path))
-        target_sr = 16000  # Resample to 16kHz for Whisper compatibility and smaller file size
-        stream = librosa.stream(
-            str(file_path),
-            block_length=int(sr * CHUNK_DURATION_S),
-            frame_length=4096,
-            hop_length=1024
-        )
         transcription_tasks = []
-        for i, y_chunk in enumerate(stream):
-            logger.info(f"Queuing audio segment {i+1} for transcription...")
-            # Ensure y_chunk is 2D
-            if y_chunk.ndim == 1:
-                y_chunk = y_chunk.reshape(-1, 1)
-            # Mix to mono if multi-channel
-            if y_chunk.shape[1] > 1:
-                y_chunk = np.mean(y_chunk, axis=1, keepdims=True)
             # Resample to target_sr
-            if sr != target_sr:
-                y_chunk = librosa.resample(y_chunk, orig_sr=sr, target_sr=target_sr, axis=0)
-            current_sr = target_sr
             pcm_chunk = (y_chunk * 32767).astype(np.int16)
-            channels = y_chunk.shape[1]  # Should be 1
             audio_segment = AudioSegment(
                 pcm_chunk.tobytes(),
-                frame_rate=current_sr,
-                sample_width=pcm_chunk.dtype.itemsize,
-                channels=channels
             )
             transcription_tasks.append(transcribe_chunk(i, audio_segment))
-            # Clean up memory explicitly
-            del pcm_chunk, y_chunk, audio_segment
             gc.collect()
         # Run all transcription tasks in parallel
         logger.info(f"Running {len(transcription_tasks)} transcription tasks in parallel...")
         transcription_results = await asyncio.gather(*transcription_tasks)
-        # Sort results by their original index and join with newlines
         transcription_results.sort(key=lambda x: x[0])
         full_transcript = "\n".join([text for index, text in transcription_results])
@@ -150,22 +150,18 @@ async def run_pipeline(task_id: str, file_path: Path, tasks_db: dict):
         summary_task = asyncio.to_thread(
             groq_client.chat.completions.create,
-            model="qwen/qwen3-32b",
             messages=[{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT}, {"role": "user", "content": full_transcript}],
             temperature=0.2,
-            reasoning_effort="default",
-            reasoning_format="hidden",
             max_tokens=1024
         )
         action_item_task = asyncio.to_thread(
             groq_client.chat.completions.create,
-            model="qwen/qwen3-32b",
             messages=[{"role": "system", "content": ACTION_ITEMS_SYSTEM_PROMPT}, {"role": "user", "content": full_transcript}],
             temperature=0.1,
-            reasoning_effort="default",
             max_tokens=1024,
-            reasoning_format="hidden",
             response_format={"type": "json_object"}
         )

         with io.BytesIO() as chunk_bytes:
             audio_chunk.export(chunk_bytes, format="wav")
             chunk_bytes.seek(0)
+            chunk_size = chunk_bytes.getbuffer().nbytes
+            logger.info(f"Chunk {chunk_index + 1} size: {chunk_size / (1024 * 1024):.2f} MB")
             transcription = await asyncio.to_thread(
                 groq_client.audio.transcriptions.create,
                 response_format="text"
             )
         logger.info(f"Finished transcription for chunk {chunk_index + 1}.")
         return (chunk_index, transcription)
     except Exception as e:
         logger.error(f"Error transcribing chunk {chunk_index + 1}: {e}")
     try:
         logger.info(f"Starting pipeline for task {task_id} with file {file_path}")
+        # Get total duration
+        duration = librosa.get_duration(filename=str(file_path))
+        orig_sr = librosa.get_samplerate(str(file_path))
+        logger.info(f"Audio duration: {duration:.2f} seconds, sample rate: {orig_sr}")
+        target_sr = 16000
+        max_chunk_mb = 19.5
+        max_chunk_bytes = max_chunk_mb * 1024 * 1024
+        bytes_per_second = target_sr * 2 * 1  # 16-bit mono
+        max_chunk_duration = (max_chunk_bytes - 1000) / bytes_per_second  # conservative
+        # Configurable base chunk duration, but cap at max
+        base_chunk_duration = int(os.getenv("CHUNK_DURATION_S", 300))  # default 5 minutes
+        chunk_duration = min(base_chunk_duration, max_chunk_duration)
+        logger.info(f"Using chunk duration: {chunk_duration:.2f} seconds")
+        num_chunks = int(np.ceil(duration / chunk_duration))
+        logger.info(f"Number of chunks: {num_chunks}")
         transcription_tasks = []
+        for i in range(num_chunks):
+            offset = i * chunk_duration
+            this_dur = min(chunk_duration, duration - offset)
+            logger.info(f"Loading audio chunk {i+1} (offset: {offset:.2f}s, duration: {this_dur:.2f}s)")
+            y_chunk, _ = librosa.load(str(file_path), sr=None, mono=True, offset=offset, duration=this_dur)
             # Resample to target_sr
+            if _ != target_sr:
+                y_chunk = librosa.resample(y_chunk, orig_sr=_, target_sr=target_sr)
             pcm_chunk = (y_chunk * 32767).astype(np.int16)
             audio_segment = AudioSegment(
                 pcm_chunk.tobytes(),
+                frame_rate=target_sr,
+                sample_width=2,
+                channels=1
             )
             transcription_tasks.append(transcribe_chunk(i, audio_segment))
+            # Clean up memory
+            del y_chunk, pcm_chunk, audio_segment
             gc.collect()
         # Run all transcription tasks in parallel
         logger.info(f"Running {len(transcription_tasks)} transcription tasks in parallel...")
         transcription_results = await asyncio.gather(*transcription_tasks)
+        # Sort results by index and join
         transcription_results.sort(key=lambda x: x[0])
         full_transcript = "\n".join([text for index, text in transcription_results])
         summary_task = asyncio.to_thread(
             groq_client.chat.completions.create,
+            model="llama3-70b-8192",
             messages=[{"role": "system", "content": SUMMARIZATION_SYSTEM_PROMPT}, {"role": "user", "content": full_transcript}],
             temperature=0.2,
             max_tokens=1024
         )
         action_item_task = asyncio.to_thread(
             groq_client.chat.completions.create,
+            model="llama3-70b-8192",
             messages=[{"role": "system", "content": ACTION_ITEMS_SYSTEM_PROMPT}, {"role": "user", "content": full_transcript}],
             temperature=0.1,
             max_tokens=1024,
             response_format={"type": "json_object"}
         )