Spaces:

Grinding
/

AudioSummarizer

Sleeping

App Files Files Community

Grinding commited on Aug 26, 2025

Commit

ae78221

verified ·

1 Parent(s): 50dc8da

Update app/processing.py

Browse files

Files changed (1) hide show

app/processing.py +24 -5

app/processing.py CHANGED Viewed

@@ -34,7 +34,7 @@ Instructions:
 2.  **Extract Key Decisions**: Pinpoint any decisions that were made, including the rationale behind them if available.
 3.  **Highlight Main Outcomes**: Detail the primary results or conclusions reached during the discussion.
 4.  **Structure the Output**: Present the summary in a clean, professional format. Use bullet points for clarity.
-5.  **Maintain Neutrality**: The summary should be objective and free of personal interpretation or bias and JSON.
 """
 ACTION_ITEMS_SYSTEM_PROMPT = """
 You are a highly specialized AI assistant tasked with identifying and extracting actionable tasks, commitments, and deadlines from a meeting or lecture transcript. Your output must be clear, concise, and formatted as a JSON object.
@@ -87,10 +87,12 @@ async def run_pipeline(task_id: str, file_path: Path, tasks_db: dict):
     try:
         logger.info(f"Starting pipeline for task {task_id} with file {file_path}")
-        # Make chunk duration configurable via environment variable, default to 3 minutes
         CHUNK_DURATION_S = int(os.getenv("CHUNK_DURATION_S", 120))
         sr = librosa.get_samplerate(str(file_path))
         stream = librosa.stream(
             str(file_path),
             block_length=int(sr * CHUNK_DURATION_S),
@@ -101,18 +103,35 @@ async def run_pipeline(task_id: str, file_path: Path, tasks_db: dict):
         transcription_tasks = []
         for i, y_chunk in enumerate(stream):
             logger.info(f"Queuing audio segment {i+1} for transcription...")
             pcm_chunk = (y_chunk * 32767).astype(np.int16)
             audio_segment = AudioSegment(
                 pcm_chunk.tobytes(),
-                frame_rate=sr,
                 sample_width=pcm_chunk.dtype.itemsize,
-                channels=1
             )
             transcription_tasks.append(transcribe_chunk(i, audio_segment))
             # Clean up memory explicitly
-            del pcm_chunk, y_chunk
             gc.collect()
         # Run all transcription tasks in parallel

 2.  **Extract Key Decisions**: Pinpoint any decisions that were made, including the rationale behind them if available.
 3.  **Highlight Main Outcomes**: Detail the primary results or conclusions reached during the discussion.
 4.  **Structure the Output**: Present the summary in a clean, professional format. Use bullet points for clarity.
+5.  **Maintain Neutrality**: The summary should be objective and free of personal interpretation or bias.
 """
 ACTION_ITEMS_SYSTEM_PROMPT = """
 You are a highly specialized AI assistant tasked with identifying and extracting actionable tasks, commitments, and deadlines from a meeting or lecture transcript. Your output must be clear, concise, and formatted as a JSON object.
     try:
         logger.info(f"Starting pipeline for task {task_id} with file {file_path}")
+        # Make chunk duration configurable via environment variable, default to 120 seconds
         CHUNK_DURATION_S = int(os.getenv("CHUNK_DURATION_S", 120))
         sr = librosa.get_samplerate(str(file_path))
+        target_sr = 16000  # Resample to 16kHz for Whisper compatibility and smaller file size
         stream = librosa.stream(
             str(file_path),
             block_length=int(sr * CHUNK_DURATION_S),
         transcription_tasks = []
         for i, y_chunk in enumerate(stream):
             logger.info(f"Queuing audio segment {i+1} for transcription...")
+            # Ensure y_chunk is 2D
+            if y_chunk.ndim == 1:
+                y_chunk = y_chunk.reshape(-1, 1)
+            # Mix to mono if multi-channel
+            if y_chunk.shape[1] > 1:
+                y_chunk = np.mean(y_chunk, axis=1, keepdims=True)
+            # Resample to target_sr
+            if sr != target_sr:
+                y_chunk = librosa.resample(y_chunk, orig_sr=sr, target_sr=target_sr, axis=0)
+            current_sr = target_sr
             pcm_chunk = (y_chunk * 32767).astype(np.int16)
+            channels = y_chunk.shape[1]  # Should be 1
             audio_segment = AudioSegment(
                 pcm_chunk.tobytes(),
+                frame_rate=current_sr,
                 sample_width=pcm_chunk.dtype.itemsize,
+                channels=channels
             )
             transcription_tasks.append(transcribe_chunk(i, audio_segment))
             # Clean up memory explicitly
+            del pcm_chunk, y_chunk, audio_segment
             gc.collect()
         # Run all transcription tasks in parallel