|
|
""" |
|
|
Audio processing utilities for voice screening. |
|
|
Handles audio combining, resampling, and WAV export. |
|
|
""" |
|
|
import io |
|
|
import wave |
|
|
import struct |
|
|
import logging |
|
|
from typing import List, Dict |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def combine_and_export_audio( |
|
|
user_chunks: List[Dict], |
|
|
agent_chunks: List[Dict], |
|
|
session_start_time: float, |
|
|
session_id: str |
|
|
) -> bytes: |
|
|
""" |
|
|
Combine user and agent audio chunks and export as WAV file. |
|
|
|
|
|
Audio chunks are continuous streams - we concatenate them in order and mix |
|
|
based on when each stream actually started relative to session start. |
|
|
|
|
|
Args: |
|
|
user_chunks: List of dicts with 'timestamp' and 'data' (bytes) |
|
|
agent_chunks: List of dicts with 'timestamp' and 'data' (bytes) |
|
|
session_start_time: Session start timestamp for relative positioning |
|
|
session_id: The session ID for logging. |
|
|
|
|
|
Returns: |
|
|
bytes: WAV file data. |
|
|
""" |
|
|
if not session_start_time: |
|
|
raise ValueError("Session start time not found") |
|
|
|
|
|
if not user_chunks and not agent_chunks: |
|
|
logger.warning(f"No audio chunks found for session {session_id}") |
|
|
|
|
|
wav_buffer = io.BytesIO() |
|
|
with wave.open(wav_buffer, 'wb') as wav_file: |
|
|
wav_file.setnchannels(1) |
|
|
wav_file.setsampwidth(2) |
|
|
wav_file.setframerate(24000) |
|
|
wav_file.writeframes(b'') |
|
|
return wav_buffer.getvalue() |
|
|
|
|
|
|
|
|
SAMPLE_RATE = 24000 |
|
|
BYTES_PER_SAMPLE = 2 |
|
|
|
|
|
|
|
|
|
|
|
user_sample_rate = SAMPLE_RATE |
|
|
logger.info(f"Using standard sample rate: {user_sample_rate}Hz") |
|
|
|
|
|
|
|
|
|
|
|
all_chunks = [] |
|
|
|
|
|
|
|
|
for chunk in user_chunks: |
|
|
chunk_data = chunk["data"] |
|
|
chunk_samples = len(chunk_data) // BYTES_PER_SAMPLE |
|
|
|
|
|
all_chunks.append({ |
|
|
"timestamp": chunk["timestamp"], |
|
|
"type": "user", |
|
|
"data": chunk_data, |
|
|
"samples": chunk_samples |
|
|
}) |
|
|
|
|
|
|
|
|
for chunk in agent_chunks: |
|
|
chunk_data = chunk["data"] |
|
|
chunk_samples = len(chunk_data) // BYTES_PER_SAMPLE |
|
|
|
|
|
all_chunks.append({ |
|
|
"timestamp": chunk["timestamp"], |
|
|
"type": "agent", |
|
|
"data": chunk_data, |
|
|
"samples": chunk_samples |
|
|
}) |
|
|
|
|
|
|
|
|
all_chunks.sort(key=lambda x: x["timestamp"]) |
|
|
|
|
|
|
|
|
|
|
|
user_cumulative = None |
|
|
agent_cumulative = None |
|
|
|
|
|
chunk_placements = [] |
|
|
|
|
|
for chunk in all_chunks: |
|
|
chunk_timestamp = chunk["timestamp"] |
|
|
chunk_offset_seconds = chunk_timestamp - session_start_time |
|
|
chunk_start_sample = max(0, int(chunk_offset_seconds * SAMPLE_RATE)) |
|
|
|
|
|
if chunk["type"] == "user": |
|
|
|
|
|
if user_cumulative is None: |
|
|
user_cumulative = chunk_start_sample |
|
|
|
|
|
|
|
|
if chunk_start_sample < user_cumulative: |
|
|
chunk_start_sample = user_cumulative |
|
|
|
|
|
chunk_placements.append({ |
|
|
"start_sample": chunk_start_sample, |
|
|
"data": chunk["data"], |
|
|
"samples": chunk["samples"], |
|
|
"type": "user" |
|
|
}) |
|
|
|
|
|
user_cumulative = chunk_start_sample + chunk["samples"] |
|
|
else: |
|
|
|
|
|
if agent_cumulative is None: |
|
|
agent_cumulative = chunk_start_sample |
|
|
|
|
|
|
|
|
if chunk_start_sample < agent_cumulative: |
|
|
chunk_start_sample = agent_cumulative |
|
|
|
|
|
chunk_placements.append({ |
|
|
"start_sample": chunk_start_sample, |
|
|
"data": chunk["data"], |
|
|
"samples": chunk["samples"], |
|
|
"type": "agent" |
|
|
}) |
|
|
|
|
|
agent_cumulative = chunk_start_sample + chunk["samples"] |
|
|
|
|
|
|
|
|
total_samples = 0 |
|
|
if chunk_placements: |
|
|
for placement in chunk_placements: |
|
|
total_samples = max(total_samples, placement["start_sample"] + placement["samples"]) |
|
|
|
|
|
if total_samples == 0: |
|
|
logger.warning(f"No audio samples to export for session {session_id}") |
|
|
wav_buffer = io.BytesIO() |
|
|
with wave.open(wav_buffer, 'wb') as wav_file: |
|
|
wav_file.setnchannels(1) |
|
|
wav_file.setsampwidth(2) |
|
|
wav_file.setframerate(SAMPLE_RATE) |
|
|
wav_file.writeframes(b'') |
|
|
return wav_buffer.getvalue() |
|
|
|
|
|
|
|
|
output_buffer = bytearray(total_samples * BYTES_PER_SAMPLE) |
|
|
|
|
|
|
|
|
for placement in chunk_placements: |
|
|
chunk_data = placement["data"] |
|
|
chunk_start = placement["start_sample"] |
|
|
chunk_samples = placement["samples"] |
|
|
|
|
|
for i in range(chunk_samples): |
|
|
sample_offset = chunk_start + i |
|
|
if 0 <= sample_offset < total_samples: |
|
|
|
|
|
sample_value = struct.unpack('<h', chunk_data[i*2:(i+1)*2])[0] |
|
|
|
|
|
current_offset = sample_offset * BYTES_PER_SAMPLE |
|
|
current_value = struct.unpack('<h', output_buffer[current_offset:current_offset+2])[0] |
|
|
|
|
|
mixed_value = max(-32768, min(32767, current_value + sample_value)) |
|
|
|
|
|
struct.pack_into('<h', output_buffer, current_offset, mixed_value) |
|
|
|
|
|
|
|
|
wav_buffer = io.BytesIO() |
|
|
with wave.open(wav_buffer, 'wb') as wav_file: |
|
|
wav_file.setnchannels(1) |
|
|
wav_file.setsampwidth(2) |
|
|
wav_file.setframerate(SAMPLE_RATE) |
|
|
wav_file.writeframes(bytes(output_buffer)) |
|
|
|
|
|
duration_seconds = total_samples / SAMPLE_RATE |
|
|
user_chunk_count = len([c for c in chunk_placements if c["type"] == "user"]) |
|
|
agent_chunk_count = len([c for c in chunk_placements if c["type"] == "agent"]) |
|
|
logger.info(f"Combined audio: {user_chunk_count} user chunks, {agent_chunk_count} agent chunks, " |
|
|
f"total {total_samples} samples ({duration_seconds:.2f}s), sorted chronologically") |
|
|
|
|
|
return wav_buffer.getvalue() |
|
|
|