audio-spatializer / spatializer /utils /foa.py

Upload spatializer/utils/foa.py with huggingface_hub

96336ad verified 3 months ago

4.73 kB

	"""First-Order Ambisonics (FOA) utilities."""

	import numpy as np
	import torch
	from typing import Tuple


	def deg2rad(degrees: float) -> float:
	"""Convert degrees to radians."""
	return degrees * np.pi / 180.0


	def encode_foa_analytic(
	mono: np.ndarray,
	azimuth_deg: float,
	elevation_deg: float,
	normalization: str = "SN3D"
	) -> np.ndarray:
	"""
	Encode mono signal to FOA using analytic panning.

	Args:
	mono: Mono audio signal, shape (n_samples,)
	azimuth_deg: Azimuth angle in degrees (-180 to 180, 0=front)
	elevation_deg: Elevation angle in degrees (-90 to 90, 0=level)
	normalization: "SN3D" or "N3D"

	Returns:
	FOA signal, shape (4, n_samples) with channels [W, X, Y, Z]
	"""
	theta = deg2rad(azimuth_deg)
	phi = deg2rad(elevation_deg)

	# Standard FOA encoding
	W = mono / np.sqrt(2) # Omnidirectional (SN3D normalization)
	X = mono * np.cos(theta) * np.cos(phi) # Left-Right
	Y = mono * np.sin(theta) * np.cos(phi) # Front-Back
	Z = mono * np.sin(phi) # Up-Down

	foa = np.stack([W, X, Y, Z], axis=0)

	if normalization == "N3D":
	# Convert SN3D to N3D (scale W by sqrt(2))
	foa[0] *= np.sqrt(2)

	return foa


	def encode_foa_analytic_torch(
	mono: torch.Tensor,
	azimuth_deg: float,
	elevation_deg: float,
	normalization: str = "SN3D"
	) -> torch.Tensor:
	"""
	PyTorch version of FOA encoding.

	Args:
	mono: Mono audio signal, shape (batch, n_samples) or (n_samples,)
	azimuth_deg: Azimuth angle in degrees
	elevation_deg: Elevation angle in degrees
	normalization: "SN3D" or "N3D"

	Returns:
	FOA signal, shape (batch, 4, n_samples) or (4, n_samples)
	"""
	theta = torch.tensor(deg2rad(azimuth_deg), dtype=mono.dtype, device=mono.device)
	phi = torch.tensor(deg2rad(elevation_deg), dtype=mono.dtype, device=mono.device)

	# Add batch dim if needed
	if mono.ndim == 1:
	mono = mono.unsqueeze(0)
	squeeze_output = True
	else:
	squeeze_output = False

	# Standard FOA encoding
	W = mono / np.sqrt(2)
	X = mono * torch.cos(theta) * torch.cos(phi)
	Y = mono * torch.sin(theta) * torch.cos(phi)
	Z = mono * torch.sin(phi)

	foa = torch.stack([W, X, Y, Z], dim=1) # (batch, 4, n_samples)

	if normalization == "N3D":
	foa[:, 0] *= np.sqrt(2)

	if squeeze_output:
	foa = foa.squeeze(0)

	return foa


	def compute_intensity_vector(foa: np.ndarray) -> Tuple[float, float]:
	"""
	Compute azimuth and elevation from FOA intensity vector.

	Args:
	foa: FOA signal, shape (4, n_samples)

	Returns:
	(azimuth_deg, elevation_deg)
	"""
	W, X, Y, Z = foa

	# Compute time-averaged intensity vector
	Ix = np.mean(W * X)
	Iy = np.mean(W * Y)
	Iz = np.mean(W * Z)

	# Convert to angles
	azimuth_rad = np.arctan2(Iy, Ix)
	elevation_rad = np.arctan2(Iz, np.sqrt(Ix2 + Iy2))

	azimuth_deg = azimuth_rad * 180.0 / np.pi
	elevation_deg = elevation_rad * 180.0 / np.pi

	return azimuth_deg, elevation_deg


	def compute_intensity_vector_torch(foa: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	PyTorch version of intensity vector computation.

	Args:
	foa: FOA signal, shape (batch, 4, n_samples) or (4, n_samples)

	Returns:
	(azimuth_deg, elevation_deg) tensors
	"""
	if foa.ndim == 2:
	foa = foa.unsqueeze(0)
	squeeze_output = True
	else:
	squeeze_output = False

	W, X, Y, Z = foa[:, 0], foa[:, 1], foa[:, 2], foa[:, 3]

	# Compute time-averaged intensity vector
	Ix = torch.mean(W * X, dim=-1)
	Iy = torch.mean(W * Y, dim=-1)
	Iz = torch.mean(W * Z, dim=-1)

	# Convert to angles
	azimuth_rad = torch.atan2(Iy, Ix)
	elevation_rad = torch.atan2(Iz, torch.sqrt(Ix2 + Iy2))

	azimuth_deg = azimuth_rad * 180.0 / np.pi
	elevation_deg = elevation_rad * 180.0 / np.pi

	if squeeze_output:
	azimuth_deg = azimuth_deg.squeeze(0)
	elevation_deg = elevation_deg.squeeze(0)

	return azimuth_deg, elevation_deg


	def foa_to_stereo_simple(foa: np.ndarray) -> np.ndarray:
	"""
	Simple stereo downmix from FOA (just using W, X for L/R).

	Args:
	foa: FOA signal, shape (4, n_samples)

	Returns:
	Stereo signal, shape (2, n_samples)
	"""
	W, X, Y, Z = foa

	# Simple stereo decode: L = W + X, R = W - X
	L = (W + X) / np.sqrt(2)
	R = (W - X) / np.sqrt(2)

	return np.stack([L, R], axis=0)