soundsol's picture
Upload spatializer/utils/text_parser.py with huggingface_hub
343bd34 verified
"""Text parsing utilities for spatial directions."""
import re
from typing import Dict, Tuple, Optional
import numpy as np
# Spatial ontology (from config)
DIRECTION_BINS = {
"front": 0,
"front-left": 45,
"frontleft": 45,
"left": 90,
"back-left": 135,
"backleft": 135,
"back": 180,
"back-right": -135,
"backright": -135,
"right": -90,
"front-right": -45,
"frontright": -45,
}
ELEVATION_BINS = {
"down": -30,
"below": -30,
"lower": -30,
"level": 0,
"middle": 0,
"center": 0,
"up": 30,
"above": 30,
"upper": 30,
}
DISTANCE_BINS = {
"near": 1.0,
"close": 1.0,
"mid": 2.5,
"medium": 2.5,
"far": 5.0,
"distant": 5.0,
}
ROOM_SIZE_BINS = {
"small": "small",
"medium": "medium",
"large": "large",
}
REVERB_BINS = {
"dry": "dry",
"medium": "medium",
"wet": "wet",
}
def parse_spatial_text(text: str) -> Dict[str, any]:
"""
Parse spatial text description into parameters.
Args:
text: Text like "front-left, up, near, small room, dry"
Returns:
Dictionary with keys:
- azimuth_deg: float
- elevation_deg: float
- distance_m: float
- room_size: str
- reverb_level: str
"""
text_lower = text.lower().strip()
# Defaults
params = {
"azimuth_deg": 0.0,
"elevation_deg": 0.0,
"distance_m": 2.5,
"room_size": "medium",
"reverb_level": "medium",
}
# Parse direction (azimuth)
for direction, angle in DIRECTION_BINS.items():
if direction in text_lower:
params["azimuth_deg"] = float(angle)
break
# Parse elevation
for elevation, angle in ELEVATION_BINS.items():
if elevation in text_lower:
params["elevation_deg"] = float(angle)
break
# Parse distance
for distance, dist_m in DISTANCE_BINS.items():
if distance in text_lower:
params["distance_m"] = dist_m
break
# Parse room size
for room_size in ROOM_SIZE_BINS.keys():
if room_size in text_lower:
params["room_size"] = room_size
break
# Parse reverb level
for reverb in REVERB_BINS.keys():
if reverb in text_lower:
params["reverb_level"] = reverb
break
return params
def generate_random_spatial_text() -> Tuple[str, Dict[str, any]]:
"""
Generate random spatial text and corresponding parameters.
Returns:
(text, params_dict)
"""
# Random sampling
direction = np.random.choice(list(DIRECTION_BINS.keys()))
elevation_keys = ["down", "level", "up"]
elevation = np.random.choice(elevation_keys)
distance_keys = ["near", "mid", "far"]
distance = np.random.choice(distance_keys)
room_size = np.random.choice(["small", "medium", "large"])
reverb = np.random.choice(["dry", "medium", "wet"])
# Build text
text = f"{direction}, {elevation}, {distance}, {room_size} room, {reverb}"
# Get params
params = {
"azimuth_deg": float(DIRECTION_BINS[direction]),
"elevation_deg": float(ELEVATION_BINS[elevation]),
"distance_m": DISTANCE_BINS[distance],
"room_size": room_size,
"reverb_level": reverb,
}
return text, params
def params_to_bins(params: Dict[str, any]) -> Dict[str, int]:
"""
Convert continuous parameters to bin indices.
Args:
params: Dict with azimuth_deg, elevation_deg, distance_m, etc.
Returns:
Dict with bin indices
"""
# Direction bin (8 bins)
azimuth = params["azimuth_deg"]
direction_angles = [0, 45, 90, 135, 180, -135, -90, -45]
direction_bin = np.argmin([abs(azimuth - a) for a in direction_angles])
# Elevation bin (3 bins)
elevation = params["elevation_deg"]
elevation_angles = [-30, 0, 30]
elevation_bin = np.argmin([abs(elevation - a) for a in elevation_angles])
# Distance bin (3 bins)
distance = params["distance_m"]
distance_values = [1.0, 2.5, 5.0]
distance_bin = np.argmin([abs(distance - d) for d in distance_values])
# Room size bin (3 bins)
room_sizes = ["small", "medium", "large"]
room_bin = room_sizes.index(params.get("room_size", "medium"))
# Reverb bin (3 bins)
reverb_levels = ["dry", "medium", "wet"]
reverb_bin = reverb_levels.index(params.get("reverb_level", "medium"))
return {
"direction_bin": direction_bin,
"elevation_bin": elevation_bin,
"distance_bin": distance_bin,
"room_bin": room_bin,
"reverb_bin": reverb_bin,
}
def bins_to_one_hot(bins: Dict[str, int]) -> np.ndarray:
"""
Convert bin indices to concatenated one-hot encoding.
Args:
bins: Dict with bin indices
Returns:
One-hot vector of shape (8 + 3 + 3 + 3 + 3 = 20,)
"""
direction_oh = np.zeros(8)
direction_oh[bins["direction_bin"]] = 1.0
elevation_oh = np.zeros(3)
elevation_oh[bins["elevation_bin"]] = 1.0
distance_oh = np.zeros(3)
distance_oh[bins["distance_bin"]] = 1.0
room_oh = np.zeros(3)
room_oh[bins["room_bin"]] = 1.0
reverb_oh = np.zeros(3)
reverb_oh[bins["reverb_bin"]] = 1.0
return np.concatenate([direction_oh, elevation_oh, distance_oh, room_oh, reverb_oh])