Spaces:

BBBAKERY
/

ustwo-api

Sleeping

App Files Files Community

ustwo-api / src /common /phone_simulator.py

asdfasdfqrqwer

Deploy from GitHub 2026-04-23T03:56:31Z

c857b85 2 months ago

Raw

History Blame Contribute Delete

4.57 kB

	"""
	전화 통화 품질 시뮬레이션 (PSTN)

	깨끗한 녹음 오디오를 전화 통화 품질로 변환하여
	AI Hub 등의 스튜디오 녹음 데이터를 학습용 통화 데이터로 전처리한다.

	3단계 처리:
	1. 밴드패스 필터링 (300–3400 Hz) — ITU-T G.712
	2. 8 kHz 다운샘플링 (anti-aliasing 포함)
	3. G.711 비선형 양자화 (A-law / μ-law companding)
	"""

	import audioop
	import random
	from enum import Enum

	import numpy as np
	import scipy.signal as signal


	class CompandingType(str, Enum):
	ALAW = "alaw" # 한국/유럽/아시아 PSTN 표준
	ULAW = "ulaw" # 북미 PSTN 표준
	RANDOM = "random" # 랜덤 선택 (학습 데이터 다양성 확보)


	class PhoneSimulator:
	"""깨끗한 녹음 오디오 → 전화 통화 품질 변환기"""

	# PSTN 표준 파라미터
	PSTN_LOW_FREQ = 300.0 # Hz — ITU-T G.712 하한
	PSTN_HIGH_FREQ = 3400.0 # Hz — ITU-T G.712 상한
	PSTN_SAMPLE_RATE = 8000 # Hz — G.711 표준 샘플레이트
	FILTER_ORDER = 5 # Butterworth 필터 차수

	def __init__(self, companding: CompandingType = CompandingType.RANDOM):
	"""
	Args:
	companding: 양자화 방식. RANDOM이면 파일마다 alaw/ulaw 랜덤 선택
	"""
	self.companding = companding

	def process(self, audio: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
	"""
	전화 통화 품질 시뮬레이션 적용.

	Args:
	audio: float32 mono numpy array (범위: -1.0 ~ 1.0)
	sr: 원본 샘플레이트

	Returns:
	(처리된 오디오, 새 샘플레이트=8000)
	"""
	if audio.ndim != 1:
	raise ValueError(f"Mono audio expected, got shape {audio.shape}")

	# Step 1: 밴드패스 필터링 (300–3400 Hz)
	audio = self._bandpass_filter(audio, sr)

	# Step 2: 8 kHz 다운샘플링
	audio = self._downsample(audio, sr)

	# Step 3: G.711 companding (encode→decode round-trip)
	audio = self._compand(audio)

	return audio, self.PSTN_SAMPLE_RATE

	def _bandpass_filter(self, audio: np.ndarray, sr: int) -> np.ndarray:
	"""
	ITU-T G.712 대역 필터링.

	300 Hz 미만(험 노이즈) + 3400 Hz 이상(치찰음) 제거.
	5차 Butterworth: 충분히 가파르면서 ringing 최소화.
	"""
	nyq = sr / 2.0
	low = self.PSTN_LOW_FREQ / nyq
	high = self.PSTN_HIGH_FREQ / nyq

	# 나이퀴스트 이상이면 필터 적용 불가 (이미 대역 내)
	if high >= 1.0:
	high = 0.99
	if low <= 0.0:
	low = 0.01

	b, a = signal.butter(self.FILTER_ORDER, [low, high], btype="band")
	return signal.filtfilt(b, a, audio).astype(np.float32)

	def _downsample(self, audio: np.ndarray, sr: int) -> np.ndarray:
	"""
	Anti-aliasing + 다운샘플링.

	scipy.signal.resample_poly는 내부적으로 anti-aliasing 필터를 적용하여
	에일리어싱을 방지한다.
	"""
	if sr == self.PSTN_SAMPLE_RATE:
	return audio

	# GCD 기반 rational resampling (resample_poly가 더 정확)
	gcd = np.gcd(sr, self.PSTN_SAMPLE_RATE)
	up = self.PSTN_SAMPLE_RATE // gcd
	down = sr // gcd
	return signal.resample_poly(audio, up, down).astype(np.float32)

	def _compand(self, audio: np.ndarray) -> np.ndarray:
	"""
	G.711 A-law/μ-law encode→decode round-trip.

	16비트 → 8비트 압축 → 16비트 복원 과정에서
	비선형 양자화 노이즈가 발생하여 전화 특유의 '거친' 음색을 만든다.
	"""
	# companding 방식 결정
	if self.companding == CompandingType.RANDOM:
	method = random.choice([CompandingType.ALAW, CompandingType.ULAW])
	else:
	method = self.companding

	# float32 → 16-bit PCM
	pcm16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
	raw_bytes = pcm16.tobytes()

	# encode (16bit → 8bit) → decode (8bit → 16bit) round-trip
	if method == CompandingType.ALAW:
	compressed = audioop.lin2alaw(raw_bytes, 2)
	decompressed = audioop.alaw2lin(compressed, 2)
	else:
	compressed = audioop.lin2ulaw(raw_bytes, 2)
	decompressed = audioop.ulaw2lin(compressed, 2)

	# 16-bit PCM → float32
	return np.frombuffer(decompressed, dtype=np.int16).astype(np.float32) / 32767.0