Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Text2Sing-DiffSinger / text_processor.py

Vaishnavi0404

Create text_processor.py

726598b verified 11 months ago

raw

history blame contribute delete

4.54 kB

	import re
	import nltk
	from nltk.tokenize import word_tokenize
	import phonemizer
	from phonemizer.backend import EspeakBackend
	import numpy as np

	class TextProcessor:
	def __init__(self):
	# Initialize phonemizer with English backend
	self.backend = EspeakBackend('en-us')

	def process(self, text):
	"""
	Process text into phonemes with duration and stress markers for singing

	Args:
	text (str): Input text to be processed

	Returns:
	tuple: (phonemes, durations, stress_markers)
	"""
	# Clean text
	text = self._clean_text(text)

	# Tokenize
	tokens = word_tokenize(text)

	# Get phonemes
	phonemes = self._text_to_phonemes(text)

	# Estimate durations
	durations = self._estimate_durations(tokens, phonemes)

	# Mark stress for singing emphasis
	stress_markers = self._mark_stress(tokens, phonemes)

	return phonemes, durations, stress_markers

	def _clean_text(self, text):
	"""Clean and normalize text"""
	# Convert to lowercase
	text = text.lower()

	# Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	# Remove special characters but keep punctuation important for phrasing
	text = re.sub(r'[^a-z0-9\s.,!?\'"-]', '', text)

	return text

	def _text_to_phonemes(self, text):
	"""Convert text to phoneme sequence"""
	phonemes = self.backend.phonemize([text], strip=True)[0]

	# Clean up phoneme representation
	phonemes = re.sub(r'\s+', ' ', phonemes).strip()

	return phonemes

	def _estimate_durations(self, tokens, phonemes):
	"""Estimate phoneme durations for singing"""
	# Split phonemes into list
	phoneme_list = phonemes.split()

	# Default duration (in seconds) for each phoneme
	base_duration = 0.1

	# Assign longer durations to vowels and certain consonants
	durations = []

	for p in phoneme_list:
	# Vowels get longer duration
	if re.search(r'[aeiou]', p):
	durations.append(base_duration * 2)
	# Certain consonants get medium duration
	elif re.search(r'[lrmnw]', p):
	durations.append(base_duration * 1.5)
	# Other phonemes get standard duration
	else:
	durations.append(base_duration)

	# Adjust for punctuation (create pauses)
	for i, token in enumerate(tokens):
	if token in ['.', ',', '!', '?', ';', ':']:
	# Add a pause duration at the end of sentences or phrases
	durations.append(base_duration * 3 if token in ['.', '!', '?'] else base_duration * 1.5)

	return durations

	def _mark_stress(self, tokens, phonemes):
	"""Mark which phonemes should be stressed in singing"""
	# Simple heuristic: mark first syllable of content words
	stress_markers = np.zeros(len(phonemes.split()))

	# POS tagging to identify content words
	tagged = nltk.pos_tag(tokens)

	content_word_indices = []
	for i, (word, tag) in enumerate(tagged):
	# Content words: nouns, verbs, adjectives, adverbs
	if tag.startswith(('N', 'V', 'J', 'R')) and len(word) > 2:
	content_word_indices.append(i)

	# Estimate phoneme positions for content words and mark stress
	phoneme_idx = 0
	word_idx = 0

	phoneme_list = phonemes.split()

	# This is a simplified approach - in practice, you'd need
	# a more sophisticated alignment between words and phonemes
	for i, word in enumerate(tokens):
	if i in content_word_indices:
	# Mark the first vowel phoneme of this word
	word_phonemes = len(word) # This is an approximation
	for j in range(word_phonemes):
	if phoneme_idx + j < len(phoneme_list):
	phon = phoneme_list[phoneme_idx + j]
	if re.search(r'[aeiou]', phon):
	stress_markers[phoneme_idx + j] = 1
	break

	phoneme_idx += len(word) # Approximate phoneme position

	return stress_markers