Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

File size: 4,537 Bytes

726598b

import re
import nltk
from nltk.tokenize import word_tokenize
import phonemizer
from phonemizer.backend import EspeakBackend
import numpy as np

class TextProcessor:
    def __init__(self):
        # Initialize phonemizer with English backend
        self.backend = EspeakBackend('en-us')
        
    def process(self, text):
        """
        Process text into phonemes with duration and stress markers for singing
        
        Args:
            text (str): Input text to be processed
            
        Returns:
            tuple: (phonemes, durations, stress_markers)
        """
        # Clean text
        text = self._clean_text(text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Get phonemes
        phonemes = self._text_to_phonemes(text)
        
        # Estimate durations
        durations = self._estimate_durations(tokens, phonemes)
        
        # Mark stress for singing emphasis
        stress_markers = self._mark_stress(tokens, phonemes)
        
        return phonemes, durations, stress_markers
    
    def _clean_text(self, text):
        """Clean and normalize text"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove special characters but keep punctuation important for phrasing
        text = re.sub(r'[^a-z0-9\s.,!?\'"-]', '', text)
        
        return text
    
    def _text_to_phonemes(self, text):
        """Convert text to phoneme sequence"""
        phonemes = self.backend.phonemize([text], strip=True)[0]
        
        # Clean up phoneme representation
        phonemes = re.sub(r'\s+', ' ', phonemes).strip()
        
        return phonemes
    
    def _estimate_durations(self, tokens, phonemes):
        """Estimate phoneme durations for singing"""
        # Split phonemes into list
        phoneme_list = phonemes.split()
        
        # Default duration (in seconds) for each phoneme
        base_duration = 0.1
        
        # Assign longer durations to vowels and certain consonants
        durations = []
        
        for p in phoneme_list:
            # Vowels get longer duration
            if re.search(r'[aeiou]', p):
                durations.append(base_duration * 2)
            # Certain consonants get medium duration
            elif re.search(r'[lrmnw]', p):
                durations.append(base_duration * 1.5)
            # Other phonemes get standard duration
            else:
                durations.append(base_duration)
        
        # Adjust for punctuation (create pauses)
        for i, token in enumerate(tokens):
            if token in ['.', ',', '!', '?', ';', ':']:
                # Add a pause duration at the end of sentences or phrases
                durations.append(base_duration * 3 if token in ['.', '!', '?'] else base_duration * 1.5)
        
        return durations
    
    def _mark_stress(self, tokens, phonemes):
        """Mark which phonemes should be stressed in singing"""
        # Simple heuristic: mark first syllable of content words
        stress_markers = np.zeros(len(phonemes.split()))
        
        # POS tagging to identify content words
        tagged = nltk.pos_tag(tokens)
        
        content_word_indices = []
        for i, (word, tag) in enumerate(tagged):
            # Content words: nouns, verbs, adjectives, adverbs
            if tag.startswith(('N', 'V', 'J', 'R')) and len(word) > 2:
                content_word_indices.append(i)
        
        # Estimate phoneme positions for content words and mark stress
        phoneme_idx = 0
        word_idx = 0
        
        phoneme_list = phonemes.split()
        
        # This is a simplified approach - in practice, you'd need
        # a more sophisticated alignment between words and phonemes
        for i, word in enumerate(tokens):
            if i in content_word_indices:
                # Mark the first vowel phoneme of this word
                word_phonemes = len(word)  # This is an approximation
                for j in range(word_phonemes):
                    if phoneme_idx + j < len(phoneme_list):
                        phon = phoneme_list[phoneme_idx + j]
                        if re.search(r'[aeiou]', phon):
                            stress_markers[phoneme_idx + j] = 1
                            break
            
            phoneme_idx += len(word)  # Approximate phoneme position
        
        return stress_markers