Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Vaishnavi0404 commited on Apr 11, 2025

Commit

726598b

verified ·

1 Parent(s): 86e14e9

Create text_processor.py

Browse files

Files changed (1) hide show

text_processor.py +127 -0

text_processor.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import re
+import nltk
+from nltk.tokenize import word_tokenize
+import phonemizer
+from phonemizer.backend import EspeakBackend
+import numpy as np
+class TextProcessor:
+    def __init__(self):
+        # Initialize phonemizer with English backend
+        self.backend = EspeakBackend('en-us')
+    def process(self, text):
+        """
+        Process text into phonemes with duration and stress markers for singing
+        Args:
+            text (str): Input text to be processed
+        Returns:
+            tuple: (phonemes, durations, stress_markers)
+        """
+        # Clean text
+        text = self._clean_text(text)
+        # Tokenize
+        tokens = word_tokenize(text)
+        # Get phonemes
+        phonemes = self._text_to_phonemes(text)
+        # Estimate durations
+        durations = self._estimate_durations(tokens, phonemes)
+        # Mark stress for singing emphasis
+        stress_markers = self._mark_stress(tokens, phonemes)
+        return phonemes, durations, stress_markers
+    def _clean_text(self, text):
+        """Clean and normalize text"""
+        # Convert to lowercase
+        text = text.lower()
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Remove special characters but keep punctuation important for phrasing
+        text = re.sub(r'[^a-z0-9\s.,!?\'"-]', '', text)
+        return text
+    def _text_to_phonemes(self, text):
+        """Convert text to phoneme sequence"""
+        phonemes = self.backend.phonemize([text], strip=True)[0]
+        # Clean up phoneme representation
+        phonemes = re.sub(r'\s+', ' ', phonemes).strip()
+        return phonemes
+    def _estimate_durations(self, tokens, phonemes):
+        """Estimate phoneme durations for singing"""
+        # Split phonemes into list
+        phoneme_list = phonemes.split()
+        # Default duration (in seconds) for each phoneme
+        base_duration = 0.1
+        # Assign longer durations to vowels and certain consonants
+        durations = []
+        for p in phoneme_list:
+            # Vowels get longer duration
+            if re.search(r'[aeiou]', p):
+                durations.append(base_duration * 2)
+            # Certain consonants get medium duration
+            elif re.search(r'[lrmnw]', p):
+                durations.append(base_duration * 1.5)
+            # Other phonemes get standard duration
+            else:
+                durations.append(base_duration)
+        # Adjust for punctuation (create pauses)
+        for i, token in enumerate(tokens):
+            if token in ['.', ',', '!', '?', ';', ':']:
+                # Add a pause duration at the end of sentences or phrases
+                durations.append(base_duration * 3 if token in ['.', '!', '?'] else base_duration * 1.5)
+        return durations
+    def _mark_stress(self, tokens, phonemes):
+        """Mark which phonemes should be stressed in singing"""
+        # Simple heuristic: mark first syllable of content words
+        stress_markers = np.zeros(len(phonemes.split()))
+        # POS tagging to identify content words
+        tagged = nltk.pos_tag(tokens)
+        content_word_indices = []
+        for i, (word, tag) in enumerate(tagged):
+            # Content words: nouns, verbs, adjectives, adverbs
+            if tag.startswith(('N', 'V', 'J', 'R')) and len(word) > 2:
+                content_word_indices.append(i)
+        # Estimate phoneme positions for content words and mark stress
+        phoneme_idx = 0
+        word_idx = 0
+        phoneme_list = phonemes.split()
+        # This is a simplified approach - in practice, you'd need
+        # a more sophisticated alignment between words and phonemes
+        for i, word in enumerate(tokens):
+            if i in content_word_indices:
+                # Mark the first vowel phoneme of this word
+                word_phonemes = len(word)  # This is an approximation
+                for j in range(word_phonemes):
+                    if phoneme_idx + j < len(phoneme_list):
+                        phon = phoneme_list[phoneme_idx + j]
+                        if re.search(r'[aeiou]', phon):
+                            stress_markers[phoneme_idx + j] = 1
+                            break
+            phoneme_idx += len(word)  # Approximate phoneme position
+        return stress_markers