File size: 4,537 Bytes
726598b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import re
import nltk
from nltk.tokenize import word_tokenize
import phonemizer
from phonemizer.backend import EspeakBackend
import numpy as np

class TextProcessor:
    def __init__(self):
        # Initialize phonemizer with English backend
        self.backend = EspeakBackend('en-us')
        
    def process(self, text):
        """
        Process text into phonemes with duration and stress markers for singing
        
        Args:
            text (str): Input text to be processed
            
        Returns:
            tuple: (phonemes, durations, stress_markers)
        """
        # Clean text
        text = self._clean_text(text)
        
        # Tokenize
        tokens = word_tokenize(text)
        
        # Get phonemes
        phonemes = self._text_to_phonemes(text)
        
        # Estimate durations
        durations = self._estimate_durations(tokens, phonemes)
        
        # Mark stress for singing emphasis
        stress_markers = self._mark_stress(tokens, phonemes)
        
        return phonemes, durations, stress_markers
    
    def _clean_text(self, text):
        """Clean and normalize text"""
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove special characters but keep punctuation important for phrasing
        text = re.sub(r'[^a-z0-9\s.,!?\'"-]', '', text)
        
        return text
    
    def _text_to_phonemes(self, text):
        """Convert text to phoneme sequence"""
        phonemes = self.backend.phonemize([text], strip=True)[0]
        
        # Clean up phoneme representation
        phonemes = re.sub(r'\s+', ' ', phonemes).strip()
        
        return phonemes
    
    def _estimate_durations(self, tokens, phonemes):
        """Estimate phoneme durations for singing"""
        # Split phonemes into list
        phoneme_list = phonemes.split()
        
        # Default duration (in seconds) for each phoneme
        base_duration = 0.1
        
        # Assign longer durations to vowels and certain consonants
        durations = []
        
        for p in phoneme_list:
            # Vowels get longer duration
            if re.search(r'[aeiou]', p):
                durations.append(base_duration * 2)
            # Certain consonants get medium duration
            elif re.search(r'[lrmnw]', p):
                durations.append(base_duration * 1.5)
            # Other phonemes get standard duration
            else:
                durations.append(base_duration)
        
        # Adjust for punctuation (create pauses)
        for i, token in enumerate(tokens):
            if token in ['.', ',', '!', '?', ';', ':']:
                # Add a pause duration at the end of sentences or phrases
                durations.append(base_duration * 3 if token in ['.', '!', '?'] else base_duration * 1.5)
        
        return durations
    
    def _mark_stress(self, tokens, phonemes):
        """Mark which phonemes should be stressed in singing"""
        # Simple heuristic: mark first syllable of content words
        stress_markers = np.zeros(len(phonemes.split()))
        
        # POS tagging to identify content words
        tagged = nltk.pos_tag(tokens)
        
        content_word_indices = []
        for i, (word, tag) in enumerate(tagged):
            # Content words: nouns, verbs, adjectives, adverbs
            if tag.startswith(('N', 'V', 'J', 'R')) and len(word) > 2:
                content_word_indices.append(i)
        
        # Estimate phoneme positions for content words and mark stress
        phoneme_idx = 0
        word_idx = 0
        
        phoneme_list = phonemes.split()
        
        # This is a simplified approach - in practice, you'd need
        # a more sophisticated alignment between words and phonemes
        for i, word in enumerate(tokens):
            if i in content_word_indices:
                # Mark the first vowel phoneme of this word
                word_phonemes = len(word)  # This is an approximation
                for j in range(word_phonemes):
                    if phoneme_idx + j < len(phoneme_list):
                        phon = phoneme_list[phoneme_idx + j]
                        if re.search(r'[aeiou]', phon):
                            stress_markers[phoneme_idx + j] = 1
                            break
            
            phoneme_idx += len(word)  # Approximate phoneme position
        
        return stress_markers