Spaces:

cdactvm
/

demoASR

Sleeping

App Files Files Community

cdactvm commited on Jun 22, 2024

Commit

9fc1cf9

verified ·

1 Parent(s): b4d2c26

Update app.py

Browse files

Files changed (1) hide show

app.py +234 -1

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ def transcribe_odiya_eng(speech):
     trn = Transliterator(source='ori', target='eng', build_lookup=True)
     text = p1(speech)["text"]
     text=trn.transform(text)
     return text
 def sel_lng(lng,mic=None, file=None):
@@ -28,9 +29,241 @@ def sel_lng(lng,mic=None, file=None):
         return transcribe_odiya(audio)
     elif (lng=="Odiya-trans"):
         return transcribe_odiya_eng(audio)
 demo=gr.Interface(
     fn=sel_lng,

     trn = Transliterator(source='ori', target='eng', build_lookup=True)
     text = p1(speech)["text"]
     text=trn.transform(text)
+    text = master_function(text)
     return text
 def sel_lng(lng,mic=None, file=None):
         return transcribe_odiya(audio)
     elif (lng=="Odiya-trans"):
         return transcribe_odiya_eng(audio)
+#####################################################
+def soundex(word):
+    word = word.upper()
+    word = ''.join(filter(str.isalpha, word))
+    if not word:
+        return None
+    soundex_mapping = {
+     'B': '1', 'F': '1', 'P': '1', 'V': '1',
+     'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2',
+     'S': '2', 'X': '2', 'Z': '2',
+     'D': '3', 'T': '3',
+     'L': '4',
+     'M': '5', 'N': '5',
+     'R': '6'
+     }
+    soundex_code = word[0]
+    for char in word[1:]:
+        if char not in ('H', 'W'):
+            soundex_code += soundex_mapping.get(char, '0')
+            soundex_code = soundex_code[0] + ''.join(c for i, c in enumerate(soundex_code[1:]) if c != soundex_code[i])
+            soundex_code = soundex_code.replace('0', '') + '000'
+    return soundex_code[:4]
+# convert special tecken to numbers
+def is_number(x):
+    if type(x) == str:
+        x = x.replace(',', '')
+    try:
+        float(x)
+    except:
+        return False
+    return True
+def text2int (textnum, numwords={}):
+    units = ['Z600', 'O500','T000','T600','F600','F100','S220','S150','E300','N500',
+             'T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
+    # teens = ['T500', 'E415', 'T410', 'T635', 'F635', 'F135', 'S235', 'S153', 'E235','N535']
+    tens = ['', '', 'T537', 'T637', 'F637', 'F137', 'S230', 'S153', 'E230', 'N530']
+    scales = ['H536', 'T253', 'M450', 'C600']
+    # scale_values = [100, 1_000, 10_0000, 1000_000_000]
+    indian_scales = ['L200', 'C600', 'A610', 'K610']
+    conjunction = ['and']
+    ordinal_words = {'oh': 'Z600', 'first': 'O500', 'second': 'T000', 'third': 'T600', 'fourth': 'F600', 'fifth': 'F100',
+                     'sixth': 'S200','seventh': 'S150','eighth': 'E230', 'ninth': 'N500', 'twelfth': 'T410'}
+    ordinal_endings = [('ieth', 'y'), ('th', '')]
+    if not numwords:
+        numwords['and'] = (1, 0)
+        for idx, word in enumerate(units): numwords[word] = (1, idx)
+        for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
+        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
+    textnum = textnum.replace('-', ' ')
+    current = result = 0
+    curstring = ''
+    onnumber = False
+    lastunit = False
+    lastscale = False
+    def is_numword(x):
+        if is_number(x):
+            return True
+        if word in numwords:
+            return True
+        return False
+    def from_numword(x):
+        if is_number(x):
+            scale = 0
+            increment = int(x.replace(',', ''))
+            return scale, increment
+        return numwords[x]
+    for word in textnum.split():
+        if word in ordinal_words:
+            scale, increment = (1, ordinal_words[word])
+            current = current * scale + increment
+            if scale > 100:
+                result += current
+                current = 0
+            onnumber = True
+            lastunit = False
+            lastscale = False
+        else:
+            for ending, replacement in ordinal_endings:
+                if word.endswith(ending):
+                    word = "%s%s" % (word[:-len(ending)], replacement)
+            if (not is_numword(word)) or (word == 'and' and not lastscale):
+                if onnumber:
+                    # Flush the current number we are building
+                    curstring += repr(result + current) + " "
+                curstring += word + " "
+                result = current = 0
+                onnumber = False
+                lastunit = False
+                lastscale = False
+            else:
+                scale, increment = from_numword(word)
+                onnumber = True
+                if lastunit and (word not in scales):
+                    # Assume this is part of a string of individual numbers to
+                    # be flushed, such as a zipcode "one two three four five"
+                    curstring += repr(result + current)
+                    result = current = 0
+                if scale > 1:
+                    current = max(1, current)
+                current = current * scale + increment
+                if scale > 100:
+                    result += current
+                    current = 0
+                lastscale = False
+                lastunit = False
+                if word in scales:
+                    lastscale = True
+                elif word in units:
+                    lastunit = True
+    if onnumber:
+        curstring += repr(result + current)
+    return curstring
+# replace those words which are not correctly spelled to correct words.
+def replace_words(sentence):
+    # Define the replacements
+    replacements = [
+        (r'\bjiro\b', 'zero'),
+        (r'\bjero\b', 'zero'),
+        (r'\bnn\b', 'one'),
+        (r'\bn\b', 'one'),
+        (r'\bna\b', 'one'),
+        (r'\btu\b', 'two'),
+        (r'\btoo\b', 'two'),
+        (r'\bthiri\b', 'three'),
+        (r'\bfor\b', 'four'),
+        (r'\bfore\b', 'four'),
+        (r'\bfib\b', 'five'),
+        (r'\bdublseven\b', 'double seven'),
+        (r'\bdubalathri\b', 'double three'),
+        (r'\bnineeit\b', 'nine eight'),
+        (r'\bfipeit\b', 'five eight'),
+        (r'\bdubal\b', 'double'),
+        (r'\bsevenatu\b', 'seven two'),
+    ]
+    # Apply the replacements
+    for pattern, replacement in replacements:
+        sentence = re.sub(pattern, replacement, sentence)
+    return sentence
+# split text and numbers and get it into different sentences.
+def split_sentence(sentence):
+    # List of word-based numbers
+    word_numbers = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
+                    "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
+                    "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty",
+                    "sixty", "seventy", "eighty", "ninety", "hundred", "thousand", "million",
+                    'zero', 'one', 'on','na','n','tu','two','to','too', 'three','tree','four',
+                    'for','five','fib', 'six', 'seven', 'eight', 'eit', 'nine', 'eit', 'seven',
+                    'sics', 'thri', 'for', 'eittu', 'eittu', 'nine','dubal','sikas','tri', 'double']
+    # Split the sentence into tokens
+    tokens = sentence.split()
+    # Initialize variables to store the parts of the sentence
+    pre_numbers = []
+    numbers = []
+    post_numbers = []
+    found_numbers = False
+    # Iterate through the tokens to classify them
+    for token in tokens:
+        if token.lower() in word_numbers:
+            found_numbers = True
+            numbers.append(token)
+        else:
+            if found_numbers:
+                post_numbers.append(token)
+            else:
+                pre_numbers.append(token)
+    # Join the parts back into sentences
+    sentence1 = ' '.join(pre_numbers)
+    number = ' '.join(numbers)
+    sentence3 = ' '.join(post_numbers)
+    return sentence1, number, sentence3
+# Process double followed by a numbers.
+def process_doubles(sentence):
+    tokens = sentence.split()
+    result = []
+    i = 0
+    while i < len(tokens):
+        if tokens[i] == "double" or tokens[i] == "dubal":
+            if i + 1 < len(tokens):
+                # Repeat the next word twice
+                result.append(tokens[i + 1])
+                result.append(tokens[i + 1])
+                i += 2  # Skip the next word as it's already added twice
+            else:
+                # If "double" is the last word, just add it (although this case is unusual)
+                result.append(tokens[i])
+                i += 1
+        else:
+            result.append(tokens[i])
+            i += 1
+    return ' '.join(result)
+# Concatenate text and numbers and form a single sentence.
+def concatenate_sentences(sentence1, numbers, sentence3):
+    full_sentence = f"{sentence1} {numbers} {sentence3}"
+    return full_sentence
+# define a master function to run all the above functions.
+def master_function(initial_input):
+    output_string1 = replace_words(initial_input)
+    sentence1, number, sentence3 = split_sentence(output_string1)
+    processed_sentence = process_doubles(number)
+    text = processed_sentence
+    words = text.strip().split()
+    soundex_codes = [soundex(word) for word in words]
+    combined_text = " ".join(soundex_codes)
+    numbers=text2int(combined_text)
+    full_sentence = concatenate_sentences(sentence1, numbers, sentence3)
+    return full_sentence
+######################################################
 demo=gr.Interface(
     fn=sel_lng,