speech_recognize1

Sleeping

App Files Files Community

mr2along commited on Oct 11, 2024

Commit

e3a58c6

verified ·

1 Parent(s): 6369c87

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -50

app.py CHANGED Viewed

@@ -2,41 +2,38 @@ import os
 import speech_recognition as sr
 import difflib
 import gradio as gr
-from gtts import gTTS
-import io
-from pydub import AudioSegment
-# Create audio directory if it doesn't exist
 if not os.path.exists('audio'):
     os.makedirs('audio')
-# Step 1: Transcribe the audio file
 def transcribe_audio(audio):
     if audio is None:
-        return "No audio file provided."  # Handle the case when no audio is uploaded
     recognizer = sr.Recognizer()
     audio_format = audio.split('.')[-1].lower()
-    # Convert to WAV if the audio is not in a supported format
     if audio_format != 'wav':
         try:
-            # Load the audio file with pydub
             audio_segment = AudioSegment.from_file(audio)
             wav_path = audio.replace(audio_format, 'wav')
-            audio_segment.export(wav_path, format='wav')  # Convert to WAV
-            audio = wav_path  # Update audio path to the converted file
         except Exception as e:
             return f"Error converting audio: {e}"
-    # Convert audio into recognizable format for the Recognizer
     audio_file = sr.AudioFile(audio)
     with audio_file as source:
         audio_data = recognizer.record(source)
     try:
-        # Recognize the audio using Google Web Speech API
         transcription = recognizer.recognize_google(audio_data)
         return transcription
     except sr.UnknownValueError:
@@ -44,78 +41,75 @@ def transcribe_audio(audio):
     except sr.RequestError as e:
         return f"Error with Google Speech Recognition service: {e}"
-# Step 2: Create pronunciation audio for incorrect words
 def create_pronunciation_audio(word):
-    tts = gTTS(word)
-    audio_file_path = f"audio/{word}.mp3"  # Save the audio to a file
-    tts.save(audio_file_path)
-    return audio_file_path  # Return the file path of the saved audio
-# Step 3: Compare the transcribed text with the input paragraph
 def compare_texts(reference_text, transcribed_text):
     reference_words = reference_text.split()
     transcribed_words = transcribed_text.split()
-    incorrect_words_audios = []  # Store audio paths for incorrect words
     sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
     similarity_score = round(sm.ratio() * 100, 2)
-    # Construct HTML output
     html_output = f"<strong>Fidelity Class:</strong> {'CORRECT' if similarity_score > 50 else 'INCORRECT'}<br>"
     html_output += f"<strong>Quality Score:</strong> {similarity_score}<br>"
     html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
     html_output += "<strong>Word Score List:</strong><br>"
-    # Generate colored word score list
     for i, word in enumerate(reference_words):
         try:
             if word.lower() == transcribed_words[i].lower():
-                html_output += f'<span style="color: green;">{word}</span> '  # Correct words in green
             elif difflib.get_close_matches(word, transcribed_words):
-                html_output += f'<span style="color: yellow;">{word}</span> '  # Close matches in yellow
             else:
-                # Incorrect words in red
-                html_output += f'<span style="color: red;">{word}</span> '
-                # Create pronunciation audio for the incorrect word
                 audio_file_path = create_pronunciation_audio(word)
                 incorrect_words_audios.append((word, audio_file_path))
         except IndexError:
-            html_output += f'<span style="color: red;">{word}</span> '  # Words in reference that were not transcribed
-    # Provide audio for incorrect words
     if incorrect_words_audios:
         html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
         for word, audio in incorrect_words_audios:
-            suggestion = difflib.get_close_matches(word, reference_words, n=1)
-            suggestion_text = f" (Did you mean: <em>{suggestion[0]}</em>?)" if suggestion else ""
             html_output += f'{word}: '
-            html_output += f'<audio controls><source src="{audio}" type="audio/mpeg">Your browser does not support the audio tag.</audio>{suggestion_text}<br>'
     return html_output
-# Step 4: Text-to-Speech Function
 def text_to_speech(paragraph):
-    if not paragraph:
-        return None  # Handle the case when no text is provided
-    tts = gTTS(paragraph)
-    audio_file_path = "audio/paragraph.mp3"  # Save the audio to a file
-    tts.save(audio_file_path)
-    return audio_file_path  # Return the file path instead of None
-# Gradio Interface Function
 def gradio_function(paragraph, audio):
-    # Transcribe the audio
     transcribed_text = transcribe_audio(audio)
-    # Compare the original paragraph with the transcribed text
     comparison_result = compare_texts(paragraph, transcribed_text)
-    # Return comparison result
     return comparison_result
-# Gradio Interface using the updated API
 interface = gr.Interface(
     fn=gradio_function,
     inputs=[
@@ -127,7 +121,6 @@ interface = gr.Interface(
     description="Input a paragraph, record your audio, and compare the transcription to the original text."
 )
-# Gradio Interface for Text-to-Speech
 tts_interface = gr.Interface(
     fn=text_to_speech,
     inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
@@ -136,8 +129,8 @@ tts_interface = gr.Interface(
     description="This tool will read your input paragraph aloud."
 )
-# Combine both interfaces into one
 demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])
-# Launch Gradio app
 demo.launch()

 import speech_recognition as sr
 import difflib
 import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import soundfile as sf
+# Tạo thư mục audio nếu chưa tồn tại
 if not os.path.exists('audio'):
     os.makedirs('audio')
+# Bước 1: Chuyển đổi âm thanh thành văn bản
 def transcribe_audio(audio):
     if audio is None:
+        return "No audio file provided."  # Xử lý trường hợp không có tệp âm thanh
     recognizer = sr.Recognizer()
     audio_format = audio.split('.')[-1].lower()
+    # Chuyển đổi sang WAV nếu âm thanh không ở định dạng hỗ trợ
     if audio_format != 'wav':
         try:
             audio_segment = AudioSegment.from_file(audio)
             wav_path = audio.replace(audio_format, 'wav')
+            audio_segment.export(wav_path, format='wav')  # Chuyển đổi sang WAV
+            audio = wav_path  # Cập nhật đường dẫn âm thanh
         except Exception as e:
             return f"Error converting audio: {e}"
     audio_file = sr.AudioFile(audio)
     with audio_file as source:
         audio_data = recognizer.record(source)
     try:
         transcription = recognizer.recognize_google(audio_data)
         return transcription
     except sr.UnknownValueError:
     except sr.RequestError as e:
         return f"Error with Google Speech Recognition service: {e}"
+# Bước 2: Tạo âm thanh phát âm cho các từ sai
 def create_pronunciation_audio(word):
+    model_name = "tts_models/en/ljspeech/tacotron2"  # Mô hình TTS
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    inputs = tokenizer(word, return_tensors="pt")
+    # Tạo âm thanh từ văn bản
+    with torch.no_grad():
+        outputs = model.generate(**inputs)
+    # Lưu âm thanh vào tệp
+    audio_file_path = f"audio/{word}.wav"
+    sf.write(audio_file_path, outputs.numpy(), 22050)  # Giả định tần số mẫu 22050Hz
+    return audio_file_path
+# Bước 3: So sánh văn bản đã chuyển đổi với đoạn văn bản gốc
 def compare_texts(reference_text, transcribed_text):
+    word_scores = []
     reference_words = reference_text.split()
     transcribed_words = transcribed_text.split()
+    incorrect_words_audios = []  # Lưu trữ đường dẫn âm thanh cho các từ sai
     sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
     similarity_score = round(sm.ratio() * 100, 2)
+    # Tạo đầu ra HTML
     html_output = f"<strong>Fidelity Class:</strong> {'CORRECT' if similarity_score > 50 else 'INCORRECT'}<br>"
     html_output += f"<strong>Quality Score:</strong> {similarity_score}<br>"
     html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
     html_output += "<strong>Word Score List:</strong><br>"
+    # Tạo danh sách điểm số từ màu sắc
     for i, word in enumerate(reference_words):
         try:
             if word.lower() == transcribed_words[i].lower():
+                html_output += f'<span style="color: green;">{word}</span> '  # Từ đúng màu xanh
             elif difflib.get_close_matches(word, transcribed_words):
+                html_output += f'<span style="color: yellow;">{word}</span> '  # Từ gần đúng màu vàng
             else:
+                html_output += f'<span style="color: red;">{word}</span> '  # Từ sai màu đỏ
+                # Tạo âm thanh phát âm cho từ sai
                 audio_file_path = create_pronunciation_audio(word)
                 incorrect_words_audios.append((word, audio_file_path))
         except IndexError:
+            html_output += f'<span style="color: red;">{word}</span> '  # Từ tham chiếu không được chuyển đổi
+    # Cung cấp âm thanh cho các từ sai
     if incorrect_words_audios:
         html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
         for word, audio in incorrect_words_audios:
             html_output += f'{word}: '
+            html_output += f'<audio controls><source src="{audio}" type="audio/wav">Your browser does not support the audio tag.</audio><br>'
     return html_output
+# Bước 4: Chức năng Text-to-Speech
 def text_to_speech(paragraph):
+    audio_file_path = create_pronunciation_audio(paragraph)  # Sử dụng hàm đã sửa
+    return audio_file_path
+# Giao diện Gradio
 def gradio_function(paragraph, audio):
     transcribed_text = transcribe_audio(audio)
     comparison_result = compare_texts(paragraph, transcribed_text)
     return comparison_result
 interface = gr.Interface(
     fn=gradio_function,
     inputs=[
     description="Input a paragraph, record your audio, and compare the transcription to the original text."
 )
 tts_interface = gr.Interface(
     fn=text_to_speech,
     inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
     description="This tool will read your input paragraph aloud."
 )
+# Kết hợp cả hai giao diện
 demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])
+# Khởi động ứng dụng Gradio
 demo.launch()