Spaces:

pawipa
/

speech-to-text

Runtime error

App Files Files Community

pawipa commited on Jan 22

Commit

a6bff07

1 Parent(s): e76c7d1

added AI functionality

Browse files

Files changed (1) hide show

app.py +112 -23

app.py CHANGED Viewed

@@ -2,27 +2,105 @@ import gradio as gr
 import time
 import os
 import zipfile
 from typing import List, Tuple, Generator
 # Initial status message
 STANDARD_OUTPUT_TEXT = "**Status:**<br>"
 def process_files_with_live_updates(
     files: List[gr.File],
-    dropdown_option: str,
-    dropdown_option_2: str
 ) -> Generator[Tuple[str, List[str]], None, None]:
     """
-    Processes a list of uploaded files and provides live updates with progress.
     Args:
         files (List[gr.File]): List of files uploaded by the user.
-        dropdown_option (str): Selected option from the first dropdown.
-        dropdown_option_2 (str): Selected option from the second dropdown.
     Yields:
         Tuple[str, List[str]]: Updated status message and list of processed file paths.
     """
     file_details = []
     total_files = len(files)
     output_files = []
@@ -32,21 +110,28 @@ def process_files_with_live_updates(
     os.makedirs(output_dir, exist_ok=True)
     for idx, file in enumerate(files):
-        # Simulate file processing
-        time.sleep(1)
         # Add to file details
         detail = (
-            f"**File Name**: {file.name} - {dropdown_option} - {dropdown_option_2}<br>"
         )
         file_details.append(detail)
-        # Generate a .txt file
-        txt_filename = os.path.join(output_dir, f"output_file_{idx + 1}.txt")
-        with open(txt_filename, "w") as txt_file:
-            txt_file.write(f"Original File Name: {file.name}")
-        output_files.append(txt_filename)
         # Update progress bar and yield the updated Markdown
         yield (
             f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
@@ -81,15 +166,18 @@ with gr.Blocks() as demo:
     # Input section
     with gr.Row():
         with gr.Column():
-            file_input = gr.Files(file_types=[".wav", ".mp3"], label="Upload your audio files")
         with gr.Column():
-            dropdown = gr.Dropdown(
-                choices=["Language: English", "Language: German", "Language: French"],
-                label="Select Language",
-                value="Language: English",
             )
             dropdown_2 = gr.Dropdown(
-                choices=["Format: Plain Text", "Format: JSON", "Format: SRT"],
                 label="Select Output Format",
                 value="Format: Plain Text",
             )
@@ -106,14 +194,14 @@ with gr.Blocks() as demo:
     # Button actions
     submit_button.click(
         process_files_with_live_updates,
-        inputs=[file_input, dropdown, dropdown_2],
         outputs=[output_md, output_files],
     )
     clear_button.click(
-        lambda: (None, "Language: English", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None),
         inputs=[],  # No inputs
-        outputs=[file_input, dropdown, dropdown_2, output_md, output_files],
     )
     gr.Textbox(os.getcwd(), label="Current Working Directory")
@@ -143,3 +231,4 @@ demo.css = """
 # Launch app
 demo.launch()

 import time
 import os
 import zipfile
+import torch
+import librosa
+import soundfile as sf
+from transformers import pipeline
 from typing import List, Tuple, Generator
+import datetime
+from pydub import AudioSegment
+# Initial model name
+MODEL_NAME = "primeline/whisper-tiny-german-1224"
+speech_to_text = pipeline("automatic-speech-recognition", model=MODEL_NAME)
 # Initial status message
 STANDARD_OUTPUT_TEXT = "**Status:**<br>"
+def get_file_creation_date(file_path: str) -> str:
+    """
+    Returns the creation date of a file.
+    Args:
+        file_path (str): The path to the file.
+    Returns:
+        str: The creation date in a human-readable format.
+    """
+    try:
+        # Get file statistics
+        file_stats = os.stat(file_path)
+        # Retrieve and format creation time
+        creation_time = datetime.datetime.fromtimestamp(file_stats.st_ctime)
+        return creation_time.strftime("%Y-%m-%d %H:%M:%S")
+    except FileNotFoundError:
+        return "File not found."
+def load_model(model_name: str):
+    """
+    Loads the selected Hugging Face model.
+    Args:
+        model_name (str): The name of the Hugging Face model to load.
+    Returns:
+        pipeline: The loaded model pipeline.
+    """
+    return pipeline("automatic-speech-recognition", model=model_name)
+def convert_to_wav(file_path: str) -> str:
+    """
+    Converts audio files to WAV format if necessary.
+    Args:
+        file_path (str): Path to the uploaded audio file.
+    Returns:
+        str: Path to the converted WAV file.
+    """
+    if file_path.endswith(".m4a"):
+        audio = AudioSegment.from_file(file_path, format="m4a")
+        wav_path = file_path.replace(".m4a", ".wav")
+        audio.export(wav_path, format="wav")
+        return wav_path
+    return file_path
+def preprocess_audio(file_path: str) -> str:
+    """
+    Preprocesses the audio file to ensure compatibility with the AI model.
+    Args:
+        file_path (str): Path to the uploaded audio file.
+    Returns:
+        str: Path to the preprocessed audio file.
+    """
+    file_path = convert_to_wav(file_path)  # Convert to WAV if necessary
+    y, sr = librosa.load(file_path, sr=16000)  # Resample audio to 16kHz
+    processed_path = file_path.replace(".mp3", "_processed.wav").replace(".wav", "_processed.wav")
+    sf.write(processed_path, y, sr)  # Save the resampled audio
+    return processed_path
 def process_files_with_live_updates(
     files: List[gr.File],
+    model_option: str,
+    output_format: str
 ) -> Generator[Tuple[str, List[str]], None, None]:
     """
+    Processes a list of uploaded files, transcribes audio, and provides live updates.
     Args:
         files (List[gr.File]): List of files uploaded by the user.
+        model_option (str): Selected model option.
+        output_format (str): Selected output format option.
     Yields:
         Tuple[str, List[str]]: Updated status message and list of processed file paths.
     """
+    global speech_to_text
+    speech_to_text = load_model(model_option)
     file_details = []
     total_files = len(files)
     output_files = []
     os.makedirs(output_dir, exist_ok=True)
     for idx, file in enumerate(files):
+        # Preprocess audio file
+        preprocessed_path = preprocess_audio(file.name)
+        # Transcribe audio using the AI model with timestamp support
+        transcription_result = speech_to_text(preprocessed_path, return_timestamps=True)
+        transcription = transcription_result["text"]
+        # Save transcription to file
+        txt_filename = os.path.join(output_dir, f"transcription_{file.name.split('/')[-1].split('.')[0]}.txt")
+        with open(txt_filename, "w", encoding="utf-8") as txt_file:
+            txt_file.write(transcription)
+        output_files.append(txt_filename)
         # Add to file details
         detail = (
+            f"**File Name**: {file.name.split('/')[-1]}<br>"
+            f"**File Date**: {get_file_creation_date(file)}<br>"
+            f"**Options**: {model_option} - {output_format}<br>"
+            f"**Transcription**: {transcription}<br><br>"
         )
         file_details.append(detail)
         # Update progress bar and yield the updated Markdown
         yield (
             f"**Status: {int(((idx + 1) / total_files) * 100)}%**<br>" + "".join(file_details),
     # Input section
     with gr.Row():
         with gr.Column():
+            file_input = gr.Files(file_types=[".wav", ".mp3", ".m4a"], label="Upload your audio files")
         with gr.Column():
+            model_dropdown = gr.Dropdown(
+                choices=[
+                    "primeline/whisper-tiny-german-1224",
+                    "primeline/whisper-tiny-german",
+                    "primeline/whisper-large-v3-german"],
+                label="Select Model",
+                value="primeline/whisper-tiny-german-1224",
             )
             dropdown_2 = gr.Dropdown(
+                choices=["Format: Plain Text"],
                 label="Select Output Format",
                 value="Format: Plain Text",
             )
     # Button actions
     submit_button.click(
         process_files_with_live_updates,
+        inputs=[file_input, model_dropdown, dropdown_2],
         outputs=[output_md, output_files],
     )
     clear_button.click(
+        lambda: (None, "primeline/whisper-tiny-german-1224", "Format: Plain Text", STANDARD_OUTPUT_TEXT, None),
         inputs=[],  # No inputs
+        outputs=[file_input, model_dropdown, dropdown_2, output_md, output_files],
     )
     gr.Textbox(os.getcwd(), label="Current Working Directory")
 # Launch app
 demo.launch()