Spaces:

MuhammadFarhanAslam
/

AI-Powered_Speech-to-Text_Transcriber

Sleeping

App Files Files Community

MuhammadFarhanAslam commited on Jun 7

Commit

5632b5a

verified ·

1 Parent(s): 1310a47

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

README.md +3 -9
app.py +281 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: AI-Powered Speech-to-Text Transcriber
-emoji: 🏆
-colorFrom: purple
-colorTo: gray
-sdk: gradio
-sdk_version: 5.33.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AI-Powered_Speech-to-Text_Transcriber
 app_file: app.py
+sdk: gradio
+sdk_version: 5.31.0
 ---

app.py ADDED Viewed

	@@ -0,0 +1,281 @@

+# app.py
+!pip install gradio
+!pip install transformers
+!pip install soundfile
+import gradio as gr
+import soundfile as sf
+import os
+from transformers import pipeline
+asr = pipeline(task="automatic-speech-recognition",
+               model="distil-whisper/distil-small.en")
+def transcribe_speech(audio_filepath):
+    if audio_filepath is None:
+      gr.Warning('No audio found. Please try again!')
+    # This line defines a Python function named 'transcribe_speech'
+    # It takes one argument: 'audio_filepath', which is expected to be a string
+    # representing the path to an audio file on your system (e.g., 'my_audio.wav').
+    # 1. Load audio from file
+    # This line uses 'sf.read()' (likely from the 'soundfile' library, or similar)
+    # to read the contents of the audio file specified by 'audio_filepath'.
+    # It returns two main pieces of information:
+    # - 'audio': A NumPy array containing the numerical samples of the audio waveform.
+    #            This is the raw digital representation of the sound.
+    # - 'sr': The sampling rate (in Hertz) of the audio. This tells you how many
+    #         samples per second are in the 'audio' array (e.g., 16000 Hz, 44100 Hz).
+    audio, sr = sf.read(audio_filepath)
+    # 2. Pass audio data to the ASR model/pipeline for transcription
+    # This is the core step where the speech recognition happens.
+    # - 'asr': This variable (which must be defined and initialized elsewhere in your code)
+    #          represents your pre-trained ASR model or, more likely, a Hugging Face
+    #          ASR pipeline (like the one you'd get from `pipeline("automatic-speech-recognition", model="...")`).
+    # - `{"array": audio, "sampling_rate": sr}`: This is the crucial input format
+    #          expected by many Hugging Face ASR models and pipelines. It's a dictionary
+    #          where:
+    #          - 'array': Contains the raw numerical audio waveform.
+    #          - 'sampling_rate': Provides the corresponding sampling rate.
+    #          The ASR model needs both to correctly interpret the audio.
+    # - 'result': The output from the 'asr' model/pipeline. For ASR tasks, this is
+    #             typically a dictionary containing the transcribed text and potentially
+    #             other metadata (like word timestamps or confidence scores).
+    result = asr(
+        {"array": audio, "sampling_rate": sr}
+    )
+    # 3. Extract and return the transcribed text
+    # The ASR pipeline or model usually returns its primary output (the transcription)
+    # under a specific key, commonly 'text'.
+    # This line extracts that text string from the 'result' dictionary.
+    return result['text']
+mic_transcribe = gr.Interface(
+    fn=transcribe_speech,
+    inputs=gr.Audio(
+        sources="microphone",
+        type="filepath",
+        label="🎤 Speak into your microphone" # Appealing label
+    ),
+    outputs=gr.Textbox(
+        label="📝 Transcription Result", # Appealing label
+        lines=4, # Slightly more lines for longer transcriptions
+        placeholder="Your transcribed text will appear here..."
+    ),
+    allow_flagging="never", # Disable flagging
+    description="Record your voice directly using your device's microphone. Get an instant transcription."
+)
+file_transcribe = gr.Interface(
+    fn=transcribe_speech,
+    inputs=gr.Audio(
+        sources="upload", # Allow input from file upload
+        type="filepath",  # Function receives audio as a temporary file path
+        label="📁 Upload an Audio File" # Appealing label
+    ),
+    outputs=gr.Textbox(
+        label="📝 Transcription Result", # Appealing label
+        lines=4, # Slightly more lines
+        placeholder="Upload an audio file (e.g., .wav, .mp3) to get its transcription."
+    ),
+    allow_flagging="never", # Disable flagging
+    description="Upload an audio file for transcription."
+)
+custom_css = """
+/* Import Google Font - Arial (or a very similar sans-serif if Arial isn't universally available on all systems) */
+/* Note: Arial is typically a system font, so direct import isn't strictly necessary for it to work,
+   but it's good practice for other fonts. */
+@import url('https://fonts.googleapis.com/css2?family=Arial:wght@400;700&display=swap');
+/* Apply Arial to ALL text elements by default within the Gradio container */
+.gradio-container, body, button, input, select, textarea, div, p, span, h1, h2, h3, h4, h5, h6 {
+    font-family: 'Arial', sans-serif !important;
+}
+/* Overall container styling */
+.gradio-container {
+    max-width: 900px; /* Limit overall width for better readability */
+    margin: 30px auto; /* Center the app on the page */
+    padding: 30px;
+    border-radius: 15px; /* Rounded corners for a softer look */
+    box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1); /* Subtle shadow for depth */
+    background-color: #ffffff; /* White background for the main content area */
+}
+/* Titles and Headers */
+h1 {
+    color: #34495e; /* Darker blue-grey for main title */
+    text-align: center;
+    font-size: 2.5em; /* Larger main title */
+    margin-bottom: 10px;
+    font-weight: 700; /* Bold */
+}
+h3 {
+    color: #5d6d7e; /* Slightly lighter blue-grey for subtitle */
+    text-align: center;
+    font-size: 1.2em;
+    margin-top: 0;
+    margin-bottom: 25px;
+}
+p {
+    text-align: center;
+    color: #7f8c8d; /* Muted grey for descriptions */
+    font-size: 0.95em;
+    margin-bottom: 20px;
+}
+/* Tabbed Interface Styling */
+.tabs {
+    border-radius: 10px;
+    overflow: hidden; /* Ensures rounded corners on tabs */
+    margin-bottom: 20px;
+}
+.tab-nav button {
+    background-color: #ecf0f1; /* Light grey for inactive tabs */
+    color: #34495e; /* Dark text for inactive tabs */
+    font-weight: bold;
+    padding: 12px 20px;
+    border-radius: 8px 8px 0 0;
+    margin-right: 5px; /* Small space between tabs */
+    transition: all 0.3s ease;
+}
+.tab-nav button.selected {
+    background-color: #4a90e2; /* Vibrant blue for active tab */
+    color: white; /* White text for active tab */
+    box-shadow: 0 4px 10px rgba(74, 144, 226, 0.3); /* Subtle shadow for active tab */
+}
+/* Input and Output Component Styling (General) */
+.gr-box {
+    border-radius: 10px; /* Rounded corners for input/output boxes */
+    border: 1px solid #dfe6e9; /* Light border */
+    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.05); /* Very subtle shadow */
+    padding: 20px;
+    background-color: #fcfcfc; /* Slightly off-white background */
+}
+/* Labels within components (e.g., "Upload Audio File", "Transcription Result") */
+.label {
+    font-weight: bold;
+    color: #2c3e50; /* Dark text for labels */
+    font-size: 1.1em;
+    margin-bottom: 8px;
+}
+/* Buttons (Clear, Submit) */
+.gr-button {
+    background-color: #4a90e2 !important; /* Primary blue for actions */
+    color: white !important;
+    border: none !important;
+    border-radius: 8px !important; /* Rounded buttons */
+    padding: 12px 25px !important;
+    font-weight: bold !important;
+    transition: background-color 0.3s ease, box-shadow 0.3s ease !important;
+    margin: 5px; /* Spacing between buttons */
+}
+.gr-button:hover {
+    background-color: #3a7bd2 !important; /* Darker blue on hover */
+    box-shadow: 0 4px 15px rgba(74, 144, 226, 0.4) !important;
+}
+/* Clear button specific */
+.gr-button.secondary {
+    background-color: #e0e6eb !important; /* Lighter grey for clear */
+    color: #34495e !important;
+}
+.gr-button.secondary:hover {
+    background-color: #d1d8df !important;
+    box-shadow: none !important;
+}
+/* Textbox specific */
+textarea {
+    border-radius: 8px !important;
+    border: 1px solid #bdc3c7 !important;
+    padding: 10px !important;
+    resize: vertical; /* Allow vertical resizing */
+}
+/* Audio component player */
+.gr-audio-player {
+    border-radius: 8px;
+    background-color: #f0f0f0;
+    padding: 10px;
+}
+/* Footer styling */
+hr {
+    border: none;
+    border-top: 1px solid #e0e0e0;
+    margin-top: 30px;
+    margin-bottom: 15px;
+}
+.footer-text {
+    font-size: 0.85em;
+    color: #a0a0a0;
+    text-align: center;
+}
+"""
+# --- 6. Main Gradio App using Blocks for layout and styling ---
+# Initialize a Gradio Blocks interface with a theme and custom CSS.
+demo = gr.Blocks(
+    theme=gr.themes.Soft(), # A good base theme for soft colors
+    css=custom_css          # Apply our custom CSS
+)
+# Define the layout within the 'demo' Blocks context
+with demo:
+    # Main Title and Description using Markdown for rich formatting and appealing colors
+    # Removed inline style for font-family as it's handled by global CSS now.
+    gr.Markdown(
+        """
+        <center>
+            <h1 style="color: #4A90E2;">
+                🎙️ AI-Powered Speech-to-Text Transcriber 📝
+            </h1>
+            <h3 style="color: #6C7A89;">
+                Developed by Muhammad Farhan Aslam.
+            </h3>
+            <h3 style="color: #6C7A89;">
+                Convert spoken words into accurate text with ease and precision.
+            </h3>
+            <p style="color: #8C9CA7; font-size: 1.05em;">
+                Effortlessly transcribe audio from your microphone or by uploading a file.
+                This application leverages advanced AI to provide clear and reliable transcriptions.
+            </p>
+        </center>
+        """
+    )
+    # Create a tabbed interface for microphone and file upload transcription
+    gr.TabbedInterface(
+        [file_transcribe, mic_transcribe],
+        ["📁 Transcribe Audio File", "🎤 Transcribe from Microphone"],
+    )
+    # Add a subtle footer for information or credits
+    gr.Markdown(
+        """
+        <hr>
+        <p class="footer-text">
+            Built with ❤️ and Gradio on Hugging Face Transformers.
+        </p>
+        """
+    )
+# start_port = int(os.environ.get('PORT1', 7861))
+# demo.launch(share=True, server_port=start_port)