Spaces:

mahemall
/

kannadasptotext

Sleeping

App Files Files Community

mahemall commited on Jul 18, 2024

Commit

4166b5e

verified ·

1 Parent(s): 4ed7f22

Create app.py

Browse files

Files changed (1) hide show

app.py +88 -0

app.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import soundfile as sf  # Import soundfile for audio file handling
+import numpy as np
+import gradio as gr
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+# Choose a suitable Kannada speech-to-text model from Hugging Face
+model_name = "vasista22/whisper-kannada-tiny"  # Replace with your preferred model
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+model = Wav2Vec2ForCTC.from_pretrained(model_name)
+def transcribe_kannada(audio_data):
+    """
+    Transcribes recorded Kannada audio using the specified Hugging Face model.
+    Args:
+        audio_data: A NumPy array representing the recorded audio data.
+    Returns:
+        The transcribed text in Kannada.
+    """
+    sampling_rate = 16000  # Assuming common speech sampling rate (adjust if needed)
+    audio_input = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
+    with torch.no_grad():
+        logits = model(**audio_input).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
+    return transcription
+def record_and_transcribe(audio):
+    """
+    Records audio from the microphone, processes each channel independently (if applicable),
+    converts them to speech-to-text, and plays reversed audio.
+    Args:
+        audio: A tuple containing recorded audio information (multiple audio channels).
+    Returns:
+        A list of transcriptions (one for each channel), or a tuple with transcriptions and reversed audio.
+    """
+    transcriptions = []
+    for channel in audio:
+        # Process each audio channel (replace with your actual conversion logic)
+        audio_data = channel  # Assuming no processing needed for individual channels
+        transcription = transcribe_kannada(audio_data)
+        transcriptions.append(transcription)
+    # ... (handle reversed audio if needed)
+    return transcriptions  # Or a tuple with transcriptions and reversed audio
+# input_audio = gr.Audio(
+#     sources=["microphone"],
+#     type="numpy",  # Specify audio format as NumPy array
+#     normalization=" [-1, 1]",  # Normalize audio data to -1 to 1 range for model compatibility
+#     label="Record Kannada Audio",
+# )
+input_audio = gr.Audio(
+    sources=["microphone"],
+    type="numpy",  # Specify audio format as NumPy array
+    label="Record Kannada Audio",
+)
+text_output = gr.Textbox(label="Transcription (ಕನ್ನಡ)")
+audio_output = gr.Audio(label="Reversed Audio (Optional)", type="numpy")
+demo = gr.Interface(
+    fn=record_and_transcribe,
+    inputs=input_audio,
+    outputs=[text_output, audio_output],
+    description="Kannada Speech-to-Text and Reverse Audio",
+)
+if __name__ == "__main__":
+    demo.launch(share=True)