Spaces:

adiitya29
/

Multilingual-ASR

Running

App Files Files Community

adiitya29 commited on 7 days ago

Commit

cf2c908

1 Parent(s): 8f2047c

frontend UI created using gradio, fastAPI created, notebooks folder created for fine tuning and evaluation of models

Browse files

Files changed (4) hide show

gradio_ui.py +70 -0
main.py +53 -0
notebooks/01_evaluation.ipynb +74 -0
notebooks/02_finetuning.ipynb +109 -0

gradio_ui.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import gradio as gr
+from app.asr_model import load_model, transcribe_audio
+from app.language_detection import detect_language_from_text
+from app.history import save_to_history, export_history
+def process_audio(audio_path):
+    if audio_path is None:
+        return "No audio uploaded.", "Unknown"
+    print(f"\n--- New Request ---")
+    print(f"Processing audio: {audio_path}")
+    # Transcribe Speech
+    print("Transcribing... (If this is the first time, it is downloading a 400MB model)")
+    transcript = transcribe_audio(audio_path)
+    print(f"Transcription complete: {transcript[:50]}...")
+    # Detect Language from transcript
+    print("Detecting language...")
+    lang = detect_language_from_text(transcript)
+    # Save History
+    print("Saving to history...")
+    save_to_history(audio_path, transcript, lang)
+    print("Done!\n")
+    return transcript, lang
+def export_history_wrapper():
+    path = export_history("csv")
+    return path if path else None
+def create_ui():
+    with gr.Blocks(title="Multilingual ASR") as demo:
+        gr.Markdown("# Multilingual Automatic Speech Recognition")
+        with gr.Tabs():
+            with gr.TabItem("Transcribe"):
+                gr.Markdown("Upload an audio file to get a text transcription using Wav2Vec.")
+                with gr.Row():
+                    with gr.Column():
+                        audio_input = gr.Audio(type="filepath", label="Upload Audio")
+                        transcribe_btn = gr.Button("Transcribe", variant="primary")
+                    with gr.Column():
+                        lang_output = gr.Textbox(label="Detected Language")
+                        transcript_output = gr.Textbox(label="Transcription", lines=10)
+                transcribe_btn.click(
+                    fn=process_audio,
+                    inputs=audio_input,
+                    outputs=[transcript_output, lang_output]
+                )
+            with gr.TabItem("History"):
+                gr.Markdown("Download your past transcriptions.")
+                download_btn = gr.Button("Prepare History for Download")
+                file_output = gr.File(label="Download CSV")
+                download_btn.click(
+                    fn=export_history_wrapper,
+                    outputs=file_output
+                )
+    return demo
+if __name__ == "__main__":
+    demo = create_ui()
+    demo.launch()

main.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from fastapi import FastAPI, UploadFile, File, HTTPException
+import gradio as gr
+from gradio_ui import create_ui
+from app.asr_model import transcribe_audio
+from app.language_detection import detect_language_from_text
+from app.history import save_to_history
+import os
+import tempfile
+import shutil
+# Initialize FastAPI app
+api = FastAPI(title="Multilingual ASR API", description="REST API for audio transcription", version="1.0.0")
+@api.post("/api/transcribe")
+async def api_transcribe(audio_file: UploadFile = File(...)):
+    """
+    REST endpoint to upload an audio file and get its transcription.
+    """
+    if not audio_file.filename:
+        raise HTTPException(status_code=400, detail="No file provided")
+    try:
+        # Save uploaded file to a temporary file
+        fd, temp_path = tempfile.mkstemp(suffix=os.path.splitext(audio_file.filename)[1])
+        with os.fdopen(fd, "wb") as f:
+            shutil.copyfileobj(audio_file.file, f)
+        # Run inference
+        transcript = transcribe_audio(temp_path)
+        lang = detect_language_from_text(transcript)
+        # Save to history
+        save_to_history(audio_file.filename, transcript, lang)
+        # Cleanup temp file
+        os.remove(temp_path)
+        return {
+            "filename": audio_file.filename,
+            "language": lang,
+            "transcript": transcript
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Mount Gradio app on root
+demo = create_ui()
+app = gr.mount_gradio_app(api, demo, path="/")
+if __name__ == "__main__":
+    import uvicorn
+    # Run the unified app with uvicorn
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)

notebooks/01_evaluation.ipynb ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model Evaluation\n",
+    "\n",
+    "This notebook demonstrates how to evaluate your Wav2Vec2 model on a test dataset using the Word Error Rate (WER) metric."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "!pip install evaluate jiwer datasets"
+   ],
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "import evaluate\n",
+    "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n",
+    "import torch\n",
+    "\n",
+    "# Load metric\n",
+    "wer_metric = evaluate.load(\"wer\")\n",
+    "\n",
+    "# Load model and processor\n",
+    "model_id = \"facebook/wav2vec2-base-960h\"\n",
+    "processor = Wav2Vec2Processor.from_pretrained(model_id)\n",
+    "model = Wav2Vec2ForCTC.from_pretrained(model_id)"
+   ],
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute WER on sample predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "predictions = [\"this is a test\", \"hello world\"]\n",
+    "references = [\"this is a test\", \"hello word\"]\n",
+    "\n",
+    "wer = wer_metric.compute(predictions=predictions, references=references)\n",
+    "print(f\"Word Error Rate (WER): {wer}\")"
+   ],
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

notebooks/02_finetuning.ipynb ADDED Viewed

	@@ -0,0 +1,109 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model Fine-tuning\n",
+    "\n",
+    "This notebook provides a skeleton for fine-tuning the Wav2Vec2 model on your custom dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "!pip install datasets transformers accelerate librosa soundfile"
+   ],
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "from datasets import load_dataset, Audio\n",
+    "\n",
+    "# Load your dataset here (example uses common_voice, you can replace with your own)\n",
+    "# dataset = load_dataset(\"mozilla-foundation/common_voice_11_0\", \"en\", split=\"train\")\n",
+    "\n",
+    "# Ensure audio is resampled to 16kHz\n",
+    "# dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))"
+   ],
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, TrainingArguments, Trainer\n",
+    "\n",
+    "model_id = \"facebook/wav2vec2-base\"\n",
+    "processor = Wav2Vec2Processor.from_pretrained(model_id)\n",
+    "model = Wav2Vec2ForCTC.from_pretrained(\n",
+    "    model_id, \n",
+    "    ctc_loss_reduction=\"mean\", \n",
+    "    pad_token_id=processor.tokenizer.pad_token_id\n",
+    ")"
+   ],
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training Setup\n",
+    "Set up the DataCollator and TrainingArguments here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "source": [
+    "# training_args = TrainingArguments(\n",
+    "#     output_dir=\"./wav2vec2-finetuned\",\n",
+    "#     group_by_length=True,\n",
+    "#     per_device_train_batch_size=16,\n",
+    "#     evaluation_strategy=\"steps\",\n",
+    "#     num_train_epochs=10,\n",
+    "#     fp16=True, # Use false if on MPS without FP16 support\n",
+    "#     save_steps=500,\n",
+    "#     eval_steps=500,\n",
+    "#     logging_steps=500,\n",
+    "#     learning_rate=1e-4,\n",
+    "#     warmup_steps=1000,\n",
+    "#     save_total_limit=2,\n",
+    "# )\n",
+    "\n",
+    "# trainer = Trainer(\n",
+    "#     model=model,\n",
+    "#     data_collator=data_collator,\n",
+    "#     args=training_args,\n",
+    "#     compute_metrics=compute_metrics,\n",
+    "#     train_dataset=dataset,\n",
+    "#     eval_dataset=dataset,\n",
+    "#     tokenizer=processor.feature_extractor,\n",
+    "# )\n",
+    "\n",
+    "# trainer.train()"
+   ],
+   "outputs": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}