Spaces:

adiitya29
/

Multilingual-ASR

Running

App Files Files Community

adiitya29 commited on 8 days ago

Commit

c17b8b3

0 Parent(s):

barebone setup of project structure

Browse files

Files changed (9) hide show

.gitignore +180 -0
README.md +41 -0
app.py +51 -0
app/__init__.py +1 -0
app/asr_model.py +15 -0
app/audio_processing.py +7 -0
app/history.py +13 -0
app/language_detection.py +8 -0
requirements.txt +20 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,180 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+.idea/
+# VS Code
+.vscode/
+# Mac OS
+.DS_Store
+# Models and large datasets
+*.pt
+*.pth
+*.bin
+*.h5
+*.hdf5
+data/*
+!data/.gitkeep
+models/*
+!models/.gitkeep
+# Histories and output transcripts
+*.json
+*.csv
+!app/templates/*.json

README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+# Multilingual Automatic Speech Recognition (ASR)
+This project provides a web application to upload audio files, detect spoken language, convert speech to text, and download transcripts. It leverages pre-trained Wav2Vec models from Hugging Face and uses Gradio for the frontend interface.
+## Features
+- Upload audio files
+- (Optional) Detect spoken language
+- Speech-to-text conversion via Hugging Face Wav2Vec
+- Save and manage transcription history
+- Download transcripts
+## Setup
+1. **Clone the repository** (or download the source code).
+2. **Create a virtual environment**:
+   ```bash
+   python -m venv venv
+   source venv/bin/activate
+   ```
+3. **Install dependencies**:
+   ```bash
+   pip install -r requirements.txt
+   ```
+## Usage
+To start the Gradio web interface, run:
+```bash
+python app.py
+```
+Open the local URL provided in the terminal in your browser.
+## Project Structure
+- `app.py`: Main entry point for the Gradio interface.
+- `app/`: Module containing logic for audio processing, ASR inference, language detection, and history management.
+- `data/`: Folder to hold sample audio files and exported histories.
+- `notebooks/`: Jupyter notebooks for experiments and fine-tuning.
+- `tests/`: Unit testing suite.

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import gradio as gr
+from app.asr_model import load_model, transcribe_audio
+from app.language_detection import detect_language
+from app.history import save_to_history, export_history
+def process_audio(audio_path):
+    if audio_path is None:
+        return "No audio uploaded.", "Unknown"
+    # Optional: Detect Language
+    lang = detect_language(audio_path)
+    # Transcribe Speech
+    transcript = transcribe_audio(audio_path)
+    # Save History
+    save_to_history(audio_path, transcript, lang)
+    return transcript, lang
+def create_ui():
+    with gr.Blocks(title="Multilingual ASR") as demo:
+        gr.Markdown("# Multilingual Automatic Speech Recognition")
+        gr.Markdown("Upload an audio file to get a text transcription using Wav2Vec.")
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(type="filepath", label="Upload Audio")
+                transcribe_btn = gr.Button("Transcribe")
+            with gr.Column():
+                lang_output = gr.Textbox(label="Detected Language")
+                transcript_output = gr.Textbox(label="Transcription", lines=10)
+                download_btn = gr.File(label="Download Transcript (Coming Soon)")
+        transcribe_btn.click(
+            fn=process_audio,
+            inputs=audio_input,
+            outputs=[transcript_output, lang_output]
+        )
+    return demo
+if __name__ == "__main__":
+    # Pre-load model on start
+    print("Loading model...")
+    load_model()
+    print("Model loaded. Starting UI...")
+    demo = create_ui()
+    demo.launch()

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Init file to make app a package

app/asr_model.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# This module handles the loading and inferencing of the Wav2Vec model
+def load_model():
+    """
+    Loads the Hugging Face Wav2Vec model and processor.
+    For Apple Silicon, we can utilize MPS (Metal Performance Shaders) later.
+    """
+    pass
+def transcribe_audio(audio_filepath: str) -> str:
+    """
+    Takes an audio filepath, processes it, and runs it through the Wav2Vec model
+    to return a text transcription.
+    """
+    return "This is a placeholder transcription. Model integration is pending."

app/audio_processing.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# This module handles audio preprocessing using libraries like librosa
+def load_and_resample(audio_filepath: str, target_sr: int = 16000):
+    """
+    Loads an audio file and resamples it to the target sample rate (default 16kHz for Wav2Vec).
+    """
+    pass

app/history.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# This module manages saving transcriptions to history and exporting them
+def save_to_history(audio_filepath: str, transcript: str, language: str):
+    """
+    Saves the transcription data to a local JSON or CSV file in the data/ directory.
+    """
+    pass
+def export_history(format: str = "csv"):
+    """
+    Exports the saved history into a downloadable format.
+    """
+    pass

app/language_detection.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# This module handles language detection logic
+def detect_language(audio_filepath: str) -> str:
+    """
+    Optional feature to detect the spoken language in the audio file.
+    Could use a separate small classification model or an API.
+    """
+    return "English (Placeholder)"

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Core Deep Learning
+torch>=2.0.0
+torchaudio>=2.0.0
+transformers>=4.30.0
+# Audio Processing
+librosa>=0.10.0
+soundfile>=0.12.1
+# Web Interface
+gradio>=4.0.0
+# Language Detection (Optional)
+langdetect>=1.0.9
+fasttext>=0.9.2  # Alternative for lang detect
+# Utilities
+numpy>=1.24.0
+pandas>=2.0.0
+python-dotenv>=1.0.0