Spaces:
Sleeping
Sleeping
New files uploaded
Browse files- app.py +202 -0
- hf_README.md +52 -0
- hf_requirements.txt +36 -0
- hf_transcriber.py +154 -0
- recorder.py +119 -0
- transcriber.py +152 -0
app.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from transcriber import AudioTranscriber
|
| 4 |
+
from hf_transcriber import HFTranscriber, transcribe_with_hf
|
| 5 |
+
from recorder import AudioRecorder, list_audio_devices
|
| 6 |
+
import tempfile
|
| 7 |
+
import base64
|
| 8 |
+
import numpy as np
|
| 9 |
+
import time
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
import sounddevice as sd # Add this import
|
| 12 |
+
|
| 13 |
+
def get_binary_file_downloader_html(bin_file, file_label='File'):
|
| 14 |
+
"""Generate a link to download the given file."""
|
| 15 |
+
with open(bin_file, 'rb') as f:
|
| 16 |
+
data = f.read()
|
| 17 |
+
bin_str = base64.b64encode(data).decode()
|
| 18 |
+
href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Download {file_label}</a>'
|
| 19 |
+
return href
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
st.set_page_config(page_title="Audio to Sheet Music Transcriber", layout="wide")
|
| 23 |
+
|
| 24 |
+
st.title("🎵 Audio to Sheet Music Transcriber")
|
| 25 |
+
st.markdown("### Convert monophonic audio to sheet music")
|
| 26 |
+
|
| 27 |
+
# Initialize session state for recording
|
| 28 |
+
if 'recorder' not in st.session_state:
|
| 29 |
+
st.session_state.recorder = AudioRecorder()
|
| 30 |
+
if 'recording' not in st.session_state:
|
| 31 |
+
st.session_state.recording = False
|
| 32 |
+
|
| 33 |
+
st.sidebar.header("Transcription Settings")
|
| 34 |
+
use_hf = st.sidebar.checkbox("Use Hugging Face Model", value=True)
|
| 35 |
+
|
| 36 |
+
if use_hf:
|
| 37 |
+
model_name = st.sidebar.selectbox(
|
| 38 |
+
"Select Model",
|
| 39 |
+
["microsoft/speecht5_asr", "facebook/wav2vec2-base-960h"],
|
| 40 |
+
index=0
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Recording or Upload Section
|
| 44 |
+
st.sidebar.header("Audio Input")
|
| 45 |
+
input_method = st.sidebar.radio("Choose input method:", ["Upload Audio File", "Record Live Audio"])
|
| 46 |
+
|
| 47 |
+
if input_method == "Record Live Audio":
|
| 48 |
+
st.header("🎤 Live Audio Recording")
|
| 49 |
+
|
| 50 |
+
# Show available audio devices
|
| 51 |
+
devices = list_audio_devices()
|
| 52 |
+
if not devices:
|
| 53 |
+
st.error("No audio input devices found!")
|
| 54 |
+
else:
|
| 55 |
+
device_names = [f"{i}: {d['name']} (Channels: {d['input_channels']}, Rate: {int(d['default_samplerate'])}Hz)"
|
| 56 |
+
for i, d in enumerate(devices)]
|
| 57 |
+
selected_device = st.selectbox("Select audio input device:", device_names,
|
| 58 |
+
index=next((i for i, d in enumerate(devices) if d['id'] == sd.default.device[0]), 0))
|
| 59 |
+
|
| 60 |
+
# Get the selected device ID
|
| 61 |
+
device_id = devices[device_names.index(selected_device)]['id']
|
| 62 |
+
|
| 63 |
+
col1, col2 = st.columns(2)
|
| 64 |
+
|
| 65 |
+
with col1:
|
| 66 |
+
if st.button("🎤 Start Recording", disabled=st.session_state.recording):
|
| 67 |
+
sd.default.device = device_id
|
| 68 |
+
st.session_state.recorder = AudioRecorder(
|
| 69 |
+
sample_rate=int(devices[device_id]['default_samplerate']),
|
| 70 |
+
channels=min(1, devices[device_id]['input_channels'])
|
| 71 |
+
)
|
| 72 |
+
st.session_state.recorder.start_recording()
|
| 73 |
+
st.session_state.recording = True
|
| 74 |
+
st.session_state.recording_start = time.time()
|
| 75 |
+
st.rerun()
|
| 76 |
+
|
| 77 |
+
with col2:
|
| 78 |
+
if st.button("⏹ Stop Recording", disabled=not st.session_state.recording):
|
| 79 |
+
audio_data = st.session_state.recorder.stop_recording()
|
| 80 |
+
if audio_data is not None:
|
| 81 |
+
# Save the recording
|
| 82 |
+
filename = f"recording_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
|
| 83 |
+
filepath = st.session_state.recorder.save_recording(filename)
|
| 84 |
+
st.session_state.recorded_file = filepath
|
| 85 |
+
st.session_state.recording = False
|
| 86 |
+
st.rerun()
|
| 87 |
+
|
| 88 |
+
if st.session_state.recording:
|
| 89 |
+
recording_duration = time.time() - st.session_state.recording_start
|
| 90 |
+
st.warning(f"🔴 Recording... {int(recording_duration)} seconds")
|
| 91 |
+
st.audio(np.concatenate(st.session_state.recorder.recording_data) if st.session_state.recorder.recording_data else None,
|
| 92 |
+
sample_rate=st.session_state.recorder.sample_rate)
|
| 93 |
+
|
| 94 |
+
if hasattr(st.session_state, 'recorded_file') and os.path.exists(st.session_state.recorded_file):
|
| 95 |
+
st.audio(st.session_state.recorded_file)
|
| 96 |
+
uploaded_file = st.session_state.recorded_file
|
| 97 |
+
else:
|
| 98 |
+
uploaded_file = None
|
| 99 |
+
else:
|
| 100 |
+
# File upload option
|
| 101 |
+
st.header("📁 Upload Audio File")
|
| 102 |
+
uploaded_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'ogg', 'flac'])
|
| 103 |
+
|
| 104 |
+
if uploaded_file is not None:
|
| 105 |
+
# Handle both file uploads and recorded files
|
| 106 |
+
if hasattr(uploaded_file, 'read'): # It's a file upload object
|
| 107 |
+
# Save uploaded file to a temporary file
|
| 108 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
| 109 |
+
tmp_file.write(uploaded_file.getvalue())
|
| 110 |
+
tmp_file_path = tmp_file.name
|
| 111 |
+
# Display audio player for uploaded file
|
| 112 |
+
st.audio(uploaded_file, format=uploaded_file.type)
|
| 113 |
+
else: # It's a recorded file path
|
| 114 |
+
tmp_file_path = uploaded_file
|
| 115 |
+
# Display audio player for recorded file
|
| 116 |
+
st.audio(tmp_file_path, format='audio/wav')
|
| 117 |
+
|
| 118 |
+
if st.button("Transcribe Audio"):
|
| 119 |
+
with st.spinner('Transcribing audio... This may take a moment...'):
|
| 120 |
+
try:
|
| 121 |
+
# Create output file paths
|
| 122 |
+
base_name = os.path.splitext(os.path.basename(tmp_file_path))[0]
|
| 123 |
+
output_dir = "outputs"
|
| 124 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 125 |
+
|
| 126 |
+
midi_path = os.path.join(output_dir, f"{base_name}.mid")
|
| 127 |
+
musicxml_path = os.path.join(output_dir, f"{base_name}.musicxml")
|
| 128 |
+
|
| 129 |
+
# Transcribe the audio using the selected method
|
| 130 |
+
if use_hf:
|
| 131 |
+
st.info(f"Using Hugging Face model: {model_name}")
|
| 132 |
+
# Use Hugging Face model for transcription
|
| 133 |
+
hf_result = transcribe_with_hf(tmp_file_path, model_name)
|
| 134 |
+
|
| 135 |
+
# Convert notes to MIDI (simplified example)
|
| 136 |
+
# In a real implementation, you would map the model's output to MIDI notes
|
| 137 |
+
transcriber = AudioTranscriber()
|
| 138 |
+
|
| 139 |
+
# Create a simple MIDI file with the detected notes
|
| 140 |
+
# This is a placeholder - you'll need to adapt this based on the model's output
|
| 141 |
+
midi = pretty_midi.PrettyMIDI()
|
| 142 |
+
piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
|
| 143 |
+
piano = pretty_midi.Instrument(program=piano_program)
|
| 144 |
+
|
| 145 |
+
# Add notes (simplified example)
|
| 146 |
+
for i, note in enumerate(hf_result['notes']):
|
| 147 |
+
note = pretty_midi.Note(
|
| 148 |
+
velocity=100,
|
| 149 |
+
pitch=min(max(int(note), 0), 127), # Ensure valid MIDI note
|
| 150 |
+
start=i * 0.5, # 0.5 seconds per note
|
| 151 |
+
end=(i + 1) * 0.5
|
| 152 |
+
)
|
| 153 |
+
piano.notes.append(note)
|
| 154 |
+
|
| 155 |
+
midi.instruments.append(piano)
|
| 156 |
+
midi.write(midi_path)
|
| 157 |
+
|
| 158 |
+
# Convert to MusicXML
|
| 159 |
+
score = music21.midi.translate.midiFileToStream(midi_path)
|
| 160 |
+
score.write('musicxml', musicxml_path)
|
| 161 |
+
|
| 162 |
+
onsets = np.arange(0, len(hf_result['notes']) * 0.5, 0.5)
|
| 163 |
+
pitches = hf_result['notes']
|
| 164 |
+
|
| 165 |
+
else:
|
| 166 |
+
# Use the original audio processing method
|
| 167 |
+
transcriber = AudioTranscriber()
|
| 168 |
+
midi, onsets, pitches = transcriber.transcribe(
|
| 169 |
+
tmp_file_path,
|
| 170 |
+
midi_path,
|
| 171 |
+
musicxml_path
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
st.success("Transcription complete!")
|
| 175 |
+
|
| 176 |
+
# Display results
|
| 177 |
+
col1, col2 = st.columns(2)
|
| 178 |
+
|
| 179 |
+
with col1:
|
| 180 |
+
st.subheader("MIDI File")
|
| 181 |
+
st.markdown(get_binary_file_downloader_html(midi_path, "MIDI"), unsafe_allow_html=True)
|
| 182 |
+
|
| 183 |
+
with col2:
|
| 184 |
+
st.subheader("Sheet Music (MusicXML)")
|
| 185 |
+
st.markdown(get_binary_file_downloader_html(musicxml_path, "MusicXML"), unsafe_allow_html=True)
|
| 186 |
+
|
| 187 |
+
# Display some statistics
|
| 188 |
+
st.subheader("Transcription Details")
|
| 189 |
+
st.write(f"- Detected {len(onsets)} note onsets")
|
| 190 |
+
st.write(f"- Average note duration: {np.mean(np.diff(onsets)):.2f} seconds")
|
| 191 |
+
|
| 192 |
+
# Clean up temporary file
|
| 193 |
+
os.unlink(tmp_file_path)
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
st.error(f"An error occurred during transcription: {str(e)}")
|
| 197 |
+
if os.path.exists(tmp_file_path):
|
| 198 |
+
os.unlink(tmp_file_path)
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
import numpy as np
|
| 202 |
+
main()
|
hf_README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Audio to Sheet Music Transcriber
|
| 2 |
+
|
| 3 |
+
A web application that converts monophonic audio recordings into sheet music using machine learning. This app can transcribe audio files (WAV, MP3) or record live audio and convert it to MIDI and MusicXML formats.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- Upload audio files (WAV, MP3) for transcription
|
| 8 |
+
- Record audio directly in the browser
|
| 9 |
+
- Choose between different transcription models
|
| 10 |
+
- Download MIDI and MusicXML files
|
| 11 |
+
- View basic audio visualizations
|
| 12 |
+
|
| 13 |
+
## How to Use
|
| 14 |
+
|
| 15 |
+
1. **Input Audio**:
|
| 16 |
+
- Upload an audio file using the file uploader
|
| 17 |
+
- OR record audio directly in the browser
|
| 18 |
+
|
| 19 |
+
2. **Transcription Settings**:
|
| 20 |
+
- Select your preferred transcription model
|
| 21 |
+
- Adjust audio parameters if needed
|
| 22 |
+
|
| 23 |
+
3. **Process**:
|
| 24 |
+
- Click "Transcribe" to start the transcription
|
| 25 |
+
- Wait for the processing to complete
|
| 26 |
+
|
| 27 |
+
4. **Download**:
|
| 28 |
+
- Download the generated MIDI file
|
| 29 |
+
- Download the MusicXML file for sheet music
|
| 30 |
+
|
| 31 |
+
## Models
|
| 32 |
+
|
| 33 |
+
- **Facebook wav2vec2**: Fast and accurate speech recognition
|
| 34 |
+
- **Microsoft SpeechT5**: High-quality speech recognition with better intonation
|
| 35 |
+
|
| 36 |
+
## Technical Details
|
| 37 |
+
|
| 38 |
+
This app uses:
|
| 39 |
+
- PyTorch and Transformers for audio processing
|
| 40 |
+
- Librosa for audio feature extraction
|
| 41 |
+
- PrettyMIDI and Music21 for MIDI and MusicXML generation
|
| 42 |
+
- Streamlit for the web interface
|
| 43 |
+
|
| 44 |
+
## Limitations
|
| 45 |
+
|
| 46 |
+
- Works best with clean, monophonic recordings
|
| 47 |
+
- May have difficulty with fast passages or complex articulations
|
| 48 |
+
- Performance depends on the quality of the input audio
|
| 49 |
+
|
| 50 |
+
## License
|
| 51 |
+
|
| 52 |
+
MIT License - See the [LICENSE](LICENSE) file for details.
|
hf_requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
numpy>=1.21.0
|
| 3 |
+
librosa>=0.9.2
|
| 4 |
+
pretty_midi>=0.2.10
|
| 5 |
+
music21>=7.3.3
|
| 6 |
+
streamlit>=1.12.0
|
| 7 |
+
scipy>=1.7.0
|
| 8 |
+
matplotlib>=3.4.3
|
| 9 |
+
soundfile>=0.12.1
|
| 10 |
+
pydub>=0.25.1
|
| 11 |
+
scikit-learn>=1.0.2
|
| 12 |
+
|
| 13 |
+
# Hugging Face and deep learning
|
| 14 |
+
transformers>=4.30.0
|
| 15 |
+
datasets>=2.12.0
|
| 16 |
+
torch>=2.0.0,<3.0.0
|
| 17 |
+
torchaudio>=2.0.0
|
| 18 |
+
accelerate>=0.20.0
|
| 19 |
+
sentencepiece # Required for some models
|
| 20 |
+
|
| 21 |
+
# Audio processing
|
| 22 |
+
resampy>=0.4.2
|
| 23 |
+
|
| 24 |
+
# For MIDI processing
|
| 25 |
+
mido>=1.2.10
|
| 26 |
+
python-rtmidi>=1.4.9
|
| 27 |
+
|
| 28 |
+
# For web interface
|
| 29 |
+
gradio>=3.0.0
|
| 30 |
+
|
| 31 |
+
# For file handling
|
| 32 |
+
tqdm>=4.65.0
|
| 33 |
+
|
| 34 |
+
# Remove Windows-specific dependencies
|
| 35 |
+
# pyaudio # Not needed on Hugging Face Spaces
|
| 36 |
+
# sounddevice # Not needed on Hugging Face Spaces
|
hf_transcriber.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import (
|
| 3 |
+
AutoModelForCTC,
|
| 4 |
+
AutoProcessor,
|
| 5 |
+
pipeline,
|
| 6 |
+
SpeechT5Processor,
|
| 7 |
+
SpeechT5ForSpeechToText,
|
| 8 |
+
SpeechT5HifiGan
|
| 9 |
+
)
|
| 10 |
+
import numpy as np
|
| 11 |
+
import librosa
|
| 12 |
+
import torchaudio
|
| 13 |
+
from typing import Tuple, Optional, List, Union, Dict, Any
|
| 14 |
+
import warnings
|
| 15 |
+
warnings.filterwarnings("ignore", message=".*gradient_checkpointing*.")
|
| 16 |
+
|
| 17 |
+
class HFTranscriber:
|
| 18 |
+
def __init__(self, model_name: str = "facebook/wav2vec2-base-960h"):
|
| 19 |
+
"""
|
| 20 |
+
Initialize the Hugging Face transcriber with a pre-trained model.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
model_name (str): Name of the Hugging Face model to use for transcription.
|
| 24 |
+
Supported models:
|
| 25 |
+
- "facebook/wav2vec2-base-960h" (default)
|
| 26 |
+
- "microsoft/speecht5_asr"
|
| 27 |
+
"""
|
| 28 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
+
self.model_name = model_name
|
| 30 |
+
self.processor = None
|
| 31 |
+
self.model = None
|
| 32 |
+
self.vocoder = None
|
| 33 |
+
self.is_speecht5 = "speecht5" in model_name.lower()
|
| 34 |
+
self._load_model()
|
| 35 |
+
|
| 36 |
+
def _load_model(self):
|
| 37 |
+
"""Load the model and processor based on the model type."""
|
| 38 |
+
try:
|
| 39 |
+
if self.is_speecht5:
|
| 40 |
+
# Load SpeechT5 model and processor
|
| 41 |
+
self.processor = SpeechT5Processor.from_pretrained(self.model_name)
|
| 42 |
+
self.model = SpeechT5ForSpeechToText.from_pretrained(self.model_name)
|
| 43 |
+
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 44 |
+
self.model.to(self.device)
|
| 45 |
+
self.vocoder.to(self.device)
|
| 46 |
+
self.model.eval()
|
| 47 |
+
self.vocoder.eval()
|
| 48 |
+
else:
|
| 49 |
+
# Load wav2vec2 model and processor
|
| 50 |
+
self.processor = AutoProcessor.from_pretrained(self.model_name)
|
| 51 |
+
self.model = AutoModelForCTC.from_pretrained(self.model_name)
|
| 52 |
+
self.model.to(self.device)
|
| 53 |
+
self.model.eval()
|
| 54 |
+
except Exception as e:
|
| 55 |
+
raise Exception(f"Failed to load model {self.model_name}: {str(e)}")
|
| 56 |
+
|
| 57 |
+
def transcribe_audio(self, audio_path: str) -> Tuple[List[int], int]:
|
| 58 |
+
"""
|
| 59 |
+
Transcribe audio file to notes using the loaded Hugging Face model.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
audio_path (str): Path to the audio file
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
tuple: (notes, sample_rate) where notes is a list of MIDI note numbers
|
| 66 |
+
"""
|
| 67 |
+
try:
|
| 68 |
+
# Load and preprocess audio
|
| 69 |
+
waveform, sample_rate = self._load_audio(audio_path)
|
| 70 |
+
|
| 71 |
+
if self.is_speecht5:
|
| 72 |
+
# Process the audio input for SpeechT5
|
| 73 |
+
inputs = self.processor(
|
| 74 |
+
audio=waveform,
|
| 75 |
+
sampling_rate=sample_rate,
|
| 76 |
+
return_tensors="pt"
|
| 77 |
+
).to(self.device)
|
| 78 |
+
|
| 79 |
+
# Generate transcription
|
| 80 |
+
with torch.no_grad():
|
| 81 |
+
generated_ids = self.model.generate(
|
| 82 |
+
input_values=inputs.input_values,
|
| 83 |
+
attention_mask=inputs.attention_mask,
|
| 84 |
+
max_length=1000
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
# Decode the generated ids to text
|
| 88 |
+
transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 89 |
+
else:
|
| 90 |
+
# Process the audio input for wav2vec2
|
| 91 |
+
inputs = self.processor(
|
| 92 |
+
waveform,
|
| 93 |
+
sampling_rate=sample_rate,
|
| 94 |
+
return_tensors="pt",
|
| 95 |
+
padding=True
|
| 96 |
+
).input_values.to(self.device)
|
| 97 |
+
|
| 98 |
+
# Get model predictions
|
| 99 |
+
with torch.no_grad():
|
| 100 |
+
logits = self.model(inputs).logits
|
| 101 |
+
|
| 102 |
+
# Get predicted token ids
|
| 103 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 104 |
+
|
| 105 |
+
# Decode the predicted ids to text
|
| 106 |
+
transcription = self.processor.batch_decode(predicted_ids)[0]
|
| 107 |
+
|
| 108 |
+
# Convert text to MIDI notes (simplified example)
|
| 109 |
+
notes = self._text_to_midi_notes(transcription)
|
| 110 |
+
|
| 111 |
+
return notes, sample_rate
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
raise Exception(f"Transcription failed: {str(e)}")
|
| 115 |
+
|
| 116 |
+
def _text_to_midi_notes(self, text: str) -> List[int]:
|
| 117 |
+
"""Convert transcribed text to MIDI notes (simplified example)."""
|
| 118 |
+
# This is a placeholder - in a real implementation, you'd need to
|
| 119 |
+
# analyze the text to determine the appropriate notes
|
| 120 |
+
# Here we just return a simple C major scale as an example
|
| 121 |
+
return [60, 62, 64, 65, 67, 69, 71, 72] # C4 to C5
|
| 122 |
+
|
| 123 |
+
def _load_audio(self, audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
|
| 124 |
+
"""Load and preprocess audio file."""
|
| 125 |
+
# Load audio with librosa (handles resampling and mono conversion)
|
| 126 |
+
waveform, sample_rate = librosa.load(
|
| 127 |
+
audio_path,
|
| 128 |
+
sr=target_sr,
|
| 129 |
+
mono=True,
|
| 130 |
+
duration=None,
|
| 131 |
+
res_type='kaiser_fast'
|
| 132 |
+
)
|
| 133 |
+
return waveform, sample_rate
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def transcribe_with_hf(audio_path: str, model_name: str = "openai/whisper-tiny") -> dict:
|
| 137 |
+
"""
|
| 138 |
+
Convenience function to transcribe audio using a Hugging Face model.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
audio_path (str): Path to the audio file
|
| 142 |
+
model_name (str): Name of the Hugging Face model to use
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
dict: Dictionary containing transcription results
|
| 146 |
+
"""
|
| 147 |
+
transcriber = HFTranscriber(model_name=model_name)
|
| 148 |
+
notes, sample_rate = transcriber.transcribe_audio(audio_path)
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
'notes': notes,
|
| 152 |
+
'sample_rate': sample_rate,
|
| 153 |
+
'model': model_name
|
| 154 |
+
}
|
recorder.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sounddevice as sd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import wave
|
| 4 |
+
import os
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from typing import Optional, Tuple
|
| 7 |
+
import threading
|
| 8 |
+
import queue
|
| 9 |
+
|
| 10 |
+
class AudioRecorder:
|
| 11 |
+
def __init__(self, sample_rate: int = 44100, channels: int = 1):
|
| 12 |
+
"""
|
| 13 |
+
Initialize the audio recorder.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
sample_rate (int): Sample rate in Hz (default: 44100)
|
| 17 |
+
channels (int): Number of audio channels (1 for mono, 2 for stereo)
|
| 18 |
+
"""
|
| 19 |
+
self.sample_rate = sample_rate
|
| 20 |
+
self.channels = channels
|
| 21 |
+
self.recording = False
|
| 22 |
+
self.audio_queue = queue.Queue()
|
| 23 |
+
self.recording_thread = None
|
| 24 |
+
self.recording_data = []
|
| 25 |
+
|
| 26 |
+
def _callback(self, indata, frames, time, status):
|
| 27 |
+
"""Callback function for audio input stream."""
|
| 28 |
+
if status:
|
| 29 |
+
print(f"Audio input status: {status}")
|
| 30 |
+
self.audio_queue.put(indata.copy())
|
| 31 |
+
|
| 32 |
+
def start_recording(self):
|
| 33 |
+
"""Start recording audio from the default input device."""
|
| 34 |
+
if self.recording:
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
self.recording = True
|
| 38 |
+
self.recording_data = []
|
| 39 |
+
|
| 40 |
+
def record():
|
| 41 |
+
with sd.InputStream(
|
| 42 |
+
samplerate=self.sample_rate,
|
| 43 |
+
channels=self.channels,
|
| 44 |
+
callback=self._callback,
|
| 45 |
+
dtype='float32'
|
| 46 |
+
) as stream:
|
| 47 |
+
while self.recording:
|
| 48 |
+
data = self.audio_queue.get()
|
| 49 |
+
self.recording_data.append(data)
|
| 50 |
+
|
| 51 |
+
self.recording_thread = threading.Thread(target=record)
|
| 52 |
+
self.recording_thread.start()
|
| 53 |
+
return True
|
| 54 |
+
|
| 55 |
+
def stop_recording(self) -> Optional[np.ndarray]:
|
| 56 |
+
"""
|
| 57 |
+
Stop recording and return the recorded audio data.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Optional[np.ndarray]: Recorded audio data as a numpy array, or None if no data was recorded
|
| 61 |
+
"""
|
| 62 |
+
if not self.recording:
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
self.recording = False
|
| 66 |
+
if self.recording_thread:
|
| 67 |
+
self.recording_thread.join()
|
| 68 |
+
|
| 69 |
+
if not self.recording_data:
|
| 70 |
+
return None
|
| 71 |
+
|
| 72 |
+
return np.concatenate(self.recording_data, axis=0)
|
| 73 |
+
|
| 74 |
+
def save_recording(self, filename: str = None, format: str = 'wav') -> str:
|
| 75 |
+
"""
|
| 76 |
+
Save the recorded audio to a file.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
filename (str, optional): Output filename. If None, generates a timestamped filename.
|
| 80 |
+
format (str): Output format ('wav' or 'npy')
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
str: Path to the saved file
|
| 84 |
+
"""
|
| 85 |
+
audio_data = np.concatenate(self.recording_data, axis=0) if self.recording_data else None
|
| 86 |
+
if audio_data is None:
|
| 87 |
+
raise ValueError("No recording data available to save")
|
| 88 |
+
|
| 89 |
+
if filename is None:
|
| 90 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 91 |
+
filename = f"recording_{timestamp}.{format}"
|
| 92 |
+
|
| 93 |
+
os.makedirs("recordings", exist_ok=True)
|
| 94 |
+
filepath = os.path.join("recordings", filename)
|
| 95 |
+
|
| 96 |
+
if format.lower() == 'wav':
|
| 97 |
+
# Convert float32 to int16 for WAV format
|
| 98 |
+
audio_data = (audio_data * 32767).astype(np.int16)
|
| 99 |
+
with wave.open(filepath, 'wb') as wf:
|
| 100 |
+
wf.setnchannels(self.channels)
|
| 101 |
+
wf.setsampwidth(2) # 16-bit
|
| 102 |
+
wf.setframerate(self.sample_rate)
|
| 103 |
+
wf.writeframes(audio_data.tobytes())
|
| 104 |
+
elif format.lower() == 'npy':
|
| 105 |
+
np.save(filepath, audio_data)
|
| 106 |
+
else:
|
| 107 |
+
raise ValueError(f"Unsupported format: {format}")
|
| 108 |
+
|
| 109 |
+
return filepath
|
| 110 |
+
|
| 111 |
+
def list_audio_devices() -> list:
|
| 112 |
+
"""List all available audio input devices."""
|
| 113 |
+
devices = sd.query_devices()
|
| 114 |
+
return [{
|
| 115 |
+
'id': i,
|
| 116 |
+
'name': device['name'],
|
| 117 |
+
'input_channels': device['max_input_channels'],
|
| 118 |
+
'default_samplerate': device['default_samplerate']
|
| 119 |
+
} for i, device in enumerate(devices) if device['max_input_channels'] > 0]
|
transcriber.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import numpy as np
|
| 3 |
+
from scipy import signal
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
import pretty_midi
|
| 7 |
+
PRETTY_MIDI_AVAILABLE = True
|
| 8 |
+
except ImportError:
|
| 9 |
+
PRETTY_MIDI_AVAILABLE = False
|
| 10 |
+
import warnings
|
| 11 |
+
warnings.warn("pretty_midi not available. MIDI file generation will be limited.", ImportWarning)
|
| 12 |
+
|
| 13 |
+
class AudioTranscriber:
|
| 14 |
+
def __init__(self, sample_rate=22050, frame_size=2048, hop_length=512):
|
| 15 |
+
"""
|
| 16 |
+
Initialize the AudioTranscriber with audio processing parameters.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
sample_rate (int): Sample rate to load audio (Hz)
|
| 20 |
+
frame_size (int): FFT window size
|
| 21 |
+
hop_length (int): Number of samples between frames
|
| 22 |
+
"""
|
| 23 |
+
self.sample_rate = sample_rate
|
| 24 |
+
self.frame_size = frame_size
|
| 25 |
+
self.hop_length = hop_length
|
| 26 |
+
|
| 27 |
+
def load_audio(self, file_path):
|
| 28 |
+
"""Load audio file and return time series and sample rate."""
|
| 29 |
+
y, sr = librosa.load(file_path, sr=self.sample_rate, mono=True)
|
| 30 |
+
return y, sr
|
| 31 |
+
|
| 32 |
+
def detect_onsets(self, y):
|
| 33 |
+
"""Detect note onset times in the audio signal."""
|
| 34 |
+
# Compute onset envelope
|
| 35 |
+
onset_env = librosa.onset.onset_strength(y=y, sr=self.sample_rate)
|
| 36 |
+
# Detect onsets
|
| 37 |
+
onset_frames = librosa.onset.onset_detect(
|
| 38 |
+
onset_envelope=onset_env,
|
| 39 |
+
sr=self.sample_rate,
|
| 40 |
+
hop_length=self.hop_length,
|
| 41 |
+
backtrack=True
|
| 42 |
+
)
|
| 43 |
+
return librosa.frames_to_time(onset_frames, sr=self.sample_rate, hop_length=self.hop_length)
|
| 44 |
+
|
| 45 |
+
def estimate_pitch(self, y, onsets):
|
| 46 |
+
"""Estimate pitch for each onset segment."""
|
| 47 |
+
pitches = []
|
| 48 |
+
for i in range(len(onsets) - 1):
|
| 49 |
+
start_sample = int(onsets[i] * self.sample_rate)
|
| 50 |
+
end_sample = int(onsets[i+1] * self.sample_rate)
|
| 51 |
+
segment = y[start_sample:end_sample]
|
| 52 |
+
|
| 53 |
+
# Skip if segment is too short
|
| 54 |
+
if len(segment) < self.frame_size:
|
| 55 |
+
pitches.append(0)
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
# Compute pitch using YIN algorithm
|
| 59 |
+
f0, _, _ = librosa.pyin(
|
| 60 |
+
segment,
|
| 61 |
+
fmin=librosa.note_to_hz('C2'),
|
| 62 |
+
fmax=librosa.note_to_hz('C7'),
|
| 63 |
+
sr=self.sample_rate
|
| 64 |
+
)
|
| 65 |
+
# Get most common pitch in the segment (excluding zeros)
|
| 66 |
+
f0 = f0[f0 > 0]
|
| 67 |
+
pitch = np.median(f0) if len(f0) > 0 else 0
|
| 68 |
+
pitches.append(pitch)
|
| 69 |
+
|
| 70 |
+
return np.array(pitches)
|
| 71 |
+
|
| 72 |
+
def create_midi(self, onsets, pitches, output_path):
|
| 73 |
+
"""
|
| 74 |
+
Create a MIDI file from detected notes.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
onsets (list): List of onset times in seconds
|
| 78 |
+
pitches (list): List of pitch values in Hz
|
| 79 |
+
output_path (str): Path to save the MIDI file
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
str: Path to the generated MIDI file or None if failed
|
| 83 |
+
"""
|
| 84 |
+
if not PRETTY_MIDI_AVAILABLE:
|
| 85 |
+
raise ImportError("pretty_midi is required for MIDI file generation. "
|
| 86 |
+
"Please install it using: pip install pretty_midi")
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
# Create a PrettyMIDI object
|
| 90 |
+
midi = pretty_midi.PrettyMIDI()
|
| 91 |
+
|
| 92 |
+
# Create an Instrument instance for a piano instrument
|
| 93 |
+
piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
|
| 94 |
+
piano = pretty_midi.Instrument(program=piano_program)
|
| 95 |
+
|
| 96 |
+
# Add the notes to the instrument
|
| 97 |
+
for i in range(len(onsets)):
|
| 98 |
+
# Skip if no pitch detected for this onset
|
| 99 |
+
if i >= len(pitches) or pitches[i] <= 0:
|
| 100 |
+
continue
|
| 101 |
+
|
| 102 |
+
# Create a note
|
| 103 |
+
note = pretty_midi.Note(
|
| 104 |
+
velocity=100,
|
| 105 |
+
pitch=int(round(pitches[i])),
|
| 106 |
+
start=onsets[i],
|
| 107 |
+
end=onsets[i] + 0.5 # Default duration of 0.5 seconds
|
| 108 |
+
)
|
| 109 |
+
piano.notes.append(note)
|
| 110 |
+
|
| 111 |
+
# Add the instrument to the PrettyMIDI object
|
| 112 |
+
midi.instruments.append(piano)
|
| 113 |
+
|
| 114 |
+
# Write the MIDI data to the output file
|
| 115 |
+
midi.write(output_path)
|
| 116 |
+
|
| 117 |
+
return output_path
|
| 118 |
+
|
| 119 |
+
except Exception as e:
|
| 120 |
+
raise Exception(f"Failed to create MIDI file: {str(e)}")
|
| 121 |
+
|
| 122 |
+
def transcribe(self, input_path, output_midi, output_musicxml=None):
|
| 123 |
+
"""Main transcription method."""
|
| 124 |
+
# Load audio
|
| 125 |
+
y, _ = self.load_audio(input_path)
|
| 126 |
+
|
| 127 |
+
# Detect onsets
|
| 128 |
+
onsets = self.detect_onsets(y)
|
| 129 |
+
|
| 130 |
+
# Estimate pitch for each note
|
| 131 |
+
pitches = self.estimate_pitch(y, onsets)
|
| 132 |
+
|
| 133 |
+
# Create MIDI file
|
| 134 |
+
midi = self.create_midi(onsets, pitches, output_midi)
|
| 135 |
+
|
| 136 |
+
# Convert to MusicXML if requested
|
| 137 |
+
if output_musicxml:
|
| 138 |
+
self.midi_to_musicxml(output_midi, output_musicxml)
|
| 139 |
+
|
| 140 |
+
return midi, onsets, pitches
|
| 141 |
+
|
| 142 |
+
@staticmethod
|
| 143 |
+
def midi_to_musicxml(midi_path, output_path):
|
| 144 |
+
"""Convert MIDI file to MusicXML format."""
|
| 145 |
+
import music21
|
| 146 |
+
mf = music21.midi.MidiFile()
|
| 147 |
+
mf.open(midi_path)
|
| 148 |
+
mf.read()
|
| 149 |
+
mf.close()
|
| 150 |
+
|
| 151 |
+
score = music21.midi.translate.midiFileToStream(mf)
|
| 152 |
+
score.write('musicxml', output_path)
|