PatienceIzere commited on
Commit
00fa6cd
·
verified ·
1 Parent(s): fb4cc01

New files uploaded

Browse files
Files changed (6) hide show
  1. app.py +202 -0
  2. hf_README.md +52 -0
  3. hf_requirements.txt +36 -0
  4. hf_transcriber.py +154 -0
  5. recorder.py +119 -0
  6. transcriber.py +152 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from transcriber import AudioTranscriber
4
+ from hf_transcriber import HFTranscriber, transcribe_with_hf
5
+ from recorder import AudioRecorder, list_audio_devices
6
+ import tempfile
7
+ import base64
8
+ import numpy as np
9
+ import time
10
+ from datetime import datetime
11
+ import sounddevice as sd # Add this import
12
+
13
+ def get_binary_file_downloader_html(bin_file, file_label='File'):
14
+ """Generate a link to download the given file."""
15
+ with open(bin_file, 'rb') as f:
16
+ data = f.read()
17
+ bin_str = base64.b64encode(data).decode()
18
+ href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Download {file_label}</a>'
19
+ return href
20
+
21
+ def main():
22
+ st.set_page_config(page_title="Audio to Sheet Music Transcriber", layout="wide")
23
+
24
+ st.title("🎵 Audio to Sheet Music Transcriber")
25
+ st.markdown("### Convert monophonic audio to sheet music")
26
+
27
+ # Initialize session state for recording
28
+ if 'recorder' not in st.session_state:
29
+ st.session_state.recorder = AudioRecorder()
30
+ if 'recording' not in st.session_state:
31
+ st.session_state.recording = False
32
+
33
+ st.sidebar.header("Transcription Settings")
34
+ use_hf = st.sidebar.checkbox("Use Hugging Face Model", value=True)
35
+
36
+ if use_hf:
37
+ model_name = st.sidebar.selectbox(
38
+ "Select Model",
39
+ ["microsoft/speecht5_asr", "facebook/wav2vec2-base-960h"],
40
+ index=0
41
+ )
42
+
43
+ # Recording or Upload Section
44
+ st.sidebar.header("Audio Input")
45
+ input_method = st.sidebar.radio("Choose input method:", ["Upload Audio File", "Record Live Audio"])
46
+
47
+ if input_method == "Record Live Audio":
48
+ st.header("🎤 Live Audio Recording")
49
+
50
+ # Show available audio devices
51
+ devices = list_audio_devices()
52
+ if not devices:
53
+ st.error("No audio input devices found!")
54
+ else:
55
+ device_names = [f"{i}: {d['name']} (Channels: {d['input_channels']}, Rate: {int(d['default_samplerate'])}Hz)"
56
+ for i, d in enumerate(devices)]
57
+ selected_device = st.selectbox("Select audio input device:", device_names,
58
+ index=next((i for i, d in enumerate(devices) if d['id'] == sd.default.device[0]), 0))
59
+
60
+ # Get the selected device ID
61
+ device_id = devices[device_names.index(selected_device)]['id']
62
+
63
+ col1, col2 = st.columns(2)
64
+
65
+ with col1:
66
+ if st.button("🎤 Start Recording", disabled=st.session_state.recording):
67
+ sd.default.device = device_id
68
+ st.session_state.recorder = AudioRecorder(
69
+ sample_rate=int(devices[device_id]['default_samplerate']),
70
+ channels=min(1, devices[device_id]['input_channels'])
71
+ )
72
+ st.session_state.recorder.start_recording()
73
+ st.session_state.recording = True
74
+ st.session_state.recording_start = time.time()
75
+ st.rerun()
76
+
77
+ with col2:
78
+ if st.button("⏹ Stop Recording", disabled=not st.session_state.recording):
79
+ audio_data = st.session_state.recorder.stop_recording()
80
+ if audio_data is not None:
81
+ # Save the recording
82
+ filename = f"recording_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav"
83
+ filepath = st.session_state.recorder.save_recording(filename)
84
+ st.session_state.recorded_file = filepath
85
+ st.session_state.recording = False
86
+ st.rerun()
87
+
88
+ if st.session_state.recording:
89
+ recording_duration = time.time() - st.session_state.recording_start
90
+ st.warning(f"🔴 Recording... {int(recording_duration)} seconds")
91
+ st.audio(np.concatenate(st.session_state.recorder.recording_data) if st.session_state.recorder.recording_data else None,
92
+ sample_rate=st.session_state.recorder.sample_rate)
93
+
94
+ if hasattr(st.session_state, 'recorded_file') and os.path.exists(st.session_state.recorded_file):
95
+ st.audio(st.session_state.recorded_file)
96
+ uploaded_file = st.session_state.recorded_file
97
+ else:
98
+ uploaded_file = None
99
+ else:
100
+ # File upload option
101
+ st.header("📁 Upload Audio File")
102
+ uploaded_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'ogg', 'flac'])
103
+
104
+ if uploaded_file is not None:
105
+ # Handle both file uploads and recorded files
106
+ if hasattr(uploaded_file, 'read'): # It's a file upload object
107
+ # Save uploaded file to a temporary file
108
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
109
+ tmp_file.write(uploaded_file.getvalue())
110
+ tmp_file_path = tmp_file.name
111
+ # Display audio player for uploaded file
112
+ st.audio(uploaded_file, format=uploaded_file.type)
113
+ else: # It's a recorded file path
114
+ tmp_file_path = uploaded_file
115
+ # Display audio player for recorded file
116
+ st.audio(tmp_file_path, format='audio/wav')
117
+
118
+ if st.button("Transcribe Audio"):
119
+ with st.spinner('Transcribing audio... This may take a moment...'):
120
+ try:
121
+ # Create output file paths
122
+ base_name = os.path.splitext(os.path.basename(tmp_file_path))[0]
123
+ output_dir = "outputs"
124
+ os.makedirs(output_dir, exist_ok=True)
125
+
126
+ midi_path = os.path.join(output_dir, f"{base_name}.mid")
127
+ musicxml_path = os.path.join(output_dir, f"{base_name}.musicxml")
128
+
129
+ # Transcribe the audio using the selected method
130
+ if use_hf:
131
+ st.info(f"Using Hugging Face model: {model_name}")
132
+ # Use Hugging Face model for transcription
133
+ hf_result = transcribe_with_hf(tmp_file_path, model_name)
134
+
135
+ # Convert notes to MIDI (simplified example)
136
+ # In a real implementation, you would map the model's output to MIDI notes
137
+ transcriber = AudioTranscriber()
138
+
139
+ # Create a simple MIDI file with the detected notes
140
+ # This is a placeholder - you'll need to adapt this based on the model's output
141
+ midi = pretty_midi.PrettyMIDI()
142
+ piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
143
+ piano = pretty_midi.Instrument(program=piano_program)
144
+
145
+ # Add notes (simplified example)
146
+ for i, note in enumerate(hf_result['notes']):
147
+ note = pretty_midi.Note(
148
+ velocity=100,
149
+ pitch=min(max(int(note), 0), 127), # Ensure valid MIDI note
150
+ start=i * 0.5, # 0.5 seconds per note
151
+ end=(i + 1) * 0.5
152
+ )
153
+ piano.notes.append(note)
154
+
155
+ midi.instruments.append(piano)
156
+ midi.write(midi_path)
157
+
158
+ # Convert to MusicXML
159
+ score = music21.midi.translate.midiFileToStream(midi_path)
160
+ score.write('musicxml', musicxml_path)
161
+
162
+ onsets = np.arange(0, len(hf_result['notes']) * 0.5, 0.5)
163
+ pitches = hf_result['notes']
164
+
165
+ else:
166
+ # Use the original audio processing method
167
+ transcriber = AudioTranscriber()
168
+ midi, onsets, pitches = transcriber.transcribe(
169
+ tmp_file_path,
170
+ midi_path,
171
+ musicxml_path
172
+ )
173
+
174
+ st.success("Transcription complete!")
175
+
176
+ # Display results
177
+ col1, col2 = st.columns(2)
178
+
179
+ with col1:
180
+ st.subheader("MIDI File")
181
+ st.markdown(get_binary_file_downloader_html(midi_path, "MIDI"), unsafe_allow_html=True)
182
+
183
+ with col2:
184
+ st.subheader("Sheet Music (MusicXML)")
185
+ st.markdown(get_binary_file_downloader_html(musicxml_path, "MusicXML"), unsafe_allow_html=True)
186
+
187
+ # Display some statistics
188
+ st.subheader("Transcription Details")
189
+ st.write(f"- Detected {len(onsets)} note onsets")
190
+ st.write(f"- Average note duration: {np.mean(np.diff(onsets)):.2f} seconds")
191
+
192
+ # Clean up temporary file
193
+ os.unlink(tmp_file_path)
194
+
195
+ except Exception as e:
196
+ st.error(f"An error occurred during transcription: {str(e)}")
197
+ if os.path.exists(tmp_file_path):
198
+ os.unlink(tmp_file_path)
199
+
200
+ if __name__ == "__main__":
201
+ import numpy as np
202
+ main()
hf_README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio to Sheet Music Transcriber
2
+
3
+ A web application that converts monophonic audio recordings into sheet music using machine learning. This app can transcribe audio files (WAV, MP3) or record live audio and convert it to MIDI and MusicXML formats.
4
+
5
+ ## Features
6
+
7
+ - Upload audio files (WAV, MP3) for transcription
8
+ - Record audio directly in the browser
9
+ - Choose between different transcription models
10
+ - Download MIDI and MusicXML files
11
+ - View basic audio visualizations
12
+
13
+ ## How to Use
14
+
15
+ 1. **Input Audio**:
16
+ - Upload an audio file using the file uploader
17
+ - OR record audio directly in the browser
18
+
19
+ 2. **Transcription Settings**:
20
+ - Select your preferred transcription model
21
+ - Adjust audio parameters if needed
22
+
23
+ 3. **Process**:
24
+ - Click "Transcribe" to start the transcription
25
+ - Wait for the processing to complete
26
+
27
+ 4. **Download**:
28
+ - Download the generated MIDI file
29
+ - Download the MusicXML file for sheet music
30
+
31
+ ## Models
32
+
33
+ - **Facebook wav2vec2**: Fast and accurate speech recognition
34
+ - **Microsoft SpeechT5**: High-quality speech recognition with better intonation
35
+
36
+ ## Technical Details
37
+
38
+ This app uses:
39
+ - PyTorch and Transformers for audio processing
40
+ - Librosa for audio feature extraction
41
+ - PrettyMIDI and Music21 for MIDI and MusicXML generation
42
+ - Streamlit for the web interface
43
+
44
+ ## Limitations
45
+
46
+ - Works best with clean, monophonic recordings
47
+ - May have difficulty with fast passages or complex articulations
48
+ - Performance depends on the quality of the input audio
49
+
50
+ ## License
51
+
52
+ MIT License - See the [LICENSE](LICENSE) file for details.
hf_requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ numpy>=1.21.0
3
+ librosa>=0.9.2
4
+ pretty_midi>=0.2.10
5
+ music21>=7.3.3
6
+ streamlit>=1.12.0
7
+ scipy>=1.7.0
8
+ matplotlib>=3.4.3
9
+ soundfile>=0.12.1
10
+ pydub>=0.25.1
11
+ scikit-learn>=1.0.2
12
+
13
+ # Hugging Face and deep learning
14
+ transformers>=4.30.0
15
+ datasets>=2.12.0
16
+ torch>=2.0.0,<3.0.0
17
+ torchaudio>=2.0.0
18
+ accelerate>=0.20.0
19
+ sentencepiece # Required for some models
20
+
21
+ # Audio processing
22
+ resampy>=0.4.2
23
+
24
+ # For MIDI processing
25
+ mido>=1.2.10
26
+ python-rtmidi>=1.4.9
27
+
28
+ # For web interface
29
+ gradio>=3.0.0
30
+
31
+ # For file handling
32
+ tqdm>=4.65.0
33
+
34
+ # Remove Windows-specific dependencies
35
+ # pyaudio # Not needed on Hugging Face Spaces
36
+ # sounddevice # Not needed on Hugging Face Spaces
hf_transcriber.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import (
3
+ AutoModelForCTC,
4
+ AutoProcessor,
5
+ pipeline,
6
+ SpeechT5Processor,
7
+ SpeechT5ForSpeechToText,
8
+ SpeechT5HifiGan
9
+ )
10
+ import numpy as np
11
+ import librosa
12
+ import torchaudio
13
+ from typing import Tuple, Optional, List, Union, Dict, Any
14
+ import warnings
15
+ warnings.filterwarnings("ignore", message=".*gradient_checkpointing*.")
16
+
17
+ class HFTranscriber:
18
+ def __init__(self, model_name: str = "facebook/wav2vec2-base-960h"):
19
+ """
20
+ Initialize the Hugging Face transcriber with a pre-trained model.
21
+
22
+ Args:
23
+ model_name (str): Name of the Hugging Face model to use for transcription.
24
+ Supported models:
25
+ - "facebook/wav2vec2-base-960h" (default)
26
+ - "microsoft/speecht5_asr"
27
+ """
28
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
29
+ self.model_name = model_name
30
+ self.processor = None
31
+ self.model = None
32
+ self.vocoder = None
33
+ self.is_speecht5 = "speecht5" in model_name.lower()
34
+ self._load_model()
35
+
36
+ def _load_model(self):
37
+ """Load the model and processor based on the model type."""
38
+ try:
39
+ if self.is_speecht5:
40
+ # Load SpeechT5 model and processor
41
+ self.processor = SpeechT5Processor.from_pretrained(self.model_name)
42
+ self.model = SpeechT5ForSpeechToText.from_pretrained(self.model_name)
43
+ self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
44
+ self.model.to(self.device)
45
+ self.vocoder.to(self.device)
46
+ self.model.eval()
47
+ self.vocoder.eval()
48
+ else:
49
+ # Load wav2vec2 model and processor
50
+ self.processor = AutoProcessor.from_pretrained(self.model_name)
51
+ self.model = AutoModelForCTC.from_pretrained(self.model_name)
52
+ self.model.to(self.device)
53
+ self.model.eval()
54
+ except Exception as e:
55
+ raise Exception(f"Failed to load model {self.model_name}: {str(e)}")
56
+
57
+ def transcribe_audio(self, audio_path: str) -> Tuple[List[int], int]:
58
+ """
59
+ Transcribe audio file to notes using the loaded Hugging Face model.
60
+
61
+ Args:
62
+ audio_path (str): Path to the audio file
63
+
64
+ Returns:
65
+ tuple: (notes, sample_rate) where notes is a list of MIDI note numbers
66
+ """
67
+ try:
68
+ # Load and preprocess audio
69
+ waveform, sample_rate = self._load_audio(audio_path)
70
+
71
+ if self.is_speecht5:
72
+ # Process the audio input for SpeechT5
73
+ inputs = self.processor(
74
+ audio=waveform,
75
+ sampling_rate=sample_rate,
76
+ return_tensors="pt"
77
+ ).to(self.device)
78
+
79
+ # Generate transcription
80
+ with torch.no_grad():
81
+ generated_ids = self.model.generate(
82
+ input_values=inputs.input_values,
83
+ attention_mask=inputs.attention_mask,
84
+ max_length=1000
85
+ )
86
+
87
+ # Decode the generated ids to text
88
+ transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
89
+ else:
90
+ # Process the audio input for wav2vec2
91
+ inputs = self.processor(
92
+ waveform,
93
+ sampling_rate=sample_rate,
94
+ return_tensors="pt",
95
+ padding=True
96
+ ).input_values.to(self.device)
97
+
98
+ # Get model predictions
99
+ with torch.no_grad():
100
+ logits = self.model(inputs).logits
101
+
102
+ # Get predicted token ids
103
+ predicted_ids = torch.argmax(logits, dim=-1)
104
+
105
+ # Decode the predicted ids to text
106
+ transcription = self.processor.batch_decode(predicted_ids)[0]
107
+
108
+ # Convert text to MIDI notes (simplified example)
109
+ notes = self._text_to_midi_notes(transcription)
110
+
111
+ return notes, sample_rate
112
+
113
+ except Exception as e:
114
+ raise Exception(f"Transcription failed: {str(e)}")
115
+
116
+ def _text_to_midi_notes(self, text: str) -> List[int]:
117
+ """Convert transcribed text to MIDI notes (simplified example)."""
118
+ # This is a placeholder - in a real implementation, you'd need to
119
+ # analyze the text to determine the appropriate notes
120
+ # Here we just return a simple C major scale as an example
121
+ return [60, 62, 64, 65, 67, 69, 71, 72] # C4 to C5
122
+
123
+ def _load_audio(self, audio_path: str, target_sr: int = 16000) -> Tuple[np.ndarray, int]:
124
+ """Load and preprocess audio file."""
125
+ # Load audio with librosa (handles resampling and mono conversion)
126
+ waveform, sample_rate = librosa.load(
127
+ audio_path,
128
+ sr=target_sr,
129
+ mono=True,
130
+ duration=None,
131
+ res_type='kaiser_fast'
132
+ )
133
+ return waveform, sample_rate
134
+
135
+
136
+ def transcribe_with_hf(audio_path: str, model_name: str = "openai/whisper-tiny") -> dict:
137
+ """
138
+ Convenience function to transcribe audio using a Hugging Face model.
139
+
140
+ Args:
141
+ audio_path (str): Path to the audio file
142
+ model_name (str): Name of the Hugging Face model to use
143
+
144
+ Returns:
145
+ dict: Dictionary containing transcription results
146
+ """
147
+ transcriber = HFTranscriber(model_name=model_name)
148
+ notes, sample_rate = transcriber.transcribe_audio(audio_path)
149
+
150
+ return {
151
+ 'notes': notes,
152
+ 'sample_rate': sample_rate,
153
+ 'model': model_name
154
+ }
recorder.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sounddevice as sd
2
+ import numpy as np
3
+ import wave
4
+ import os
5
+ from datetime import datetime
6
+ from typing import Optional, Tuple
7
+ import threading
8
+ import queue
9
+
10
+ class AudioRecorder:
11
+ def __init__(self, sample_rate: int = 44100, channels: int = 1):
12
+ """
13
+ Initialize the audio recorder.
14
+
15
+ Args:
16
+ sample_rate (int): Sample rate in Hz (default: 44100)
17
+ channels (int): Number of audio channels (1 for mono, 2 for stereo)
18
+ """
19
+ self.sample_rate = sample_rate
20
+ self.channels = channels
21
+ self.recording = False
22
+ self.audio_queue = queue.Queue()
23
+ self.recording_thread = None
24
+ self.recording_data = []
25
+
26
+ def _callback(self, indata, frames, time, status):
27
+ """Callback function for audio input stream."""
28
+ if status:
29
+ print(f"Audio input status: {status}")
30
+ self.audio_queue.put(indata.copy())
31
+
32
+ def start_recording(self):
33
+ """Start recording audio from the default input device."""
34
+ if self.recording:
35
+ return False
36
+
37
+ self.recording = True
38
+ self.recording_data = []
39
+
40
+ def record():
41
+ with sd.InputStream(
42
+ samplerate=self.sample_rate,
43
+ channels=self.channels,
44
+ callback=self._callback,
45
+ dtype='float32'
46
+ ) as stream:
47
+ while self.recording:
48
+ data = self.audio_queue.get()
49
+ self.recording_data.append(data)
50
+
51
+ self.recording_thread = threading.Thread(target=record)
52
+ self.recording_thread.start()
53
+ return True
54
+
55
+ def stop_recording(self) -> Optional[np.ndarray]:
56
+ """
57
+ Stop recording and return the recorded audio data.
58
+
59
+ Returns:
60
+ Optional[np.ndarray]: Recorded audio data as a numpy array, or None if no data was recorded
61
+ """
62
+ if not self.recording:
63
+ return None
64
+
65
+ self.recording = False
66
+ if self.recording_thread:
67
+ self.recording_thread.join()
68
+
69
+ if not self.recording_data:
70
+ return None
71
+
72
+ return np.concatenate(self.recording_data, axis=0)
73
+
74
+ def save_recording(self, filename: str = None, format: str = 'wav') -> str:
75
+ """
76
+ Save the recorded audio to a file.
77
+
78
+ Args:
79
+ filename (str, optional): Output filename. If None, generates a timestamped filename.
80
+ format (str): Output format ('wav' or 'npy')
81
+
82
+ Returns:
83
+ str: Path to the saved file
84
+ """
85
+ audio_data = np.concatenate(self.recording_data, axis=0) if self.recording_data else None
86
+ if audio_data is None:
87
+ raise ValueError("No recording data available to save")
88
+
89
+ if filename is None:
90
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
91
+ filename = f"recording_{timestamp}.{format}"
92
+
93
+ os.makedirs("recordings", exist_ok=True)
94
+ filepath = os.path.join("recordings", filename)
95
+
96
+ if format.lower() == 'wav':
97
+ # Convert float32 to int16 for WAV format
98
+ audio_data = (audio_data * 32767).astype(np.int16)
99
+ with wave.open(filepath, 'wb') as wf:
100
+ wf.setnchannels(self.channels)
101
+ wf.setsampwidth(2) # 16-bit
102
+ wf.setframerate(self.sample_rate)
103
+ wf.writeframes(audio_data.tobytes())
104
+ elif format.lower() == 'npy':
105
+ np.save(filepath, audio_data)
106
+ else:
107
+ raise ValueError(f"Unsupported format: {format}")
108
+
109
+ return filepath
110
+
111
+ def list_audio_devices() -> list:
112
+ """List all available audio input devices."""
113
+ devices = sd.query_devices()
114
+ return [{
115
+ 'id': i,
116
+ 'name': device['name'],
117
+ 'input_channels': device['max_input_channels'],
118
+ 'default_samplerate': device['default_samplerate']
119
+ } for i, device in enumerate(devices) if device['max_input_channels'] > 0]
transcriber.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ from scipy import signal
4
+
5
+ try:
6
+ import pretty_midi
7
+ PRETTY_MIDI_AVAILABLE = True
8
+ except ImportError:
9
+ PRETTY_MIDI_AVAILABLE = False
10
+ import warnings
11
+ warnings.warn("pretty_midi not available. MIDI file generation will be limited.", ImportWarning)
12
+
13
+ class AudioTranscriber:
14
+ def __init__(self, sample_rate=22050, frame_size=2048, hop_length=512):
15
+ """
16
+ Initialize the AudioTranscriber with audio processing parameters.
17
+
18
+ Args:
19
+ sample_rate (int): Sample rate to load audio (Hz)
20
+ frame_size (int): FFT window size
21
+ hop_length (int): Number of samples between frames
22
+ """
23
+ self.sample_rate = sample_rate
24
+ self.frame_size = frame_size
25
+ self.hop_length = hop_length
26
+
27
+ def load_audio(self, file_path):
28
+ """Load audio file and return time series and sample rate."""
29
+ y, sr = librosa.load(file_path, sr=self.sample_rate, mono=True)
30
+ return y, sr
31
+
32
+ def detect_onsets(self, y):
33
+ """Detect note onset times in the audio signal."""
34
+ # Compute onset envelope
35
+ onset_env = librosa.onset.onset_strength(y=y, sr=self.sample_rate)
36
+ # Detect onsets
37
+ onset_frames = librosa.onset.onset_detect(
38
+ onset_envelope=onset_env,
39
+ sr=self.sample_rate,
40
+ hop_length=self.hop_length,
41
+ backtrack=True
42
+ )
43
+ return librosa.frames_to_time(onset_frames, sr=self.sample_rate, hop_length=self.hop_length)
44
+
45
+ def estimate_pitch(self, y, onsets):
46
+ """Estimate pitch for each onset segment."""
47
+ pitches = []
48
+ for i in range(len(onsets) - 1):
49
+ start_sample = int(onsets[i] * self.sample_rate)
50
+ end_sample = int(onsets[i+1] * self.sample_rate)
51
+ segment = y[start_sample:end_sample]
52
+
53
+ # Skip if segment is too short
54
+ if len(segment) < self.frame_size:
55
+ pitches.append(0)
56
+ continue
57
+
58
+ # Compute pitch using YIN algorithm
59
+ f0, _, _ = librosa.pyin(
60
+ segment,
61
+ fmin=librosa.note_to_hz('C2'),
62
+ fmax=librosa.note_to_hz('C7'),
63
+ sr=self.sample_rate
64
+ )
65
+ # Get most common pitch in the segment (excluding zeros)
66
+ f0 = f0[f0 > 0]
67
+ pitch = np.median(f0) if len(f0) > 0 else 0
68
+ pitches.append(pitch)
69
+
70
+ return np.array(pitches)
71
+
72
+ def create_midi(self, onsets, pitches, output_path):
73
+ """
74
+ Create a MIDI file from detected notes.
75
+
76
+ Args:
77
+ onsets (list): List of onset times in seconds
78
+ pitches (list): List of pitch values in Hz
79
+ output_path (str): Path to save the MIDI file
80
+
81
+ Returns:
82
+ str: Path to the generated MIDI file or None if failed
83
+ """
84
+ if not PRETTY_MIDI_AVAILABLE:
85
+ raise ImportError("pretty_midi is required for MIDI file generation. "
86
+ "Please install it using: pip install pretty_midi")
87
+
88
+ try:
89
+ # Create a PrettyMIDI object
90
+ midi = pretty_midi.PrettyMIDI()
91
+
92
+ # Create an Instrument instance for a piano instrument
93
+ piano_program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
94
+ piano = pretty_midi.Instrument(program=piano_program)
95
+
96
+ # Add the notes to the instrument
97
+ for i in range(len(onsets)):
98
+ # Skip if no pitch detected for this onset
99
+ if i >= len(pitches) or pitches[i] <= 0:
100
+ continue
101
+
102
+ # Create a note
103
+ note = pretty_midi.Note(
104
+ velocity=100,
105
+ pitch=int(round(pitches[i])),
106
+ start=onsets[i],
107
+ end=onsets[i] + 0.5 # Default duration of 0.5 seconds
108
+ )
109
+ piano.notes.append(note)
110
+
111
+ # Add the instrument to the PrettyMIDI object
112
+ midi.instruments.append(piano)
113
+
114
+ # Write the MIDI data to the output file
115
+ midi.write(output_path)
116
+
117
+ return output_path
118
+
119
+ except Exception as e:
120
+ raise Exception(f"Failed to create MIDI file: {str(e)}")
121
+
122
+ def transcribe(self, input_path, output_midi, output_musicxml=None):
123
+ """Main transcription method."""
124
+ # Load audio
125
+ y, _ = self.load_audio(input_path)
126
+
127
+ # Detect onsets
128
+ onsets = self.detect_onsets(y)
129
+
130
+ # Estimate pitch for each note
131
+ pitches = self.estimate_pitch(y, onsets)
132
+
133
+ # Create MIDI file
134
+ midi = self.create_midi(onsets, pitches, output_midi)
135
+
136
+ # Convert to MusicXML if requested
137
+ if output_musicxml:
138
+ self.midi_to_musicxml(output_midi, output_musicxml)
139
+
140
+ return midi, onsets, pitches
141
+
142
+ @staticmethod
143
+ def midi_to_musicxml(midi_path, output_path):
144
+ """Convert MIDI file to MusicXML format."""
145
+ import music21
146
+ mf = music21.midi.MidiFile()
147
+ mf.open(midi_path)
148
+ mf.read()
149
+ mf.close()
150
+
151
+ score = music21.midi.translate.midiFileToStream(mf)
152
+ score.write('musicxml', output_path)