Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Vaishnavi0404 commited on Apr 14, 2025

Commit

0c413ab

verified ·

1 Parent(s): 5667d64

Update music_generator.py

Browse files

Files changed (1) hide show

music_generator.py +135 -280

music_generator.py CHANGED Viewed

@@ -1,308 +1,163 @@
 import numpy as np
 import librosa
 import soundfile as sf
-import music21
-from music21 import chord, note, stream, instrument
-import random
-import os
-from music21 import tempo
-def generate_accompaniment(emotion, sentiment_score, tempo=100, output_path="accompaniment.wav"):
-    """
-    Generate musical accompaniment based on emotion and sentiment
-    Args:
-        emotion (str): Dominant emotion (Happy, Sad, Angry, Fear, Surprise)
-        sentiment_score (float): Sentiment score from -1 (negative) to 1 (positive)
-        tempo (int): Tempo in BPM
-        output_path (str): Path to save the audio file
-    Returns:
-        str: Path to the generated audio file
-    """
-    # Choose scales and chords based on emotion
-    if emotion == "Happy" or sentiment_score > 0.3:
-        # Major scales for happy emotions
-        scales = [
-            ['C4', 'D4', 'E4', 'F4', 'G4', 'A4', 'B4', 'C5'],  # C major
-            ['G3', 'A3', 'B3', 'C4', 'D4', 'E4', 'F#4', 'G4']  # G major
-        ]
-        chord_progressions = [
-            [['C4', 'E4', 'G4'], ['G3', 'B3', 'D4'], ['A3', 'C4', 'E4'], ['F3', 'A3', 'C4']],  # I-V-vi-IV
-            [['C4', 'E4', 'G4'], ['F3', 'A3', 'C4'], ['G3', 'B3', 'D4'], ['C4', 'E4', 'G4']]   # I-IV-V-I
-        ]
-    elif emotion == "Sad" or sentiment_score < -0.3:
-        # Minor scales for sad emotions
-        scales = [
-            ['A3', 'B3', 'C4', 'D4', 'E4', 'F4', 'G4', 'A4'],  # A minor
-            ['D3', 'E3', 'F3', 'G3', 'A3', 'Bb3', 'C4', 'D4']  # D minor
-        ]
-        chord_progressions = [
-            [['A3', 'C4', 'E4'], ['F3', 'A3', 'C4'], ['G3', 'B3', 'D4'], ['E3', 'G3', 'B3']],  # i-VI-VII-v
-            [['A3', 'C4', 'E4'], ['D3', 'F3', 'A3'], ['F3', 'A3', 'C4'], ['E3', 'G3', 'B3']]   # i-iv-VI-V
-        ]
-    elif emotion == "Angry":
-        # Diminished and altered scales for angry emotions
-        scales = [
-            ['E3', 'F3', 'G#3', 'A3', 'B3', 'C4', 'D#4', 'E4'],  # E phrygian dominant
-            ['B3', 'C4', 'D4', 'E4', 'F4', 'G4', 'A4', 'B4']     # B locrian
-        ]
-        chord_progressions = [
-            [['E3', 'G#3', 'B3'], ['A3', 'C4', 'E4'], ['F3', 'A3', 'C4'], ['B3', 'D4', 'F4']],
-            [['E3', 'G#3', 'B3'], ['D3', 'F3', 'A3'], ['C3', 'E3', 'G3'], ['B2', 'D3', 'F3']]
-        ]
-    else:  # Fear, Surprise, or neutral
-        # Modal scales for other emotions
-        scales = [
-            ['D3', 'E3', 'F3', 'G3', 'A3', 'B3', 'C4', 'D4'],  # D dorian
-            ['E3', 'F#3', 'G3', 'A3', 'B3', 'C#4', 'D4', 'E4']  # E dorian
-        ]
-        chord_progressions = [
-            [['D3', 'F3', 'A3'], ['C3', 'E3', 'G3'], ['Bb2', 'D3', 'F3'], ['A2', 'C3', 'E3']],
-            [['E3', 'G3', 'B3'], ['A3', 'C4', 'E4'], ['D3', 'F#3', 'A3'], ['G3', 'B3', 'D4']]
-        ]
-    # Choose a scale and chord progression randomly
-    scale = random.choice(scales)
-    progression = random.choice(chord_progressions)
-    # Create a music21 stream
-    s = stream.Stream()
-    # Set tempo
     t = music21.tempo.MetronomeMark(number=tempo_value)
-    s.append(t)
-    # Set instrument based on emotion
-    if emotion == "Happy":
-        inst = instrument.Piano()
-    elif emotion == "Sad":
-        inst = instrument.StringInstrument()
-    elif emotion == "Angry":
-        inst = instrument.ElectricGuitar()
-    else:
-        inst = instrument.Harp()
-    s.append(inst)
-    # Generate a simple chord progression
-    for i in range(4):  # 4 measures
-        for chord_notes in progression:
-            # Create chord
-            c = chord.Chord(chord_notes)
-            c.quarterLength = 1.0  # Quarter note duration
-            s.append(c)
-    # Add a simple melody using the scale
-    melody_part = stream.Part()
-    melody_part.append(instrument.Flute())
-    # Generate melody based on emotion
-    for i in range(16):  # 16 beats
-        if random.random() < 0.2:  # 20% chance of rest
-            n = note.Rest()
-        else:
-            # Choose note from scale
-            if emotion == "Happy":
-                # More skips and jumps for happy
-                pitch = scale[random.randint(0, len(scale)-1)]
-            elif emotion == "Sad":
-                # More stepwise motion for sad
-                idx = min(max(0, int(np.random.normal(3, 1))), len(scale)-1)
-                pitch = scale[idx]
-            else:
-                pitch = random.choice(scale)
-            n = note.Note(pitch)
-            # Add articulation based on emotion
-            if emotion == "Angry":
-                n.volume.velocity = 100  # Louder
-            elif emotion == "Sad":
-                n.volume.velocity = 60   # Softer
-        # Set duration
-        if random.random() < 0.3:  # 30% chance of half note
-            n.quarterLength = 2.0
-        else:
-            n.quarterLength = 1.0
         melody_part.append(n)
-    s.append(melody_part)
-    # Export to MIDI
-    midi_path = "temp_midi.mid"
-    s.write('midi', fp=midi_path)
-    # Convert MIDI to audio using fluidsynth (if available)
-    try:
-        from midi2audio import FluidSynth
-        fs = FluidSynth()
-        fs.midi_to_audio(midi_path, output_path)
-        print(f"Musical accompaniment saved to {output_path}")
-        # Clean up midi file
-        if os.path.exists(midi_path):
-            os.remove(midi_path)
-        return output_path
-    except ImportError:
-        print("FluidSynth not available. Creating synthetic audio instead.")
-        # Create synthetic audio as fallback
-        return _generate_synthetic_audio(emotion, sentiment_score, tempo, output_path)
-def _generate_synthetic_audio(emotion, sentiment_score, tempo, output_path):
-    """Generate synthetic audio using numpy when FluidSynth is not available"""
-    # Convert tempo to seconds per beat
-    spb = 60.0 / tempo
-    # Sample rate
-    sr = 22050
-    # Duration in seconds (16 beats)
-    duration = spb * 16
-    # Total samples
-    total_samples = int(sr * duration)
-    # Frequencies based on emotion
-    if emotion == "Happy" or sentiment_score > 0.3:
-        # Major chord frequencies (C major: C, E, G)
-        freqs = [261.63, 329.63, 392.00]
-    elif emotion == "Sad" or sentiment_score < -0.3:
-        # Minor chord frequencies (A minor: A, C, E)
-        freqs = [220.00, 261.63, 329.63]
-    elif emotion == "Angry":
-        # Diminished chord (B diminished: B, D, F)
-        freqs = [246.94, 293.66, 349.23]
     else:
-        # Suspended chord (D suspended: D, G, A)
-        freqs = [293.66, 392.00, 440.00]
-    # Generate a simple chord progression
-    audio = np.zeros(total_samples)
-    # Create 4 chords, each for 4 beats
-    for i in range(4):
-        chord_start = int(i * 4 * spb * sr)
-        chord_end = int((i + 1) * 4 * spb * sr)
-        # Shift base frequencies based on chord position
-        if i == 0:
-            freq_shift = 1.0  # Root
-        elif i == 1:
-            freq_shift = 5.0/4.0  # Fourth up
-        elif i == 2:
-            freq_shift = 6.0/5.0  # Minor third up from previous
         else:
-            freq_shift = 4.0/3.0  # Perfect fourth up from root
-        # Create chord tones
-        chord_audio = np.zeros(chord_end - chord_start)
-        for freq in freqs:
-            # Create a time array for this segment
-            t = np.linspace(0, (chord_end - chord_start) / sr, chord_end - chord_start, False)
-            # Adjust frequency based on chord position
-            adjusted_freq = freq * freq_shift
-            # Generate sine wave
-            note = 0.2 * np.sin(2 * np.pi * adjusted_freq * t)
-            # Apply envelope
-            envelope = np.ones_like(t)
-            attack = int(0.02 * len(t))  # 2% attack
-            decay = int(0.1 * len(t))    # 10% decay
-            release = int(0.2 * len(t))  # 20% release
-            envelope[:attack] = np.linspace(0, 1, attack)
-            envelope[-release:] = np.linspace(1, 0, release)
-            # Apply envelope
-            note = note * envelope
-            # Add to chord
-            chord_audio += note
-        # Normalize chord
-        chord_audio = chord_audio / np.max(np.abs(chord_audio))
-        # Add to full audio
-        audio[chord_start:chord_end] += chord_audio
-    # Add a simple melody
-    melody_audio = np.zeros_like(audio)
-    # Generate a few melody notes based on emotion
-    note_duration = int(0.5 * spb * sr)  # Eighth notes
-    if emotion == "Happy":
-        notes_per_measure = 4
-    elif emotion == "Sad":
-        notes_per_measure = 2
-    else:
-        notes_per_measure = 3
-    for measure in range(4):
-        for note_idx in range(notes_per_measure):
-            # Calculate start time for this note
-            start = measure * 4 * spb * sr + note_idx * (4 * spb * sr / notes_per_measure)
-            start = int(start)
-            # Note duration (with a small gap between notes)
-            end = start + int(0.9 * (4 * spb * sr / notes_per_measure))
-            if end > len(melody_audio):
-                end = len(melody_audio)
-            # Choose a frequency based on emotion
-            if emotion == "Happy":
-                freq = random.choice([392.00, 440.00, 493.88, 523.25])  # G4, A4, B4, C5
-            elif emotion == "Sad":
-                freq = random.choice([329.63, 349.23, 392.00, 440.00])  # E4, F4, G4, A4
-            else:
-                freq = random.choice([293.66, 329.63, 349.23, 392.00])  # D4, E4, F4, G4
-            # Create time array for this note
-            t = np.linspace(0, (end - start) / sr, end - start, False)
-            # Generate sine wave with some harmonics for richness
-            note_audio = 0.3 * np.sin(2 * np.pi * freq * t)
-            note_audio += 0.15 * np.sin(2 * np.pi * freq * 2 * t)  # First harmonic
-            note_audio += 0.05 * np.sin(2 * np.pi * freq * 3 * t)  # Second harmonic
-            # Apply envelope
-            envelope = np.ones_like(t)
-            attack = int(0.1 * len(t))
-            release = int(0.3 * len(t))
-            envelope[:attack] = np.linspace(0, 1, attack)
-            envelope[-release:] = np.linspace(1, 0, release)
-            note_audio = note_audio * envelope
-            # Add to melody
-            melody_audio[start:end] += note_audio
-    # Normalize melody
-    melody_audio = 0.6 * melody_audio / np.max(np.abs(melody_audio))
-    # Mix chord progression and melody
-    final_audio = audio + melody_audio
-    # Final normalization
-    final_audio = 0.9 * final_audio / np.max(np.abs(final_audio))
-    # Save audio file
-    sf.write(output_path, final_audio, sr)
-    print(f"Synthetic musical accompaniment saved to {output_path}")
-    return output_path

+import os
 import numpy as np
+import torch
 import librosa
 import soundfile as sf
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import music21  # Added this import to fix the NameError
+# Assuming you're using other modules for text-to-singing functionality
+# Add any other imports you need here
+class EmotionDetector:
+    def __init__(self):
+        self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
+        self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
+    def detect_emotion(self, text):
+        # Simple emotion detection based on sentiment
+        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+        outputs = self.model(**inputs)
+        sentiment_score = outputs.logits[0].softmax(dim=0)
+        sentiment_value = (sentiment_score * torch.tensor([1, 2, 3, 4, 5])).sum().item() / 5.0
+        # Map sentiment score to emotion
+        if sentiment_value > 0.7:
+            return "Happy", sentiment_value * 2 - 1
+        elif sentiment_value < 0.3:
+            return "Sad", 1 - sentiment_value * 2
+        else:
+            return "Neutral", 0.0
+# Function to synthesize speech from text
+def synthesize_speech(text, output_path="temp_speech.wav"):
+    # Placeholder for your text-to-speech implementation
+    # This should create a speech file at output_path
+    print(f"Speech synthesized and saved to {output_path}")
+    # Your actual implementation here
+    return output_path
+# Function to convert speech to singing
+def convert_to_singing(speech_path, emotion="Neutral", emotion_intensity=0.5, output_path="temp_singing.wav"):
+    # Placeholder for speech-to-singing conversion
+    # This should implement your DiffSinger model to convert speech to singing
+    print(f"Singing audio saved to {output_path}")
+    # Your actual implementation here
+    return output_path
+# Function to generate musical accompaniment
+def generate_accompaniment(
+    lyrics,
+    melody_path,
+    output_path="output_accompaniment.mid",
+    tempo_value=120,
+    key="C",
+    time_signature="4/4",
+    style="pop"
+):
+    # Create a music21 score
+    score = music21.stream.Score()
+    # Create a part for the melody
+    melody_part = music21.stream.Part()
+    # Set the tempo - this was causing the error
     t = music21.tempo.MetronomeMark(number=tempo_value)
+    melody_part.append(t)
+    # Set the key signature
+    ks = music21.key.Key(key)
+    melody_part.append(ks)
+    # Set the time signature
+    ts = music21.meter.TimeSignature(time_signature)
+    melody_part.append(ts)
+    # Here you would add notes based on your melody_path
+    # This is just a placeholder - you'll need to implement your actual note generation
+    # For example, adding a simple C major scale
+    notes = ['C4', 'D4', 'E4', 'F4', 'G4', 'A4', 'B4', 'C5']
+    for note_name in notes:
+        n = music21.note.Note(note_name)
+        n.quarterLength = 1.0
         melody_part.append(n)
+    # Add the melody part to the score
+    score.append(melody_part)
+    # Add accompaniment based on style
+    # This is a placeholder - implement your actual accompaniment generation
+    if style == "pop":
+        # Add pop-style accompaniment
+        pass
+    elif style == "classical":
+        # Add classical-style accompaniment
+        pass
+    # Add more styles as needed
+    # Write the score to a MIDI file
+    score.write('midi', fp=output_path)
+    return output_path
+# Function to combine singing and accompaniment
+def combine_audio(singing_path, accompaniment_path, output_path="final_output.wav"):
+    # Load the singing audio
+    singing, sr = librosa.load(singing_path, sr=None)
+    # If accompaniment is MIDI, convert it to audio first
+    if accompaniment_path.endswith('.mid'):
+        # Convert MIDI to audio - implement your conversion method
+        # This is a placeholder
+        accompaniment = np.zeros_like(singing)  # Just a placeholder
     else:
+        # Load the accompaniment audio
+        accompaniment, sr_acc = librosa.load(accompaniment_path, sr=sr)
+        # Make sure they're the same length
+        if len(accompaniment) > len(singing):
+            accompaniment = accompaniment[:len(singing)]
         else:
+            # Pad accompaniment with zeros if it's shorter
+            accompaniment = np.pad(accompaniment, (0, max(0, len(singing) - len(accompaniment))))
+    # Mix the singing and accompaniment
+    # You might want to adjust the volumes
+    singing_volume = 0.7
+    accompaniment_volume = 0.3
+    mixed = singing_volume * singing + accompaniment_volume * accompaniment
+    # Normalize to prevent clipping
+    if np.max(np.abs(mixed)) > 1.0:
+        mixed = mixed / np.max(np.abs(mixed))
+    # Save the mixed audio
+    sf.write(output_path, mixed, sr)
+    return output_path
+# Main function to process text to singing
+def text_to_singing(text, output_path="final_output.wav"):
+    # Detect emotion in the text
+    emotion_detector = EmotionDetector()
+    emotion, emotion_intensity = emotion_detector.detect_emotion(text)
+    print(f"Detected emotion: {emotion}")
+    print(f"Sentiment score: {emotion_intensity}")
+    # Synthesize speech from text
+    speech_path = synthesize_speech(text)
+    # Convert speech to singing with emotion
+    singing_path = convert_to_singing(speech_path, emotion, emotion_intensity)
+    # Generate musical accompaniment
+    accompaniment_path = generate_accompaniment(text, singing_path)
+    # Combine singing and accompaniment
+    final_output = combine_audio(singing_path, accompaniment_path, output_path)
+    return final_output, emotion, emotion_intensity