Text2Sing-DiffSinger / music_generator.py
Vaishnavi0404's picture
Update music_generator.py
0c413ab verified
import os
import numpy as np
import torch
import librosa
import soundfile as sf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import music21 # Added this import to fix the NameError
# Assuming you're using other modules for text-to-singing functionality
# Add any other imports you need here
class EmotionDetector:
def __init__(self):
self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
def detect_emotion(self, text):
# Simple emotion detection based on sentiment
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
outputs = self.model(**inputs)
sentiment_score = outputs.logits[0].softmax(dim=0)
sentiment_value = (sentiment_score * torch.tensor([1, 2, 3, 4, 5])).sum().item() / 5.0
# Map sentiment score to emotion
if sentiment_value > 0.7:
return "Happy", sentiment_value * 2 - 1
elif sentiment_value < 0.3:
return "Sad", 1 - sentiment_value * 2
else:
return "Neutral", 0.0
# Function to synthesize speech from text
def synthesize_speech(text, output_path="temp_speech.wav"):
# Placeholder for your text-to-speech implementation
# This should create a speech file at output_path
print(f"Speech synthesized and saved to {output_path}")
# Your actual implementation here
return output_path
# Function to convert speech to singing
def convert_to_singing(speech_path, emotion="Neutral", emotion_intensity=0.5, output_path="temp_singing.wav"):
# Placeholder for speech-to-singing conversion
# This should implement your DiffSinger model to convert speech to singing
print(f"Singing audio saved to {output_path}")
# Your actual implementation here
return output_path
# Function to generate musical accompaniment
def generate_accompaniment(
lyrics,
melody_path,
output_path="output_accompaniment.mid",
tempo_value=120,
key="C",
time_signature="4/4",
style="pop"
):
# Create a music21 score
score = music21.stream.Score()
# Create a part for the melody
melody_part = music21.stream.Part()
# Set the tempo - this was causing the error
t = music21.tempo.MetronomeMark(number=tempo_value)
melody_part.append(t)
# Set the key signature
ks = music21.key.Key(key)
melody_part.append(ks)
# Set the time signature
ts = music21.meter.TimeSignature(time_signature)
melody_part.append(ts)
# Here you would add notes based on your melody_path
# This is just a placeholder - you'll need to implement your actual note generation
# For example, adding a simple C major scale
notes = ['C4', 'D4', 'E4', 'F4', 'G4', 'A4', 'B4', 'C5']
for note_name in notes:
n = music21.note.Note(note_name)
n.quarterLength = 1.0
melody_part.append(n)
# Add the melody part to the score
score.append(melody_part)
# Add accompaniment based on style
# This is a placeholder - implement your actual accompaniment generation
if style == "pop":
# Add pop-style accompaniment
pass
elif style == "classical":
# Add classical-style accompaniment
pass
# Add more styles as needed
# Write the score to a MIDI file
score.write('midi', fp=output_path)
return output_path
# Function to combine singing and accompaniment
def combine_audio(singing_path, accompaniment_path, output_path="final_output.wav"):
# Load the singing audio
singing, sr = librosa.load(singing_path, sr=None)
# If accompaniment is MIDI, convert it to audio first
if accompaniment_path.endswith('.mid'):
# Convert MIDI to audio - implement your conversion method
# This is a placeholder
accompaniment = np.zeros_like(singing) # Just a placeholder
else:
# Load the accompaniment audio
accompaniment, sr_acc = librosa.load(accompaniment_path, sr=sr)
# Make sure they're the same length
if len(accompaniment) > len(singing):
accompaniment = accompaniment[:len(singing)]
else:
# Pad accompaniment with zeros if it's shorter
accompaniment = np.pad(accompaniment, (0, max(0, len(singing) - len(accompaniment))))
# Mix the singing and accompaniment
# You might want to adjust the volumes
singing_volume = 0.7
accompaniment_volume = 0.3
mixed = singing_volume * singing + accompaniment_volume * accompaniment
# Normalize to prevent clipping
if np.max(np.abs(mixed)) > 1.0:
mixed = mixed / np.max(np.abs(mixed))
# Save the mixed audio
sf.write(output_path, mixed, sr)
return output_path
# Main function to process text to singing
def text_to_singing(text, output_path="final_output.wav"):
# Detect emotion in the text
emotion_detector = EmotionDetector()
emotion, emotion_intensity = emotion_detector.detect_emotion(text)
print(f"Detected emotion: {emotion}")
print(f"Sentiment score: {emotion_intensity}")
# Synthesize speech from text
speech_path = synthesize_speech(text)
# Convert speech to singing with emotion
singing_path = convert_to_singing(speech_path, emotion, emotion_intensity)
# Generate musical accompaniment
accompaniment_path = generate_accompaniment(text, singing_path)
# Combine singing and accompaniment
final_output = combine_audio(singing_path, accompaniment_path, output_path)
return final_output, emotion, emotion_intensity