Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

File size: 5,838 Bytes

0c413ab
8d77db3
0c413ab
8d77db3
 
0c413ab
 
8d77db3
0c413ab
 
 
 
 
 
 
 
 
 
 
 
 
 
8d77db3
0c413ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f261c3
0c413ab
8d77db3
0c413ab
 
 
8d77db3
0c413ab
 
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
 
 
 
8d77db3
 
0c413ab
 
8d77db3
0c413ab
 
 
 
 
 
 
 
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
 
 
 
 
 
 
 
 
 
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
 
 
8d77db3
0c413ab
8d77db3
0c413ab
 
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
 
 
 
 
 
 
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
8d77db3
0c413ab
 
8d77db3
0c413ab

import os
import numpy as np
import torch
import librosa
import soundfile as sf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import music21  # Added this import to fix the NameError

# Assuming you're using other modules for text-to-singing functionality
# Add any other imports you need here

class EmotionDetector:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    
    def detect_emotion(self, text):
        # Simple emotion detection based on sentiment
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        outputs = self.model(**inputs)
        sentiment_score = outputs.logits[0].softmax(dim=0)
        sentiment_value = (sentiment_score * torch.tensor([1, 2, 3, 4, 5])).sum().item() / 5.0
        
        # Map sentiment score to emotion
        if sentiment_value > 0.7:
            return "Happy", sentiment_value * 2 - 1
        elif sentiment_value < 0.3:
            return "Sad", 1 - sentiment_value * 2
        else:
            return "Neutral", 0.0

# Function to synthesize speech from text
def synthesize_speech(text, output_path="temp_speech.wav"):
    # Placeholder for your text-to-speech implementation
    # This should create a speech file at output_path
    print(f"Speech synthesized and saved to {output_path}")
    # Your actual implementation here
    
    return output_path

# Function to convert speech to singing
def convert_to_singing(speech_path, emotion="Neutral", emotion_intensity=0.5, output_path="temp_singing.wav"):
    # Placeholder for speech-to-singing conversion
    # This should implement your DiffSinger model to convert speech to singing
    print(f"Singing audio saved to {output_path}")
    # Your actual implementation here
    
    return output_path

# Function to generate musical accompaniment
def generate_accompaniment(
    lyrics, 
    melody_path, 
    output_path="output_accompaniment.mid", 
    tempo_value=120,
    key="C",
    time_signature="4/4",
    style="pop"
):
    # Create a music21 score
    score = music21.stream.Score()
    
    # Create a part for the melody
    melody_part = music21.stream.Part()
    
    # Set the tempo - this was causing the error
    t = music21.tempo.MetronomeMark(number=tempo_value)
    melody_part.append(t)
    
    # Set the key signature
    ks = music21.key.Key(key)
    melody_part.append(ks)
    
    # Set the time signature
    ts = music21.meter.TimeSignature(time_signature)
    melody_part.append(ts)
    
    # Here you would add notes based on your melody_path
    # This is just a placeholder - you'll need to implement your actual note generation
    
    # For example, adding a simple C major scale
    notes = ['C4', 'D4', 'E4', 'F4', 'G4', 'A4', 'B4', 'C5']
    for note_name in notes:
        n = music21.note.Note(note_name)
        n.quarterLength = 1.0
        melody_part.append(n)
    
    # Add the melody part to the score
    score.append(melody_part)
    
    # Add accompaniment based on style
    # This is a placeholder - implement your actual accompaniment generation
    if style == "pop":
        # Add pop-style accompaniment
        pass
    elif style == "classical":
        # Add classical-style accompaniment
        pass
    # Add more styles as needed
    
    # Write the score to a MIDI file
    score.write('midi', fp=output_path)
    
    return output_path

# Function to combine singing and accompaniment
def combine_audio(singing_path, accompaniment_path, output_path="final_output.wav"):
    # Load the singing audio
    singing, sr = librosa.load(singing_path, sr=None)
    
    # If accompaniment is MIDI, convert it to audio first
    if accompaniment_path.endswith('.mid'):
        # Convert MIDI to audio - implement your conversion method
        # This is a placeholder
        accompaniment = np.zeros_like(singing)  # Just a placeholder
    else:
        # Load the accompaniment audio
        accompaniment, sr_acc = librosa.load(accompaniment_path, sr=sr)
        
        # Make sure they're the same length
        if len(accompaniment) > len(singing):
            accompaniment = accompaniment[:len(singing)]
        else:
            # Pad accompaniment with zeros if it's shorter
            accompaniment = np.pad(accompaniment, (0, max(0, len(singing) - len(accompaniment))))
    
    # Mix the singing and accompaniment
    # You might want to adjust the volumes
    singing_volume = 0.7
    accompaniment_volume = 0.3
    
    mixed = singing_volume * singing + accompaniment_volume * accompaniment
    
    # Normalize to prevent clipping
    if np.max(np.abs(mixed)) > 1.0:
        mixed = mixed / np.max(np.abs(mixed))
    
    # Save the mixed audio
    sf.write(output_path, mixed, sr)
    
    return output_path

# Main function to process text to singing
def text_to_singing(text, output_path="final_output.wav"):
    # Detect emotion in the text
    emotion_detector = EmotionDetector()
    emotion, emotion_intensity = emotion_detector.detect_emotion(text)
    print(f"Detected emotion: {emotion}")
    print(f"Sentiment score: {emotion_intensity}")
    
    # Synthesize speech from text
    speech_path = synthesize_speech(text)
    
    # Convert speech to singing with emotion
    singing_path = convert_to_singing(speech_path, emotion, emotion_intensity)
    
    # Generate musical accompaniment
    accompaniment_path = generate_accompaniment(text, singing_path)
    
    # Combine singing and accompaniment
    final_output = combine_audio(singing_path, accompaniment_path, output_path)
    
    return final_output, emotion, emotion_intensity