| import os |
| import numpy as np |
| import torch |
| import librosa |
| import soundfile as sf |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| import music21 |
|
|
| |
| |
|
|
| class EmotionDetector: |
| def __init__(self): |
| self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") |
| self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment") |
| |
| def detect_emotion(self, text): |
| |
| inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512) |
| outputs = self.model(**inputs) |
| sentiment_score = outputs.logits[0].softmax(dim=0) |
| sentiment_value = (sentiment_score * torch.tensor([1, 2, 3, 4, 5])).sum().item() / 5.0 |
| |
| |
| if sentiment_value > 0.7: |
| return "Happy", sentiment_value * 2 - 1 |
| elif sentiment_value < 0.3: |
| return "Sad", 1 - sentiment_value * 2 |
| else: |
| return "Neutral", 0.0 |
|
|
| |
| def synthesize_speech(text, output_path="temp_speech.wav"): |
| |
| |
| print(f"Speech synthesized and saved to {output_path}") |
| |
| |
| return output_path |
|
|
| |
| def convert_to_singing(speech_path, emotion="Neutral", emotion_intensity=0.5, output_path="temp_singing.wav"): |
| |
| |
| print(f"Singing audio saved to {output_path}") |
| |
| |
| return output_path |
|
|
| |
| def generate_accompaniment( |
| lyrics, |
| melody_path, |
| output_path="output_accompaniment.mid", |
| tempo_value=120, |
| key="C", |
| time_signature="4/4", |
| style="pop" |
| ): |
| |
| score = music21.stream.Score() |
| |
| |
| melody_part = music21.stream.Part() |
| |
| |
| t = music21.tempo.MetronomeMark(number=tempo_value) |
| melody_part.append(t) |
| |
| |
| ks = music21.key.Key(key) |
| melody_part.append(ks) |
| |
| |
| ts = music21.meter.TimeSignature(time_signature) |
| melody_part.append(ts) |
| |
| |
| |
| |
| |
| notes = ['C4', 'D4', 'E4', 'F4', 'G4', 'A4', 'B4', 'C5'] |
| for note_name in notes: |
| n = music21.note.Note(note_name) |
| n.quarterLength = 1.0 |
| melody_part.append(n) |
| |
| |
| score.append(melody_part) |
| |
| |
| |
| if style == "pop": |
| |
| pass |
| elif style == "classical": |
| |
| pass |
| |
| |
| |
| score.write('midi', fp=output_path) |
| |
| return output_path |
|
|
| |
| def combine_audio(singing_path, accompaniment_path, output_path="final_output.wav"): |
| |
| singing, sr = librosa.load(singing_path, sr=None) |
| |
| |
| if accompaniment_path.endswith('.mid'): |
| |
| |
| accompaniment = np.zeros_like(singing) |
| else: |
| |
| accompaniment, sr_acc = librosa.load(accompaniment_path, sr=sr) |
| |
| |
| if len(accompaniment) > len(singing): |
| accompaniment = accompaniment[:len(singing)] |
| else: |
| |
| accompaniment = np.pad(accompaniment, (0, max(0, len(singing) - len(accompaniment)))) |
| |
| |
| |
| singing_volume = 0.7 |
| accompaniment_volume = 0.3 |
| |
| mixed = singing_volume * singing + accompaniment_volume * accompaniment |
| |
| |
| if np.max(np.abs(mixed)) > 1.0: |
| mixed = mixed / np.max(np.abs(mixed)) |
| |
| |
| sf.write(output_path, mixed, sr) |
| |
| return output_path |
|
|
| |
| def text_to_singing(text, output_path="final_output.wav"): |
| |
| emotion_detector = EmotionDetector() |
| emotion, emotion_intensity = emotion_detector.detect_emotion(text) |
| print(f"Detected emotion: {emotion}") |
| print(f"Sentiment score: {emotion_intensity}") |
| |
| |
| speech_path = synthesize_speech(text) |
| |
| |
| singing_path = convert_to_singing(speech_path, emotion, emotion_intensity) |
| |
| |
| accompaniment_path = generate_accompaniment(text, singing_path) |
| |
| |
| final_output = combine_audio(singing_path, accompaniment_path, output_path) |
| |
| return final_output, emotion, emotion_intensity |