File size: 5,837 Bytes
bdeeafd
 
 
1678182
bdeeafd
 
 
 
 
 
 
 
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
bdeeafd
 
1678182
bdeeafd
 
 
1678182
 
 
bdeeafd
1678182
bdeeafd
1678182
 
 
 
 
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
 
bdeeafd
1678182
 
 
 
 
 
bdeeafd
 
1678182
 
 
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
 
1678182
bdeeafd
1678182
 
 
 
 
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
bdeeafd
1678182
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdeeafd
1678182
 
 
bdeeafd
 
1678182
 
 
bdeeafd
1678182
bdeeafd
 
1678182
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import torch
from audiocraft.models import MusicGen
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pyttsx3
import gradio as gr
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.io.wavfile as wavfile
from diffusers import StableDiffusionPipeline
import matplotlib.pyplot as plt
import librosa.display
import librosa
import soundfile as sf
from PIL import Image
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# MusicGen
music_model = MusicGen.get_pretrained("small", device=device)

# GPT-2 for conversation
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Stable Diffusion for image generation
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe = pipe.to(device)

# Emotion detection for Text-to-Audio
def get_emotion_tone(text):
    if any(word in text.lower() for word in ["happy", "joy", "excited"]):
        return "happy"
    elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
        return "sad"
    elif any(word in text.lower() for word in ["angry", "frustrated"]):
        return "angry"
    else:
        return "neutral"

# Image generation using Stable Diffusion
def generate_image(prompt, style="realistic"):
    styled_prompt = f"{style} style {prompt}"
    try:
        image = pipe(styled_prompt).images[0]
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        image.save(temp_image.name)
        return temp_image.name
    except Exception as e:
        return f"Error generating image: {e}"

# Convert Text to Audio with Emotion
def text_to_audio(text):
    emotion = get_emotion_tone(text)
    engine = pyttsx3.init()
    engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
    engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5)

    temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
    engine.save_to_file(text, temp_file.name)
    engine.runAndWait()
    return temp_file.name

# Music generation using MusicGen
def generate_music(prompt):
    try:
        descriptions = [prompt]
        wav = music_model.generate(descriptions)
        temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
        audio_data = wav.cpu().numpy()
        wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
        return temp_file.name
    except Exception as e:
        return f"Error generating music: {e}"

# Spectrogram generation from audio
def generate_spectrogram(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        S = librosa.feature.melspectrogram(y, sr=sr)
        S_dB = librosa.power_to_db(S, ref=np.max)

        plt.figure(figsize=(10, 4))
        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel-frequency spectrogram')
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        plt.savefig(temp_image.name)
        plt.close()
        return temp_image.name
    except Exception as e:
        return f"Error generating spectrogram: {e}"

# Chat with AI (GPT-2)
def chat_with_ai(user_input):
    try:
        inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
        outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        return f"Error in chat generation: {e}"

# Simulate Video Generation using a Sequence of Images
def generate_video(prompt):
    frames = []
    for i in range(5):  # Generate 5 frames as a sequence
        frame_prompt = f"{prompt} frame {i+1}"
        frame_path = generate_image(frame_prompt)
        frames.append(Image.open(frame_path))

    temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
    frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
    return temp_video.name

# Main interface logic
def main_interface(input_text, task_type, style):
    try:
        if task_type == "Conversation":
            response = chat_with_ai(input_text)
            image_path = generate_image(f"conversation about {input_text}", style)
            return response, None, image_path

        elif task_type == "Music":
            audio_path = generate_music(input_text)
            spectrogram_path = generate_spectrogram(audio_path)
            return "Music Generated", audio_path, spectrogram_path

        elif task_type == "Text to Audio":
            audio_path = text_to_audio(input_text)
            image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
            return "Audio Generated", audio_path, image_path

        elif task_type == "Video Generation":
            video_path = generate_video(input_text)
            audio_path = generate_music(input_text)
            return "Video Generated", audio_path, video_path
    except Exception as e:
        return f"Error: {e}", None, None

# Gradio interface setup
interface = gr.Interface(
    fn=main_interface,
    inputs=[
        gr.Textbox(label="Enter Text or Prompt"),
        gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
        gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
    ],
    outputs=[
        gr.Textbox(label="Generated Output"),
        gr.Audio(label="Generated Audio", type="filepath"),
        gr.Image(label="Generated Image", type="filepath"),
    ],
    live=False,
)

interface.launch()