Spaces:

arshad1234321
/

Text_to_Multimedia

Sleeping

File size: 5,837 Bytes

bdeeafd
 
 
1678182
bdeeafd
 
 
 
 
 
 
 
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
bdeeafd
 
1678182
bdeeafd
 
 
1678182
 
 
bdeeafd
1678182
bdeeafd
1678182
 
 
 
 
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
 
bdeeafd
1678182
 
 
 
 
 
bdeeafd
 
1678182
 
 
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
 
1678182
bdeeafd
1678182
 
 
 
 
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
bdeeafd
1678182
 
 
 
bdeeafd
1678182
bdeeafd
1678182
bdeeafd
1678182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bdeeafd
1678182
 
 
bdeeafd
 
1678182
 
 
bdeeafd
1678182
bdeeafd
 
1678182

import torch
from audiocraft.models import MusicGen
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pyttsx3
import gradio as gr
from tempfile import NamedTemporaryFile
import numpy as np
import scipy.io.wavfile as wavfile
from diffusers import StableDiffusionPipeline
import matplotlib.pyplot as plt
import librosa.display
import librosa
import soundfile as sf
from PIL import Image
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# MusicGen
music_model = MusicGen.get_pretrained("small", device=device)

# GPT-2 for conversation
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Stable Diffusion for image generation
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe = pipe.to(device)

# Emotion detection for Text-to-Audio
def get_emotion_tone(text):
    if any(word in text.lower() for word in ["happy", "joy", "excited"]):
        return "happy"
    elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
        return "sad"
    elif any(word in text.lower() for word in ["angry", "frustrated"]):
        return "angry"
    else:
        return "neutral"

# Image generation using Stable Diffusion
def generate_image(prompt, style="realistic"):
    styled_prompt = f"{style} style {prompt}"
    try:
        image = pipe(styled_prompt).images[0]
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        image.save(temp_image.name)
        return temp_image.name
    except Exception as e:
        return f"Error generating image: {e}"

# Convert Text to Audio with Emotion
def text_to_audio(text):
    emotion = get_emotion_tone(text)
    engine = pyttsx3.init()
    engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
    engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5)

    temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
    engine.save_to_file(text, temp_file.name)
    engine.runAndWait()
    return temp_file.name

# Music generation using MusicGen
def generate_music(prompt):
    try:
        descriptions = [prompt]
        wav = music_model.generate(descriptions)
        temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
        audio_data = wav.cpu().numpy()
        wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
        return temp_file.name
    except Exception as e:
        return f"Error generating music: {e}"

# Spectrogram generation from audio
def generate_spectrogram(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        S = librosa.feature.melspectrogram(y, sr=sr)
        S_dB = librosa.power_to_db(S, ref=np.max)

        plt.figure(figsize=(10, 4))
        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
        plt.colorbar(format='%+2.0f dB')
        plt.title('Mel-frequency spectrogram')
        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
        plt.savefig(temp_image.name)
        plt.close()
        return temp_image.name
    except Exception as e:
        return f"Error generating spectrogram: {e}"

# Chat with AI (GPT-2)
def chat_with_ai(user_input):
    try:
        inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
        outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        return f"Error in chat generation: {e}"

# Simulate Video Generation using a Sequence of Images
def generate_video(prompt):
    frames = []
    for i in range(5):  # Generate 5 frames as a sequence
        frame_prompt = f"{prompt} frame {i+1}"
        frame_path = generate_image(frame_prompt)
        frames.append(Image.open(frame_path))

    temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
    frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
    return temp_video.name

# Main interface logic
def main_interface(input_text, task_type, style):
    try:
        if task_type == "Conversation":
            response = chat_with_ai(input_text)
            image_path = generate_image(f"conversation about {input_text}", style)
            return response, None, image_path

        elif task_type == "Music":
            audio_path = generate_music(input_text)
            spectrogram_path = generate_spectrogram(audio_path)
            return "Music Generated", audio_path, spectrogram_path

        elif task_type == "Text to Audio":
            audio_path = text_to_audio(input_text)
            image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
            return "Audio Generated", audio_path, image_path

        elif task_type == "Video Generation":
            video_path = generate_video(input_text)
            audio_path = generate_music(input_text)
            return "Video Generated", audio_path, video_path
    except Exception as e:
        return f"Error: {e}", None, None

# Gradio interface setup
interface = gr.Interface(
    fn=main_interface,
    inputs=[
        gr.Textbox(label="Enter Text or Prompt"),
        gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
        gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
    ],
    outputs=[
        gr.Textbox(label="Generated Output"),
        gr.Audio(label="Generated Audio", type="filepath"),
        gr.Image(label="Generated Image", type="filepath"),
    ],
    live=False,
)

interface.launch()