Spaces:
Sleeping
Sleeping
| import torch | |
| from audiocraft.models import MusicGen | |
| from transformers import GPT2LMHeadModel, GPT2Tokenizer | |
| import pyttsx3 | |
| import gradio as gr | |
| from tempfile import NamedTemporaryFile | |
| import numpy as np | |
| import scipy.io.wavfile as wavfile | |
| from diffusers import StableDiffusionPipeline | |
| import matplotlib.pyplot as plt | |
| import librosa.display | |
| import librosa | |
| import soundfile as sf | |
| from PIL import Image | |
| import os | |
| # Set device | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # MusicGen | |
| music_model = MusicGen.get_pretrained("small", device=device) | |
| # GPT-2 for conversation | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device) | |
| # Stable Diffusion for image generation | |
| pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) | |
| pipe = pipe.to(device) | |
| # Emotion detection for Text-to-Audio | |
| def get_emotion_tone(text): | |
| if any(word in text.lower() for word in ["happy", "joy", "excited"]): | |
| return "happy" | |
| elif any(word in text.lower() for word in ["sad", "down", "melancholy"]): | |
| return "sad" | |
| elif any(word in text.lower() for word in ["angry", "frustrated"]): | |
| return "angry" | |
| else: | |
| return "neutral" | |
| # Image generation using Stable Diffusion | |
| def generate_image(prompt, style="realistic"): | |
| styled_prompt = f"{style} style {prompt}" | |
| try: | |
| image = pipe(styled_prompt).images[0] | |
| temp_image = NamedTemporaryFile(delete=False, suffix=".png") | |
| image.save(temp_image.name) | |
| return temp_image.name | |
| except Exception as e: | |
| return f"Error generating image: {e}" | |
| # Convert Text to Audio with Emotion | |
| def text_to_audio(text): | |
| emotion = get_emotion_tone(text) | |
| engine = pyttsx3.init() | |
| engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200) | |
| engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5) | |
| temp_file = NamedTemporaryFile(delete=False, suffix=".mp3") | |
| engine.save_to_file(text, temp_file.name) | |
| engine.runAndWait() | |
| return temp_file.name | |
| # Music generation using MusicGen | |
| def generate_music(prompt): | |
| try: | |
| descriptions = [prompt] | |
| wav = music_model.generate(descriptions) | |
| temp_file = NamedTemporaryFile(delete=False, suffix=".wav") | |
| audio_data = wav.cpu().numpy() | |
| wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0]) | |
| return temp_file.name | |
| except Exception as e: | |
| return f"Error generating music: {e}" | |
| # Spectrogram generation from audio | |
| def generate_spectrogram(audio_path): | |
| try: | |
| y, sr = librosa.load(audio_path, sr=None) | |
| S = librosa.feature.melspectrogram(y, sr=sr) | |
| S_dB = librosa.power_to_db(S, ref=np.max) | |
| plt.figure(figsize=(10, 4)) | |
| librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm') | |
| plt.colorbar(format='%+2.0f dB') | |
| plt.title('Mel-frequency spectrogram') | |
| temp_image = NamedTemporaryFile(delete=False, suffix=".png") | |
| plt.savefig(temp_image.name) | |
| plt.close() | |
| return temp_image.name | |
| except Exception as e: | |
| return f"Error generating spectrogram: {e}" | |
| # Chat with AI (GPT-2) | |
| def chat_with_ai(user_input): | |
| try: | |
| inputs = tokenizer.encode(user_input, return_tensors="pt").to(device) | |
| outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return response | |
| except Exception as e: | |
| return f"Error in chat generation: {e}" | |
| # Simulate Video Generation using a Sequence of Images | |
| def generate_video(prompt): | |
| frames = [] | |
| for i in range(5): # Generate 5 frames as a sequence | |
| frame_prompt = f"{prompt} frame {i+1}" | |
| frame_path = generate_image(frame_prompt) | |
| frames.append(Image.open(frame_path)) | |
| temp_video = NamedTemporaryFile(delete=False, suffix=".gif") | |
| frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0) | |
| return temp_video.name | |
| # Main interface logic | |
| def main_interface(input_text, task_type, style): | |
| try: | |
| if task_type == "Conversation": | |
| response = chat_with_ai(input_text) | |
| image_path = generate_image(f"conversation about {input_text}", style) | |
| return response, None, image_path | |
| elif task_type == "Music": | |
| audio_path = generate_music(input_text) | |
| spectrogram_path = generate_spectrogram(audio_path) | |
| return "Music Generated", audio_path, spectrogram_path | |
| elif task_type == "Text to Audio": | |
| audio_path = text_to_audio(input_text) | |
| image_path = generate_image(f"text-to-audio conversion for {input_text}", style) | |
| return "Audio Generated", audio_path, image_path | |
| elif task_type == "Video Generation": | |
| video_path = generate_video(input_text) | |
| audio_path = generate_music(input_text) | |
| return "Video Generated", audio_path, video_path | |
| except Exception as e: | |
| return f"Error: {e}", None, None | |
| # Gradio interface setup | |
| interface = gr.Interface( | |
| fn=main_interface, | |
| inputs=[ | |
| gr.Textbox(label="Enter Text or Prompt"), | |
| gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"), | |
| gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Generated Output"), | |
| gr.Audio(label="Generated Audio", type="filepath"), | |
| gr.Image(label="Generated Image", type="filepath"), | |
| ], | |
| live=False, | |
| ) | |
| interface.launch() |