Spaces:

arshad1234321
/

Text_to_Multimedia

Sleeping

App Files Files Community

arshad1234321 commited on Apr 14

Commit

bdeeafd

verified ·

1 Parent(s): 2e95955

Create app.py

Browse files

Files changed (1) hide show

app.py +160 -0

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+#working code
+!pip install transformers diffusers gradio librosa audiocraft pyttsx3
+!pip install --upgrade torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+import torch
+from audiocraft.models import MusicGen
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import pyttsx3
+import gradio as gr
+from tempfile import NamedTemporaryFile
+import numpy as np
+import scipy.io.wavfile as wavfile
+from diffusers import StableDiffusionPipeline
+import matplotlib.pyplot as plt
+import librosa.display
+import librosa
+import soundfile as sf
+from PIL import Image
+import os
+# Set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# MusicGen
+music_model = MusicGen.get_pretrained("small", device=device)
+# GPT-2 for conversation
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
+# Stable Diffusion for image generation
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipe = pipe.to(device)
+# Emotion detection for Text-to-Audio
+def get_emotion_tone(text):
+    if any(word in text.lower() for word in ["happy", "joy", "excited"]):
+        return "happy"
+    elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
+        return "sad"
+    elif any(word in text.lower() for word in ["angry", "frustrated"]):
+        return "angry"
+    else:
+        return "neutral"
+# Image generation using Stable Diffusion
+def generate_image(prompt, style="realistic"):
+    styled_prompt = f"{style} style {prompt}"
+    try:
+        image = pipe(styled_prompt).images[0]
+        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
+        image.save(temp_image.name)
+        return temp_image.name
+    except Exception as e:
+        return f"Error generating image: {e}"
+# Convert Text to Audio with Emotion
+def text_to_audio(text):
+    emotion = get_emotion_tone(text)
+    engine = pyttsx3.init()
+    engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
+    engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5)
+    temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
+    engine.save_to_file(text, temp_file.name)
+    engine.runAndWait()
+    return temp_file.name
+# Music generation using MusicGen
+def generate_music(prompt):
+    try:
+        descriptions = [prompt]
+        wav = music_model.generate(descriptions)
+        temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
+        audio_data = wav.cpu().numpy()
+        wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
+        return temp_file.name
+    except Exception as e:
+        return f"Error generating music: {e}"
+# Spectrogram generation from audio
+def generate_spectrogram(audio_path):
+    try:
+        y, sr = librosa.load(audio_path, sr=None)
+        S = librosa.feature.melspectrogram(y, sr=sr)
+        S_dB = librosa.power_to_db(S, ref=np.max)
+        plt.figure(figsize=(10, 4))
+        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
+        plt.colorbar(format='%+2.0f dB')
+        plt.title('Mel-frequency spectrogram')
+        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
+        plt.savefig(temp_image.name)
+        plt.close()
+        return temp_image.name
+    except Exception as e:
+        return f"Error generating spectrogram: {e}"
+# Chat with AI (GPT-2)
+def chat_with_ai(user_input):
+    try:
+        inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
+        outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response
+    except Exception as e:
+        return f"Error in chat generation: {e}"
+# Simulate Video Generation using a Sequence of Images
+def generate_video(prompt):
+    frames = []
+    for i in range(5): # Generate 5 frames as a sequence
+        frame_prompt = f"{prompt} frame {i+1}"
+        frame_path = generate_image(frame_prompt)
+        frames.append(Image.open(frame_path))
+    temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
+    frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
+    return temp_video.name
+# Main interface logic
+def main_interface(input_text, task_type, style):
+    try:
+        if task_type == "Conversation":
+            response = chat_with_ai(input_text)
+            image_path = generate_image(f"conversation about {input_text}", style)
+            return response, None, image_path
+        elif task_type == "Music":
+            audio_path = generate_music(input_text)
+            spectrogram_path = generate_spectrogram(audio_path)
+            return "Music Generated", audio_path, spectrogram_path
+        elif task_type == "Text to Audio":
+            audio_path = text_to_audio(input_text)
+            image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
+            return "Audio Generated", audio_path, image_path
+        elif task_type == "Video Generation":
+            video_path = generate_video(input_text)
+            audio_path = generate_music(input_text)
+            return "Video Generated", audio_path, video_path
+    except Exception as e:
+        return f"Error: {e}", None, None
+# Gradio interface setup
+interface = gr.Interface(
+    fn=main_interface,
+    inputs=[
+        gr.Textbox(label="Enter Text or Prompt"),
+        gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
+        gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
+    ],
+    outputs=[
+        gr.Textbox(label="Generated Output"),
+        gr.Audio(label="Generated Audio", type="filepath"),
+        gr.Image(label="Generated Image", type="filepath"),
+    ],
+    live=False,
+)
+interface.launch()