Spaces:

arshad1234321
/

Text_to_Multimedia

Sleeping

App Files Files Community

Text_to_Multimedia / app.py

arshad1234321

Update app.py

1678182 verified 9 months ago

raw

history blame contribute delete

5.84 kB

	import torch
	from audiocraft.models import MusicGen
	from transformers import GPT2LMHeadModel, GPT2Tokenizer
	import pyttsx3
	import gradio as gr
	from tempfile import NamedTemporaryFile
	import numpy as np
	import scipy.io.wavfile as wavfile
	from diffusers import StableDiffusionPipeline
	import matplotlib.pyplot as plt
	import librosa.display
	import librosa
	import soundfile as sf
	from PIL import Image
	import os

	# Set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# MusicGen
	music_model = MusicGen.get_pretrained("small", device=device)

	# GPT-2 for conversation
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
	gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

	# Stable Diffusion for image generation
	pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
	pipe = pipe.to(device)

	# Emotion detection for Text-to-Audio
	def get_emotion_tone(text):
	if any(word in text.lower() for word in ["happy", "joy", "excited"]):
	return "happy"
	elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
	return "sad"
	elif any(word in text.lower() for word in ["angry", "frustrated"]):
	return "angry"
	else:
	return "neutral"

	# Image generation using Stable Diffusion
	def generate_image(prompt, style="realistic"):
	styled_prompt = f"{style} style {prompt}"
	try:
	image = pipe(styled_prompt).images[0]
	temp_image = NamedTemporaryFile(delete=False, suffix=".png")
	image.save(temp_image.name)
	return temp_image.name
	except Exception as e:
	return f"Error generating image: {e}"

	# Convert Text to Audio with Emotion
	def text_to_audio(text):
	emotion = get_emotion_tone(text)
	engine = pyttsx3.init()
	engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
	engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5)

	temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
	engine.save_to_file(text, temp_file.name)
	engine.runAndWait()
	return temp_file.name

	# Music generation using MusicGen
	def generate_music(prompt):
	try:
	descriptions = [prompt]
	wav = music_model.generate(descriptions)
	temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
	audio_data = wav.cpu().numpy()
	wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
	return temp_file.name
	except Exception as e:
	return f"Error generating music: {e}"

	# Spectrogram generation from audio
	def generate_spectrogram(audio_path):
	try:
	y, sr = librosa.load(audio_path, sr=None)
	S = librosa.feature.melspectrogram(y, sr=sr)
	S_dB = librosa.power_to_db(S, ref=np.max)

	plt.figure(figsize=(10, 4))
	librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
	plt.colorbar(format='%+2.0f dB')
	plt.title('Mel-frequency spectrogram')
	temp_image = NamedTemporaryFile(delete=False, suffix=".png")
	plt.savefig(temp_image.name)
	plt.close()
	return temp_image.name
	except Exception as e:
	return f"Error generating spectrogram: {e}"

	# Chat with AI (GPT-2)
	def chat_with_ai(user_input):
	try:
	inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
	outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return response
	except Exception as e:
	return f"Error in chat generation: {e}"

	# Simulate Video Generation using a Sequence of Images
	def generate_video(prompt):
	frames = []
	for i in range(5): # Generate 5 frames as a sequence
	frame_prompt = f"{prompt} frame {i+1}"
	frame_path = generate_image(frame_prompt)
	frames.append(Image.open(frame_path))

	temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
	frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
	return temp_video.name

	# Main interface logic
	def main_interface(input_text, task_type, style):
	try:
	if task_type == "Conversation":
	response = chat_with_ai(input_text)
	image_path = generate_image(f"conversation about {input_text}", style)
	return response, None, image_path

	elif task_type == "Music":
	audio_path = generate_music(input_text)
	spectrogram_path = generate_spectrogram(audio_path)
	return "Music Generated", audio_path, spectrogram_path

	elif task_type == "Text to Audio":
	audio_path = text_to_audio(input_text)
	image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
	return "Audio Generated", audio_path, image_path

	elif task_type == "Video Generation":
	video_path = generate_video(input_text)
	audio_path = generate_music(input_text)
	return "Video Generated", audio_path, video_path
	except Exception as e:
	return f"Error: {e}", None, None

	# Gradio interface setup
	interface = gr.Interface(
	fn=main_interface,
	inputs=[
	gr.Textbox(label="Enter Text or Prompt"),
	gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
	gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
	],
	outputs=[
	gr.Textbox(label="Generated Output"),
	gr.Audio(label="Generated Audio", type="filepath"),
	gr.Image(label="Generated Image", type="filepath"),
	],
	live=False,
	)

	interface.launch()