Spaces:

arshad1234321
/

Text_to_Multimedia

Running

App Files Files Community

arshad1234321 commited on Apr 15, 2025

Commit

5d1605e

verified ·

1 Parent(s): ae1e3e1

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -36

app.py CHANGED Viewed

@@ -14,25 +14,22 @@ import soundfile as sf
 from PIL import Image
 import os
-# Set device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# MusicGen
 music_model = MusicGen.get_pretrained("small", device=device)
-# GPT-2 for conversation
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
-# Stable Diffusion for image generation with dtype fix
-dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 pipe = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float32  # Required for CPU
 ).to("cpu")
-pipe = pipe.to(device)
-# Emotion detection for Text-to-Audio
 def get_emotion_tone(text):
     if any(word in text.lower() for word in ["happy", "joy", "excited"]):
         return "happy"
@@ -43,7 +40,6 @@ def get_emotion_tone(text):
     else:
         return "neutral"
-# Image generation using Stable Diffusion
 def generate_image(prompt, style="realistic"):
     styled_prompt = f"{style} style {prompt}"
     try:
@@ -52,9 +48,9 @@ def generate_image(prompt, style="realistic"):
         image.save(temp_image.name)
         return temp_image.name
     except Exception as e:
-        return f"Error generating image: {e}"
-# Convert Text to Audio with Emotion
 def text_to_audio(text):
     emotion = get_emotion_tone(text)
     engine = pyttsx3.init()
@@ -66,19 +62,17 @@ def text_to_audio(text):
     engine.runAndWait()
     return temp_file.name
-# Music generation using MusicGen
 def generate_music(prompt):
     try:
-        descriptions = [prompt]
-        wav = music_model.generate(descriptions)
         temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
         audio_data = wav.cpu().numpy()
         wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
         return temp_file.name
     except Exception as e:
-        return f"Error generating music: {e}"
-# Spectrogram generation from audio
 def generate_spectrogram(audio_path):
     try:
         y, sr = librosa.load(audio_path, sr=None)
@@ -94,9 +88,9 @@ def generate_spectrogram(audio_path):
         plt.close()
         return temp_image.name
     except Exception as e:
-        return f"Error generating spectrogram: {e}"
-# Chat with AI (GPT-2)
 def chat_with_ai(user_input):
     try:
         inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
@@ -104,52 +98,48 @@ def chat_with_ai(user_input):
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response
     except Exception as e:
-        return f"Error in chat generation: {e}"
-# Simulate Video Generation using a Sequence of Images
 def generate_video(prompt):
     frames = []
-    for i in range(5):  # Generate 5 frames as a sequence
         frame_prompt = f"{prompt} frame {i+1}"
         frame_path = generate_image(frame_prompt)
-        if "Error" in frame_path:
-            return frame_path
-        frames.append(Image.open(frame_path))
     temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
     frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
     return temp_video.name
-# Main interface logic
 def main_interface(input_text, task_type, style):
     try:
         if task_type == "Conversation":
             response = chat_with_ai(input_text)
             image_path = generate_image(f"conversation about {input_text}", style)
-            return response, None, image_path
         elif task_type == "Music":
             audio_path = generate_music(input_text)
-            if "Error" in audio_path:
-                return audio_path, None, None
-            spectrogram_path = generate_spectrogram(audio_path)
-            return "Music Generated", audio_path, spectrogram_path
         elif task_type == "Text to Audio":
             audio_path = text_to_audio(input_text)
             image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
-            return "Audio Generated", audio_path, image_path
         elif task_type == "Video Generation":
             video_path = generate_video(input_text)
-            if "Error" in video_path:
-                return video_path, None, None
             audio_path = generate_music(input_text)
-            return "Video Generated", audio_path, video_path
     except Exception as e:
         return f"Error: {e}", None, None
-# Gradio interface setup
 interface = gr.Interface(
     fn=main_interface,
     inputs=[

 from PIL import Image
 import os
+# CPU device
+device = torch.device("cpu")
+# Load MusicGen (CPU)
 music_model = MusicGen.get_pretrained("small", device=device)
+# GPT-2 (CPU)
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
+# Stable Diffusion (CPU-safe config)
 pipe = StableDiffusionPipeline.from_pretrained(
     "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float32  # Must be float32 for CPU
 ).to("cpu")
 def get_emotion_tone(text):
     if any(word in text.lower() for word in ["happy", "joy", "excited"]):
         return "happy"
     else:
         return "neutral"
 def generate_image(prompt, style="realistic"):
     styled_prompt = f"{style} style {prompt}"
     try:
         image.save(temp_image.name)
         return temp_image.name
     except Exception as e:
+        print("Image generation error:", e)
+        return None
 def text_to_audio(text):
     emotion = get_emotion_tone(text)
     engine = pyttsx3.init()
     engine.runAndWait()
     return temp_file.name
 def generate_music(prompt):
     try:
+        wav = music_model.generate([prompt])
         temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
         audio_data = wav.cpu().numpy()
         wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
         return temp_file.name
     except Exception as e:
+        print("Music generation error:", e)
+        return None
 def generate_spectrogram(audio_path):
     try:
         y, sr = librosa.load(audio_path, sr=None)
         plt.close()
         return temp_image.name
     except Exception as e:
+        print("Spectrogram generation error:", e)
+        return None
 def chat_with_ai(user_input):
     try:
         inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response
     except Exception as e:
+        print("Chat error:", e)
+        return "Error in chat generation."
 def generate_video(prompt):
     frames = []
+    for i in range(5):
         frame_prompt = f"{prompt} frame {i+1}"
         frame_path = generate_image(frame_prompt)
+        if frame_path:
+            frames.append(Image.open(frame_path))
+    if not frames:
+        return None
     temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
     frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
     return temp_video.name
 def main_interface(input_text, task_type, style):
     try:
         if task_type == "Conversation":
             response = chat_with_ai(input_text)
             image_path = generate_image(f"conversation about {input_text}", style)
+            return response, None, image_path if os.path.exists(image_path) else None
         elif task_type == "Music":
             audio_path = generate_music(input_text)
+            spectrogram_path = generate_spectrogram(audio_path) if audio_path else None
+            return "Music Generated", audio_path if os.path.exists(audio_path) else None, spectrogram_path
         elif task_type == "Text to Audio":
             audio_path = text_to_audio(input_text)
             image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
+            return "Audio Generated", audio_path if os.path.exists(audio_path) else None, image_path
         elif task_type == "Video Generation":
             video_path = generate_video(input_text)
             audio_path = generate_music(input_text)
+            return "Video Generated", audio_path if os.path.exists(audio_path) else None, video_path
     except Exception as e:
         return f"Error: {e}", None, None
 interface = gr.Interface(
     fn=main_interface,
     inputs=[