Spaces:

arshad1234321
/

Text_to_Multimedia

Sleeping

App Files Files Community

arshad1234321 commited on Apr 15, 2025

Commit

f1b1b95

verified ·

1 Parent(s): e9bad54

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -32

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from audiocraft.models import MusicGen
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
-import pyttsx3
 import gradio as gr
 from tempfile import NamedTemporaryFile
 import numpy as np
@@ -14,22 +14,22 @@ import soundfile as sf
 from PIL import Image
 import os
-# CPU device
-device = torch.device("cpu")
-# Load MusicGen (CPU)
 music_model = MusicGen.get_pretrained("small", device=device)
-# GPT-2 (CPU)
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
-# Stable Diffusion (CPU-safe config)
 pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
-    torch_dtype=torch.float32  # Must be float32 for CPU
-).to("cpu")
 def get_emotion_tone(text):
     if any(word in text.lower() for word in ["happy", "joy", "excited"]):
         return "happy"
@@ -40,6 +40,7 @@ def get_emotion_tone(text):
     else:
         return "neutral"
 def generate_image(prompt, style="realistic"):
     styled_prompt = f"{style} style {prompt}"
     try:
@@ -48,20 +49,17 @@ def generate_image(prompt, style="realistic"):
         image.save(temp_image.name)
         return temp_image.name
     except Exception as e:
-        print("Image generation error:", e)
         return None
 def text_to_audio(text):
     emotion = get_emotion_tone(text)
-    engine = pyttsx3.init()
-    engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
-    engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion in ["happy", "angry"] else 0.5)
-    temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
-    engine.save_to_file(text, temp_file.name)
-    engine.runAndWait()
     return temp_file.name
 def generate_music(prompt):
     try:
         wav = music_model.generate([prompt])
@@ -70,9 +68,10 @@ def generate_music(prompt):
         wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
         return temp_file.name
     except Exception as e:
-        print("Music generation error:", e)
         return None
 def generate_spectrogram(audio_path):
     try:
         y, sr = librosa.load(audio_path, sr=None)
@@ -88,19 +87,21 @@ def generate_spectrogram(audio_path):
         plt.close()
         return temp_image.name
     except Exception as e:
-        print("Spectrogram generation error:", e)
         return None
 def chat_with_ai(user_input):
     try:
         inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
-        outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response
     except Exception as e:
         print("Chat error:", e)
-        return "Error in chat generation."
 def generate_video(prompt):
     frames = []
     for i in range(5):
@@ -109,37 +110,40 @@ def generate_video(prompt):
         if frame_path:
             frames.append(Image.open(frame_path))
-    if not frames:
-        return None
-    temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
-    frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
-    return temp_video.name
 def main_interface(input_text, task_type, style):
     try:
         if task_type == "Conversation":
             response = chat_with_ai(input_text)
             image_path = generate_image(f"conversation about {input_text}", style)
-            return response, None, image_path if os.path.exists(image_path) else None
         elif task_type == "Music":
             audio_path = generate_music(input_text)
-            spectrogram_path = generate_spectrogram(audio_path) if audio_path else None
-            return "Music Generated", audio_path if os.path.exists(audio_path) else None, spectrogram_path
         elif task_type == "Text to Audio":
             audio_path = text_to_audio(input_text)
             image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
-            return "Audio Generated", audio_path if os.path.exists(audio_path) else None, image_path
         elif task_type == "Video Generation":
             video_path = generate_video(input_text)
             audio_path = generate_music(input_text)
-            return "Video Generated", audio_path if os.path.exists(audio_path) else None, video_path
     except Exception as e:
         return f"Error: {e}", None, None
 interface = gr.Interface(
     fn=main_interface,
     inputs=[

 import torch
 from audiocraft.models import MusicGen
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
+from TTS.api import TTS
 import gradio as gr
 from tempfile import NamedTemporaryFile
 import numpy as np
 from PIL import Image
 import os
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load models
 music_model = MusicGen.get_pretrained("small", device=device)
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
+# Set torch_dtype to float32 for compatibility on CPU
 pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32
+).to(device)
+tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False).to(device)
+# Emotion detection
 def get_emotion_tone(text):
     if any(word in text.lower() for word in ["happy", "joy", "excited"]):
         return "happy"
     else:
         return "neutral"
+# Generate image
 def generate_image(prompt, style="realistic"):
     styled_prompt = f"{style} style {prompt}"
     try:
         image.save(temp_image.name)
         return temp_image.name
     except Exception as e:
+        print("Image error:", e)
         return None
+# Convert text to audio using TTS
 def text_to_audio(text):
     emotion = get_emotion_tone(text)
+    temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
+    tts.tts_to_file(text=text, file_path=temp_file.name)
     return temp_file.name
+# Generate music
 def generate_music(prompt):
     try:
         wav = music_model.generate([prompt])
         wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
         return temp_file.name
     except Exception as e:
+        print("Music error:", e)
         return None
+# Generate spectrogram
 def generate_spectrogram(audio_path):
     try:
         y, sr = librosa.load(audio_path, sr=None)
         plt.close()
         return temp_image.name
     except Exception as e:
+        print("Spectrogram error:", e)
         return None
+# GPT-2 chatbot
 def chat_with_ai(user_input):
     try:
         inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
+        outputs = gpt2_model.generate(inputs, max_length=60, num_return_sequences=1)
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response
     except Exception as e:
         print("Chat error:", e)
+        return "Sorry, I couldn't respond."
+# Generate gif video
 def generate_video(prompt):
     frames = []
     for i in range(5):
         if frame_path:
             frames.append(Image.open(frame_path))
+    if frames:
+        temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
+        frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
+        return temp_video.name
+    return None
+# Main interface
 def main_interface(input_text, task_type, style):
     try:
         if task_type == "Conversation":
             response = chat_with_ai(input_text)
             image_path = generate_image(f"conversation about {input_text}", style)
+            return response, None, image_path
         elif task_type == "Music":
             audio_path = generate_music(input_text)
+            spectrogram_path = generate_spectrogram(audio_path)
+            return "Music Generated", audio_path, spectrogram_path
         elif task_type == "Text to Audio":
             audio_path = text_to_audio(input_text)
             image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
+            return "Audio Generated", audio_path, image_path
         elif task_type == "Video Generation":
             video_path = generate_video(input_text)
             audio_path = generate_music(input_text)
+            return "Video Generated", audio_path, video_path
     except Exception as e:
+        print("Main interface error:", e)
         return f"Error: {e}", None, None
+# Gradio app
 interface = gr.Interface(
     fn=main_interface,
     inputs=[