Spaces:

arshad1234321
/

Text_to_Multimedia

Sleeping

App Files Files Community

arshad1234321 commited on Apr 15, 2025

Commit

2895c13

verified ·

1 Parent(s): 2f63663

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -96

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from audiocraft.models import MusicGen
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
-from TTS.api import TTS
 import gradio as gr
 from tempfile import NamedTemporaryFile
 import numpy as np
@@ -14,149 +14,140 @@ import soundfile as sf
 from PIL import Image
 import os
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Load models
 music_model = MusicGen.get_pretrained("small", device=device)
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
-# Set torch_dtype to float32 for compatibility on CPU
 pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32
 ).to(device)
-tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False).to(device)
-# Emotion detection
 def get_emotion_tone(text):
-    if any(word in text.lower() for word in ["happy", "joy", "excited"]):
         return "happy"
-    elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
         return "sad"
-    elif any(word in text.lower() for word in ["angry", "frustrated"]):
         return "angry"
-    else:
-        return "neutral"
-# Generate image
 def generate_image(prompt, style="realistic"):
-    styled_prompt = f"{style} style {prompt}"
     try:
-        image = pipe(styled_prompt).images[0]
-        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
-        image.save(temp_image.name)
-        return temp_image.name
     except Exception as e:
         print("Image error:", e)
         return None
-# Convert text to audio using TTS
 def text_to_audio(text):
-    emotion = get_emotion_tone(text)
-    temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
-    tts.tts_to_file(text=text, file_path=temp_file.name)
-    return temp_file.name
-# Generate music
 def generate_music(prompt):
     try:
-        wav = music_model.generate([prompt])
-        temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
-        audio_data = wav.cpu().numpy()
-        wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
-        return temp_file.name
     except Exception as e:
         print("Music error:", e)
         return None
-# Generate spectrogram
 def generate_spectrogram(audio_path):
     try:
         y, sr = librosa.load(audio_path, sr=None)
         S = librosa.feature.melspectrogram(y, sr=sr)
-        S_dB = librosa.power_to_db(S, ref=np.max)
-        plt.figure(figsize=(10, 4))
-        librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
-        plt.colorbar(format='%+2.0f dB')
-        plt.title('Mel-frequency spectrogram')
-        temp_image = NamedTemporaryFile(delete=False, suffix=".png")
-        plt.savefig(temp_image.name)
         plt.close()
-        return temp_image.name
     except Exception as e:
         print("Spectrogram error:", e)
         return None
-# GPT-2 chatbot
-def chat_with_ai(user_input):
     try:
-        inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
-        outputs = gpt2_model.generate(inputs, max_length=60, num_return_sequences=1)
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return response
     except Exception as e:
         print("Chat error:", e)
-        return "Sorry, I couldn't respond."
-# Generate gif video
 def generate_video(prompt):
     frames = []
     for i in range(5):
-        frame_prompt = f"{prompt} frame {i+1}"
-        frame_path = generate_image(frame_prompt)
-        if frame_path:
-            frames.append(Image.open(frame_path))
-    if frames:
-        temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
-        frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
-        return temp_video.name
-    return None
-# Main interface
-def main_interface(input_text, task_type, style):
-    try:
-        if task_type == "Conversation":
-            response = chat_with_ai(input_text)
-            image_path = generate_image(f"conversation about {input_text}", style)
-            return response, None, image_path
-        elif task_type == "Music":
-            audio_path = generate_music(input_text)
-            spectrogram_path = generate_spectrogram(audio_path)
-            return "Music Generated", audio_path, spectrogram_path
-        elif task_type == "Text to Audio":
-            audio_path = text_to_audio(input_text)
-            image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
-            return "Audio Generated", audio_path, image_path
-        elif task_type == "Video Generation":
-            video_path = generate_video(input_text)
-            audio_path = generate_music(input_text)
-            return "Video Generated", audio_path, video_path
-    except Exception as e:
-        print("Main interface error:", e)
-        return f"Error: {e}", None, None
-# Gradio app
-interface = gr.Interface(
-    fn=main_interface,
     inputs=[
-        gr.Textbox(label="Enter Text or Prompt"),
-        gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
-        gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
     ],
     outputs=[
-        gr.Textbox(label="Generated Output"),
-        gr.Audio(label="Generated Audio", type="filepath"),
-        gr.Image(label="Generated Image", type="filepath"),
     ],
-    live=False,
 )
-interface.launch()

 import torch
 from audiocraft.models import MusicGen
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import pyttsx3
 import gradio as gr
 from tempfile import NamedTemporaryFile
 import numpy as np
 from PIL import Image
 import os
+# Ensure CPU-only
+device = torch.device("cpu")
+# Load MusicGen (small) on CPU
 music_model = MusicGen.get_pretrained("small", device=device)
+# Load GPT-2 on CPU
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
+# Load Stable Diffusion CPU-only
 pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float32
 ).to(device)
+# Initialize pyttsx3 TTS
+tts_engine = pyttsx3.init()
+tts_engine.setProperty("rate", 150)
+tts_engine.setProperty("volume", 0.8)
 def get_emotion_tone(text):
+    txt = text.lower()
+    if any(w in txt for w in ["happy", "joy", "excited"]):
         return "happy"
+    if any(w in txt for w in ["sad", "down", "melancholy"]):
         return "sad"
+    if any(w in txt for w in ["angry", "frustrated"]):
         return "angry"
+    return "neutral"
 def generate_image(prompt, style="realistic"):
+    styled = f"{style} style {prompt}"
     try:
+        img = pipe(styled).images[0]
+        tmp = NamedTemporaryFile(delete=False, suffix=".png")
+        img.save(tmp.name)
+        return tmp.name
     except Exception as e:
         print("Image error:", e)
         return None
 def text_to_audio(text):
+    tone = get_emotion_tone(text)
+    # adjust rate/volume by tone
+    rate = {"neutral":150, "happy":180, "sad":100, "angry":200}[tone]
+    vol  = {"neutral":0.8,   "happy":1.0,   "sad":0.5,   "angry":1.0}[tone]
+    tts_engine.setProperty("rate", rate)
+    tts_engine.setProperty("volume", vol)
+    tmp = NamedTemporaryFile(delete=False, suffix=".mp3")
+    tts_engine.save_to_file(text, tmp.name)
+    tts_engine.runAndWait()
+    return tmp.name
 def generate_music(prompt):
     try:
+        wav = music_model.generate([prompt])  # shape [1, 1, T]
+        data = wav.cpu().numpy()[0,0]
+        tmp = NamedTemporaryFile(delete=False, suffix=".wav")
+        wavfile.write(tmp.name, music_model.sample_rate, data)
+        return tmp.name
     except Exception as e:
         print("Music error:", e)
         return None
 def generate_spectrogram(audio_path):
     try:
         y, sr = librosa.load(audio_path, sr=None)
         S = librosa.feature.melspectrogram(y, sr=sr)
+        S_db = librosa.power_to_db(S, ref=np.max)
+        plt.figure(figsize=(6,3))
+        librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel')
+        plt.title("Mel Spectrogram")
+        tmp = NamedTemporaryFile(delete=False, suffix=".png")
+        plt.savefig(tmp.name, bbox_inches="tight")
         plt.close()
+        return tmp.name
     except Exception as e:
         print("Spectrogram error:", e)
         return None
+def chat_with_ai(text):
     try:
+        tok = tokenizer.encode(text, return_tensors="pt").to(device)
+        out = gpt2_model.generate(tok, max_length=50)
+        return tokenizer.decode(out[0], skip_special_tokens=True)
     except Exception as e:
         print("Chat error:", e)
+        return "Error generating response."
 def generate_video(prompt):
     frames = []
     for i in range(5):
+        path = generate_image(f"{prompt} frame {i+1}")
+        if path:
+            frames.append(Image.open(path))
+    if not frames:
+        return None
+    tmp = NamedTemporaryFile(delete=False, suffix=".gif")
+    frames[0].save(tmp.name, save_all=True, append_images=frames[1:], duration=400, loop=0)
+    return tmp.name
+def main(input_text, task, style):
+    if task=="Conversation":
+        resp = chat_with_ai(input_text)
+        img  = generate_image(f"conversation about {input_text}", style)
+        return resp, None, img
+    if task=="Music":
+        mus = generate_music(input_text)
+        spec = generate_spectrogram(mus) if mus else None
+        return "Music ready", mus, spec
+    if task=="Text to Audio":
+        aud = text_to_audio(input_text)
+        img = generate_image(f"audio for {input_text}", style)
+        return "Audio ready", aud, img
+    if task=="Video Generation":
+        vid = generate_video(input_text)
+        aud = generate_music(input_text)
+        return "Video ready", aud, vid
+iface = gr.Interface(
+    fn=main,
     inputs=[
+        gr.Textbox(label="Enter Prompt"),
+        gr.Radio(["Conversation","Music","Text to Audio","Video Generation"], label="Task"),
+        gr.Dropdown(["realistic","abstract","comic"], label="Style")
     ],
     outputs=[
+        gr.Textbox(label="Output Text"),
+        gr.Audio(label="Audio File", type="filepath"),
+        gr.Image(label="Image/GIF", type="filepath")
     ],
+    live=False
 )
+if __name__=="__main__":
+    iface.launch()