arshad1234321 commited on
Commit
1678182
·
verified ·
1 Parent(s): 4f4130a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -113
app.py CHANGED
@@ -1,8 +1,7 @@
1
- import sys, os
2
  import torch
3
  from audiocraft.models import MusicGen
4
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
5
- from gtts import gTTS
6
  import gradio as gr
7
  from tempfile import NamedTemporaryFile
8
  import numpy as np
@@ -11,150 +10,148 @@ from diffusers import StableDiffusionPipeline
11
  import matplotlib.pyplot as plt
12
  import librosa.display
13
  import librosa
 
14
  from PIL import Image
 
15
 
16
- # 1) Startup logs
17
- print("=== STARTUP ===")
18
- print("Python:", sys.version.replace('\n',' '))
19
- print("Torch:", torch.__version__)
20
- print("Device: CPU")
21
-
22
- # 2) Force CPU
23
- device = torch.device("cpu")
24
-
25
- # 3) Load MusicGen
26
- print("Loading MusicGen…")
27
  music_model = MusicGen.get_pretrained("small", device=device)
28
- print("MusicGen loaded.")
29
 
30
- # 4) Load GPT-2
31
- print("Loading GPT-2…")
32
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
33
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
34
- print("GPT-2 loaded.")
35
 
36
- # 5) Load Stable Diffusion (CPU-safe)
37
- print("Loading Stable Diffusion…")
38
- pipe = StableDiffusionPipeline.from_pretrained(
39
- "runwayml/stable-diffusion-v1-5",
40
- torch_dtype=torch.float32
41
- ).to(device)
42
- print("Stable Diffusion loaded.")
43
 
44
- # Emotion helper
45
  def get_emotion_tone(text):
46
- txt = text.lower()
47
- if any(w in txt for w in ["happy","joy","excited"]): return "happy"
48
- if any(w in txt for w in ["sad","down","melancholy"]): return "sad"
49
- if any(w in txt for w in ["angry","frustrated"]): return "angry"
50
- return "neutral"
 
 
 
51
 
52
- # Image generation
53
  def generate_image(prompt, style="realistic"):
54
- styled = f"{style} style {prompt}"
55
  try:
56
- img = pipe(styled).images[0]
57
- tmp = NamedTemporaryFile(delete=False, suffix=".png")
58
- img.save(tmp.name)
59
- return tmp.name
60
  except Exception as e:
61
- print("Image error:", e)
62
- return None
63
 
64
- # Text-to-audio via gTTS
65
  def text_to_audio(text):
66
- try:
67
- tts = gTTS(text=text, lang="en")
68
- tmp = NamedTemporaryFile(delete=False, suffix=".mp3")
69
- tts.save(tmp.name)
70
- return tmp.name
71
- except Exception as e:
72
- print("TTS error:", e)
73
- return None
74
 
75
- # Music generation
 
 
 
 
 
76
  def generate_music(prompt):
77
  try:
78
- wav = music_model.generate([prompt]) # [1,1,T]
79
- data = wav.cpu().numpy()[0,0]
80
- tmp = NamedTemporaryFile(delete=False, suffix=".wav")
81
- wavfile.write(tmp.name, music_model.sample_rate, data)
82
- return tmp.name
 
83
  except Exception as e:
84
- print("Music error:", e)
85
- return None
86
 
87
- # Spectrogram
88
  def generate_spectrogram(audio_path):
89
  try:
90
- y,sr = librosa.load(audio_path, sr=None)
91
  S = librosa.feature.melspectrogram(y, sr=sr)
92
- S_db = librosa.power_to_db(S, ref=np.max)
93
- plt.figure(figsize=(6,3))
94
- librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel')
95
- tmp = NamedTemporaryFile(delete=False, suffix=".png")
96
- plt.savefig(tmp.name, bbox_inches="tight")
 
 
 
97
  plt.close()
98
- return tmp.name
99
  except Exception as e:
100
- print("Spectrogram error:", e)
101
- return None
102
 
103
- # GPT-2 chat
104
- def chat_with_ai(text):
105
  try:
106
- tok = tokenizer.encode(text, return_tensors="pt").to(device)
107
- out = gpt2_model.generate(tok, max_length=50)
108
- return tokenizer.decode(out[0], skip_special_tokens=True)
 
109
  except Exception as e:
110
- print("Chat error:", e)
111
- return "Error generating response."
112
 
113
- # GIF video
114
  def generate_video(prompt):
115
- frames=[]
116
- for i in range(5):
117
- p = generate_image(f"{prompt} frame {i+1}")
118
- if p: frames.append(Image.open(p))
119
- if not frames: return None
120
- tmp = NamedTemporaryFile(delete=False, suffix=".gif")
121
- frames[0].save(tmp.name, save_all=True, append_images=frames[1:], duration=400, loop=0)
122
- return tmp.name
123
-
124
- # Main interface
125
- def main(input_text, task, style):
126
- if task=="Conversation":
127
- resp = chat_with_ai(input_text)
128
- img = generate_image(f"conversation about {input_text}", style)
129
- return resp, None, img
130
- if task=="Music":
131
- mus = generate_music(input_text)
132
- spec = generate_spectrogram(mus) if mus else None
133
- return "Music ready", mus, spec
134
- if task=="Text to Audio":
135
- aud = text_to_audio(input_text)
136
- img = generate_image(f"audio for {input_text}", style)
137
- return "Audio ready", aud, img
138
- if task=="Video Generation":
139
- vid = generate_video(input_text)
140
- aud = generate_music(input_text)
141
- return "Video ready", aud, vid
142
-
143
- # Launch with debug logging
144
- iface = gr.Interface(
145
- fn=main,
 
 
 
 
 
 
 
146
  inputs=[
147
- gr.Textbox(label="Enter Prompt"),
148
- gr.Radio(["Conversation","Music","Text to Audio","Video Generation"], label="Task"),
149
- gr.Dropdown(["realistic","abstract","comic"], label="Style"),
150
  ],
151
  outputs=[
152
- gr.Textbox(label="Output Text"),
153
- gr.Audio(label="Audio File", type="filepath"),
154
- gr.Image(label="Image/GIF", type="filepath"),
155
  ],
 
156
  )
157
 
158
- if __name__=="__main__":
159
- print("Launching Gradio…")
160
- iface.launch(server_name="0.0.0.0", server_port=7860, debug=True)
 
 
1
  import torch
2
  from audiocraft.models import MusicGen
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
+ import pyttsx3
5
  import gradio as gr
6
  from tempfile import NamedTemporaryFile
7
  import numpy as np
 
10
  import matplotlib.pyplot as plt
11
  import librosa.display
12
  import librosa
13
+ import soundfile as sf
14
  from PIL import Image
15
+ import os
16
 
17
+ # Set device
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ # MusicGen
 
 
 
 
 
 
 
 
20
  music_model = MusicGen.get_pretrained("small", device=device)
 
21
 
22
+ # GPT-2 for conversation
 
23
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
24
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
 
25
 
26
+ # Stable Diffusion for image generation
27
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
28
+ pipe = pipe.to(device)
 
 
 
 
29
 
30
+ # Emotion detection for Text-to-Audio
31
  def get_emotion_tone(text):
32
+ if any(word in text.lower() for word in ["happy", "joy", "excited"]):
33
+ return "happy"
34
+ elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
35
+ return "sad"
36
+ elif any(word in text.lower() for word in ["angry", "frustrated"]):
37
+ return "angry"
38
+ else:
39
+ return "neutral"
40
 
41
+ # Image generation using Stable Diffusion
42
  def generate_image(prompt, style="realistic"):
43
+ styled_prompt = f"{style} style {prompt}"
44
  try:
45
+ image = pipe(styled_prompt).images[0]
46
+ temp_image = NamedTemporaryFile(delete=False, suffix=".png")
47
+ image.save(temp_image.name)
48
+ return temp_image.name
49
  except Exception as e:
50
+ return f"Error generating image: {e}"
 
51
 
52
+ # Convert Text to Audio with Emotion
53
  def text_to_audio(text):
54
+ emotion = get_emotion_tone(text)
55
+ engine = pyttsx3.init()
56
+ engine.setProperty('rate', 150 if emotion == "neutral" else 180 if emotion == "happy" else 100 if emotion == "sad" else 200)
57
+ engine.setProperty('volume', 0.8 if emotion == "neutral" else 1.0 if emotion == "happy" or emotion == "angry" else 0.5)
 
 
 
 
58
 
59
+ temp_file = NamedTemporaryFile(delete=False, suffix=".mp3")
60
+ engine.save_to_file(text, temp_file.name)
61
+ engine.runAndWait()
62
+ return temp_file.name
63
+
64
+ # Music generation using MusicGen
65
  def generate_music(prompt):
66
  try:
67
+ descriptions = [prompt]
68
+ wav = music_model.generate(descriptions)
69
+ temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
70
+ audio_data = wav.cpu().numpy()
71
+ wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
72
+ return temp_file.name
73
  except Exception as e:
74
+ return f"Error generating music: {e}"
 
75
 
76
+ # Spectrogram generation from audio
77
  def generate_spectrogram(audio_path):
78
  try:
79
+ y, sr = librosa.load(audio_path, sr=None)
80
  S = librosa.feature.melspectrogram(y, sr=sr)
81
+ S_dB = librosa.power_to_db(S, ref=np.max)
82
+
83
+ plt.figure(figsize=(10, 4))
84
+ librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
85
+ plt.colorbar(format='%+2.0f dB')
86
+ plt.title('Mel-frequency spectrogram')
87
+ temp_image = NamedTemporaryFile(delete=False, suffix=".png")
88
+ plt.savefig(temp_image.name)
89
  plt.close()
90
+ return temp_image.name
91
  except Exception as e:
92
+ return f"Error generating spectrogram: {e}"
 
93
 
94
+ # Chat with AI (GPT-2)
95
+ def chat_with_ai(user_input):
96
  try:
97
+ inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
98
+ outputs = gpt2_model.generate(inputs, max_length=50, num_return_sequences=1)
99
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
100
+ return response
101
  except Exception as e:
102
+ return f"Error in chat generation: {e}"
 
103
 
104
+ # Simulate Video Generation using a Sequence of Images
105
  def generate_video(prompt):
106
+ frames = []
107
+ for i in range(5): # Generate 5 frames as a sequence
108
+ frame_prompt = f"{prompt} frame {i+1}"
109
+ frame_path = generate_image(frame_prompt)
110
+ frames.append(Image.open(frame_path))
111
+
112
+ temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
113
+ frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
114
+ return temp_video.name
115
+
116
+ # Main interface logic
117
+ def main_interface(input_text, task_type, style):
118
+ try:
119
+ if task_type == "Conversation":
120
+ response = chat_with_ai(input_text)
121
+ image_path = generate_image(f"conversation about {input_text}", style)
122
+ return response, None, image_path
123
+
124
+ elif task_type == "Music":
125
+ audio_path = generate_music(input_text)
126
+ spectrogram_path = generate_spectrogram(audio_path)
127
+ return "Music Generated", audio_path, spectrogram_path
128
+
129
+ elif task_type == "Text to Audio":
130
+ audio_path = text_to_audio(input_text)
131
+ image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
132
+ return "Audio Generated", audio_path, image_path
133
+
134
+ elif task_type == "Video Generation":
135
+ video_path = generate_video(input_text)
136
+ audio_path = generate_music(input_text)
137
+ return "Video Generated", audio_path, video_path
138
+ except Exception as e:
139
+ return f"Error: {e}", None, None
140
+
141
+ # Gradio interface setup
142
+ interface = gr.Interface(
143
+ fn=main_interface,
144
  inputs=[
145
+ gr.Textbox(label="Enter Text or Prompt"),
146
+ gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
147
+ gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
148
  ],
149
  outputs=[
150
+ gr.Textbox(label="Generated Output"),
151
+ gr.Audio(label="Generated Audio", type="filepath"),
152
+ gr.Image(label="Generated Image", type="filepath"),
153
  ],
154
+ live=False,
155
  )
156
 
157
+ interface.launch()