arshad1234321 commited on
Commit
2895c13
·
verified ·
1 Parent(s): 2f63663

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -96
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import torch
2
  from audiocraft.models import MusicGen
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
- from TTS.api import TTS
5
  import gradio as gr
6
  from tempfile import NamedTemporaryFile
7
  import numpy as np
@@ -14,149 +14,140 @@ import soundfile as sf
14
  from PIL import Image
15
  import os
16
 
17
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
18
 
19
- # Load models
20
  music_model = MusicGen.get_pretrained("small", device=device)
21
 
 
22
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
23
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
24
 
25
- # Set torch_dtype to float32 for compatibility on CPU
26
  pipe = StableDiffusionPipeline.from_pretrained(
27
- "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32
 
28
  ).to(device)
29
 
30
- tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False).to(device)
 
 
 
31
 
32
- # Emotion detection
33
  def get_emotion_tone(text):
34
- if any(word in text.lower() for word in ["happy", "joy", "excited"]):
 
35
  return "happy"
36
- elif any(word in text.lower() for word in ["sad", "down", "melancholy"]):
37
  return "sad"
38
- elif any(word in text.lower() for word in ["angry", "frustrated"]):
39
  return "angry"
40
- else:
41
- return "neutral"
42
 
43
- # Generate image
44
  def generate_image(prompt, style="realistic"):
45
- styled_prompt = f"{style} style {prompt}"
46
  try:
47
- image = pipe(styled_prompt).images[0]
48
- temp_image = NamedTemporaryFile(delete=False, suffix=".png")
49
- image.save(temp_image.name)
50
- return temp_image.name
51
  except Exception as e:
52
  print("Image error:", e)
53
  return None
54
 
55
- # Convert text to audio using TTS
56
  def text_to_audio(text):
57
- emotion = get_emotion_tone(text)
58
- temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
59
- tts.tts_to_file(text=text, file_path=temp_file.name)
60
- return temp_file.name
 
 
 
 
 
 
61
 
62
- # Generate music
63
  def generate_music(prompt):
64
  try:
65
- wav = music_model.generate([prompt])
66
- temp_file = NamedTemporaryFile(delete=False, suffix=".wav")
67
- audio_data = wav.cpu().numpy()
68
- wavfile.write(temp_file.name, music_model.sample_rate, audio_data[0, 0])
69
- return temp_file.name
70
  except Exception as e:
71
  print("Music error:", e)
72
  return None
73
 
74
- # Generate spectrogram
75
  def generate_spectrogram(audio_path):
76
  try:
77
  y, sr = librosa.load(audio_path, sr=None)
78
  S = librosa.feature.melspectrogram(y, sr=sr)
79
- S_dB = librosa.power_to_db(S, ref=np.max)
80
-
81
- plt.figure(figsize=(10, 4))
82
- librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='coolwarm')
83
- plt.colorbar(format='%+2.0f dB')
84
- plt.title('Mel-frequency spectrogram')
85
- temp_image = NamedTemporaryFile(delete=False, suffix=".png")
86
- plt.savefig(temp_image.name)
87
  plt.close()
88
- return temp_image.name
89
  except Exception as e:
90
  print("Spectrogram error:", e)
91
  return None
92
 
93
- # GPT-2 chatbot
94
- def chat_with_ai(user_input):
95
  try:
96
- inputs = tokenizer.encode(user_input, return_tensors="pt").to(device)
97
- outputs = gpt2_model.generate(inputs, max_length=60, num_return_sequences=1)
98
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
99
- return response
100
  except Exception as e:
101
  print("Chat error:", e)
102
- return "Sorry, I couldn't respond."
103
 
104
- # Generate gif video
105
  def generate_video(prompt):
106
  frames = []
107
  for i in range(5):
108
- frame_prompt = f"{prompt} frame {i+1}"
109
- frame_path = generate_image(frame_prompt)
110
- if frame_path:
111
- frames.append(Image.open(frame_path))
112
-
113
- if frames:
114
- temp_video = NamedTemporaryFile(delete=False, suffix=".gif")
115
- frames[0].save(temp_video.name, save_all=True, append_images=frames[1:], duration=500, loop=0)
116
- return temp_video.name
117
- return None
118
-
119
- # Main interface
120
- def main_interface(input_text, task_type, style):
121
- try:
122
- if task_type == "Conversation":
123
- response = chat_with_ai(input_text)
124
- image_path = generate_image(f"conversation about {input_text}", style)
125
- return response, None, image_path
126
-
127
- elif task_type == "Music":
128
- audio_path = generate_music(input_text)
129
- spectrogram_path = generate_spectrogram(audio_path)
130
- return "Music Generated", audio_path, spectrogram_path
131
-
132
- elif task_type == "Text to Audio":
133
- audio_path = text_to_audio(input_text)
134
- image_path = generate_image(f"text-to-audio conversion for {input_text}", style)
135
- return "Audio Generated", audio_path, image_path
136
-
137
- elif task_type == "Video Generation":
138
- video_path = generate_video(input_text)
139
- audio_path = generate_music(input_text)
140
- return "Video Generated", audio_path, video_path
141
-
142
- except Exception as e:
143
- print("Main interface error:", e)
144
- return f"Error: {e}", None, None
145
-
146
- # Gradio app
147
- interface = gr.Interface(
148
- fn=main_interface,
149
  inputs=[
150
- gr.Textbox(label="Enter Text or Prompt"),
151
- gr.Radio(["Conversation", "Music", "Text to Audio", "Video Generation"], label="Select Task"),
152
- gr.Dropdown(["realistic", "abstract", "comic"], label="Select Style"),
153
  ],
154
  outputs=[
155
- gr.Textbox(label="Generated Output"),
156
- gr.Audio(label="Generated Audio", type="filepath"),
157
- gr.Image(label="Generated Image", type="filepath"),
158
  ],
159
- live=False,
160
  )
161
 
162
- interface.launch()
 
 
1
  import torch
2
  from audiocraft.models import MusicGen
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
+ import pyttsx3
5
  import gradio as gr
6
  from tempfile import NamedTemporaryFile
7
  import numpy as np
 
14
  from PIL import Image
15
  import os
16
 
17
+ # Ensure CPU-only
18
+ device = torch.device("cpu")
19
 
20
+ # Load MusicGen (small) on CPU
21
  music_model = MusicGen.get_pretrained("small", device=device)
22
 
23
+ # Load GPT-2 on CPU
24
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
25
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
26
 
27
+ # Load Stable Diffusion CPU-only
28
  pipe = StableDiffusionPipeline.from_pretrained(
29
+ "runwayml/stable-diffusion-v1-5",
30
+ torch_dtype=torch.float32
31
  ).to(device)
32
 
33
+ # Initialize pyttsx3 TTS
34
+ tts_engine = pyttsx3.init()
35
+ tts_engine.setProperty("rate", 150)
36
+ tts_engine.setProperty("volume", 0.8)
37
 
 
38
  def get_emotion_tone(text):
39
+ txt = text.lower()
40
+ if any(w in txt for w in ["happy", "joy", "excited"]):
41
  return "happy"
42
+ if any(w in txt for w in ["sad", "down", "melancholy"]):
43
  return "sad"
44
+ if any(w in txt for w in ["angry", "frustrated"]):
45
  return "angry"
46
+ return "neutral"
 
47
 
 
48
  def generate_image(prompt, style="realistic"):
49
+ styled = f"{style} style {prompt}"
50
  try:
51
+ img = pipe(styled).images[0]
52
+ tmp = NamedTemporaryFile(delete=False, suffix=".png")
53
+ img.save(tmp.name)
54
+ return tmp.name
55
  except Exception as e:
56
  print("Image error:", e)
57
  return None
58
 
 
59
  def text_to_audio(text):
60
+ tone = get_emotion_tone(text)
61
+ # adjust rate/volume by tone
62
+ rate = {"neutral":150, "happy":180, "sad":100, "angry":200}[tone]
63
+ vol = {"neutral":0.8, "happy":1.0, "sad":0.5, "angry":1.0}[tone]
64
+ tts_engine.setProperty("rate", rate)
65
+ tts_engine.setProperty("volume", vol)
66
+ tmp = NamedTemporaryFile(delete=False, suffix=".mp3")
67
+ tts_engine.save_to_file(text, tmp.name)
68
+ tts_engine.runAndWait()
69
+ return tmp.name
70
 
 
71
  def generate_music(prompt):
72
  try:
73
+ wav = music_model.generate([prompt]) # shape [1, 1, T]
74
+ data = wav.cpu().numpy()[0,0]
75
+ tmp = NamedTemporaryFile(delete=False, suffix=".wav")
76
+ wavfile.write(tmp.name, music_model.sample_rate, data)
77
+ return tmp.name
78
  except Exception as e:
79
  print("Music error:", e)
80
  return None
81
 
 
82
  def generate_spectrogram(audio_path):
83
  try:
84
  y, sr = librosa.load(audio_path, sr=None)
85
  S = librosa.feature.melspectrogram(y, sr=sr)
86
+ S_db = librosa.power_to_db(S, ref=np.max)
87
+ plt.figure(figsize=(6,3))
88
+ librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel')
89
+ plt.title("Mel Spectrogram")
90
+ tmp = NamedTemporaryFile(delete=False, suffix=".png")
91
+ plt.savefig(tmp.name, bbox_inches="tight")
 
 
92
  plt.close()
93
+ return tmp.name
94
  except Exception as e:
95
  print("Spectrogram error:", e)
96
  return None
97
 
98
+ def chat_with_ai(text):
 
99
  try:
100
+ tok = tokenizer.encode(text, return_tensors="pt").to(device)
101
+ out = gpt2_model.generate(tok, max_length=50)
102
+ return tokenizer.decode(out[0], skip_special_tokens=True)
 
103
  except Exception as e:
104
  print("Chat error:", e)
105
+ return "Error generating response."
106
 
 
107
  def generate_video(prompt):
108
  frames = []
109
  for i in range(5):
110
+ path = generate_image(f"{prompt} frame {i+1}")
111
+ if path:
112
+ frames.append(Image.open(path))
113
+ if not frames:
114
+ return None
115
+ tmp = NamedTemporaryFile(delete=False, suffix=".gif")
116
+ frames[0].save(tmp.name, save_all=True, append_images=frames[1:], duration=400, loop=0)
117
+ return tmp.name
118
+
119
+ def main(input_text, task, style):
120
+ if task=="Conversation":
121
+ resp = chat_with_ai(input_text)
122
+ img = generate_image(f"conversation about {input_text}", style)
123
+ return resp, None, img
124
+ if task=="Music":
125
+ mus = generate_music(input_text)
126
+ spec = generate_spectrogram(mus) if mus else None
127
+ return "Music ready", mus, spec
128
+ if task=="Text to Audio":
129
+ aud = text_to_audio(input_text)
130
+ img = generate_image(f"audio for {input_text}", style)
131
+ return "Audio ready", aud, img
132
+ if task=="Video Generation":
133
+ vid = generate_video(input_text)
134
+ aud = generate_music(input_text)
135
+ return "Video ready", aud, vid
136
+
137
+ iface = gr.Interface(
138
+ fn=main,
 
 
 
 
 
 
 
 
 
 
 
 
139
  inputs=[
140
+ gr.Textbox(label="Enter Prompt"),
141
+ gr.Radio(["Conversation","Music","Text to Audio","Video Generation"], label="Task"),
142
+ gr.Dropdown(["realistic","abstract","comic"], label="Style")
143
  ],
144
  outputs=[
145
+ gr.Textbox(label="Output Text"),
146
+ gr.Audio(label="Audio File", type="filepath"),
147
+ gr.Image(label="Image/GIF", type="filepath")
148
  ],
149
+ live=False
150
  )
151
 
152
+ if __name__=="__main__":
153
+ iface.launch()