arshad1234321 commited on
Commit
35cfc08
·
verified ·
1 Parent(s): 2895c13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -33
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
  from audiocraft.models import MusicGen
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
@@ -10,41 +11,52 @@ from diffusers import StableDiffusionPipeline
10
  import matplotlib.pyplot as plt
11
  import librosa.display
12
  import librosa
13
- import soundfile as sf
14
  from PIL import Image
15
  import os
16
 
17
- # Ensure CPU-only
 
 
 
 
 
 
18
  device = torch.device("cpu")
19
 
20
- # Load MusicGen (small) on CPU
 
21
  music_model = MusicGen.get_pretrained("small", device=device)
 
22
 
23
- # Load GPT-2 on CPU
 
24
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
25
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
 
26
 
27
- # Load Stable Diffusion CPU-only
 
28
  pipe = StableDiffusionPipeline.from_pretrained(
29
- "runwayml/stable-diffusion-v1-5",
30
- torch_dtype=torch.float32
31
  ).to(device)
 
32
 
33
- # Initialize pyttsx3 TTS
 
34
  tts_engine = pyttsx3.init()
35
  tts_engine.setProperty("rate", 150)
36
  tts_engine.setProperty("volume", 0.8)
 
37
 
 
38
  def get_emotion_tone(text):
39
  txt = text.lower()
40
- if any(w in txt for w in ["happy", "joy", "excited"]):
41
- return "happy"
42
- if any(w in txt for w in ["sad", "down", "melancholy"]):
43
- return "sad"
44
- if any(w in txt for w in ["angry", "frustrated"]):
45
- return "angry"
46
  return "neutral"
47
 
 
48
  def generate_image(prompt, style="realistic"):
49
  styled = f"{style} style {prompt}"
50
  try:
@@ -56,21 +68,22 @@ def generate_image(prompt, style="realistic"):
56
  print("Image error:", e)
57
  return None
58
 
 
59
  def text_to_audio(text):
60
  tone = get_emotion_tone(text)
61
- # adjust rate/volume by tone
62
- rate = {"neutral":150, "happy":180, "sad":100, "angry":200}[tone]
63
- vol = {"neutral":0.8, "happy":1.0, "sad":0.5, "angry":1.0}[tone]
64
- tts_engine.setProperty("rate", rate)
65
- tts_engine.setProperty("volume", vol)
66
  tmp = NamedTemporaryFile(delete=False, suffix=".mp3")
67
  tts_engine.save_to_file(text, tmp.name)
68
  tts_engine.runAndWait()
69
  return tmp.name
70
 
 
71
  def generate_music(prompt):
72
  try:
73
- wav = music_model.generate([prompt]) # shape [1, 1, T]
74
  data = wav.cpu().numpy()[0,0]
75
  tmp = NamedTemporaryFile(delete=False, suffix=".wav")
76
  wavfile.write(tmp.name, music_model.sample_rate, data)
@@ -79,14 +92,14 @@ def generate_music(prompt):
79
  print("Music error:", e)
80
  return None
81
 
 
82
  def generate_spectrogram(audio_path):
83
  try:
84
- y, sr = librosa.load(audio_path, sr=None)
85
  S = librosa.feature.melspectrogram(y, sr=sr)
86
  S_db = librosa.power_to_db(S, ref=np.max)
87
  plt.figure(figsize=(6,3))
88
  librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel')
89
- plt.title("Mel Spectrogram")
90
  tmp = NamedTemporaryFile(delete=False, suffix=".png")
91
  plt.savefig(tmp.name, bbox_inches="tight")
92
  plt.close()
@@ -95,6 +108,7 @@ def generate_spectrogram(audio_path):
95
  print("Spectrogram error:", e)
96
  return None
97
 
 
98
  def chat_with_ai(text):
99
  try:
100
  tok = tokenizer.encode(text, return_tensors="pt").to(device)
@@ -104,25 +118,25 @@ def chat_with_ai(text):
104
  print("Chat error:", e)
105
  return "Error generating response."
106
 
 
107
  def generate_video(prompt):
108
- frames = []
109
  for i in range(5):
110
- path = generate_image(f"{prompt} frame {i+1}")
111
- if path:
112
- frames.append(Image.open(path))
113
- if not frames:
114
- return None
115
  tmp = NamedTemporaryFile(delete=False, suffix=".gif")
116
  frames[0].save(tmp.name, save_all=True, append_images=frames[1:], duration=400, loop=0)
117
  return tmp.name
118
 
 
119
  def main(input_text, task, style):
120
  if task=="Conversation":
121
  resp = chat_with_ai(input_text)
122
  img = generate_image(f"conversation about {input_text}", style)
123
  return resp, None, img
124
  if task=="Music":
125
- mus = generate_music(input_text)
126
  spec = generate_spectrogram(mus) if mus else None
127
  return "Music ready", mus, spec
128
  if task=="Text to Audio":
@@ -134,20 +148,21 @@ def main(input_text, task, style):
134
  aud = generate_music(input_text)
135
  return "Video ready", aud, vid
136
 
 
137
  iface = gr.Interface(
138
  fn=main,
139
  inputs=[
140
  gr.Textbox(label="Enter Prompt"),
141
  gr.Radio(["Conversation","Music","Text to Audio","Video Generation"], label="Task"),
142
- gr.Dropdown(["realistic","abstract","comic"], label="Style")
143
  ],
144
  outputs=[
145
  gr.Textbox(label="Output Text"),
146
  gr.Audio(label="Audio File", type="filepath"),
147
- gr.Image(label="Image/GIF", type="filepath")
148
  ],
149
- live=False
150
  )
151
 
152
  if __name__=="__main__":
153
- iface.launch()
 
 
1
+ import sys
2
  import torch
3
  from audiocraft.models import MusicGen
4
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
 
11
  import matplotlib.pyplot as plt
12
  import librosa.display
13
  import librosa
 
14
  from PIL import Image
15
  import os
16
 
17
+ # 1) Startup logs
18
+ print("=== STARTUP ===")
19
+ print("Python:", sys.version.replace("\n", " "))
20
+ print("Torch:", torch.__version__)
21
+ print("Device:", torch.device("cpu"))
22
+
23
+ # 2) Force CPU
24
  device = torch.device("cpu")
25
 
26
+ # 3) Load MusicGen
27
+ print("Loading MusicGen…")
28
  music_model = MusicGen.get_pretrained("small", device=device)
29
+ print("MusicGen loaded.")
30
 
31
+ # 4) Load GPT-2
32
+ print("Loading GPT-2…")
33
  tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
34
  gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
35
+ print("GPT-2 loaded.")
36
 
37
+ # 5) Load Stable Diffusion (CPU-safe)
38
+ print("Loading Stable Diffusion…")
39
  pipe = StableDiffusionPipeline.from_pretrained(
40
+ "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32
 
41
  ).to(device)
42
+ print("Stable Diffusion loaded.")
43
 
44
+ # 6) Init pyttsx3
45
+ print("Initializing TTS engine…")
46
  tts_engine = pyttsx3.init()
47
  tts_engine.setProperty("rate", 150)
48
  tts_engine.setProperty("volume", 0.8)
49
+ print("TTS engine ready.")
50
 
51
+ # Emotion helper
52
  def get_emotion_tone(text):
53
  txt = text.lower()
54
+ if any(w in txt for w in ["happy","joy","excited"]): return "happy"
55
+ if any(w in txt for w in ["sad","down","melancholy"]): return "sad"
56
+ if any(w in txt for w in ["angry","frustrated"]): return "angry"
 
 
 
57
  return "neutral"
58
 
59
+ # Image generation
60
  def generate_image(prompt, style="realistic"):
61
  styled = f"{style} style {prompt}"
62
  try:
 
68
  print("Image error:", e)
69
  return None
70
 
71
+ # Text-to-audio
72
  def text_to_audio(text):
73
  tone = get_emotion_tone(text)
74
+ rate_map = {"neutral":150,"happy":180,"sad":100,"angry":200}
75
+ vol_map = {"neutral":0.8,"happy":1.0,"sad":0.5,"angry":1.0}
76
+ tts_engine.setProperty("rate", rate_map[tone])
77
+ tts_engine.setProperty("volume", vol_map[tone])
 
78
  tmp = NamedTemporaryFile(delete=False, suffix=".mp3")
79
  tts_engine.save_to_file(text, tmp.name)
80
  tts_engine.runAndWait()
81
  return tmp.name
82
 
83
+ # Music generation
84
  def generate_music(prompt):
85
  try:
86
+ wav = music_model.generate([prompt])
87
  data = wav.cpu().numpy()[0,0]
88
  tmp = NamedTemporaryFile(delete=False, suffix=".wav")
89
  wavfile.write(tmp.name, music_model.sample_rate, data)
 
92
  print("Music error:", e)
93
  return None
94
 
95
+ # Spectrogram
96
  def generate_spectrogram(audio_path):
97
  try:
98
+ y,sr = librosa.load(audio_path, sr=None)
99
  S = librosa.feature.melspectrogram(y, sr=sr)
100
  S_db = librosa.power_to_db(S, ref=np.max)
101
  plt.figure(figsize=(6,3))
102
  librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel')
 
103
  tmp = NamedTemporaryFile(delete=False, suffix=".png")
104
  plt.savefig(tmp.name, bbox_inches="tight")
105
  plt.close()
 
108
  print("Spectrogram error:", e)
109
  return None
110
 
111
+ # GPT-2 chat
112
  def chat_with_ai(text):
113
  try:
114
  tok = tokenizer.encode(text, return_tensors="pt").to(device)
 
118
  print("Chat error:", e)
119
  return "Error generating response."
120
 
121
+ # GIF video
122
  def generate_video(prompt):
123
+ frames=[]
124
  for i in range(5):
125
+ p = generate_image(f"{prompt} frame {i+1}")
126
+ if p: frames.append(Image.open(p))
127
+ if not frames: return None
 
 
128
  tmp = NamedTemporaryFile(delete=False, suffix=".gif")
129
  frames[0].save(tmp.name, save_all=True, append_images=frames[1:], duration=400, loop=0)
130
  return tmp.name
131
 
132
+ # Main interface
133
  def main(input_text, task, style):
134
  if task=="Conversation":
135
  resp = chat_with_ai(input_text)
136
  img = generate_image(f"conversation about {input_text}", style)
137
  return resp, None, img
138
  if task=="Music":
139
+ mus = generate_music(input_text)
140
  spec = generate_spectrogram(mus) if mus else None
141
  return "Music ready", mus, spec
142
  if task=="Text to Audio":
 
148
  aud = generate_music(input_text)
149
  return "Video ready", aud, vid
150
 
151
+ # Launch with debug logging
152
  iface = gr.Interface(
153
  fn=main,
154
  inputs=[
155
  gr.Textbox(label="Enter Prompt"),
156
  gr.Radio(["Conversation","Music","Text to Audio","Video Generation"], label="Task"),
157
+ gr.Dropdown(["realistic","abstract","comic"], label="Style"),
158
  ],
159
  outputs=[
160
  gr.Textbox(label="Output Text"),
161
  gr.Audio(label="Audio File", type="filepath"),
162
+ gr.Image(label="Image/GIF", type="filepath"),
163
  ],
 
164
  )
165
 
166
  if __name__=="__main__":
167
+ print("Launching Gradio…")
168
+ iface.launch(server_name="0.0.0.0", server_port=7860, debug=True)