Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -45,10 +45,11 @@ tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
|
| 45 |
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
| 46 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 47 |
model.to(device)
|
| 48 |
-
print(device)
|
| 49 |
|
| 50 |
|
| 51 |
def get_output_video(text):
|
|
|
|
| 52 |
inputs = tokenizer(text,
|
| 53 |
max_length=1024,
|
| 54 |
truncation=True,
|
|
@@ -58,6 +59,7 @@ def get_output_video(text):
|
|
| 58 |
skip_special_tokens=True,
|
| 59 |
clean_up_tokenization_spaces=False)
|
| 60 |
plot = list(summary[0].split('.'))
|
|
|
|
| 61 |
|
| 62 |
'''
|
| 63 |
The required models will be downloaded to models_root if they are not already there.
|
|
@@ -68,15 +70,16 @@ def get_output_video(text):
|
|
| 68 |
'''
|
| 69 |
@spaces.GPU(duration=60 * 3)
|
| 70 |
def generate_image(
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
):
|
|
|
|
| 80 |
model = MinDalle(
|
| 81 |
is_mega=is_mega,
|
| 82 |
models_root=models_root,
|
|
@@ -94,21 +97,28 @@ def get_output_video(text):
|
|
| 94 |
top_k=top_k,
|
| 95 |
is_verbose=True
|
| 96 |
)
|
|
|
|
| 97 |
return image
|
| 98 |
|
| 99 |
|
| 100 |
generated_images = []
|
| 101 |
-
for senten in plot[:-1]:
|
| 102 |
-
image
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
# Step 4- Creation of the subtitles
|
| 114 |
sentences = plot[:-1]
|
|
@@ -121,6 +131,7 @@ def get_output_video(text):
|
|
| 121 |
for k in range(len(generated_images)):
|
| 122 |
subtitles = tokenize.sent_tokenize(sentences[k])
|
| 123 |
sub_names.append(subtitles)
|
|
|
|
| 124 |
|
| 125 |
# Step 5- Adding Subtitles to the Images
|
| 126 |
def draw_multiple_line_text(image, text, font, text_color, text_start_height):
|
|
@@ -165,6 +176,7 @@ def get_output_video(text):
|
|
| 165 |
text_to_add = sub_names[k][0]
|
| 166 |
result = add_text_to_img(text_to_add, imagenes)
|
| 167 |
generated_images_sub.append(result)
|
|
|
|
| 168 |
|
| 169 |
# Step 7 - Creation of audio
|
| 170 |
c = 0
|
|
@@ -172,7 +184,7 @@ def get_output_video(text):
|
|
| 172 |
mp3_lengths = []
|
| 173 |
for k in range(len(generated_images)):
|
| 174 |
text_to_add = sub_names[k][0]
|
| 175 |
-
print(text_to_add)
|
| 176 |
f_name = 'audio_' + str(c) + '.mp3'
|
| 177 |
mp3_names.append(f_name)
|
| 178 |
# The text that you want to convert to audio
|
|
@@ -190,7 +202,7 @@ def get_output_video(text):
|
|
| 190 |
audio = AudioSegment.from_file(sound_file, format="mp3")
|
| 191 |
duration = len(audio) / 1000
|
| 192 |
mp3_lengths.append(duration)
|
| 193 |
-
print(duration)
|
| 194 |
c += 1
|
| 195 |
|
| 196 |
# Step 8 - Merge audio files
|
|
@@ -201,16 +213,16 @@ def get_output_video(text):
|
|
| 201 |
|
| 202 |
for n, mp3_file in enumerate(mp3_names):
|
| 203 |
mp3_file = mp3_file.replace(chr(92), '/')
|
| 204 |
-
print(
|
| 205 |
# Load the current mp3 into `audio_segment`
|
| 206 |
audio_segment = AudioSegment.from_mp3(mp3_file)
|
| 207 |
# Just accumulate the new `audio_segment` + `silence`
|
| 208 |
full_audio += audio_segment + silence
|
| 209 |
-
print('Merging
|
| 210 |
# The loop will exit once all files in the list have been used
|
| 211 |
# Then export
|
| 212 |
full_audio.export(export_path, format='mp3')
|
| 213 |
-
print('\
|
| 214 |
|
| 215 |
# Step 9 - Creation of the video with adjusted times of the sound
|
| 216 |
c = 0
|
|
@@ -219,18 +231,20 @@ def get_output_video(text):
|
|
| 219 |
f_name = 'img_' + str(c) + '.jpg'
|
| 220 |
file_names.append(f_name)
|
| 221 |
img.save(f_name)
|
|
|
|
| 222 |
c += 1
|
| 223 |
-
print(file_names)
|
| 224 |
|
| 225 |
clips = []
|
| 226 |
d = 0
|
| 227 |
for m in file_names:
|
| 228 |
duration = mp3_lengths[d]
|
| 229 |
-
print(d
|
| 230 |
clips.append(mpe.ImageClip(m).set_duration(duration + 0.5))
|
| 231 |
d += 1
|
| 232 |
concat_clip = mpe.concatenate_videoclips(clips, method="compose")
|
| 233 |
concat_clip.write_videofile("result_new.mp4", fps=24)
|
|
|
|
| 234 |
|
| 235 |
# Step 10 - Merge Video + Audio
|
| 236 |
movie_name = 'result_new.mp4'
|
|
@@ -244,6 +258,7 @@ def get_output_video(text):
|
|
| 244 |
final_clip.write_videofile(outname, fps=fps)
|
| 245 |
|
| 246 |
combine_audio(movie_name, export_path, movie_final) # create a new file
|
|
|
|
| 247 |
|
| 248 |
# Cleanup intermediate files
|
| 249 |
for f in file_names:
|
|
@@ -252,8 +267,9 @@ def get_output_video(text):
|
|
| 252 |
os.remove(f)
|
| 253 |
os.remove("result_new.mp4")
|
| 254 |
os.remove("result.mp3")
|
|
|
|
| 255 |
|
| 256 |
-
|
| 257 |
return 'result_final.mp4'
|
| 258 |
|
| 259 |
|
|
@@ -277,4 +293,4 @@ with demo:
|
|
| 277 |
gr.Markdown(
|
| 278 |
"This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
|
| 279 |
button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
|
| 280 |
-
demo.launch(debug=
|
|
|
|
| 45 |
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
|
| 46 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 47 |
model.to(device)
|
| 48 |
+
print(f"Using device: {device}")
|
| 49 |
|
| 50 |
|
| 51 |
def get_output_video(text):
|
| 52 |
+
print("Starting get_output_video function...")
|
| 53 |
inputs = tokenizer(text,
|
| 54 |
max_length=1024,
|
| 55 |
truncation=True,
|
|
|
|
| 59 |
skip_special_tokens=True,
|
| 60 |
clean_up_tokenization_spaces=False)
|
| 61 |
plot = list(summary[0].split('.'))
|
| 62 |
+
print(f"Summarized plot: {plot}")
|
| 63 |
|
| 64 |
'''
|
| 65 |
The required models will be downloaded to models_root if they are not already there.
|
|
|
|
| 70 |
'''
|
| 71 |
@spaces.GPU(duration=60 * 3)
|
| 72 |
def generate_image(
|
| 73 |
+
is_mega: bool,
|
| 74 |
+
text: str,
|
| 75 |
+
seed: int,
|
| 76 |
+
grid_size: int,
|
| 77 |
+
top_k: int,
|
| 78 |
+
image_path: str,
|
| 79 |
+
models_root: str,
|
| 80 |
+
fp16: bool,
|
| 81 |
):
|
| 82 |
+
print(f"Generating image for: {text}")
|
| 83 |
model = MinDalle(
|
| 84 |
is_mega=is_mega,
|
| 85 |
models_root=models_root,
|
|
|
|
| 97 |
top_k=top_k,
|
| 98 |
is_verbose=True
|
| 99 |
)
|
| 100 |
+
print(f"Image generated successfully.")
|
| 101 |
return image
|
| 102 |
|
| 103 |
|
| 104 |
generated_images = []
|
| 105 |
+
for i, senten in enumerate(plot[:-1]):
|
| 106 |
+
print(f"Generating image {i+1} of {len(plot)-1}...")
|
| 107 |
+
try:
|
| 108 |
+
image = generate_image(
|
| 109 |
+
is_mega=True,
|
| 110 |
+
text=senten,
|
| 111 |
+
seed=1,
|
| 112 |
+
grid_size=1, # param {type:"integer"}
|
| 113 |
+
top_k=256, # param {type:"integer"}
|
| 114 |
+
image_path='generated',
|
| 115 |
+
models_root='pretrained',
|
| 116 |
+
fp16=True, )
|
| 117 |
+
generated_images.append(image)
|
| 118 |
+
print(f"Image {i+1} generated and appended.")
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"Error generating image {i+1}: {e}")
|
| 121 |
+
raise
|
| 122 |
|
| 123 |
# Step 4- Creation of the subtitles
|
| 124 |
sentences = plot[:-1]
|
|
|
|
| 131 |
for k in range(len(generated_images)):
|
| 132 |
subtitles = tokenize.sent_tokenize(sentences[k])
|
| 133 |
sub_names.append(subtitles)
|
| 134 |
+
print(f"Subtitles generated for image {k+1}: {subtitles}")
|
| 135 |
|
| 136 |
# Step 5- Adding Subtitles to the Images
|
| 137 |
def draw_multiple_line_text(image, text, font, text_color, text_start_height):
|
|
|
|
| 176 |
text_to_add = sub_names[k][0]
|
| 177 |
result = add_text_to_img(text_to_add, imagenes)
|
| 178 |
generated_images_sub.append(result)
|
| 179 |
+
print(f"Subtitles added to image {k+1}.")
|
| 180 |
|
| 181 |
# Step 7 - Creation of audio
|
| 182 |
c = 0
|
|
|
|
| 184 |
mp3_lengths = []
|
| 185 |
for k in range(len(generated_images)):
|
| 186 |
text_to_add = sub_names[k][0]
|
| 187 |
+
print(f"Generating audio for: {text_to_add}")
|
| 188 |
f_name = 'audio_' + str(c) + '.mp3'
|
| 189 |
mp3_names.append(f_name)
|
| 190 |
# The text that you want to convert to audio
|
|
|
|
| 202 |
audio = AudioSegment.from_file(sound_file, format="mp3")
|
| 203 |
duration = len(audio) / 1000
|
| 204 |
mp3_lengths.append(duration)
|
| 205 |
+
print(f"Audio duration: {duration} seconds")
|
| 206 |
c += 1
|
| 207 |
|
| 208 |
# Step 8 - Merge audio files
|
|
|
|
| 213 |
|
| 214 |
for n, mp3_file in enumerate(mp3_names):
|
| 215 |
mp3_file = mp3_file.replace(chr(92), '/')
|
| 216 |
+
print(f"Merging audio file: {mp3_file}")
|
| 217 |
# Load the current mp3 into `audio_segment`
|
| 218 |
audio_segment = AudioSegment.from_mp3(mp3_file)
|
| 219 |
# Just accumulate the new `audio_segment` + `silence`
|
| 220 |
full_audio += audio_segment + silence
|
| 221 |
+
print(f'Merging audio {n+1} completed.')
|
| 222 |
# The loop will exit once all files in the list have been used
|
| 223 |
# Then export
|
| 224 |
full_audio.export(export_path, format='mp3')
|
| 225 |
+
print('\nAudio merging done!')
|
| 226 |
|
| 227 |
# Step 9 - Creation of the video with adjusted times of the sound
|
| 228 |
c = 0
|
|
|
|
| 231 |
f_name = 'img_' + str(c) + '.jpg'
|
| 232 |
file_names.append(f_name)
|
| 233 |
img.save(f_name)
|
| 234 |
+
print(f"Saving image: {f_name}")
|
| 235 |
c += 1
|
| 236 |
+
print(f"Image file names: {file_names}")
|
| 237 |
|
| 238 |
clips = []
|
| 239 |
d = 0
|
| 240 |
for m in file_names:
|
| 241 |
duration = mp3_lengths[d]
|
| 242 |
+
print(f"Creating video clip {d+1} with duration: {duration} seconds")
|
| 243 |
clips.append(mpe.ImageClip(m).set_duration(duration + 0.5))
|
| 244 |
d += 1
|
| 245 |
concat_clip = mpe.concatenate_videoclips(clips, method="compose")
|
| 246 |
concat_clip.write_videofile("result_new.mp4", fps=24)
|
| 247 |
+
print("Video clips concatenated and saved as result_new.mp4")
|
| 248 |
|
| 249 |
# Step 10 - Merge Video + Audio
|
| 250 |
movie_name = 'result_new.mp4'
|
|
|
|
| 258 |
final_clip.write_videofile(outname, fps=fps)
|
| 259 |
|
| 260 |
combine_audio(movie_name, export_path, movie_final) # create a new file
|
| 261 |
+
print("Video and audio merged successfully!")
|
| 262 |
|
| 263 |
# Cleanup intermediate files
|
| 264 |
for f in file_names:
|
|
|
|
| 267 |
os.remove(f)
|
| 268 |
os.remove("result_new.mp4")
|
| 269 |
os.remove("result.mp3")
|
| 270 |
+
print("Intermediate files cleaned up.")
|
| 271 |
|
| 272 |
+
print("Finished get_output_video function.")
|
| 273 |
return 'result_final.mp4'
|
| 274 |
|
| 275 |
|
|
|
|
| 293 |
gr.Markdown(
|
| 294 |
"This program text-to-video AI software generating videos from any prompt! AI software to build an art gallery. The future version will use Dalle-2 For more info visit [ruslanmv.com](https://ruslanmv.com/) ")
|
| 295 |
button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation)
|
| 296 |
+
demo.launch(debug=True)
|