Spaces:

ruslanmv
/

TextToVideo-Flux

Paused

App Files Files Community

ruslanmv commited on Feb 2

Commit

6499e9c

1 Parent(s): 1b6fe15

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -4

app.py CHANGED Viewed

@@ -70,7 +70,7 @@ DESCRIPTION = (
 TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
 # Load Tokenizer and Model for Text Summarization
-def load_text_summarization_model():
     """Load the tokenizer and model for text summarization."""
     print("Loading text summarization model...")
     tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
@@ -80,6 +80,19 @@ def load_text_summarization_model():
     model.to(device)
     return tokenizer, model, device
 tokenizer, model, device = load_text_summarization_model()
 # Log GPU Memory (optional, for debugging)
@@ -102,8 +115,8 @@ def check_gpu_availability():
 check_gpu_availability()
-@spaces.GPU()
-def generate_image_with_flux(
     text: str,
     seed: int = 42,
     width: int = 1024,
@@ -141,6 +154,48 @@ def generate_image_with_flux(
     print("DEBUG: Image generated successfully.")
     return image
 # --------- End of MinDalle Functions ---------
 # Merge audio files
@@ -165,8 +220,18 @@ def merge_audio_files(mp3_names: List[str]) -> str:
 # Function to generate video from text
-def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
     print("DEBUG: Starting get_output_video function...")
     # Summarize the input text
@@ -320,6 +385,183 @@ def get_output_video(text, seed, randomize_seed, width, height, num_inference_st
     print("DEBUG: get_output_video function completed successfully.")
     return 'result_final.mp4'
 # Example text (can be changed by user in Gradio interface)
 text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'

 TITLE = "Video Story Generator with Audio by using FLUX, distilbart, and GTTS."
 # Load Tokenizer and Model for Text Summarization
+def load_text_summarization_model_V1():
     """Load the tokenizer and model for text summarization."""
     print("Loading text summarization model...")
     tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
     model.to(device)
     return tokenizer, model, device
+def load_text_summarization_model():
+    """Load the tokenizer and model for text summarization on CPU."""
+    print("Loading text summarization model...")
+    tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
+    model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
+    # Remove the line that sets the device here
+    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    # print(f"Using device: {device}")
+    # model.to(device)
+    return tokenizer, model
+tokenizer, model = load_text_summarization_model()
 tokenizer, model, device = load_text_summarization_model()
 # Log GPU Memory (optional, for debugging)
 check_gpu_availability()
+#@spaces.GPU()
+def generate_image_with_flux_old(
     text: str,
     seed: int = 42,
     width: int = 1024,
     print("DEBUG: Image generated successfully.")
     return image
+@spaces.GPU()
+def generate_image_with_flux(
+    text: str,
+    seed: int = 42,
+    width: int = 1024,
+    height: int = 1024,
+    num_inference_steps: int = 4,
+    randomize_seed: bool = True):
+    """
+    Generates an image from text using FLUX.
+    Args:
+        text: The text prompt to generate the image from.
+        seed: The random seed for image generation. -1 for random.
+        width: Width of the generated image.
+        height: Height of the generated image.
+        num_inference_steps: Number of inference steps.
+        randomize_seed: Whether to randomize the seed.
+    Returns:
+        A PIL Image object.
+    """
+    print(f"DEBUG: Generating image with FLUX for text: '{text}'")
+    # Initialize FLUX pipeline here
+    dtype = torch.bfloat16
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    flux_pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=dtype).to(device)
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device=device).manual_seed(seed)  # Specify device for generator
+    image = flux_pipe(
+        prompt=text,
+        width=width,
+        height=height,
+        num_inference_steps=num_inference_steps,
+        generator=generator,
+        guidance_scale=0.0
+    ).images[0]
+    print("DEBUG: Image generated successfully.")
+    return image
 # --------- End of MinDalle Functions ---------
 # Merge audio files
 # Function to generate video from text
+def get_output_video_old(text, seed, randomize_seed, width, height, num_inference_steps):
     print("DEBUG: Starting get_output_video function...")
     # Summarize the input text
     print("DEBUG: get_output_video function completed successfully.")
     return 'result_final.mp4'
+# Function to generate video from text
+@spaces.GPU()
+def get_output_video(text, seed, randomize_seed, width, height, num_inference_steps):
+    print("DEBUG: Starting get_output_video function...")
+    # Set the device here, inside the GPU-accelerated function
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Move the model to the GPU
+    model.to(device)
+    # Summarize the input text
+    print("DEBUG: Summarizing text...")
+    inputs = tokenizer(
+        text,
+        max_length=1024,
+        truncation=True,
+        return_tensors="pt"
+    ).to(device) # Now it's safe to move to the device
+    summary_ids = model.generate(inputs["input_ids"].to(device)) # .to(device) here
+    summary = tokenizer.batch_decode(
+        summary_ids,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+    )
+    plot = list(summary[0].split('.'))
+    print(f"DEBUG: Summary generated: {plot}")
+    image_system ="Generate a realistic picture about this: "
+    # Generate images for each sentence in the plot
+    generated_images = []
+    for i, senten in enumerate(plot[:-1]):
+        print(f"DEBUG: Generating image {i+1} of {len(plot)-1}...")
+        image_dir = f"image_{i}"
+        os.makedirs(image_dir, exist_ok=True)
+        image = generate_image_with_flux(
+            text= image_system + senten,
+            seed=seed,
+            randomize_seed=randomize_seed,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps
+        )
+        generated_images.append(image)
+        image_path = os.path.join(image_dir, "generated_image.png")
+        image.save(image_path)
+        print(f"DEBUG: Image generated and saved to {image_path}")
+        #del min_dalle_model # No need to delete the model here
+        # torch.cuda.empty_cache() # No need to empty cache here
+        # gc.collect() # No need to collect garbage here
+    # Create subtitles from the plot
+    sentences = plot[:-1]
+    print("DEBUG: Creating subtitles...")
+    assert len(generated_images) == len(sentences), "Mismatch in number of images and sentences."
+    sub_names = [nltk.tokenize.sent_tokenize(sentence) for sentence in sentences]
+    # Add subtitles to images with dynamic adjustments
+    def get_dynamic_wrap_width(font, text, image_width, padding):
+        # Estimate the number of characters per line dynamically
+        avg_char_width = sum(font.getbbox(c)[2] for c in text) / len(text)
+        return max(1, (image_width - padding * 2) // avg_char_width)
+    def draw_multiple_line_text(image, text, font, text_color, text_start_height, padding=10):
+        draw = ImageDraw.Draw(image)
+        image_width, _ = image.size
+        y_text = text_start_height
+        lines = textwrap.wrap(text, width=get_dynamic_wrap_width(font, text, image_width, padding))
+        for line in lines:
+            line_width, line_height = font.getbbox(line)[2:]
+            draw.text(((image_width - line_width) / 2, y_text), line, font=font, fill=text_color)
+            y_text += line_height + padding
+    def add_text_to_img(text1, image_input):
+        print(f"DEBUG: Adding text to image: '{text1}'")
+        # Scale font size dynamically
+        base_font_size = 30
+        image_width, image_height = image_input.size
+        scaled_font_size = max(10, int(base_font_size * (image_width / 800)))
+        path_font = "/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf"
+        if not os.path.exists(path_font):
+            path_font = "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf"
+        font = ImageFont.truetype(path_font, scaled_font_size)
+        text_color = (255, 255, 0)
+        padding = 10
+        # Estimate starting height dynamically
+        line_height = font.getbbox("A")[3] + padding
+        total_text_height = len(textwrap.wrap(text1, get_dynamic_wrap_width(font, text1, image_width, padding))) * line_height
+        text_start_height = image_height - total_text_height - 20
+        draw_multiple_line_text(image_input, text1, font, text_color, text_start_height, padding)
+        return image_input
+    # Process images with subtitles
+    generated_images_sub = []
+    for k, image in enumerate(generated_images):
+        text_to_add = sub_names[k][0]
+        result = add_text_to_img(text_to_add, image.copy())
+        generated_images_sub.append(result)
+        result.save(f"image_{k}/generated_image_with_subtitles.png")
+    # Generate audio for each subtitle
+    mp3_names = []
+    mp3_lengths = []
+    for k, text_to_add in enumerate(sub_names):
+        print(f"DEBUG: Generating audio for: '{text_to_add[0]}'")
+        f_name = f'audio_{k}.mp3'
+        mp3_names.append(f_name)
+        myobj = gTTS(text=text_to_add[0], lang='en', slow=False)
+        myobj.save(f_name)
+        audio = MP3(f_name)
+        mp3_lengths.append(audio.info.length)
+        print(f"DEBUG: Audio duration: {audio.info.length} seconds")
+    # Merge audio files
+    export_path = merge_audio_files(mp3_names)
+    # Create video clips from images
+    clips = []
+    for k, img in enumerate(generated_images_sub):
+        duration = mp3_lengths[k]
+        print(f"DEBUG: Creating video clip {k+1} with duration: {duration} seconds")
+        clip = mpy.ImageClip(f"image_{k}/generated_image_with_subtitles.png").set_duration(duration + 0.5)
+        clips.append(clip)
+    # Concatenate video clips
+    print("DEBUG: Concatenating video clips...")
+    concat_clip = mpy.concatenate_videoclips(clips, method="compose")
+    concat_clip.write_videofile("result_no_audio.mp4", fps=24, logger=None)
+    # Combine video and audio
+    movie_name = 'result_no_audio.mp4'
+    movie_final = 'result_final.mp4'
+    def combine_audio(vidname, audname, outname, fps=24):
+        print(f"DEBUG: Combining audio for video: '{vidname}'")
+        my_clip = mpy.VideoFileClip(vidname)
+        audio_background = mpy.AudioFileClip(audname)
+        final_clip = my_clip.set_audio(audio_background)
+        final_clip.write_videofile(outname, fps=fps, logger=None)
+    combine_audio(movie_name, export_path, movie_final)
+    # Clean up
+    print("DEBUG: Cleaning up files...")
+    for i in range(len(generated_images_sub)):
+        shutil.rmtree(f"image_{i}")
+        os.remove(f"audio_{i}.mp3")
+    os.remove("result.mp3")
+    os.remove("result_no_audio.mp4")
+    print("DEBUG: Cleanup complete.")
+    print("DEBUG: get_output_video function completed successfully.")
+    return 'result_final.mp4'
 # Example text (can be changed by user in Gradio interface)
 text = 'Once, there was a girl called Laura who went to the supermarket to buy the ingredients to make a cake. Because today is her birthday and her friends come to her house and help her to prepare the cake.'