| |
|
|
| import os |
| import re |
| import gradio as gr |
| import torch |
| from torch import cuda |
| from math import isclose |
| import whisper |
| from PyPDF2 import PdfReader |
| from PIL import Image |
| from diffusers import StableDiffusionPipeline |
| from gtts import gTTS |
| from moviepy.editor import ( |
| ImageClip, |
| AudioFileClip, |
| TextClip, |
| CompositeVideoClip, |
| concatenate_videoclips |
| ) |
| from moviepy.video.fx.all import resize |
|
|
| |
| |
| |
|
|
| |
| device = "cuda" if cuda.is_available() else "cpu" |
| print(f"Using device: {device}") |
|
|
| |
| pipe = StableDiffusionPipeline.from_pretrained( |
| "stabilityai/stable-diffusion-2", |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32 |
| ) |
| pipe.to(device) |
|
|
| |
| pipe.enable_attention_slicing() |
| pipe.enable_sequential_cpu_offload() |
|
|
| |
| whisper_model = whisper.load_model("small") |
|
|
| |
| os.makedirs("images", exist_ok=True) |
| os.makedirs("videos", exist_ok=True) |
|
|
|
|
| |
| |
| |
|
|
| def unify_text_no_newlines(text): |
| """Replace any sequence of whitespace/newlines with a single space.""" |
| return re.sub(r"\s+", " ", text).strip() |
|
|
| def split_into_sentences(text): |
| """Split text into sentences by period. Adjust to your needs.""" |
| parts = re.split(r'\.\s*', text) |
| |
| sentences = [p.strip() for p in parts if p.strip()] |
| return sentences |
|
|
| def repeating_zoom(t, base=1.0, amplitude=0.1, period=4.0): |
| """ |
| Continuously zoom in/out in a triangular wave: |
| - base=1.0 => no zoom at the center |
| - amplitude=0.1 => up to 1.1, down to 1.0, etc. |
| - period=4s => every 4s completes one in/out cycle |
| """ |
| cp = (t % period) / period |
| if cp < 0.5: |
| |
| up = cp / 0.5 |
| scale = base + amplitude * up |
| else: |
| |
| down = 1 - ((cp - 0.5) / 0.5) |
| scale = base + amplitude * down |
| return max(0.01, scale) |
|
|
| def add_subtitles(video_clip, text, duration): |
| """Overlay word-by-word subtitles at the bottom.""" |
| words = text.split() |
| if not words: |
| return video_clip |
|
|
| word_duration = duration / len(words) |
| subclips = [] |
| for i, w in enumerate(words): |
| start_t = i * word_duration |
| txt_clip = ( |
| TextClip( |
| w, fontsize=36, color='white', |
| font='Arial', bg_color='black', method='caption' |
| ) |
| .set_start(start_t) |
| .set_duration(word_duration) |
| .set_position(("center", "bottom")) |
| ) |
| subclips.append(txt_clip) |
| final = CompositeVideoClip([video_clip, *subclips]) |
| return final.set_duration(duration) |
|
|
| def process_pdf_to_video(pdf_file_path): |
| """ |
| 1) Extract text from PDF (remove newlines). |
| 2) Split into sentences. |
| 3) For each sentence, generate image, TTS, clip. |
| 4) Concatenate final video. |
| 5) Return final MP4 path. |
| """ |
| |
| reader = PdfReader(pdf_file_path) |
| raw_text = [] |
| for page in reader.pages: |
| page_text = page.extract_text() or "" |
| raw_text.append(page_text) |
| text = unify_text_no_newlines(" ".join(raw_text)) |
|
|
| |
| sentences = split_into_sentences(text) |
| if not sentences: |
| raise ValueError("No text found in PDF.") |
|
|
| |
| base_prompt = "Ghibli-style art, soft lighting, whimsical characters, serene environment" |
| clips = [] |
|
|
| |
| for idx, sentence in enumerate(sentences): |
| if not sentence: |
| continue |
|
|
| |
| prompt = f"{base_prompt}, {sentence}" |
| |
| image = pipe( |
| prompt=prompt, |
| num_inference_steps=20 |
| ).images[0] |
| img_path = f"images/clip_{idx+1}.png" |
| image.save(img_path) |
|
|
| |
| audio_path = f"videos/tts_{idx+1}.mp3" |
| tts = gTTS(sentence, lang='en') |
| tts.save(audio_path) |
|
|
| |
| audio_clip = AudioFileClip(audio_path) |
| duration = audio_clip.duration |
| if duration < 0.1: |
| continue |
|
|
| img_clip = ImageClip(img_path).set_duration(duration) |
|
|
| |
| zoom_clip = img_clip.fx( |
| resize, |
| lambda t: repeating_zoom(t, base=1.0, amplitude=0.1, period=4.0) |
| ).set_audio(audio_clip) |
|
|
| |
| final_clip = add_subtitles(zoom_clip, sentence, duration) |
| clips.append(final_clip) |
|
|
| |
| if not clips: |
| raise ValueError("No valid clips generated.") |
| |
| combined = concatenate_videoclips(clips, method="compose") |
| |
| combined_16_9 = combined.resize((1280, 720)) |
|
|
| |
| final_path = "videos/final_video.mp4" |
| combined_16_9.write_videofile(final_path, fps=24, codec="libx264") |
| return final_path |
|
|
|
|
| |
| |
| |
|
|
| def generate_video_from_pdf(pdf_file): |
| """ |
| This is the function called by Gradio. |
| pdf_file is a Gradio 'tempfile' object with .name referencing local path. |
| """ |
| if not pdf_file: |
| return "No PDF uploaded." |
| try: |
| final_video_path = process_pdf_to_video(pdf_file.name) |
| return final_video_path |
| except Exception as e: |
| return f"Error: {str(e)}" |
|
|
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown("# PDF to Ghibli-Style Video") |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) |
| generate_btn = gr.Button("Generate Video") |
| video_output = gr.Video(label="Output Video") |
|
|
| |
| generate_btn.click( |
| fn=generate_video_from_pdf, |
| inputs=pdf_input, |
| outputs=video_output |
| ) |
|
|
| |
| def start_app(): |
| |
| |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|
| if __name__ == "__main__": |
| start_app() |
|
|