TvApp / app.py
wahab5763's picture
Create app.py
22ceee6 verified
# app.py
import os
import re
import gradio as gr
import torch
from torch import cuda
from math import isclose
import whisper
from PyPDF2 import PdfReader
from PIL import Image
from diffusers import StableDiffusionPipeline
from gtts import gTTS
from moviepy.editor import (
ImageClip,
AudioFileClip,
TextClip,
CompositeVideoClip,
concatenate_videoclips
)
from moviepy.video.fx.all import resize
######################################
# 1) SETUP AND MODEL LOADING
######################################
# Check for GPU
device = "cuda" if cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load Stable Diffusion
pipe = StableDiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-2",
torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
pipe.to(device)
# (Optional) memory optimizations for low VRAM
pipe.enable_attention_slicing()
pipe.enable_sequential_cpu_offload()
# Load Whisper (not actually used here for transcription, but included if needed)
whisper_model = whisper.load_model("small")
# Make output folders
os.makedirs("images", exist_ok=True)
os.makedirs("videos", exist_ok=True)
######################################
# 2) CORE PDF-TO-VIDEO FUNCTION
######################################
def unify_text_no_newlines(text):
"""Replace any sequence of whitespace/newlines with a single space."""
return re.sub(r"\s+", " ", text).strip()
def split_into_sentences(text):
"""Split text into sentences by period. Adjust to your needs."""
parts = re.split(r'\.\s*', text)
# Clean them up
sentences = [p.strip() for p in parts if p.strip()]
return sentences
def repeating_zoom(t, base=1.0, amplitude=0.1, period=4.0):
"""
Continuously zoom in/out in a triangular wave:
- base=1.0 => no zoom at the center
- amplitude=0.1 => up to 1.1, down to 1.0, etc.
- period=4s => every 4s completes one in/out cycle
"""
cp = (t % period) / period
if cp < 0.5:
# 0..0.5 => scale from base..(base+amplitude)
up = cp / 0.5 # in [0..1]
scale = base + amplitude * up
else:
# 0.5..1 => scale from (base+amplitude)..base
down = 1 - ((cp - 0.5) / 0.5)
scale = base + amplitude * down
return max(0.01, scale)
def add_subtitles(video_clip, text, duration):
"""Overlay word-by-word subtitles at the bottom."""
words = text.split()
if not words:
return video_clip
word_duration = duration / len(words)
subclips = []
for i, w in enumerate(words):
start_t = i * word_duration
txt_clip = (
TextClip(
w, fontsize=36, color='white',
font='Arial', bg_color='black', method='caption'
)
.set_start(start_t)
.set_duration(word_duration)
.set_position(("center", "bottom"))
)
subclips.append(txt_clip)
final = CompositeVideoClip([video_clip, *subclips])
return final.set_duration(duration)
def process_pdf_to_video(pdf_file_path):
"""
1) Extract text from PDF (remove newlines).
2) Split into sentences.
3) For each sentence, generate image, TTS, clip.
4) Concatenate final video.
5) Return final MP4 path.
"""
# 1) Extract text
reader = PdfReader(pdf_file_path)
raw_text = []
for page in reader.pages:
page_text = page.extract_text() or ""
raw_text.append(page_text)
text = unify_text_no_newlines(" ".join(raw_text))
# 2) Split sentences
sentences = split_into_sentences(text)
if not sentences:
raise ValueError("No text found in PDF.")
# Basic Ghibli prompt
base_prompt = "Ghibli-style art, soft lighting, whimsical characters, serene environment"
clips = []
# 3) Generate data for each sentence
for idx, sentence in enumerate(sentences):
if not sentence:
continue
# Prompt for Stable Diffusion
prompt = f"{base_prompt}, {sentence}"
# Generate image
image = pipe(
prompt=prompt,
num_inference_steps=20
).images[0]
img_path = f"images/clip_{idx+1}.png"
image.save(img_path)
# TTS
audio_path = f"videos/tts_{idx+1}.mp3"
tts = gTTS(sentence, lang='en')
tts.save(audio_path)
# Create Clip
audio_clip = AudioFileClip(audio_path)
duration = audio_clip.duration
if duration < 0.1:
continue
img_clip = ImageClip(img_path).set_duration(duration)
# Apply indefinite zoom in/out
zoom_clip = img_clip.fx(
resize,
lambda t: repeating_zoom(t, base=1.0, amplitude=0.1, period=4.0)
).set_audio(audio_clip)
# Add subtitles
final_clip = add_subtitles(zoom_clip, sentence, duration)
clips.append(final_clip)
# 4) Concatenate all
if not clips:
raise ValueError("No valid clips generated.")
combined = concatenate_videoclips(clips, method="compose")
# Resize to 1280x720
combined_16_9 = combined.resize((1280, 720))
# 5) Write out final MP4
final_path = "videos/final_video.mp4"
combined_16_9.write_videofile(final_path, fps=24, codec="libx264")
return final_path
######################################
# 3) GRADIO INTERFACE
######################################
def generate_video_from_pdf(pdf_file):
"""
This is the function called by Gradio.
pdf_file is a Gradio 'tempfile' object with .name referencing local path.
"""
if not pdf_file:
return "No PDF uploaded."
try:
final_video_path = process_pdf_to_video(pdf_file.name)
return final_video_path # Gradio can display as a video if we return the path
except Exception as e:
return f"Error: {str(e)}"
# Build the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# PDF to Ghibli-Style Video")
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
generate_btn = gr.Button("Generate Video")
video_output = gr.Video(label="Output Video")
# When button is clicked, call generate_video_from_pdf
generate_btn.click(
fn=generate_video_from_pdf,
inputs=pdf_input,
outputs=video_output
)
# Launch the Gradio app
def start_app():
# Note: On Hugging Face Spaces, you typically do 'demo.launch()'
# without blocking the main thread.
demo.launch(server_name="0.0.0.0", server_port=7860)
if __name__ == "__main__":
start_app()