Spaces:

wahab5763
/

TvApp

Build error

App Files Files Community

TvApp / app.py

wahab5763

Create app.py

22ceee6 verified about 1 year ago

raw

history blame contribute delete

6.55 kB

	# app.py

	import os
	import re
	import gradio as gr
	import torch
	from torch import cuda
	from math import isclose
	import whisper
	from PyPDF2 import PdfReader
	from PIL import Image
	from diffusers import StableDiffusionPipeline
	from gtts import gTTS
	from moviepy.editor import (
	ImageClip,
	AudioFileClip,
	TextClip,
	CompositeVideoClip,
	concatenate_videoclips
	)
	from moviepy.video.fx.all import resize

	######################################
	# 1) SETUP AND MODEL LOADING
	######################################

	# Check for GPU
	device = "cuda" if cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Load Stable Diffusion
	pipe = StableDiffusionPipeline.from_pretrained(
	"stabilityai/stable-diffusion-2",
	torch_dtype=torch.float16 if device == "cuda" else torch.float32
	)
	pipe.to(device)

	# (Optional) memory optimizations for low VRAM
	pipe.enable_attention_slicing()
	pipe.enable_sequential_cpu_offload()

	# Load Whisper (not actually used here for transcription, but included if needed)
	whisper_model = whisper.load_model("small")

	# Make output folders
	os.makedirs("images", exist_ok=True)
	os.makedirs("videos", exist_ok=True)


	######################################
	# 2) CORE PDF-TO-VIDEO FUNCTION
	######################################

	def unify_text_no_newlines(text):
	"""Replace any sequence of whitespace/newlines with a single space."""
	return re.sub(r"\s+", " ", text).strip()

	def split_into_sentences(text):
	"""Split text into sentences by period. Adjust to your needs."""
	parts = re.split(r'\.\s*', text)
	# Clean them up
	sentences = [p.strip() for p in parts if p.strip()]
	return sentences

	def repeating_zoom(t, base=1.0, amplitude=0.1, period=4.0):
	"""
	Continuously zoom in/out in a triangular wave:
	- base=1.0 => no zoom at the center
	- amplitude=0.1 => up to 1.1, down to 1.0, etc.
	- period=4s => every 4s completes one in/out cycle
	"""
	cp = (t % period) / period
	if cp < 0.5:
	# 0..0.5 => scale from base..(base+amplitude)
	up = cp / 0.5 # in [0..1]
	scale = base + amplitude * up
	else:
	# 0.5..1 => scale from (base+amplitude)..base
	down = 1 - ((cp - 0.5) / 0.5)
	scale = base + amplitude * down
	return max(0.01, scale)

	def add_subtitles(video_clip, text, duration):
	"""Overlay word-by-word subtitles at the bottom."""
	words = text.split()
	if not words:
	return video_clip

	word_duration = duration / len(words)
	subclips = []
	for i, w in enumerate(words):
	start_t = i * word_duration
	txt_clip = (
	TextClip(
	w, fontsize=36, color='white',
	font='Arial', bg_color='black', method='caption'
	)
	.set_start(start_t)
	.set_duration(word_duration)
	.set_position(("center", "bottom"))
	)
	subclips.append(txt_clip)
	final = CompositeVideoClip([video_clip, *subclips])
	return final.set_duration(duration)

	def process_pdf_to_video(pdf_file_path):
	"""
	1) Extract text from PDF (remove newlines).
	2) Split into sentences.
	3) For each sentence, generate image, TTS, clip.
	4) Concatenate final video.
	5) Return final MP4 path.
	"""
	# 1) Extract text
	reader = PdfReader(pdf_file_path)
	raw_text = []
	for page in reader.pages:
	page_text = page.extract_text() or ""
	raw_text.append(page_text)
	text = unify_text_no_newlines(" ".join(raw_text))

	# 2) Split sentences
	sentences = split_into_sentences(text)
	if not sentences:
	raise ValueError("No text found in PDF.")

	# Basic Ghibli prompt
	base_prompt = "Ghibli-style art, soft lighting, whimsical characters, serene environment"
	clips = []

	# 3) Generate data for each sentence
	for idx, sentence in enumerate(sentences):
	if not sentence:
	continue

	# Prompt for Stable Diffusion
	prompt = f"{base_prompt}, {sentence}"
	# Generate image
	image = pipe(
	prompt=prompt,
	num_inference_steps=20
	).images[0]
	img_path = f"images/clip_{idx+1}.png"
	image.save(img_path)

	# TTS
	audio_path = f"videos/tts_{idx+1}.mp3"
	tts = gTTS(sentence, lang='en')
	tts.save(audio_path)

	# Create Clip
	audio_clip = AudioFileClip(audio_path)
	duration = audio_clip.duration
	if duration < 0.1:
	continue

	img_clip = ImageClip(img_path).set_duration(duration)

	# Apply indefinite zoom in/out
	zoom_clip = img_clip.fx(
	resize,
	lambda t: repeating_zoom(t, base=1.0, amplitude=0.1, period=4.0)
	).set_audio(audio_clip)

	# Add subtitles
	final_clip = add_subtitles(zoom_clip, sentence, duration)
	clips.append(final_clip)

	# 4) Concatenate all
	if not clips:
	raise ValueError("No valid clips generated.")

	combined = concatenate_videoclips(clips, method="compose")
	# Resize to 1280x720
	combined_16_9 = combined.resize((1280, 720))

	# 5) Write out final MP4
	final_path = "videos/final_video.mp4"
	combined_16_9.write_videofile(final_path, fps=24, codec="libx264")
	return final_path


	######################################
	# 3) GRADIO INTERFACE
	######################################

	def generate_video_from_pdf(pdf_file):
	"""
	This is the function called by Gradio.
	pdf_file is a Gradio 'tempfile' object with .name referencing local path.
	"""
	if not pdf_file:
	return "No PDF uploaded."
	try:
	final_video_path = process_pdf_to_video(pdf_file.name)
	return final_video_path # Gradio can display as a video if we return the path
	except Exception as e:
	return f"Error: {str(e)}"


	# Build the Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# PDF to Ghibli-Style Video")
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	generate_btn = gr.Button("Generate Video")
	video_output = gr.Video(label="Output Video")

	# When button is clicked, call generate_video_from_pdf
	generate_btn.click(
	fn=generate_video_from_pdf,
	inputs=pdf_input,
	outputs=video_output
	)

	# Launch the Gradio app
	def start_app():
	# Note: On Hugging Face Spaces, you typically do 'demo.launch()'
	# without blocking the main thread.
	demo.launch(server_name="0.0.0.0", server_port=7860)

	if __name__ == "__main__":
	start_app()