from flask import Flask, request, jsonify, send_file, render_template_string import os import torch import numpy as np from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel, VitsTokenizer from diffusers import StableDiffusionPipeline import cv2 from PIL import Image, ImageDraw, ImageFont import soundfile as sf import subprocess import threading import uuid import json import time import random from datetime import datetime import asyncio from concurrent.futures import ThreadPoolExecutor import openai from openai import OpenAI import re app = Flask(__name__) app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max # Configuration OPENAI_API_KEY = os.getenv("open_key") # Add your OpenAI API key client = OpenAI(api_key=OPENAI_API_KEY) # Global variables for models models_loaded = False story_model = None story_tokenizer = None tts_model = None tts_tokenizer = None image_pipeline = None device = None # Video generation status tracking video_status = {} video_files = {} def load_models(): """Load all AI models at startup""" global models_loaded, story_model, story_tokenizer, tts_model, tts_tokenizer, image_pipeline, device if models_loaded: return device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🚀 Loading models on {device}...") try: # Hindi TTS Model print("📢 Loading Hindi TTS model...") tts_model = VitsModel.from_pretrained("facebook/mms-tts-hin") tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-hin") # Image Generation Model - Higher quality settings print("🎨 Loading image generation model...") image_pipeline = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 if device == "cuda" else torch.float32, safety_checker=None, requires_safety_checker=False ) image_pipeline.to(device) # Optimize for speed and quality try: if hasattr(image_pipeline, 'enable_xformers_memory_efficient_attention'): image_pipeline.enable_xformers_memory_efficient_attention() except Exception as e: print(f"xformers not available, using standard attention: {e}") try: if hasattr(image_pipeline, 'enable_model_cpu_offload'): image_pipeline.enable_model_cpu_offload() except Exception as e: print(f"CPU offload not available: {e}") models_loaded = True print("✅ All models loaded successfully!") except Exception as e: print(f"❌ Error loading models: {e}") class AdvancedHindiVideoGenerator: def __init__(self): self.device = device self.executor = ThreadPoolExecutor(max_workers=4) def generate_dynamic_hindi_story(self, theme, duration_minutes, style="adventure"): """Generate dynamic Hindi story using ChatGPT""" try: # Calculate number of scenes based on duration scenes_needed = max(8, duration_minutes // 2) # ~2 minutes per scene prompt = f""" Create a captivating {duration_minutes}-minute Hindi story about {theme} with exactly {scenes_needed} scenes. Requirements: - Theme: {theme} - Style: {style} - Each scene should be 1-2 minutes long - Include dialogue and descriptive narration - Make it engaging for YouTube audience - Ensure cultural authenticity Format your response as JSON with this structure: {{ "title": "Story title in Hindi and English", "scenes": [ {{ "scene_number": 1, "hindi_text": "Hindi narration text (2-3 sentences)", "english_description": "Scene description for image generation", "visual_prompt": "Detailed visual description for AI image generation", "mood": "happy/sad/suspense/action/peaceful", "duration_estimate": "seconds" }} ] }} Make each scene vivid and cinematic. The story should flow naturally and be suitable for all ages. """ response = client.chat.completions.create( model="gpt-3.5-turbo", # Changed from gpt-4 to gpt-3.5-turbo for better accessibility messages=[ {"role": "system", "content": "You are a master storyteller specializing in Hindi stories. Create engaging, family-friendly content perfect for YouTube videos."}, {"role": "user", "content": prompt} ], max_tokens=2000, temperature=0.8 ) story_content = response.choices[0].message.content # Parse JSON response try: story_data = json.loads(story_content) return story_data except json.JSONDecodeError: # Fallback if JSON parsing fails return self.create_fallback_story(theme, scenes_needed) except Exception as e: print(f"ChatGPT Error: {e}") return self.create_fallback_story(theme, scenes_needed) def create_fallback_story(self, theme, scenes_needed): """Fallback story generation if ChatGPT fails""" fallback_stories = { "adventure": { "title": "साहसिक यात्रा - The Great Adventure", "scenes": [ { "scene_number": 1, "hindi_text": "एक बार एक बहादुर युवक था जो नए रोमांच की तलाश में निकला।", "english_description": "A brave young man starting an adventure", "visual_prompt": "heroic young Indian man with backpack standing at mountain edge, sunrise, cinematic wide shot, epic landscape", "mood": "inspiring", "duration_estimate": "120" }, { "scene_number": 2, "hindi_text": "जंगल में उसे एक रहस्यमय गुफा दिखाई दी जिसमें से अजीब रोशनी आ रही थी।", "english_description": "Mysterious glowing cave in forest", "visual_prompt": "mysterious cave entrance glowing with magical blue light, dense forest, atmospheric lighting, fantasy style", "mood": "mysterious", "duration_estimate": "110" } ] } } base_story = fallback_stories.get(theme, fallback_stories["adventure"]) # Extend scenes to match required length scenes = base_story["scenes"] while len(scenes) < scenes_needed: scenes.extend(base_story["scenes"]) return { "title": base_story["title"], "scenes": scenes[:scenes_needed] } def generate_enhanced_scene_audio(self, text, output_path, mood="neutral"): """Generate high-quality Hindi audio with mood-based adjustments""" try: # Preprocess text for better TTS processed_text = self.preprocess_hindi_text(text) inputs = tts_tokenizer(processed_text, return_tensors="pt") with torch.no_grad(): output = tts_model(**inputs).waveform audio_np = output.squeeze().cpu().numpy() # Apply mood-based audio processing audio_np = self.apply_audio_effects(audio_np, mood) # Save high-quality audio sf.write(output_path, audio_np, tts_model.config.sampling_rate) duration = len(audio_np) / tts_model.config.sampling_rate return max(duration, 4.0) # Minimum 4 seconds per scene except Exception as e: print(f"TTS Error: {e}") # Create silence as fallback duration = max(len(text.split()) * 0.7, 4.0) silence = np.zeros(int(duration * 22050)) sf.write(output_path, silence, 22050) return duration def preprocess_hindi_text(self, text): """Preprocess Hindi text for better TTS pronunciation""" # Add pauses for better speech rhythm text = re.sub(r'([।!?])', r'\1 ', text) text = re.sub(r'([,])', r'\1 ', text) return text.strip() def apply_audio_effects(self, audio_np, mood): """Apply mood-based audio effects""" try: if mood == "suspense": # Lower pitch slightly for suspense audio_np = audio_np * 0.9 elif mood == "happy": # Slight pitch increase for happiness audio_np = audio_np * 1.05 elif mood == "action": # Increase volume and add slight compression audio_np = np.tanh(audio_np * 1.2) return np.clip(audio_np, -1.0, 1.0) except: return audio_np def generate_high_quality_scene_image(self, visual_prompt, mood, output_path, scene_num): """Generate ultra-high quality images for YouTube""" try: # Enhanced prompt based on mood and YouTube requirements mood_styles = { "happy": "bright colors, warm lighting, cheerful atmosphere", "sad": "muted colors, soft lighting, emotional depth", "suspense": "dramatic shadows, mysterious atmosphere, dark tones", "action": "dynamic angles, intense lighting, high energy", "peaceful": "soft pastels, natural lighting, serene atmosphere", "mysterious": "dim lighting, fog, ethereal atmosphere" } style_addition = mood_styles.get(mood, "cinematic lighting, professional quality") enhanced_prompt = f""" {visual_prompt}, {style_addition}, ultra high quality, 8K resolution, professional photography, cinematic composition, perfect lighting, sharp focus, detailed textures, rich colors, masterpiece quality, YouTube thumbnail worthy, award winning photography """ negative_prompt = """ blurry, low quality, distorted, ugly, bad anatomy, pixelated, watermark, text, signature, amateur, poorly lit, overexposed, underexposed, noise, artifacts, jpeg artifacts, compression """ # Generate high-resolution image image = image_pipeline( prompt=enhanced_prompt, negative_prompt=negative_prompt, num_inference_steps=40, # Higher steps for better quality guidance_scale=8.5, width=1920, # Full HD width height=1080, # Full HD height generator=torch.Generator(device=device).manual_seed(scene_num * 123 + hash(visual_prompt) % 1000) ).images[0] # Post-process for maximum quality image = self.enhance_image_quality(image) image.save(output_path, "PNG", quality=100, optimize=False) return True except Exception as e: print(f"Image generation error for scene {scene_num}: {e}") return self.create_fallback_image(output_path, scene_num, mood) def enhance_image_quality(self, image): """Enhance image quality using PIL""" try: from PIL import ImageEnhance, ImageFilter # Resize to exact YouTube dimensions image = image.resize((1920, 1080), Image.Resampling.LANCZOS) # Enhance contrast slightly enhancer = ImageEnhance.Contrast(image) image = enhancer.enhance(1.1) # Enhance sharpness enhancer = ImageEnhance.Sharpness(image) image = enhancer.enhance(1.1) # Enhance colors enhancer = ImageEnhance.Color(image) image = enhancer.enhance(1.05) return image except Exception as e: print(f"Image enhancement error: {e}") return image.resize((1920, 1080), Image.Resampling.LANCZOS) def create_fallback_image(self, output_path, scene_num, mood): """Create high-quality fallback image""" try: # Create gradient background based on mood mood_colors = { "happy": [(255, 223, 0), (255, 94, 77)], # Yellow to red "sad": [(74, 144, 226), (80, 80, 80)], # Blue to gray "suspense": [(30, 30, 30), (70, 20, 70)], # Dark to purple "action": [(255, 0, 0), (255, 165, 0)], # Red to orange "peaceful": [(135, 206, 235), (144, 238, 144)], # Sky blue to light green "mysterious": [(25, 25, 112), (72, 61, 139)] # Dark blue to purple } colors = mood_colors.get(mood, [(100, 100, 150), (150, 100, 100)]) img = Image.new('RGB', (1920, 1080), color=colors[0]) draw = ImageDraw.Draw(img) # Create gradient effect for y in range(1080): ratio = y / 1080 r = int(colors[0][0] * (1 - ratio) + colors[1][0] * ratio) g = int(colors[0][1] * (1 - ratio) + colors[1][1] * ratio) b = int(colors[0][2] * (1 - ratio) + colors[1][2] * ratio) draw.line([(0, y), (1920, y)], fill=(r, g, b)) # Add scene information try: # Use a larger font font_size = 72 font = ImageFont.load_default() # Add scene number text = f"Scene {scene_num}" bbox = draw.textbbox((0, 0), text, font=font) text_width = bbox[2] - bbox[0] text_height = bbox[3] - bbox[1] x = (1920 - text_width) // 2 y = (1080 - text_height) // 2 # Add text with shadow draw.text((x + 3, y + 3), text, fill=(0, 0, 0), font=font) # Shadow draw.text((x, y), text, fill=(255, 255, 255), font=font) # Main text except Exception as font_error: print(f"Font error: {font_error}") img.save(output_path, "PNG", quality=100) return True except Exception as e: print(f"Fallback image creation error: {e}") return False def create_professional_video_with_ffmpeg(self, image_path, audio_path, text, output_path, duration, mood): """Create professional quality video using FFmpeg with advanced effects""" try: # FFmpeg command for high-quality video with effects temp_video = output_path.replace('.mp4', '_temp.mp4') # Ken Burns effect parameters based on mood zoom_effects = { "action": "zoompan=z='min(zoom+0.001,1.5)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=1:s=1920x1080", "peaceful": "zoompan=z='min(zoom+0.0005,1.2)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=1:s=1920x1080", "suspense": "zoompan=z='if(lte(zoom,1.0),1.5,max(1.0,zoom-0.001))':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=1:s=1920x1080" } zoom_filter = zoom_effects.get(mood, "zoompan=z='min(zoom+0.0008,1.3)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=1:s=1920x1080") # Create video from image with advanced effects ffmpeg_cmd = [ 'ffmpeg', '-y', '-loop', '1', '-i', image_path, '-i', audio_path, '-c:v', 'libx264', '-preset', 'slow', # Better quality '-crf', '18', # High quality (18 is very high quality) '-pix_fmt', 'yuv420p', '-profile:v', 'high', '-level:v', '4.1', '-vf', f'{zoom_filter},fps=30', # 30 FPS for smooth playback '-c:a', 'aac', '-b:a', '320k', # High quality audio '-ac', '2', # Stereo '-ar', '44100', # Standard sample rate '-shortest', '-movflags', '+faststart', # Optimize for web streaming temp_video ] result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: print(f"FFmpeg error: {result.stderr}") return self.fallback_video_creation(image_path, audio_path, output_path, duration) # Add subtitles using FFmpeg self.add_professional_subtitles(temp_video, text, output_path, duration, mood) # Cleanup if os.path.exists(temp_video): os.remove(temp_video) return True except subprocess.TimeoutExpired: print("FFmpeg timeout - falling back to basic method") return self.fallback_video_creation(image_path, audio_path, output_path, duration) except Exception as e: print(f"FFmpeg video creation error: {e}") return self.fallback_video_creation(image_path, audio_path, output_path, duration) def add_professional_subtitles(self, video_path, text, output_path, duration, mood): """Add professional subtitles using FFmpeg""" try: # Create subtitle file subtitle_file = output_path.replace('.mp4', '.srt') # Split text into chunks for better readability words = text.split() chunks = [] chunk_size = 8 # Words per subtitle for i in range(0, len(words), chunk_size): chunk = ' '.join(words[i:i + chunk_size]) chunks.append(chunk) # Create SRT content srt_content = "" chunk_duration = duration / len(chunks) if chunks else duration for i, chunk in enumerate(chunks): start_time = i * chunk_duration end_time = min((i + 1) * chunk_duration, duration) start_srt = self.seconds_to_srt_time(start_time) end_srt = self.seconds_to_srt_time(end_time) srt_content += f"{i + 1}\n{start_srt} --> {end_srt}\n{chunk}\n\n" # Write subtitle file with open(subtitle_file, 'w', encoding='utf-8') as f: f.write(srt_content) # Subtitle style based on mood subtitle_styles = { "action": "FontSize=24,PrimaryColour=&H00FFFF&,OutlineColour=&H000000&,Outline=2", "peaceful": "FontSize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&,Outline=1", "suspense": "FontSize=23,PrimaryColour=&H00FFFF&,OutlineColour=&H800000&,Outline=2" } style = subtitle_styles.get(mood, "FontSize=22,PrimaryColour=&HFFFFFF&,OutlineColour=&H000000&,Outline=2") # Add subtitles to video ffmpeg_subtitle_cmd = [ 'ffmpeg', '-y', '-i', video_path, '-vf', f"subtitles={subtitle_file}:force_style='{style}'", '-c:a', 'copy', '-c:v', 'libx264', '-crf', '18', output_path ] result = subprocess.run(ffmpeg_subtitle_cmd, capture_output=True, text=True, timeout=120) # Cleanup subtitle file if os.path.exists(subtitle_file): os.remove(subtitle_file) if result.returncode != 0: print(f"Subtitle error: {result.stderr}") # Just copy the video without subtitles import shutil shutil.copy(video_path, output_path) except Exception as e: print(f"Subtitle addition error: {e}") # Fallback: copy video without subtitles try: import shutil shutil.copy(video_path, output_path) except: pass def seconds_to_srt_time(self, seconds): """Convert seconds to SRT time format""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millis = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" def fallback_video_creation(self, image_path, audio_path, output_path, duration): """Fallback video creation method""" try: from moviepy.editor import VideoFileClip, AudioFileClip, ImageClip # Create video from image image_clip = ImageClip(image_path, duration=duration) audio_clip = AudioFileClip(audio_path) # Combine final_clip = image_clip.set_audio(audio_clip) final_clip.write_videofile( output_path, codec='libx264', audio_codec='aac', fps=30, bitrate='8M' # High bitrate for quality ) # Cleanup image_clip.close() audio_clip.close() final_clip.close() return True except Exception as e: print(f"Fallback video creation error: {e}") return False def generate_complete_video(self, video_id, theme, duration): """Generate complete high-quality video""" try: video_status[video_id] = {"status": "initializing", "progress": 5} # Create output directory output_dir = f"generated_videos/{video_id}" os.makedirs(output_dir, exist_ok=True) # Generate dynamic story using ChatGPT video_status[video_id] = {"status": "generating_story_with_chatgpt", "progress": 10} story_data = self.generate_dynamic_hindi_story(theme, duration) scenes = story_data["scenes"] total_scenes = len(scenes) story_title = story_data.get("title", f"Hindi {theme.title()} Story") video_status[video_id] = { "status": "processing_scenes", "progress": 20, "total_scenes": total_scenes, "story_title": story_title } scene_videos = [] total_duration = 0 # Process each scene for i, scene in enumerate(scenes): print(f"🎬 Processing scene {i+1}/{total_scenes}: {scene['hindi_text'][:50]}...") # Update progress progress = 20 + (i / total_scenes) * 65 video_status[video_id] = { "status": f"processing_scene_{i+1}", "progress": int(progress), "current_scene": i+1, "total_scenes": total_scenes, "story_title": story_title } # File paths audio_path = f"{output_dir}/scene_{i}_audio.wav" image_path = f"{output_dir}/scene_{i}_image.png" video_path = f"{output_dir}/scene_{i}_video.mp4" # Generate enhanced audio duration_sec = self.generate_enhanced_scene_audio( scene['hindi_text'], audio_path, scene.get('mood', 'neutral') ) # Generate high-quality image success = self.generate_high_quality_scene_image( scene['visual_prompt'], scene.get('mood', 'neutral'), image_path, i ) # Create professional video if self.create_professional_video_with_ffmpeg( image_path, audio_path, scene['hindi_text'], video_path, duration_sec, scene.get('mood', 'neutral') ): scene_videos.append(video_path) total_duration += duration_sec print(f"✅ Scene {i+1} completed ({duration_sec:.1f}s)") else: print(f"❌ Scene {i+1} failed") # Combine all scenes with professional transitions video_status[video_id] = {"status": "combining_videos", "progress": 90} if scene_videos: final_path = f"{output_dir}/final_hindi_story_hd.mp4" # Use FFmpeg for professional video concatenation self.combine_videos_with_ffmpeg(scene_videos, final_path, story_title) # Store video info video_files[video_id] = { "path": final_path, "title": story_title, "duration": total_duration, "created": datetime.now().isoformat(), "theme": theme, "quality": "1080p", "scenes": total_scenes } video_status[video_id] = { "status": "completed", "progress": 100, "duration": total_duration, "title": story_title } print(f"🎉 High-quality video {video_id} generated successfully!") return final_path except Exception as e: video_status[video_id] = {"status": "failed", "progress": 0, "error": str(e)} print(f"❌ Video generation failed: {e}") return None def combine_videos_with_ffmpeg(self, scene_videos, output_path, title): """Combine videos using FFmpeg with professional quality""" try: # Create file list for FFmpeg file_list_path = output_path.replace('.mp4', '_filelist.txt') with open(file_list_path, 'w') as f: for video in scene_videos: f.write(f"file '{os.path.abspath(video)}'\n") # FFmpeg concatenation command ffmpeg_cmd = [ 'ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', file_list_path, '-c:v', 'libx264', '-preset', 'slow', '-crf', '18', # High quality '-c:a', 'aac', '-b:a', '320k', '-movflags', '+faststart', output_path ] result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, timeout=600) # Cleanup if os.path.exists(file_list_path): os.remove(file_list_path) if result.returncode != 0: print(f"FFmpeg concatenation error: {result.stderr}") return False print(f"✅ Video combined successfully: {output_path}") return True except Exception as e: print(f"Video combination error: {e}") return False # Initialize generator generator = AdvancedHindiVideoGenerator() # Enhanced HTML template HTML_TEMPLATE = """
Create High-Quality YouTube Videos with AI-Powered Stories