Spaces:

garyuzair
/

Video-Fx

Running

App Files Files Community

garyuzair commited on Mar 13, 2025

Commit

ad6d387

verified ·

1 Parent(s): eb074a1

Upload 6 files

Browse files

Files changed (5) hide show

animator.py +37 -13
app.py +175 -51
image_generator.py +208 -279
transcriber.py +33 -17
video_creator.py +17 -6

animator.py CHANGED Viewed

@@ -10,11 +10,16 @@ class Animator:
     def __init__(self):
         self.frame_cache = {}
         self.aspect_ratio = "1:1"  # Default aspect ratio
     def set_aspect_ratio(self, aspect_ratio):
         """Set the aspect ratio for animations"""
         self.aspect_ratio = aspect_ratio
     def apply_cinematic_effects(self, image):
         """Apply cinematic effects to enhance the frame quality"""
         try:
@@ -65,8 +70,11 @@ class Animator:
                 return Image.open(image)
             return image
-    def add_zoom_animation(self, image_path, num_frames=10, zoom_factor=1.05, output_dir="temp"):
         """Add a simple zoom animation to an image with cinematic effects"""
         # Check cache first
         cache_key = f"zoom_{image_path}_{num_frames}_{zoom_factor}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
@@ -102,8 +110,11 @@ class Animator:
         self.frame_cache[cache_key] = frames
         return frames
-    def add_pan_animation(self, image_path, num_frames=10, direction="right", output_dir="temp"):
         """Add a simple panning animation to an image with cinematic effects"""
         # Check cache first
         cache_key = f"pan_{image_path}_{num_frames}_{direction}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
@@ -165,8 +176,11 @@ class Animator:
         self.frame_cache[cache_key] = frames
         return frames
-    def add_fade_animation(self, image_path, num_frames=10, fade_type="in", output_dir="temp"):
         """Add a fade in/out animation to an image with cinematic effects"""
         # Check cache first
         cache_key = f"fade_{image_path}_{num_frames}_{fade_type}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
@@ -207,8 +221,11 @@ class Animator:
         self.frame_cache[cache_key] = frames
         return frames
-    def add_ken_burns_effect(self, image_path, num_frames=10, output_dir="temp"):
         """Add a Ken Burns effect (combination of pan and zoom) with cinematic effects"""
         # Check cache first
         cache_key = f"kenburns_{image_path}_{num_frames}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
@@ -279,8 +296,11 @@ class Animator:
         self.frame_cache[cache_key] = frames
         return frames
-    def animate_single_image(self, img_path, animation_type="random", output_dir="temp"):
         """Animate a single image with cinematic effects"""
         # Choose animation type
         animation_types = ["zoom", "pan_right", "pan_left", "fade_in", "ken_burns"]
@@ -302,21 +322,24 @@ class Animator:
         # Apply the chosen animation
         if chosen_type == "ken_burns":
-            frames = self.add_ken_burns_effect(img_path, output_dir=output_dir)
         elif chosen_type.startswith("pan"):
             direction = chosen_type.split("_")[1] if "_" in chosen_type else "right"
-            frames = self.add_pan_animation(img_path, direction=direction, output_dir=output_dir)
         elif chosen_type.startswith("fade"):
             fade_type = chosen_type.split("_")[1] if "_" in chosen_type else "in"
-            frames = self.add_fade_animation(img_path, fade_type=fade_type, output_dir=output_dir)
         else:  # Default to zoom
-            frames = self.add_zoom_animation(img_path, output_dir=output_dir)
         return frames
     def animate_images(self, image_paths, animation_type="random", output_dir="temp",
-                      progress_callback=None, parallel=False, max_workers=4, batch_size=2):
         """Add animations to a list of images with parallel processing and batching"""
         all_animated_frames = []
         if parallel and len(image_paths) > 1:
@@ -325,7 +348,8 @@ class Animator:
                 # Create a partial function with fixed parameters
                 animate_func = partial(self.animate_single_image,
                                       animation_type=animation_type,
-                                      output_dir=output_dir)
                 # Process images in parallel
                 if progress_callback:
@@ -343,7 +367,7 @@ class Animator:
                 batch_frames = []
                 for img_path in batch:
-                    frames = self.animate_single_image(img_path, animation_type, output_dir)
                     batch_frames.append(frames)
                 all_animated_frames.extend(batch_frames)

     def __init__(self):
         self.frame_cache = {}
         self.aspect_ratio = "1:1"  # Default aspect ratio
+        self.frames_per_animation = 15  # Default number of frames per animation for smoother transitions
     def set_aspect_ratio(self, aspect_ratio):
         """Set the aspect ratio for animations"""
         self.aspect_ratio = aspect_ratio
+    def set_frames_per_animation(self, frames):
+        """Set the number of frames per animation"""
+        self.frames_per_animation = max(10, min(frames, 20))  # Keep between 10-20 frames for balance
     def apply_cinematic_effects(self, image):
         """Apply cinematic effects to enhance the frame quality"""
         try:
                 return Image.open(image)
             return image
+    def add_zoom_animation(self, image_path, num_frames=None, zoom_factor=1.05, output_dir="temp"):
         """Add a simple zoom animation to an image with cinematic effects"""
+        if num_frames is None:
+            num_frames = self.frames_per_animation
         # Check cache first
         cache_key = f"zoom_{image_path}_{num_frames}_{zoom_factor}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
         self.frame_cache[cache_key] = frames
         return frames
+    def add_pan_animation(self, image_path, num_frames=None, direction="right", output_dir="temp"):
         """Add a simple panning animation to an image with cinematic effects"""
+        if num_frames is None:
+            num_frames = self.frames_per_animation
         # Check cache first
         cache_key = f"pan_{image_path}_{num_frames}_{direction}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
         self.frame_cache[cache_key] = frames
         return frames
+    def add_fade_animation(self, image_path, num_frames=None, fade_type="in", output_dir="temp"):
         """Add a fade in/out animation to an image with cinematic effects"""
+        if num_frames is None:
+            num_frames = self.frames_per_animation
         # Check cache first
         cache_key = f"fade_{image_path}_{num_frames}_{fade_type}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
         self.frame_cache[cache_key] = frames
         return frames
+    def add_ken_burns_effect(self, image_path, num_frames=None, output_dir="temp"):
         """Add a Ken Burns effect (combination of pan and zoom) with cinematic effects"""
+        if num_frames is None:
+            num_frames = self.frames_per_animation
         # Check cache first
         cache_key = f"kenburns_{image_path}_{num_frames}_{self.aspect_ratio}"
         if cache_key in self.frame_cache:
         self.frame_cache[cache_key] = frames
         return frames
+    def animate_single_image(self, img_path, animation_type="random", output_dir="temp", num_frames=None):
         """Animate a single image with cinematic effects"""
+        if num_frames is None:
+            num_frames = self.frames_per_animation
         # Choose animation type
         animation_types = ["zoom", "pan_right", "pan_left", "fade_in", "ken_burns"]
         # Apply the chosen animation
         if chosen_type == "ken_burns":
+            frames = self.add_ken_burns_effect(img_path, num_frames=num_frames, output_dir=output_dir)
         elif chosen_type.startswith("pan"):
             direction = chosen_type.split("_")[1] if "_" in chosen_type else "right"
+            frames = self.add_pan_animation(img_path, num_frames=num_frames, direction=direction, output_dir=output_dir)
         elif chosen_type.startswith("fade"):
             fade_type = chosen_type.split("_")[1] if "_" in chosen_type else "in"
+            frames = self.add_fade_animation(img_path, num_frames=num_frames, fade_type=fade_type, output_dir=output_dir)
         else:  # Default to zoom
+            frames = self.add_zoom_animation(img_path, num_frames=num_frames, output_dir=output_dir)
         return frames
     def animate_images(self, image_paths, animation_type="random", output_dir="temp",
+                      progress_callback=None, parallel=False, max_workers=4, batch_size=2, num_frames=None):
         """Add animations to a list of images with parallel processing and batching"""
+        if num_frames is None:
+            num_frames = self.frames_per_animation
         all_animated_frames = []
         if parallel and len(image_paths) > 1:
                 # Create a partial function with fixed parameters
                 animate_func = partial(self.animate_single_image,
                                       animation_type=animation_type,
+                                      output_dir=output_dir,
+                                      num_frames=num_frames)
                 # Process images in parallel
                 if progress_callback:
                 batch_frames = []
                 for img_path in batch:
+                    frames = self.animate_single_image(img_path, animation_type, output_dir, num_frames)
                     batch_frames.append(frames)
                 all_animated_frames.extend(batch_frames)

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from functools import partial
 import torch
 import hashlib
 from PIL import Image, ImageDraw
 from transcriber import AudioTranscriber
 from prompt_generator import PromptGenerator
@@ -128,7 +129,17 @@ def generate_prompt_for_segment(transcription, prompt_generator, aspect_ratio="1
 def generate_image_for_prompt(prompt, image_generator):
     """Generate an image for a single prompt in parallel"""
     try:
-        return image_generator.generate_image(prompt)
     except Exception as e:
         st.warning(f"Error generating image: {str(e)}. Using fallback image.")
         # Create a fallback image
@@ -140,10 +151,10 @@ def generate_image_for_prompt(prompt, image_generator):
         img.save(path)
         return path
-def animate_image(image_path, animator, animation_type="random"):
     """Animate a single image in parallel"""
     try:
-        return animator.animate_single_image(image_path, animation_type)
     except Exception as e:
         st.warning(f"Error animating image: {str(e)}. Using static frames.")
         # Create a sequence of identical frames as fallback
@@ -197,17 +208,44 @@ def main():
                                    help="Number of simultaneous tasks (higher values may use more memory)")
             use_caching = st.toggle("Enable result caching", value=True,
                                   help="Save results to speed up repeated conversions")
         # Content settings
         st.markdown("### 🎨 Content")
         with st.expander("Segmentation", expanded=True):
-            num_segments = st.slider("Number of segments", min_value=2, max_value=10, value=5,
-                                    help="How many scenes to create in your video")
             animation_type = st.selectbox(
                 "Animation style",
                 ["random", "zoom", "pan_right", "pan_left", "fade_in", "ken_burns"],
                 help="Choose how images will animate in your video"
             )
         # Advanced settings
         st.markdown("### 🔧 Advanced")
@@ -260,6 +298,7 @@ def main():
         Optimized for Hugging Face Spaces with:
         - Multiple video formats (16:9, 1:1, 9:16)
         - Parallel processing
         - Memory-efficient models
         - Result caching
@@ -278,7 +317,7 @@ def main():
         # Generate a cache key based on the audio file and settings
         audio_bytes = audio_file.getvalue()
-        settings_str = f"{num_segments}_{animation_type}_{base_image_size}_{inference_steps}_{video_quality}_{selected_aspect_ratio}"
         cache_key = hashlib.md5((hashlib.md5(audio_bytes).hexdigest() + settings_str).encode()).hexdigest()
         # Process button with better styling
@@ -325,6 +364,11 @@ def main():
                     status_message = st.empty()
             try:
                 # Step 1: Initialize components
                 status_text.text("Initializing components...")
                 status_message.markdown("🔄 **Setting up AI models...**")
@@ -339,6 +383,13 @@ def main():
                 animator.set_aspect_ratio(selected_aspect_ratio)
                 video_creator.set_aspect_ratio(selected_aspect_ratio)
                 # Calculate actual image size based on aspect ratio
                 actual_image_size = image_generator.get_size_for_aspect_ratio(base_image_size, selected_aspect_ratio)
@@ -359,7 +410,7 @@ def main():
                     import numpy as np
                     audio_segments = [np.zeros(16000) for _ in range(num_segments)]  # 1-second silent segments
                     total_duration = 5 * num_segments  # Assume 5 seconds per segment
-                    timestamps = [(i*5, (i+1)*5) for i in range(num_segments)]
                 progress_bar.progress(15)
@@ -382,6 +433,11 @@ def main():
                             st.warning(f"Error transcribing segment: {str(e)}. Using empty transcription.")
                             transcriptions.append("")
                 # Display transcriptions with better styling
                 progress_bar.progress(30)
                 st.markdown("### 📝 Transcriptions")
@@ -423,32 +479,50 @@ def main():
                     </div>
                     """, unsafe_allow_html=True)
-                # Step 4: Generate images in parallel
                 status_text.text("Generating images from prompts...")
                 status_message.markdown("🎨 **Creating images...**")
-                if parallel_processing:
-                    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                        # Create a partial function with the image generator
-                        image_func = partial(generate_image_for_prompt, image_generator=image_generator)
-                        # Generate images in parallel
-                        images = list(executor.map(image_func, prompts))
-                else:
                     images = []
-                    for i, prompt in enumerate(prompts):
-                        status_text.text(f"Generating image {i+1}/{len(prompts)}...")
-                        try:
-                            img_path = image_generator.generate_image(prompt)
                             images.append(img_path)
-                        except Exception as e:
-                            st.warning(f"Error generating image: {str(e)}. Using fallback image.")
-                            # Create a fallback image
-                            from PIL import Image, ImageDraw
-                            img = Image.new('RGB', image_generator.target_size, color=(240, 240, 240))
-                            draw = ImageDraw.Draw(img)
-                            draw.text((10, 10), prompt[:50], fill=(0, 0, 0))
-                            path = f"temp/fallback_{int(time.time() * 1000)}.png"
-                            img.save(path)
-                            images.append(path)
                 # Display images with better styling
                 progress_bar.progress(60)
@@ -458,32 +532,73 @@ def main():
                     with image_cols[i % len(image_cols)]:
                         st.image(img_path, caption=f"Image {i+1}", use_column_width=True)
-                # Step 5: Add animations in parallel
                 status_text.text("Adding animations to images...")
                 status_message.markdown("✨ **Adding animations...**")
-                if parallel_processing:
-                    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                        # Create a partial function with the animator and animation type
-                        animate_func = partial(animate_image, animator=animator, animation_type=animation_type)
-                        # Animate images in parallel
-                        animated_frames = list(executor.map(animate_func, images))
-                else:
                     animated_frames = []
-                    for i, img_path in enumerate(images):
-                        status_text.text(f"Animating image {i+1}/{len(images)}...")
-                        try:
-                            frames = animator.animate_single_image(img_path, animation_type)
-                            animated_frames.append(frames)
-                        except Exception as e:
-                            st.warning(f"Error animating image: {str(e)}. Using static frames.")
-                            # Create a sequence of identical frames as fallback
-                            frames = []
-                            for _ in range(10):
-                                frames.append(img_path)
                             animated_frames.append(frames)
                 progress_bar.progress(80)
                 # Step 6: Create video
                 status_text.text("Creating final video...")
                 status_message.markdown("🎬 **Assembling video...**")
@@ -492,7 +607,7 @@ def main():
                     audio_file,
                     segments=transcriptions,
                     timestamps=timestamps,
-                    parallel=parallel_processing,
                     max_workers=max_workers
                 )
@@ -510,7 +625,7 @@ def main():
                     output_video = video_creator.optimize_video(
                         output_video,
                         bitrate=bitrate,
-                        threads=max_workers
                     )
                 # Cache the result if caching is enabled
@@ -541,7 +656,10 @@ def main():
                 st.markdown("### ⏱️ Performance Metrics")
                 st.info(f"""
                 - Video Format: {aspect_ratio}
                 - Parallel Processing: {'Enabled' if parallel_processing else 'Disabled'}
                 - Workers: {max_workers}
                 - Image Size: {actual_image_size[0]}x{actual_image_size[1]}
                 - Inference Steps: {inference_steps}
@@ -557,6 +675,11 @@ def main():
                         except:
                             pass
                 status_text.text("All done! Your video is ready for download.")
             except Exception as e:
@@ -566,9 +689,10 @@ def main():
                 # Provide troubleshooting tips
                 st.markdown("### 🔧 Troubleshooting Tips")
                 st.info("""
-                - Try reducing the number of segments
                 - Use a smaller image size
                 - Reduce inference steps
                 - Make sure your audio file is in a supported format
                 - Clear the cache and try again
                 """)

 import torch
 import hashlib
 from PIL import Image, ImageDraw
+import gc
 from transcriber import AudioTranscriber
 from prompt_generator import PromptGenerator
 def generate_image_for_prompt(prompt, image_generator):
     """Generate an image for a single prompt in parallel"""
     try:
+        # Force garbage collection before generating each image
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        image_path = image_generator.generate_image(prompt)
+        # Force garbage collection after generating each image
+        gc.collect()
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        return image_path
     except Exception as e:
         st.warning(f"Error generating image: {str(e)}. Using fallback image.")
         # Create a fallback image
         img.save(path)
         return path
+def animate_image(image_path, animator, animation_type="random", num_frames=15):
     """Animate a single image in parallel"""
     try:
+        return animator.animate_single_image(image_path, animation_type, num_frames=num_frames)
     except Exception as e:
         st.warning(f"Error animating image: {str(e)}. Using static frames.")
         # Create a sequence of identical frames as fallback
                                    help="Number of simultaneous tasks (higher values may use more memory)")
             use_caching = st.toggle("Enable result caching", value=True,
                                   help="Save results to speed up repeated conversions")
+            # Memory optimization settings
+            memory_optimization = st.toggle("Enable memory optimization", value=True,
+                                         help="Reduce memory usage (recommended for Hugging Face Spaces)")
         # Content settings
         st.markdown("### 🎨 Content")
         with st.expander("Segmentation", expanded=True):
+            # New setting for maximum segment duration
+            max_segment_duration = st.slider(
+                "Maximum image duration (seconds)",
+                min_value=1.0,
+                max_value=5.0,
+                value=5.0,
+                step=0.5,
+                help="Maximum time each image will stay on screen (5 seconds or less)"
+            )
+            # Adjust number of segments based on max duration
+            st.info("More images will be created to ensure each stays under the maximum duration")
+            num_segments = st.slider("Minimum number of segments", min_value=2, max_value=20, value=5,
+                                    help="Minimum number of scenes to create in your video")
             animation_type = st.selectbox(
                 "Animation style",
                 ["random", "zoom", "pan_right", "pan_left", "fade_in", "ken_burns"],
                 help="Choose how images will animate in your video"
             )
+            # Animation frames setting
+            frames_per_animation = st.slider(
+                "Animation smoothness",
+                min_value=10,
+                max_value=20,
+                value=15,
+                help="Higher values create smoother animations but may increase processing time"
+            )
         # Advanced settings
         st.markdown("### 🔧 Advanced")
         Optimized for Hugging Face Spaces with:
         - Multiple video formats (16:9, 1:1, 9:16)
+        - Dynamic image timing (5 seconds or less)
         - Parallel processing
         - Memory-efficient models
         - Result caching
         # Generate a cache key based on the audio file and settings
         audio_bytes = audio_file.getvalue()
+        settings_str = f"{num_segments}_{max_segment_duration}_{animation_type}_{frames_per_animation}_{base_image_size}_{inference_steps}_{video_quality}_{selected_aspect_ratio}_{memory_optimization}"
         cache_key = hashlib.md5((hashlib.md5(audio_bytes).hexdigest() + settings_str).encode()).hexdigest()
         # Process button with better styling
                     status_message = st.empty()
             try:
+                # Force garbage collection before starting
+                if memory_optimization:
+                    gc.collect()
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
                 # Step 1: Initialize components
                 status_text.text("Initializing components...")
                 status_message.markdown("🔄 **Setting up AI models...**")
                 animator.set_aspect_ratio(selected_aspect_ratio)
                 video_creator.set_aspect_ratio(selected_aspect_ratio)
+                # Set maximum segment duration
+                transcriber.set_max_segment_duration(max_segment_duration)
+                video_creator.set_max_segment_duration(max_segment_duration)
+                # Set animation frames
+                animator.set_frames_per_animation(frames_per_animation)
                 # Calculate actual image size based on aspect ratio
                 actual_image_size = image_generator.get_size_for_aspect_ratio(base_image_size, selected_aspect_ratio)
                     import numpy as np
                     audio_segments = [np.zeros(16000) for _ in range(num_segments)]  # 1-second silent segments
                     total_duration = 5 * num_segments  # Assume 5 seconds per segment
+                    timestamps = [(i*5, min((i+1)*5, i*5+max_segment_duration)) for i in range(num_segments)]
                 progress_bar.progress(15)
                             st.warning(f"Error transcribing segment: {str(e)}. Using empty transcription.")
                             transcriptions.append("")
+                # Force garbage collection after transcription
+                if memory_optimization:
+                    gc.collect()
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
                 # Display transcriptions with better styling
                 progress_bar.progress(30)
                 st.markdown("### 📝 Transcriptions")
                     </div>
                     """, unsafe_allow_html=True)
+                # Step 4: Generate images in parallel or batches
                 status_text.text("Generating images from prompts...")
                 status_message.markdown("🎨 **Creating images...**")
+                # For memory optimization, process in smaller batches even with parallel processing
+                if memory_optimization:
+                    batch_size = 2  # Process only 2 images at a time to conserve memory
                     images = []
+                    for i in range(0, len(prompts), batch_size):
+                        batch_prompts = prompts[i:i+batch_size]
+                        status_text.text(f"Generating images {i+1}-{min(i+batch_size, len(prompts))}/{len(prompts)}...")
+                        if parallel_processing and batch_size > 1:
+                            with concurrent.futures.ThreadPoolExecutor(max_workers=min(batch_size, max_workers)) as executor:
+                                # Create a partial function with the image generator
+                                image_func = partial(generate_image_for_prompt, image_generator=image_generator)
+                                # Generate images in parallel within the batch
+                                batch_images = list(executor.map(image_func, batch_prompts))
+                        else:
+                            batch_images = []
+                            for prompt in batch_prompts:
+                                img_path = generate_image_for_prompt(prompt, image_generator)
+                                batch_images.append(img_path)
+                        images.extend(batch_images)
+                        # Force garbage collection after each batch
+                        gc.collect()
+                        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                else:
+                    # Standard processing without special memory considerations
+                    if parallel_processing:
+                        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                            # Create a partial function with the image generator
+                            image_func = partial(generate_image_for_prompt, image_generator=image_generator)
+                            # Generate images in parallel
+                            images = list(executor.map(image_func, prompts))
+                    else:
+                        images = []
+                        for i, prompt in enumerate(prompts):
+                            status_text.text(f"Generating image {i+1}/{len(prompts)}...")
+                            img_path = generate_image_for_prompt(prompt, image_generator)
                             images.append(img_path)
                 # Display images with better styling
                 progress_bar.progress(60)
                     with image_cols[i % len(image_cols)]:
                         st.image(img_path, caption=f"Image {i+1}", use_column_width=True)
+                # Force garbage collection after image generation
+                if memory_optimization:
+                    gc.collect()
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                # Step 5: Add animations in parallel or batches
                 status_text.text("Adding animations to images...")
                 status_message.markdown("✨ **Adding animations...**")
+                # For memory optimization, process in smaller batches
+                if memory_optimization:
+                    batch_size = 3  # Process only 3 animations at a time
                     animated_frames = []
+                    for i in range(0, len(images), batch_size):
+                        batch_images = images[i:i+batch_size]
+                        status_text.text(f"Animating images {i+1}-{min(i+batch_size, len(images))}/{len(images)}...")
+                        if parallel_processing and batch_size > 1:
+                            with concurrent.futures.ThreadPoolExecutor(max_workers=min(batch_size, max_workers)) as executor:
+                                # Create a partial function with the animator, animation type, and frames
+                                animate_func = partial(animate_image,
+                                                      animator=animator,
+                                                      animation_type=animation_type,
+                                                      num_frames=frames_per_animation)
+                                # Animate images in parallel within the batch
+                                batch_frames = list(executor.map(animate_func, batch_images))
+                        else:
+                            batch_frames = []
+                            for img_path in batch_images:
+                                frames = animate_image(img_path, animator, animation_type, frames_per_animation)
+                                batch_frames.append(frames)
+                        animated_frames.extend(batch_frames)
+                        # Force garbage collection after each batch
+                        gc.collect()
+                        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                else:
+                    # Standard processing without special memory considerations
+                    if parallel_processing:
+                        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                            # Create a partial function with the animator, animation type, and frames
+                            animate_func = partial(animate_image,
+                                                  animator=animator,
+                                                  animation_type=animation_type,
+                                                  num_frames=frames_per_animation)
+                            # Animate images in parallel
+                            animated_frames = list(executor.map(animate_func, images))
+                    else:
+                        animated_frames = []
+                        for i, img_path in enumerate(images):
+                            status_text.text(f"Animating image {i+1}/{len(images)}...")
+                            frames = animator.animate_single_image(
+                                img_path,
+                                animation_type,
+                                num_frames=frames_per_animation
+                            )
                             animated_frames.append(frames)
                 progress_bar.progress(80)
+                # Force garbage collection before video creation
+                if memory_optimization:
+                    gc.collect()
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
                 # Step 6: Create video
                 status_text.text("Creating final video...")
                 status_message.markdown("🎬 **Assembling video...**")
                     audio_file,
                     segments=transcriptions,
                     timestamps=timestamps,
+                    parallel=parallel_processing and not memory_optimization,  # Disable parallel for memory optimization
                     max_workers=max_workers
                 )
                     output_video = video_creator.optimize_video(
                         output_video,
                         bitrate=bitrate,
+                        threads=2 if memory_optimization else max_workers  # Use fewer threads for memory optimization
                     )
                 # Cache the result if caching is enabled
                 st.markdown("### ⏱️ Performance Metrics")
                 st.info(f"""
                 - Video Format: {aspect_ratio}
+                - Max Image Duration: {max_segment_duration} seconds
+                - Number of Segments: {len(audio_segments)}
                 - Parallel Processing: {'Enabled' if parallel_processing else 'Disabled'}
+                - Memory Optimization: {'Enabled' if memory_optimization else 'Disabled'}
                 - Workers: {max_workers}
                 - Image Size: {actual_image_size[0]}x{actual_image_size[1]}
                 - Inference Steps: {inference_steps}
                         except:
                             pass
+                # Final garbage collection
+                if memory_optimization:
+                    gc.collect()
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
                 status_text.text("All done! Your video is ready for download.")
             except Exception as e:
                 # Provide troubleshooting tips
                 st.markdown("### 🔧 Troubleshooting Tips")
                 st.info("""
+                - Try enabling memory optimization
                 - Use a smaller image size
                 - Reduce inference steps
+                - Reduce the number of segments
                 - Make sure your audio file is in a supported format
                 - Clear the cache and try again
                 """)

image_generator.py CHANGED Viewed

@@ -1,104 +1,53 @@
 import streamlit as st
-import torch
 import os
-import numpy as np
-from PIL import Image, ImageEnhance, ImageFilter
 import time
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
 class ImageGenerator:
     def __init__(self):
         self.model = None
         self.inference_steps = 20
-        self.target_size = (384, 384)
         self.aspect_ratio = "1:1"  # Default aspect ratio
-    def load_model(self):
-        """Load a lightweight image generation model"""
-        if self.model is None:
-            with st.spinner("Loading image generation model... This may take a moment."):
-                try:
-                    # Using a lightweight model for image generation
-                    from diffusers import StableDiffusionPipeline
-                    model_id = "sd-legacy/stable-diffusion-v1-5"
-                    # Load with memory optimization settings
-                    self.model = StableDiffusionPipeline.from_pretrained(
-                        model_id,
-                        torch_dtype=torch.float32,
-                        safety_checker=None,
-                        requires_safety_checker=False,
-                        low_cpu_mem_usage=True
-                    )
-                    # Use CPU for inference to save memory
-                    self.model = self.model.to("cpu")
-                    # Enable memory efficient attention if available
-                    if hasattr(self.model, 'enable_attention_slicing'):
-                        self.model.enable_attention_slicing()
-                    # Enable memory efficient attention
-                    if hasattr(self.model, 'enable_vae_slicing'):
-                        self.model.enable_vae_slicing()
-                    # Enable xformers memory efficient attention if available
-                    try:
-                        if hasattr(self.model, 'enable_xformers_memory_efficient_attention'):
-                            self.model.enable_xformers_memory_efficient_attention()
-                    except:
-                        pass
-                except Exception as e:
-                    st.warning(f"Error loading image generation model: {str(e)}. Using fallback method.")
-                    self.model = None
-        return self.model
-    def set_inference_steps(self, steps):
-        """Set the number of inference steps"""
-        self.inference_steps = steps
     def set_target_size(self, size):
-        """Set the target image size"""
         self.target_size = size
-    def set_aspect_ratio(self, aspect_ratio):
-        """Set the aspect ratio for generated images"""
-        self.aspect_ratio = aspect_ratio
-        # Update target size based on aspect ratio while maintaining total pixels
-        base_pixels = self.target_size[0] * self.target_size[1]
-        if aspect_ratio == "1:1":
-            # Square format
-            side = int(np.sqrt(base_pixels))
-            self.target_size = (side, side)
-        elif aspect_ratio == "16:9":
-            # Landscape format
-            width = int(np.sqrt(base_pixels * 16 / 9))
-            height = int(width * 9 / 16)
-            self.target_size = (width, height)
-        elif aspect_ratio == "9:16":
-            # Portrait format
-            height = int(np.sqrt(base_pixels * 16 / 9))
-            width = int(height * 9 / 16)
-            self.target_size = (width, height)
-    def get_size_for_aspect_ratio(self, base_size, aspect_ratio):
-        """Calculate dimensions for a given aspect ratio while maintaining approximate total pixels"""
         base_pixels = base_size[0] * base_size[1]
         if aspect_ratio == "1:1":
             # Square format
             side = int(np.sqrt(base_pixels))
             return (side, side)
         elif aspect_ratio == "16:9":
             # Landscape format
             width = int(np.sqrt(base_pixels * 16 / 9))
             height = int(width * 9 / 16)
-            # Ensure dimensions are even numbers for video compatibility
             width = width if width % 2 == 0 else width + 1
             height = height if height % 2 == 0 else height + 1
             return (width, height)
@@ -106,242 +55,222 @@ class ImageGenerator:
             # Portrait format
             height = int(np.sqrt(base_pixels * 16 / 9))
             width = int(height * 9 / 16)
-            # Ensure dimensions are even numbers for video compatibility
             width = width if width % 2 == 0 else width + 1
             height = height if height % 2 == 0 else height + 1
             return (width, height)
         else:
             # Default to original size
             return base_size
-    def apply_cinematic_effects(self, image):
-        """Apply cinematic effects to enhance the image quality"""
-        try:
-            # Enhance contrast slightly
-            enhancer = ImageEnhance.Contrast(image)
-            image = enhancer.enhance(1.2)
-            # Enhance color saturation slightly
-            enhancer = ImageEnhance.Color(image)
-            image = enhancer.enhance(1.1)
-            # Add subtle vignette effect
-            # Create a radial gradient mask
-            mask = Image.new('L', image.size, 255)
-            draw = ImageDraw.Draw(mask)
-            width, height = image.size
-            center_x, center_y = width // 2, height // 2
-            max_radius = min(width, height) // 2
-            for y in range(height):
-                for x in range(width):
-                    # Calculate distance from center
-                    distance = np.sqrt((x - center_x)**2 + (y - center_y)**2)
-                    # Create vignette effect (darker at edges)
-                    intensity = int(255 * (1 - 0.3 * (distance / max_radius)**2))
-                    mask.putpixel((x, y), intensity)
-            # Apply the mask
-            image = Image.composite(image, Image.new('RGB', image.size, (0, 0, 0)), mask)
-            # Add subtle film grain
-            grain = Image.effect_noise((image.width, image.height), 10)
-            grain = grain.convert('L')
-            grain = grain.filter(ImageFilter.GaussianBlur(radius=1))
-            image = Image.blend(image, Image.composite(image, Image.new('RGB', image.size, (128, 128, 128)), grain), 0.05)
-            return image
-        except Exception as e:
-            # If effects fail, return original image
-            return image
-    def generate_image(self, prompt, output_dir="temp"):
-        """Generate a single image from a prompt"""
         # Ensure output directory exists
-        os.makedirs(output_dir, exist_ok=True)
         try:
             # Load the model if not already loaded
             model = self.load_model()
             if model is not None:
-                # Generate image with minimal inference steps to save resources
-                image = model(
-                    prompt,
-                    num_inference_steps=self.inference_steps,
-                    guidance_scale=7.5
-                ).images[0]
-                # Resize to target size for consistency and performance
-                if image.size != self.target_size:
-                    image = image.resize(self.target_size, Image.LANCZOS)
-                # Apply cinematic effects
-                image = self.apply_cinematic_effects(image)
-            else:
-                # Fallback: Create a colored gradient image with text
-                from PIL import Image, ImageDraw, ImageFilter
-                # Create a base image with gradient background
-                image = Image.new('RGB', self.target_size, color=(240, 240, 240))
-                draw = ImageDraw.Draw(image)
-                # Create a gradient background
-                for y in range(image.height):
-                    for x in range(image.width):
-                        # Create a simple gradient
-                        r = int(200 + (x * 55 / image.width))
-                        g = int(200 + (y * 55 / image.height))
-                        b = 240
-                        draw.point((x, y), fill=(r, g, b))
-                # Add some noise/texture
-                image = image.filter(ImageFilter.GaussianBlur(radius=1))
-                # Add text from prompt (truncated)
-                draw = ImageDraw.Draw(image)
-                text = prompt[:50] + "..." if len(prompt) > 50 else prompt
-                # Position text
-                text_width = draw.textlength(text, font=None)
-                text_position = ((image.width - text_width) / 2, image.height / 2)
-                # Draw text
-                draw.text(text_position, text, fill=(0, 0, 0))
         except Exception as e:
             st.warning(f"Error generating image: {str(e)}. Using fallback method.")
-            # Fallback: Create a colored gradient image with text
-            from PIL import Image, ImageDraw, ImageFilter
-            # Create a base image with gradient background
-            image = Image.new('RGB', self.target_size, color=(240, 240, 240))
-            draw = ImageDraw.Draw(image)
-            # Create a gradient background
-            for y in range(image.height):
-                for x in range(image.width):
-                    # Create a simple gradient
-                    r = int(200 + (x * 55 / image.width))
-                    g = int(200 + (y * 55 / image.height))
-                    b = 240
-                    draw.point((x, y), fill=(r, g, b))
-            # Add some noise/texture
-            image = image.filter(ImageFilter.GaussianBlur(radius=1))
-            # Add text from prompt (truncated)
-            draw = ImageDraw.Draw(image)
-            text = prompt[:50] + "..." if len(prompt) > 50 else prompt
-            # Position text
-            text_width = draw.textlength(text, font=None)
-            text_position = ((image.width - text_width) / 2, image.height / 2)
-            # Draw text
-            draw.text(text_position, text, fill=(0, 0, 0))
-        # Save the image
-        image_path = f"{output_dir}/image_{int(time.time() * 1000)}.png"
-        image.save(image_path)
-        return image_path
-    def generate_images(self, prompts, output_dir="temp", progress_callback=None, parallel=False, max_workers=4):
-        """Generate images from the prompts"""
-        # Ensure output directory exists
-        os.makedirs(output_dir, exist_ok=True)
-        if parallel and len(prompts) > 1:
-            # Generate images in parallel
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                # Create a partial function with fixed parameters
-                generate_func = partial(self.generate_image, output_dir=output_dir)
-                # Process prompts in parallel and collect results
-                if progress_callback:
-                    progress_callback("Generating images in parallel...")
-                images = list(executor.map(generate_func, prompts))
         else:
-            # Generate images sequentially
-            images = []
-            for i, prompt in enumerate(prompts):
-                if progress_callback:
-                    progress_callback(f"Generating image {i+1}/{len(prompts)}...")
-                image_path = self.generate_image(prompt, output_dir)
-                images.append(image_path)
-        return images
-    def optimize_image(self, image_path, target_size=None):
-        """Optimize image size for video creation"""
-        if target_size is None:
-            target_size = self.target_size
-        img = Image.open(image_path)
-        # Resize to target size
-        img = img.resize(target_size, Image.LANCZOS)
-        # Apply cinematic effects
-        img = self.apply_cinematic_effects(img)
-        # Save optimized image
-        img.save(image_path)
-        return image_path
-    def optimize_all_images(self, image_paths, target_size=None, parallel=False, max_workers=4):
-        """Optimize all images for video creation"""
-        if target_size is None:
-            target_size = self.target_size
-        if parallel and len(image_paths) > 1:
-            # Optimize images in parallel
-            with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                # Create a partial function with fixed parameters
-                optimize_func = partial(self.optimize_image, target_size=target_size)
-                # Process images in parallel
-                optimized_paths = list(executor.map(optimize_func, image_paths))
-        else:
-            # Optimize images sequentially
-            optimized_paths = []
-            for path in image_paths:
-                optimized_path = self.optimize_image(path, target_size)
-                optimized_paths.append(optimized_path)
-        return optimized_paths
-    def batch_generate_images(self, prompts, batch_size=2, output_dir="temp", progress_callback=None):
-        """Generate images in batches to optimize memory usage"""
-        # Ensure output directory exists
-        os.makedirs(output_dir, exist_ok=True)
-        images = []
-        # Process prompts in batches
-        for i in range(0, len(prompts), batch_size):
-            batch_prompts = prompts[i:i+batch_size]
-            if progress_callback:
-                progress_callback(f"Generating batch {i//batch_size + 1}/{(len(prompts) + batch_size - 1)//batch_size}...")
-            # Generate images for this batch
-            batch_images = []
-            for j, prompt in enumerate(batch_prompts):
-                image_path = self.generate_image(prompt, output_dir)
-                batch_images.append(image_path)
-            # Add batch results to overall results
-            images.extend(batch_images)
-            # Clear CUDA cache if using GPU
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        return images

 import streamlit as st
 import os
+import tempfile
+from PIL import Image
+import torch
 import time
+import numpy as np
+import gc
 class ImageGenerator:
     def __init__(self):
         self.model = None
+        self.processor = None
+        self.target_size = (512, 512)
         self.inference_steps = 20
+        self.guidance_scale = 7.5
         self.aspect_ratio = "1:1"  # Default aspect ratio
+        self.image_cache = {}
+    def set_aspect_ratio(self, aspect_ratio):
+        """Set the aspect ratio for image generation"""
+        self.aspect_ratio = aspect_ratio
     def set_target_size(self, size):
+        """Set the target size for generated images"""
         self.target_size = size
+    def set_inference_steps(self, steps):
+        """Set the number of inference steps for image generation"""
+        self.inference_steps = steps
+    def get_size_for_aspect_ratio(self, base_size, aspect_ratio=None):
+        """Calculate image dimensions based on aspect ratio"""
+        if aspect_ratio is None:
+            aspect_ratio = self.aspect_ratio
+        # Calculate base pixels (total pixels in the image)
         base_pixels = base_size[0] * base_size[1]
         if aspect_ratio == "1:1":
             # Square format
             side = int(np.sqrt(base_pixels))
+            # Ensure even dimensions for compatibility
+            side = side if side % 2 == 0 else side + 1
             return (side, side)
         elif aspect_ratio == "16:9":
             # Landscape format
             width = int(np.sqrt(base_pixels * 16 / 9))
             height = int(width * 9 / 16)
+            # Ensure even dimensions for compatibility
             width = width if width % 2 == 0 else width + 1
             height = height if height % 2 == 0 else height + 1
             return (width, height)
             # Portrait format
             height = int(np.sqrt(base_pixels * 16 / 9))
             width = int(height * 9 / 16)
+            # Ensure even dimensions for compatibility
             width = width if width % 2 == 0 else width + 1
             height = height if height % 2 == 0 else height + 1
             return (width, height)
         else:
             # Default to original size
             return base_size
+    def load_model(self):
+        """Load the image generation model with optimizations for CPU"""
+        if self.model is None:
+            with st.spinner("Loading image generation model..."):
+                try:
+                    # Force garbage collection before loading model
+                    gc.collect()
+                    torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                    from diffusers import StableDiffusionPipeline
+                    # Use the correct model ID as specified
+                    model_id = "sd-legacy/stable-diffusion-v1-5"
+                    # For CPU-only environments like Hugging Face Spaces free tier
+                    self.model = StableDiffusionPipeline.from_pretrained(
+                        model_id,
+                        torch_dtype=torch.float32,  # Use float32 for CPU
+                        safety_checker=None,        # Disable safety checker for speed
+                        low_cpu_mem_usage=True,     # Optimize for low memory
+                        revision="fp16"             # Use fp16 weights but convert to fp32
+                    )
+                    # Optimize for CPU
+                    self.model = self.model.to("cpu")
+                    # Enable memory efficient attention
+                    if hasattr(self.model, "enable_attention_slicing"):
+                        self.model.enable_attention_slicing(1)
+                    # Enable sequential CPU offload if available
+                    if hasattr(self.model, "enable_sequential_cpu_offload"):
+                        self.model.enable_sequential_cpu_offload()
+                    # Enable model CPU offloading if available
+                    if hasattr(self.model, "enable_model_cpu_offload"):
+                        self.model.enable_model_cpu_offload()
+                    # Use smaller VAE scale factor for memory efficiency
+                    if hasattr(self.model, "vae") and hasattr(self.model.vae, "config"):
+                        if hasattr(self.model.vae.config, "scaling_factor"):
+                            self.model.vae.config.scaling_factor = 0.18215  # Default value, explicitly set
+                except Exception as e:
+                    st.warning(f"Error loading image generation model: {str(e)}. Using fallback method.")
+                    self.model = None
+        return self.model
+    def generate_image(self, prompt, negative_prompt="blurry, bad quality, distorted, disfigured, low resolution"):
+        """Generate an image from a text prompt"""
+        # Generate a cache key based on the prompt and settings
+        import hashlib
+        cache_key = f"{hashlib.md5(prompt.encode()).hexdigest()}_{self.target_size}_{self.inference_steps}_{self.guidance_scale}_{self.aspect_ratio}"
+        # Check if result is in cache
+        if cache_key in self.image_cache:
+            return self.image_cache[cache_key]
         # Ensure output directory exists
+        os.makedirs("temp", exist_ok=True)
         try:
             # Load the model if not already loaded
             model = self.load_model()
             if model is not None:
+                # Enhance the prompt with aspect ratio-specific details
+                enhanced_prompt = self.enhance_prompt_for_aspect_ratio(prompt)
+                # Force garbage collection before inference
+                gc.collect()
+                torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                # Generate the image
+                with torch.no_grad():  # Disable gradient calculation for memory efficiency
+                    # Use lower precision during inference
+                    with torch.autocast("cpu"):
+                        image = model(
+                            prompt=enhanced_prompt,
+                            negative_prompt=negative_prompt,
+                            num_inference_steps=self.inference_steps,
+                            guidance_scale=self.guidance_scale,
+                            width=self.target_size[0],
+                            height=self.target_size[1]
+                        ).images[0]
+                # Save the image to a temporary file
+                output_path = f"temp/image_{int(time.time() * 1000)}.png"
+                image.save(output_path)
+                # Force garbage collection after inference
+                gc.collect()
+                torch.cuda.empty_cache() if torch.cuda.is_available() else None
+                # Cache the result
+                self.image_cache[cache_key] = output_path
+                return output_path
+            else:
+                # Fallback: Create a simple image with text
+                return self.create_fallback_image(prompt)
         except Exception as e:
             st.warning(f"Error generating image: {str(e)}. Using fallback method.")
+            return self.create_fallback_image(prompt)
+    def enhance_prompt_for_aspect_ratio(self, prompt):
+        """Enhance the prompt based on the selected aspect ratio"""
+        # Base enhancement for all prompts
+        base_enhancement = "hyper realistic, photo realistic, ultra detailed, hyper detailed textures, 8K resolution"
+        # Add cinematic lighting
+        lighting_options = [
+            "golden hour glow", "moody overcast", "dramatic lighting",
+            "soft natural light", "cinematic lighting", "film noir shadows"
+        ]
+        # Add camera effects
+        camera_effects = [
+            "shallow depth of field", "motion blur", "film grain",
+            "professional photography", "award winning photograph"
+        ]
+        # Add environmental details
+        environmental_details = [
+            "atmospheric", "detailed environment", "rich textures",
+            "detailed background", "immersive scene"
+        ]
+        # Select enhancements based on aspect ratio
+        import random
+        random.seed(hash(prompt))  # Use prompt as seed for deterministic selection
+        selected_lighting = random.choice(lighting_options)
+        selected_effect = random.choice(camera_effects)
+        selected_detail = random.choice(environmental_details)
+        # Aspect ratio specific enhancements
+        if self.aspect_ratio == "16:9":
+            # Landscape format - cinematic, wide view
+            aspect_enhancement = "cinematic wide shot, landscape composition, panoramic view"
+        elif self.aspect_ratio == "9:16":
+            # Portrait format - vertical composition
+            aspect_enhancement = "vertical composition, portrait framing, tall perspective"
         else:
+            # Square format - balanced composition
+            aspect_enhancement = "balanced composition, centered framing, square format"
+        # Combine all enhancements
+        enhanced_prompt = f"{prompt}, {base_enhancement}, {selected_lighting}, {selected_effect}, {selected_detail}, {aspect_enhancement}"
+        return enhanced_prompt
+    def create_fallback_image(self, prompt):
+        """Create a fallback image when model generation fails"""
+        from PIL import Image, ImageDraw, ImageFont
+        # Create a gradient background
+        width, height = self.target_size
+        image = Image.new('RGB', (width, height), color=(240, 240, 240))
+        draw = ImageDraw.Draw(image)
+        # Add a gradient
+        for y in range(height):
+            r = int(240 * (1 - y / height))
+            g = int(240 * (1 - y / height))
+            b = int(255 * (1 - y / height * 0.5))
+            for x in range(width):
+                draw.point((x, y), fill=(r, g, b))
+        # Add text
+        try:
+            # Try to use a nice font if available
+            font = ImageFont.truetype("Arial", 20)
+        except:
+            # Fallback to default font
+            font = ImageFont.load_default()
+        # Wrap text to fit width
+        words = prompt.split()
+        lines = []
+        current_line = []
+        for word in words:
+            test_line = ' '.join(current_line + [word])
+            # Estimate text width (approximate method)
+            if len(test_line) * 10 < width - 40:  # 10 pixels per character, 20 pixel margin on each side
+                current_line.append(word)
+            else:
+                lines.append(' '.join(current_line))
+                current_line = [word]
+        if current_line:
+            lines.append(' '.join(current_line))
+        # Draw text
+        y_position = height // 4
+        for line in lines[:8]:  # Limit to 8 lines
+            draw.text((20, y_position), line, fill=(0, 0, 0), font=font)
+            y_position += 30
+        # Save the image
+        output_path = f"temp/fallback_{int(time.time() * 1000)}.png"
+        image.save(output_path)
+        return output_path
+    def clear_cache(self):
+        """Clear the image cache"""
+        self.image_cache = {}
+        return True

transcriber.py CHANGED Viewed

@@ -12,6 +12,11 @@ class AudioTranscriber:
         self.model = None
         self.processor = None
         self.transcription_cache = {}
     def load_model(self):
         """Load a lightweight transcription model"""
@@ -33,8 +38,8 @@ class AudioTranscriber:
         return self.model
-    def segment_audio(self, audio_file, num_segments=5, min_segment_duration=3.0):
-        """Segment the audio file into chunks for processing"""
         # Save the uploaded audio to a temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             tmp_file.write(audio_file.getvalue())
@@ -47,21 +52,25 @@ class AudioTranscriber:
             # Get total duration
             duration = librosa.get_duration(y=y, sr=sr)
             # Ensure we don't create segments that are too short
-            actual_segments = min(num_segments, int(duration / min_segment_duration))
-            if actual_segments < 1:
-                actual_segments = 1
             # Calculate segment duration
-            segment_duration = duration / actual_segments
             # Create segments
             segments = []
             timestamps = []
-            for i in range(actual_segments):
-                start_time = i * segment_duration
-                end_time = min((i + 1) * segment_duration, duration)
                 # Convert time to samples
                 start_sample = int(start_time * sr)
@@ -71,6 +80,8 @@ class AudioTranscriber:
                 segment = y[start_sample:end_sample]
                 segments.append(segment)
                 timestamps.append((start_time, end_time))
             return segments, timestamps
@@ -82,21 +93,24 @@ class AudioTranscriber:
                 y, sr = sf.read(audio_path)
                 duration = len(y) / sr
                 # Ensure we don't create segments that are too short
-                actual_segments = min(num_segments, int(duration / min_segment_duration))
-                if actual_segments < 1:
-                    actual_segments = 1
                 # Calculate segment duration
-                segment_duration = duration / actual_segments
                 # Create segments
                 segments = []
                 timestamps = []
-                for i in range(actual_segments):
-                    start_time = i * segment_duration
-                    end_time = min((i + 1) * segment_duration, duration)
                     # Convert time to samples
                     start_sample = int(start_time * sr)
@@ -106,6 +120,8 @@ class AudioTranscriber:
                     segment = y[start_sample:end_sample]
                     segments.append(segment)
                     timestamps.append((start_time, end_time))
                 return segments, timestamps
@@ -113,7 +129,7 @@ class AudioTranscriber:
                 st.error(f"Critical error in audio segmentation: {str(inner_e)}")
                 # Last resort: Create dummy segments
                 segments = [np.zeros(16000) for _ in range(num_segments)]  # 1-second silent segments
-                timestamps = [(i, i+1) for i in range(num_segments)]
                 return segments, timestamps
         finally:
             # Clean up temporary file

         self.model = None
         self.processor = None
         self.transcription_cache = {}
+        self.max_segment_duration = 5.0  # Maximum segment duration in seconds
+    def set_max_segment_duration(self, duration):
+        """Set the maximum duration for any segment in seconds"""
+        self.max_segment_duration = duration
     def load_model(self):
         """Load a lightweight transcription model"""
         return self.model
+    def segment_audio(self, audio_file, num_segments=5, min_segment_duration=1.0):
+        """Segment the audio file into chunks for processing with maximum duration limit"""
         # Save the uploaded audio to a temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             tmp_file.write(audio_file.getvalue())
             # Get total duration
             duration = librosa.get_duration(y=y, sr=sr)
+            # Calculate ideal number of segments based on max_segment_duration
+            # We want to create enough segments so that each is <= max_segment_duration
+            ideal_segments = max(num_segments, int(duration / self.max_segment_duration) + 1)
             # Ensure we don't create segments that are too short
+            actual_segments = max(ideal_segments, int(duration / min_segment_duration))
             # Calculate segment duration
+            segment_duration = min(duration / actual_segments, self.max_segment_duration)
             # Create segments
             segments = []
             timestamps = []
+            # Create more segments to ensure each is under max_segment_duration
+            current_time = 0
+            while current_time < duration:
+                start_time = current_time
+                end_time = min(start_time + segment_duration, duration)
                 # Convert time to samples
                 start_sample = int(start_time * sr)
                 segment = y[start_sample:end_sample]
                 segments.append(segment)
                 timestamps.append((start_time, end_time))
+                current_time = end_time
             return segments, timestamps
                 y, sr = sf.read(audio_path)
                 duration = len(y) / sr
+                # Calculate ideal number of segments based on max_segment_duration
+                ideal_segments = max(num_segments, int(duration / self.max_segment_duration) + 1)
                 # Ensure we don't create segments that are too short
+                actual_segments = max(ideal_segments, int(duration / min_segment_duration))
                 # Calculate segment duration
+                segment_duration = min(duration / actual_segments, self.max_segment_duration)
                 # Create segments
                 segments = []
                 timestamps = []
+                # Create more segments to ensure each is under max_segment_duration
+                current_time = 0
+                while current_time < duration:
+                    start_time = current_time
+                    end_time = min(start_time + segment_duration, duration)
                     # Convert time to samples
                     start_sample = int(start_time * sr)
                     segment = y[start_sample:end_sample]
                     segments.append(segment)
                     timestamps.append((start_time, end_time))
+                    current_time = end_time
                 return segments, timestamps
                 st.error(f"Critical error in audio segmentation: {str(inner_e)}")
                 # Last resort: Create dummy segments
                 segments = [np.zeros(16000) for _ in range(num_segments)]  # 1-second silent segments
+                timestamps = [(i, min(i+1, i+self.max_segment_duration)) for i in range(num_segments)]
                 return segments, timestamps
         finally:
             # Clean up temporary file

video_creator.py CHANGED Viewed

@@ -12,11 +12,16 @@ class VideoCreator:
         os.makedirs("outputs", exist_ok=True)
         self.video_cache = {}
         self.aspect_ratio = "1:1"  # Default aspect ratio
     def set_aspect_ratio(self, aspect_ratio):
         """Set the aspect ratio for video creation"""
         self.aspect_ratio = aspect_ratio
     def get_video_dimensions(self, base_size=None):
         """Get video dimensions based on aspect ratio"""
         if base_size is None:
@@ -62,6 +67,9 @@ class VideoCreator:
     def create_segment_clip(self, frames, segment_duration, segment_text=None):
         """Create a video clip from frames with optional text overlay"""
         try:
             # Calculate frame duration based on segment duration
             frame_duration = segment_duration / len(frames)
@@ -128,7 +136,7 @@ class VideoCreator:
         """Create a video from animated frames synchronized with audio using parallel processing"""
         # Generate a cache key based on inputs
         import hashlib
-        cache_key = f"{hashlib.md5(audio_file.getvalue()).hexdigest()}_{len(animated_frames)}_{self.aspect_ratio}"
         # Check if result is in cache
         if cache_key in self.video_cache:
@@ -146,11 +154,11 @@ class VideoCreator:
             # Calculate segment durations
             if timestamps:
-                # Use provided timestamps
-                segment_durations = [end - start for start, end in timestamps]
             else:
-                # Distribute evenly
-                segment_durations = [total_duration / len(animated_frames)] * len(animated_frames)
             # Create video clips for each animated segment
             video_clips = []
@@ -182,7 +190,7 @@ class VideoCreator:
                 # Fallback: Create a simple clip for each segment
                 video_clips = []
                 for i, _ in enumerate(animated_frames):
-                    segment_duration = segment_durations[min(i, len(segment_durations)-1)]
                     from moviepy.editor import ColorClip
                     clip = ColorClip(self.get_video_dimensions(), color=(0, 0, 0), duration=segment_duration)
                     video_clips.append(clip)
@@ -192,6 +200,9 @@ class VideoCreator:
                 final_clip = concatenate_videoclips(video_clips)
                 # Set the audio
                 final_clip = final_clip.set_audio(audio_clip)
                 # Get target dimensions based on aspect ratio

         os.makedirs("outputs", exist_ok=True)
         self.video_cache = {}
         self.aspect_ratio = "1:1"  # Default aspect ratio
+        self.max_segment_duration = 5.0  # Maximum duration for any segment in seconds
     def set_aspect_ratio(self, aspect_ratio):
         """Set the aspect ratio for video creation"""
         self.aspect_ratio = aspect_ratio
+    def set_max_segment_duration(self, duration):
+        """Set the maximum duration for any segment in seconds"""
+        self.max_segment_duration = duration
     def get_video_dimensions(self, base_size=None):
         """Get video dimensions based on aspect ratio"""
         if base_size is None:
     def create_segment_clip(self, frames, segment_duration, segment_text=None):
         """Create a video clip from frames with optional text overlay"""
         try:
+            # Limit segment duration to max_segment_duration
+            segment_duration = min(segment_duration, self.max_segment_duration)
             # Calculate frame duration based on segment duration
             frame_duration = segment_duration / len(frames)
         """Create a video from animated frames synchronized with audio using parallel processing"""
         # Generate a cache key based on inputs
         import hashlib
+        cache_key = f"{hashlib.md5(audio_file.getvalue()).hexdigest()}_{len(animated_frames)}_{self.aspect_ratio}_{self.max_segment_duration}"
         # Check if result is in cache
         if cache_key in self.video_cache:
             # Calculate segment durations
             if timestamps:
+                # Use provided timestamps but limit to max_segment_duration
+                segment_durations = [min(end - start, self.max_segment_duration) for start, end in timestamps]
             else:
+                # Distribute evenly but limit to max_segment_duration
+                segment_durations = [min(total_duration / len(animated_frames), self.max_segment_duration)] * len(animated_frames)
             # Create video clips for each animated segment
             video_clips = []
                 # Fallback: Create a simple clip for each segment
                 video_clips = []
                 for i, _ in enumerate(animated_frames):
+                    segment_duration = min(segment_durations[min(i, len(segment_durations)-1)], self.max_segment_duration)
                     from moviepy.editor import ColorClip
                     clip = ColorClip(self.get_video_dimensions(), color=(0, 0, 0), duration=segment_duration)
                     video_clips.append(clip)
                 final_clip = concatenate_videoclips(video_clips)
                 # Set the audio
+                # If the video is shorter than the audio due to max_segment_duration,
+                # we need to trim the audio to match the video duration
+                audio_clip = audio_clip.subclip(0, min(final_clip.duration, audio_clip.duration))
                 final_clip = final_clip.set_audio(audio_clip)
                 # Get target dimensions based on aspect ratio