Spaces:

Elvoro
/

Tools

Running

App Files Files Community

topcoderkz commited on Oct 9, 2025

Commit

b1bee74

1 Parent(s): e4d57c9

Refactor code, remove deepseek integration

Browse files

Files changed (9) hide show

.env.example +0 -1
config/api_keys.yaml +0 -4
requirements.txt +23 -0
src/api_clients.py +80 -149
src/asset_selector.py +122 -95
src/automation.py +12 -88
src/somira_video_library.csv +51 -0
src/utils.py +0 -129
src/video_renderer.py +442 -114

.env.example CHANGED Viewed

@@ -1,7 +1,6 @@
 # API Keys
 GEMINI_API_KEY=your_gemini_api_key_here
 RUNWAYML_API_KEY=your_runwayml_api_key_here
-DEEPSEEK_API_KEY=your_deepseek_api_key_here
 GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/service-account-key.json
 # Cloud Storage

 # API Keys
 GEMINI_API_KEY=your_gemini_api_key_here
 RUNWAYML_API_KEY=your_runwayml_api_key_here
 GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/service-account-key.json
 # Cloud Storage

config/api_keys.yaml CHANGED Viewed

@@ -6,10 +6,6 @@ runwayml:
   base_url: "https://api.runwayml.com/v1"
   timeout: 300
-deepseek:
-  base_url: "https://api.deepseek.com/v1"
-  model: "deepseek-chat"
 tts:
   provider: "google"
   voice: "en-US-Neural2-F"

   base_url: "https://api.runwayml.com/v1"
   timeout: 300
 tts:
   provider: "google"
   voice: "en-US-Neural2-F"

requirements.txt CHANGED Viewed

@@ -3,8 +3,11 @@ aiohttp==3.9.5
 aiosignal==1.4.0
 annotated-types==0.7.0
 attrs==25.3.0
 cachetools==5.5.2
 certifi==2025.8.3
 charset-normalizer==3.4.3
 decorator==4.4.2
 frozenlist==1.7.0
@@ -26,19 +29,31 @@ httplib2==0.31.0
 idna==3.10
 imageio==2.37.0
 imageio-ffmpeg==0.6.0
 moviepy==1.0.3
 multidict==6.6.4
 numpy==1.26.4
 pandas==2.3.3
 pillow==11.3.0
 proglog==0.1.12
 propcache==0.4.0
 proto-plus==1.26.1
 protobuf==5.29.5
 pyasn1==0.6.1
 pyasn1_modules==0.4.2
 pydantic==2.11.10
 pydantic_core==2.33.2
 pyparsing==3.2.5
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
@@ -46,7 +61,15 @@ pytz==2025.2
 PyYAML==6.0.3
 requests==2.32.5
 rsa==4.9.1
 six==1.17.0
 tqdm==4.67.1
 typing-inspection==0.4.2
 typing_extensions==4.15.0

 aiosignal==1.4.0
 annotated-types==0.7.0
 attrs==25.3.0
+audioop-lts==0.2.2
+audioread==3.0.1
 cachetools==5.5.2
 certifi==2025.8.3
+cffi==2.0.0
 charset-normalizer==3.4.3
 decorator==4.4.2
 frozenlist==1.7.0
 idna==3.10
 imageio==2.37.0
 imageio-ffmpeg==0.6.0
+joblib==1.5.2
+lazy_loader==0.4
+librosa==0.11.0
+llvmlite==0.45.1
 moviepy==1.0.3
+msgpack==1.1.2
 multidict==6.6.4
+mutagen==1.47.0
+numba==0.62.1
 numpy==1.26.4
+packaging==25.0
 pandas==2.3.3
 pillow==11.3.0
+platformdirs==4.5.0
+pooch==1.8.2
 proglog==0.1.12
 propcache==0.4.0
 proto-plus==1.26.1
 protobuf==5.29.5
 pyasn1==0.6.1
 pyasn1_modules==0.4.2
+pycparser==2.23
 pydantic==2.11.10
 pydantic_core==2.33.2
+pydub==0.25.1
 pyparsing==3.2.5
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 PyYAML==6.0.3
 requests==2.32.5
 rsa==4.9.1
+scikit-learn==1.7.2
+scipy==1.16.2
 six==1.17.0
+soundfile==0.13.1
+soxr==1.0.0
+standard-aifc==3.13.0
+standard-chunk==3.13.0
+standard-sunau==3.13.0
+threadpoolctl==3.6.0
 tqdm==4.67.1
 typing-inspection==0.4.2
 typing_extensions==4.15.0

src/api_clients.py CHANGED Viewed

@@ -17,7 +17,7 @@ class APIClients:
         # Initialize Gemini client
         self.gemini_client = genai
-        genai.configure(api_key=config.get('gemini_api_key') or os.getenv('GEMINI_API_KEY'))
         # Initialize GCS client
         self.gcs_client = storage.Client()
@@ -41,7 +41,7 @@ class APIClients:
             Enhanced prompt optimized for video generation
         """
         try:
-            logger.info(f"Enhancing prompt with Gemini: {prompt[:100]}...")
             enhancement_instruction = f"""
             You are a prompt enhancement specialist for video generation AI.
@@ -61,7 +61,7 @@ class APIClients:
             response = model.generate_content(enhancement_instruction)
             enhanced_prompt = response.text.strip()
-            logger.info(f"Enhanced prompt: {enhanced_prompt[:100]}...")
             return enhanced_prompt
         except Exception as e:
@@ -71,70 +71,93 @@ class APIClients:
     async def generate_video(self, prompt: str, duration: int = 10) -> Dict:
         """
-        Generate video using RunwayML Gen-4 API
         """
         try:
-            logger.info(f"Generating video with RunwayML: {prompt[:100]}...")
             headers = {
                 "Authorization": f"Bearer {self.runway_api_key}",
                 "Content-Type": "application/json",
-                "X-Runway-Version": "1.0.0"  # Add this required header
             }
             payload = {
-                "promptText": prompt,
-                "model": "gen4",
                 "duration": duration,
-                "ratio": "16:9",
-                "watermark": False
             }
             async with aiohttp.ClientSession() as session:
-                # Create generation task
                 async with session.post(
-                    f"{self.runway_base_url}/generations",
                     headers=headers,
                     json=payload
                 ) as response:
                     if response.status != 200:
                         error_text = await response.text()
-                        raise Exception(f"RunwayML API error: {error_text}")
                     task_data = await response.json()
                     task_id = task_data['id']
                     logger.info(f"Video generation task created: {task_id}")
                 # Poll for completion
-                max_attempts = 60  # 5 minutes max
                 attempt = 0
                 while attempt < max_attempts:
-                    await asyncio.sleep(5)  # Check every 5 seconds
                     async with session.get(
-                        f"{self.runway_base_url}/generations/{task_id}",
                         headers=headers
                     ) as status_response:
                         status_data = await status_response.json()
                         status = status_data['status']
                         if status == 'SUCCEEDED':
-                            video_url = status_data['output'][0]
                             logger.info(f"Video generated successfully: {video_url}")
                             return {
                                 'video_url': video_url,
                                 'task_id': task_id,
                                 'duration': duration,
-                                'prompt': prompt
                             }
                         elif status == 'FAILED':
-                            raise Exception(f"Video generation failed: {status_data.get('failure')}")
                         attempt += 1
-                        logger.info(f"Video generation in progress... ({status})")
-                raise Exception("Video generation timeout")
         except Exception as e:
             logger.error(f"Error generating video with RunwayML: {e}")
@@ -145,7 +168,7 @@ class APIClients:
         Generate TTS audio using Google Cloud TTS
         """
         try:
-            logger.info(f"Generating TTS for text: {text[:100]}...")
             if not voice_name:
                 voice_name = self.config.get('default_voice', 'en-US-Neural2-F')
@@ -168,12 +191,10 @@ class APIClients:
                 pitch=0.0
             )
-            # Remove TimePointingType as it's not available in this version
             response = self.tts_client.synthesize_speech(
                 input=synthesis_input,
                 voice=voice,
                 audio_config=audio_config
-                # Remove: enable_time_pointing=[texttospeech.TimePointingType.SSML_MARK]
             )
             # Save audio to temporary file
@@ -184,18 +205,37 @@ class APIClients:
             with open(audio_path, "wb") as out:
                 out.write(response.audio_content)
             # Upload to GCS
             audio_url = await self.store_in_gcs(audio_path, 'audio')
-            # Remove lip sync data extraction
             logger.info(f"TTS generated successfully: {audio_url}")
             return {
                 'audio_url': audio_url,
-                'duration': len(response.audio_content) / 32000,  # Approximate
                 'voice': voice_name,
                 'text': text,
-                'local_path': audio_path  # Add local path directly
             }
         except Exception as e:
@@ -279,91 +319,23 @@ class APIClients:
         else:
             logger.error("  ❌ TTS API: Not configured")
-        # Check DeepSeek configuration
-        deepseek_key = self.config.get('deepseek_api_key')
-        if deepseek_key and len(deepseek_key) > 10:
-            logger.info("  ✅ DeepSeek API: Configured")
-        else:
-            logger.warning("  ⚠️ DeepSeek API: Not configured")
         all_healthy = all(health.values())
         status = "✅ All systems operational!" if all_healthy else "⚠️ Some services have issues"
         logger.info(f"\n{status}")
         return health
-    async def select_videos(self, tts_script: str, count: int = 3) -> List[Dict]:
-        """
-        AI agent selects videos based on script using Gemini
-        Args:
-            tts_script: The TTS script to analyze
-            count: Number of videos to select (max 3)
-        Returns:
-            List of selected video metadata
-        """
-        try:
-            logger.info(f"Selecting {count} videos for script...")
-            # Use Gemini to analyze script and suggest video keywords
-            analysis_prompt = f"""
-            Analyze this product advertisement script and identify {count} key visual moments
-            that should be represented with video clips. For each moment, provide:
-            1. A descriptive keyword/phrase
-            2. The timing (start-end seconds if mentioned)
-            3. Visual style preference (product closeup, lifestyle, abstract, etc.)
-            Script: {tts_script}
-            Return as JSON array with format:
-            [{{"keyword": "...", "timing": "0-5", "style": "..."}}, ...]
-            """
-            model = genai.GenerativeModel('gemini-2.0-flash-exp')
-            response = model.generate_content(analysis_prompt)
-            # Parse Gemini response
-            try:
-                suggestions = json.loads(response.text.strip())
-            except:
-                # Fallback to keyword extraction
-                keywords = self._extract_keywords(tts_script)
-                suggestions = [
-                    {"keyword": kw, "timing": f"{i*5}-{(i+1)*5}", "style": "general"}
-                    for i, kw in enumerate(keywords[:count])
-                ]
-            # Select videos from library based on suggestions
-            selected_videos = []
-            for i, suggestion in enumerate(suggestions[:count]):
-                video_id = (hash(suggestion['keyword']) + i) % self.config['video_library_size'] + 1
-                selected_videos.append({
-                    'id': video_id,
-                    'url': f"gs://{self.config['gcs_bucket_name']}/library/video{video_id}.mp4",
-                    'keyword': suggestion['keyword'],
-                    'timing': suggestion.get('timing', f"{i*5}-{(i+1)*5}"),
-                    'style': suggestion.get('style', 'general'),
-                    'reason': f"Matches: {suggestion['keyword']}"
-                })
-            logger.info(f"Selected {len(selected_videos)} videos")
-            return selected_videos
-        except Exception as e:
-            logger.error(f"Error selecting videos: {e}")
-            # Fallback selection
-            return self._fallback_video_selection(tts_script, count)
     async def store_in_gcs(self, file_path: str, content_type: str = 'video') -> str:
         """
-        Store file in Google Cloud Storage
         Args:
             file_path: Local file path
             content_type: Type of content ('video', 'audio', etc.)
         Returns:
-            GCS public URL
         """
         try:
             logger.info(f"Storing file in GCS: {file_path}")
@@ -386,59 +358,18 @@ class APIClients:
             # Upload file
             blob.upload_from_filename(file_path)
-            # Make public (optional)
-            blob.make_public()
-            gcs_url = blob.public_url
-            logger.info(f"File uploaded to: {gcs_url}")
-            return gcs_url
         except Exception as e:
             logger.error(f"Error storing file in GCS: {e}")
             raise
-    def _extract_keywords(self, text: str) -> List[str]:
-        """Extract keywords from TTS script"""
-        text_lower = text.lower()
-        keywords = []
-        key_phrases = [
-            'somira massager', 'neck pain', 'product', 'massager',
-            'solution', 'comfort', 'using the product', 'relaxation',
-            'relief', 'wellness', 'ergonomic', 'design'
-        ]
-        for phrase in key_phrases:
-            if phrase in text_lower:
-                keywords.append(phrase)
-        return keywords if keywords else ['general', 'product', 'lifestyle']
-    def _extract_timing_data(self, tts_response) -> Dict:
-        """Extract timing data from TTS response for lip sync"""
-        # This would parse the timepoints from Azure TTS response
-        # Simplified version
-        return {
-            'timestamps': [],
-            'phonemes': [],
-            'words': []
-        }
-    def _fallback_video_selection(self, text: str, count: int) -> List[Dict]:
-        """Fallback video selection if AI selection fails"""
-        keywords = self._extract_keywords(text)
-        selected_videos = []
-        for i in range(min(count, 3)):
-            video_id = (hash(text) + i) % self.config['video_library_size'] + 1
-            selected_videos.append({
-                'id': video_id,
-                'url': f"gs://{self.config['gcs_bucket_name']}/library/video{video_id}.mp4",
-                'keyword': keywords[i % len(keywords)] if keywords else "general",
-                'timing': f"{i*5}-{(i+1)*5}",
-                'style': 'general',
-                'reason': f'Fallback selection for: {keywords[i % len(keywords)] if keywords else "general"}'
-            })
-        return selected_videos

         # Initialize Gemini client
         self.gemini_client = genai
+        genai.configure(api_key=config.get('gemini_api_key'))
         # Initialize GCS client
         self.gcs_client = storage.Client()
             Enhanced prompt optimized for video generation
         """
         try:
+            logger.info(f"Enhancing prompt with Gemini: {prompt[:300]}...")
             enhancement_instruction = f"""
             You are a prompt enhancement specialist for video generation AI.
             response = model.generate_content(enhancement_instruction)
             enhanced_prompt = response.text.strip()
+            logger.info(f"Enhanced prompt: {enhanced_prompt[:300]}...")
             return enhanced_prompt
         except Exception as e:
     async def generate_video(self, prompt: str, duration: int = 10) -> Dict:
         """
+        Generate video using RunwayML API
         """
         try:
+            logger.info(f"Generating video with RunwayML: {prompt[:1000]}...")
             headers = {
                 "Authorization": f"Bearer {self.runway_api_key}",
                 "Content-Type": "application/json",
+                "X-Runway-Version": "2024-11-06"
             }
             payload = {
+                "promptText": prompt[:1000],
+                "model": "veo3",
                 "duration": duration,
+                "ratio": "1280:720",  # Standard HD ratio
+                # "seed": 42  # Optional: for reproducibility
             }
             async with aiohttp.ClientSession() as session:
+                # Create video generation task
                 async with session.post(
+                    "https://api.dev.runwayml.com/v1/text_to_video",
                     headers=headers,
                     json=payload
                 ) as response:
                     if response.status != 200:
                         error_text = await response.text()
+                        raise Exception(f"RunwayML API error ({response.status}): {error_text}")
                     task_data = await response.json()
                     task_id = task_data['id']
                     logger.info(f"Video generation task created: {task_id}")
                 # Poll for completion
+                max_attempts = 120  # 20 minutes max (video generation can take time)
                 attempt = 0
                 while attempt < max_attempts:
+                    await asyncio.sleep(10)  # Check every 10 seconds
                     async with session.get(
+                        f"https://api.dev.runwayml.com/v1/tasks/{task_id}",  # Correct tasks endpoint
                         headers=headers
                     ) as status_response:
+                        if status_response.status != 200:
+                            error_text = await status_response.text()
+                            raise Exception(f"Task status check failed: {error_text}")
                         status_data = await status_response.json()
                         status = status_data['status']
                         if status == 'SUCCEEDED':
+                            video_urls = status_data['output']  # Returns array of URLs
+                            video_url = video_urls[0] if video_urls else None
+                            if not video_url:
+                                raise Exception("No video URL in successful response")
                             logger.info(f"Video generated successfully: {video_url}")
                             return {
                                 'video_url': video_url,
                                 'task_id': task_id,
                                 'duration': duration,
+                                'prompt': prompt,
+                                'status': status,
+                                'created_at': status_data.get('createdAt')
                             }
                         elif status == 'FAILED':
+                            failure_msg = status_data.get('failure', 'Unknown error')
+                            failure_code = status_data.get('failureCode', 'UNKNOWN')
+                            raise Exception(f"Video generation failed: {failure_msg} (Code: {failure_code})")
+                        elif status == 'THROTTLED':
+                            logger.warning("Video generation throttled, retrying...")
+                        elif status == 'PENDING':
+                            logger.info("Video generation pending...")
+                        elif status == 'RUNNING':
+                            # Still processing
+                            progress = status_data.get('progress', 0)
+                            logger.info(f"Video generation {status.lower()}: {progress*100:.1f}% complete")
+                        else:
+                            logger.warning(f"Unknown status: {status}")
                         attempt += 1
+                raise Exception(f"Video generation timeout after {max_attempts * 5} seconds")
         except Exception as e:
             logger.error(f"Error generating video with RunwayML: {e}")
         Generate TTS audio using Google Cloud TTS
         """
         try:
+            logger.info(f"Generating TTS for text: {text[:300]}...")
             if not voice_name:
                 voice_name = self.config.get('default_voice', 'en-US-Neural2-F')
                 pitch=0.0
             )
             response = self.tts_client.synthesize_speech(
                 input=synthesis_input,
                 voice=voice,
                 audio_config=audio_config
             )
             # Save audio to temporary file
             with open(audio_path, "wb") as out:
                 out.write(response.audio_content)
+            # Get actual audio duration using mutagen or pydub
+            try:
+                from mutagen.mp3 import MP3
+                audio = MP3(audio_path)
+                duration = audio.info.length
+                logger.info(f"Audio duration: {duration:.2f}s")
+            except ImportError:
+                # Fallback: use pydub if mutagen not available
+                try:
+                    from pydub import AudioSegment
+                    audio = AudioSegment.from_mp3(audio_path)
+                    duration = len(audio) / 1000.0  # Convert milliseconds to seconds
+                    logger.info(f"Audio duration: {duration:.2f}s (via pydub)")
+                except ImportError:
+                    # Last resort: estimate based on text length
+                    # Average speaking rate: ~150 words per minute
+                    word_count = len(text.split())
+                    duration = (word_count / 150) * 60
+                    logger.warning(f"⚠️ Could not determine exact duration, estimating: {duration:.2f}s")
             # Upload to GCS
             audio_url = await self.store_in_gcs(audio_path, 'audio')
             logger.info(f"TTS generated successfully: {audio_url}")
             return {
                 'audio_url': audio_url,
+                'duration': duration,
                 'voice': voice_name,
                 'text': text,
+                'local_path': audio_path
             }
         except Exception as e:
         else:
             logger.error("  ❌ TTS API: Not configured")
         all_healthy = all(health.values())
         status = "✅ All systems operational!" if all_healthy else "⚠️ Some services have issues"
         logger.info(f"\n{status}")
         return health
     async def store_in_gcs(self, file_path: str, content_type: str = 'video') -> str:
         """
+        Store file in Google Cloud Storage with signed URL
         Args:
             file_path: Local file path
             content_type: Type of content ('video', 'audio', etc.)
         Returns:
+            Signed URL with temporary access
         """
         try:
             logger.info(f"Storing file in GCS: {file_path}")
             # Upload file
             blob.upload_from_filename(file_path)
+            # Generate signed URL (valid for 7 days)
+            from datetime import timedelta
+            signed_url = blob.generate_signed_url(
+                version="v4",
+                expiration=timedelta(days=7),
+                method="GET"
+            )
+            logger.info(f"File uploaded with signed URL: {signed_url[:100]}...")
+            return signed_url
         except Exception as e:
             logger.error(f"Error storing file in GCS: {e}")
             raise

src/asset_selector.py CHANGED Viewed

@@ -1,11 +1,11 @@
-"""
-AI-powered asset selection using DeepSeek for contextual video matching
-"""
 import pandas as pd
 import aiohttp
 import json
 from typing import List, Dict, Optional
 from utils import logger
 class AssetSelector:
@@ -15,39 +15,58 @@ class AssetSelector:
         self.audio_library = self._load_audio_library()
     def _load_video_library(self) -> pd.DataFrame:
-        """Load video library from CSV data"""
         try:
-            # Create a simple video library from your provided data
-            video_data = [
-                {
-                    'url': 'https://storage.googleapis.com/somira/Somira%20Massager.mp4',
-                    'duration': 2,
-                    'alignment': 'product mention, solution, features',
-                    'energy': 5,
-                    'description': 'Product showcase'
-                },
-                {
-                    'url': 'https://storage.googleapis.com/somira/FemaleWomenPuttingOnNeckMassagerr.mp4',
-                    'duration': 2,
-                    'alignment': 'using the product, turning on, operation',
-                    'energy': 35,
-                    'description': 'Product usage demonstration'
-                },
-                {
-                    'url': 'https://storage.googleapis.com/somira/PersonEnjoyingTheNeckMassager.mp4',
-                    'duration': 1.5,
-                    'alignment': 'comfort, relaxation, satisfaction',
-                    'energy': 40,
-                    'description': 'User satisfaction'
-                },
-                # Add more videos as needed for testing
-            ]
-            return pd.DataFrame(video_data)
         except Exception as e:
-            logger.error(f"Failed to load video library: {e}")
             return pd.DataFrame()
     def _load_audio_library(self) -> List[str]:
         """Load audio library URLs"""
@@ -65,16 +84,21 @@ class AssetSelector:
             List of selected video metadata
         """
         try:
-            logger.info(f"🤖 AI video selection for script: {tts_script[:100]}...")
-            # Use DeepSeek for intelligent selection
-            selected_videos = await self._analyze_with_deepseek(tts_script, max_duration)
             if not selected_videos:
                 logger.warning("⚠️ AI selection failed, using fallback")
                 selected_videos = self._fallback_selection(tts_script, max_duration)
-            total_duration = sum(v['duration'] for v in selected_videos)
             logger.info(f"✓ Selected {len(selected_videos)} videos, total: {total_duration}s")
             return selected_videos
@@ -83,12 +107,36 @@ class AssetSelector:
             logger.error(f"❌ Video selection failed: {e}")
             return self._fallback_selection(tts_script, max_duration)
-    async def _analyze_with_deepseek(self, tts_script: str, max_duration: int) -> List[Dict]:
-        """Use DeepSeek API for contextual video selection"""
         try:
             # Prepare video library context
             video_context = "\n".join([
-                f"{i}. {row['description']} - {row['duration']}s - Alignment: {row['alignment']}"
                 for i, row in self.video_library.iterrows()
             ])
@@ -104,7 +152,7 @@ class AssetSelector:
             - Total duration under {max_duration} seconds
             - Energy level appropriateness
-            Return JSON format:
             {{
                 "selected_videos": [
                     {{
@@ -118,52 +166,44 @@ class AssetSelector:
             }}
             """
-            # DeepSeek API call
-            headers = {
-                "Authorization": f"Bearer {self.config.get('deepseek_api_key')}",
-                "Content-Type": "application/json"
-            }
-            payload = {
-                "model": "deepseek-chat",
-                "messages": [
-                    {"role": "system", "content": "You are a video editor AI that selects the most relevant videos for advertising content."},
-                    {"role": "user", "content": prompt}
-                ],
-                "temperature": 0.3,
-                "max_tokens": 2000
-            }
-            async with aiohttp.ClientSession() as session:
-                async with session.post(
-                    "https://api.deepseek.com/v1/chat/completions",
-                    headers=headers,
-                    json=payload
-                ) as response:
-                    if response.status == 200:
-                        result = await response.json()
-                        selection = json.loads(result['choices'][0]['message']['content'])
-                        # Map to actual video data
-                        selected = []
-                        for item in selection['selected_videos']:
-                            if item['index'] < len(self.video_library):
-                                video = self.video_library.iloc[item['index']]
-                                selected.append({
-                                    'url': video['url'],
-                                    'duration': video['duration'],
-                                    'reason': item['reason'],
-                                    'alignment': video['alignment'],
-                                    'energy': video['energy']
-                                })
-                        return selected
-                    else:
-                        logger.error(f"DeepSeek API error: {response.status}")
-                        return []
         except Exception as e:
-            logger.error(f"DeepSeek analysis failed: {e}")
             return []
     def _fallback_selection(self, tts_script: str, max_duration: int) -> List[Dict]:
@@ -212,19 +252,6 @@ class AssetSelector:
         return selected[:3]  # Max 3 videos
-    def _find_video_for_category(self, category: str) -> Optional[Dict]:
-        """Find best video for a category"""
-        for _, row in self.video_library.iterrows():
-            if category in str(row['alignment']).lower():
-                return {
-                    'url': row['url'],
-                    'duration': row['duration'],
-                    'reason': f"Matches {category} category",
-                    'alignment': row['alignment'],
-                    'energy': row['energy']
-                }
-        return None
     def select_background_music(self) -> str:
         """Select background music using round-robin"""
         import random

 import pandas as pd
 import aiohttp
 import json
 from typing import List, Dict, Optional
 from utils import logger
+import google.generativeai as genai
+import os
+import re
 class AssetSelector:
         self.audio_library = self._load_audio_library()
     def _load_video_library(self) -> pd.DataFrame:
+        """Load video library from specific CSV file"""
         try:
+            # Use path relative to this file
+            current_dir = os.path.dirname(os.path.abspath(__file__))
+            csv_filename = os.path.join(current_dir, "somira_video_library.csv")
+            if not os.path.exists(csv_filename):
+                logger.error(f"CSV file not found: {csv_filename}")
+                return pd.DataFrame()
+            # Load the CSV
+            df = pd.read_csv(csv_filename)
+            # Parse energy scores if the column exists
+            if 'Energy Score (0-100)' in df.columns:
+                df['energy_score'] = df['Energy Score (0-100)'].apply(self._parse_energy_score)
+            # Parse durations - convert to integers
+            if 'Duration' in df.columns:
+                df['duration'] = df['Duration'].apply(self._parse_duration)
+            elif 'duration' in df.columns:
+                df['duration'] = df['duration'].apply(self._parse_duration)
+            logger.info(f"Successfully loaded video library with {len(df)} entries")
+            return df
         except Exception as e:
+            logger.error(f"Failed to load video library from CSV: {e}")
             return pd.DataFrame()
+    def _parse_duration(self, duration_str: str) -> int:
+        """
+        Parse duration from various string formats to integer seconds.
+        Handles formats like: "2 seconds", "3 seconds", "1.5 seconds", "2s", etc.
+        """
+        try:
+            if pd.isna(duration_str) or duration_str == "":
+                return 0
+            # Convert to string and lowercase
+            duration_str = str(duration_str).lower().strip()
+            # Extract numbers - handle decimals too
+            numbers = re.findall(r'(\d+\.?\d*)', duration_str)
+            if numbers:
+                # Convert to float first to handle decimals, then round to int
+                return int(float(numbers[0]))
+            return 0
+        except (ValueError, TypeError) as e:
+            logger.warning(f"Failed to parse duration '{duration_str}': {e}")
+            return 0
     def _load_audio_library(self) -> List[str]:
         """Load audio library URLs"""
             List of selected video metadata
         """
         try:
+            logger.info(f"🤖 AI video selection for script: {tts_script[:300]}...")
+            # Use Gemini for intelligent selection
+            selected_videos = await self._analyze_with_gemini(tts_script, max_duration)
             if not selected_videos:
                 logger.warning("⚠️ AI selection failed, using fallback")
                 selected_videos = self._fallback_selection(tts_script, max_duration)
+            # Ensure all durations are integers before summing
+            for video in selected_videos:
+                if isinstance(video.get('duration'), str):
+                    video['duration'] = self._parse_duration(video['duration'])
+            total_duration = sum(int(v.get('duration', 0)) for v in selected_videos)
             logger.info(f"✓ Selected {len(selected_videos)} videos, total: {total_duration}s")
             return selected_videos
             logger.error(f"❌ Video selection failed: {e}")
             return self._fallback_selection(tts_script, max_duration)
+    def _parse_energy_score(self, energy_score_str: str) -> int:
+        """
+        Parse energy score from string format to integer.
+        Handles formats like: "5 out of 100", "35 out of 100", "40 out of 100"
+        """
+        try:
+            if pd.isna(energy_score_str) or energy_score_str == "":
+                return 0
+            # Extract the first number from strings like "5 out of 100"
+            match = re.search(r'(\d+)\s*out of\s*\d+', str(energy_score_str))
+            if match:
+                return int(match.group(1))
+            # Try to extract just a number if no "out of" pattern
+            numbers = re.findall(r'\d+', str(energy_score_str))
+            if numbers:
+                return int(numbers[0])
+            return 0
+        except (ValueError, TypeError) as e:
+            logger.warning(f"Failed to parse energy score '{energy_score_str}': {e}")
+            return 0
+    async def _analyze_with_gemini(self, tts_script: str, max_duration: int) -> List[Dict]:
+        """Use Gemini API for contextual video selection"""
         try:
             # Prepare video library context
             video_context = "\n".join([
+                f"{i}. {row.get('Full Video Description Summary', row.get('description', ''))} - {row.get('duration', 0)}s - Alignment: {row.get('Video Alignment with the TTS Script', row.get('alignment', ''))}"
                 for i, row in self.video_library.iterrows()
             ])
             - Total duration under {max_duration} seconds
             - Energy level appropriateness
+            Return ONLY valid JSON in this exact format (no markdown, no extra text):
             {{
                 "selected_videos": [
                     {{
             }}
             """
+            # Gemini API call
+            model = genai.GenerativeModel('gemini-2.0-flash-exp')
+            response = model.generate_content(prompt)
+            # Extract and parse JSON response
+            response_text = response.text.strip()
+            # Remove markdown code blocks if present
+            if response_text.startswith('```'):
+                response_text = response_text.split('```')[1]
+                if response_text.startswith('json'):
+                    response_text = response_text[4:]
+                response_text = response_text.strip()
+            selection = json.loads(response_text)
+            # Map to actual video data
+            selected = []
+            for item in selection['selected_videos']:
+                if item['index'] < len(self.video_library):
+                    video = self.video_library.iloc[item['index']]
+                    selected.append({
+                        'url': video.get('Video URL (No Audio)', video.get('url', '')),
+                        'duration': video.get('duration', 0),
+                        'reason': item['reason'],
+                        'alignment': video.get('Video Alignment with the TTS Script', video.get('alignment', '')),
+                        'energy': video.get('energy_score', 0)
+                    })
+            logger.info(f"✓ Gemini selected {len(selected)} videos: {selection.get('rationale', '')}")
+            return selected
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse Gemini JSON response: {e}")
+            logger.debug(f"Raw response: {response_text[:500]}")
+            return []
         except Exception as e:
+            logger.error(f"Gemini analysis failed: {e}")
             return []
     def _fallback_selection(self, tts_script: str, max_duration: int) -> List[Dict]:
         return selected[:3]  # Max 3 videos
     def select_background_music(self) -> str:
         """Select background music using round-robin"""
         import random

src/automation.py CHANGED Viewed

@@ -114,61 +114,6 @@ class ContentAutomation:
             logger.error(f"📋 Debug: {traceback.format_exc()}")
             return False
-    async def local_test(self):
-        """Run a local test without external APIs"""
-        logger.info("🧪 Running local functionality test...")
-        try:
-            # Test 1: Check if we can create basic video clips
-            logger.info("1. Testing video clip creation...")
-            from moviepy.editor import ColorClip
-            test_clip = ColorClip(size=(100, 100), color=(255, 0, 0), duration=1)
-            test_clip = test_clip.set_fps(24)  # Add FPS
-            test_clip.write_videofile('/tmp/test_color.mp4', verbose=False, logger=None)
-            test_clip.close()
-            logger.info("   ✅ Video clip creation: OK")
-            # Test 2: Check if we can create audio clips
-            logger.info("2. Testing audio clip creation...")
-            from moviepy.editor import AudioClip
-            import numpy as np
-            def make_tone(duration):
-                return lambda t: 0.1 * np.sin(440 * 2 * np.pi * t)
-            test_audio = AudioClip(make_tone(1), duration=1)
-            test_audio.write_audiofile('/tmp/test_audio.mp3', verbose=False, logger=None)
-            test_audio.close()
-            logger.info("   ✅ Audio clip creation: OK")
-            # Test 3: Check video rendering with simple assets
-            logger.info("3. Testing video rendering pipeline...")
-            test_assets = {
-                'selected_videos': [
-                    {
-                        'local_path': '/tmp/test_color.mp4',
-                        'duration': 1,
-                        'reason': 'Test video'
-                    }
-                ],
-                'tts_audio': {
-                    'local_path': '/tmp/test_audio.mp3',
-                    'duration': 1
-                },
-                'tts_script': 'Test script.',
-                'background_music_local': '/tmp/test_audio.mp3'
-            }
-            output_path = await self.video_renderer.render_video(test_assets)
-            logger.info(f"   ✅ Video rendering: OK - {output_path}")
-            logger.info("\n🎉 Local functionality test passed!")
-            return True
-        except Exception as e:
-            logger.error(f"❌ Local test failed: {e}")
-            return False
     async def execute_pipeline(self, content_strategy: Dict[str, str], tts_script: str) -> Dict[str, Any]:
         """
         Execute complete production video pipeline with better error handling
@@ -182,8 +127,8 @@ class ContentAutomation:
             assets = await self._generate_assets_parallel(content_strategy, tts_script)
             # Check if we have minimum required assets
-            if not assets.get('selected_videos') or not assets.get('tts_audio'):
-                raise ValueError("Missing critical assets: videos or TTS audio")
             # Step 2: Download all remote assets
             logger.info("\n⬇️ STEP 2: Downloading Remote Assets")
@@ -261,8 +206,17 @@ class ContentAutomation:
             # Generate video
             video_data = await self.api_clients.generate_video(
                 enhanced_prompt,
-                duration=5  # 5-second hook video
             )
             return video_data
@@ -376,33 +330,3 @@ class ContentAutomation:
             print("❌ System has significant issues that need attention")
         return health_status
-    async def basic_test(self):
-        """Basic test without external APIs"""
-        logger.info("🧪 Running basic pipeline test...")
-        # Use local test assets
-        test_assets = {
-            'selected_videos': [
-                {
-                    'url': 'https://example.com/video1.mp4',
-                    'duration': 2,
-                    'reason': 'Test video 1',
-                    'local_path': '/tmp/test_video1.mp4'  # You'd need to create this
-                }
-            ],
-            'tts_audio': {
-                'local_path': '/tmp/test_audio.mp3',  # You'd need to create this
-                'duration': 10
-            },
-            'background_music_local': '/tmp/test_music.mp3',
-            'tts_script': 'Test script for video generation.'
-        }
-        try:
-            final_video_path = await self.video_renderer.render_video(test_assets)
-            logger.info(f"✅ Basic test passed: {final_video_path}")
-            return True
-        except Exception as e:
-            logger.error(f"❌ Basic test failed: {e}")
-            return False

             logger.error(f"📋 Debug: {traceback.format_exc()}")
             return False
     async def execute_pipeline(self, content_strategy: Dict[str, str], tts_script: str) -> Dict[str, Any]:
         """
         Execute complete production video pipeline with better error handling
             assets = await self._generate_assets_parallel(content_strategy, tts_script)
             # Check if we have minimum required assets
+            if not assets.get('selected_videos') or not assets.get('tts_audio') or not assets.get('hook_video'):
+                raise ValueError("Missing critical assets: hook video or library videos or TTS audio")
             # Step 2: Download all remote assets
             logger.info("\n⬇️ STEP 2: Downloading Remote Assets")
             # Generate video
             video_data = await self.api_clients.generate_video(
                 enhanced_prompt,
+                duration=8
             )
+            # TODO: Mocking video generation for now
+            # return {
+            #     'video_url': 'https://dnznrvs05pmza.cloudfront.net/veo3/projects/vertex-ai-claude-431722/locations/us-central1/publishers/google/models/veo-3.0-generate-001/operations/12d22a72-16b2-4767-a9f4-edc8589bb199/A_slow__deliberate_dolly_in_shot_focuses_on_a_blonde_woman_in_her_early_30s__positioned_within_the_p.mp4?_jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJrZXlIYXNoIjoiYTJkMzQzOTlkZDM3YWU1ZCIsImJ1Y2tldCI6InJ1bndheS10YXNrLWFydGlmYWN0cyIsInN0YWdlIjoicHJvZCIsImV4cCI6MTc2MDE0MDgwMH0.pWG8lw7CE4No8VeRTxPuHSHin6sANds6ScnmoCydzmA',
+            #     'task_id': '0328498f-7ea8-46a5-9c6d-f997770abeb6',
+            #     'duration': 8,
+            #     'prompt': prompt,
+            #     'status': 'SUCCEEDED',
+            #     'created_at': '2025-10-08T20:52:09.879Z',
+            # }
             return video_data
             print("❌ System has significant issues that need attention")
         return health_status

src/somira_video_library.csv ADDED Viewed

	@@ -0,0 +1,51 @@

+Video URL (No Audio),Full Video Description Summary,Duration,Video Alignment with the TTS Script,Product Visibility,Energy Score (0-100)
+https://storage.googleapis.com/somira/Somira%20Massager.mp4,"The video clearly displays the Somira Massager product from a front-view perspective against a professional white and light beige background. It features a static image with a slow zoom-in effect. The video contains no people, no significant action, and minimal visual excitement. Its primary purpose is to quickly and simply show the Somira Massager product itself. Therefore, this video is ideal for use whenever the script explicitly mentions terms like ""Somira Massager,"" ""the product,"" ""massage product,"" or any indirect or direct references to this specific product. The focal point of the image is centered, and the product image appears slightly black-and-white for a professional appearance. ",2 seconds,"This video can be used when the TTS script explicitly mentions words or phrases such as ""Somira Massager,"" ""product,"" or ""massager."" Additionally, the video fits well when the script discusses a solution to a problem, highlights product features, or indirectly refers to the product without naming it explicitly. This video is versatile and serves as a reliable fallback option whenever clear product imagery is required.",The product is fully visible. ,"5 out of 100.
+This video is calm and only contains a slight zoom with little motion. "
+https://storage.googleapis.com/somira/FemaleWomenPuttingOnNeckMassagerr.mp4,"The video shows a blonde-haired woman seated upright on a couch at home, directly facing the camera at eye level, in a brightly lit setting with white and light beige colors. During the first 5 seconds, she calmly lifts the Somira neck massager over her head, places it around her neck, and inserts her arms into its built-in armrests. In the final 3 seconds, she clearly presses the power button on the massager, starting the massage function. The camera remains completely still throughout, with no movement or distraction, ensuring a consistent focus on the woman and the product. Her facial expression appears comfortable and satisfied, resembling a happy customer. The video is instructional, clean, and clearly emphasizes product usage.",2 seconds,"Use this video when the TTS script explicitly mentions actions like ""using the product,"" ""turning on the massager,"" or references to ""easy product operation."" It is also suitable whenever the script discusses product comfort, ease-of-use, user satisfaction, or demonstrates how the Somira neck massager works. Additionally, this video aligns well when highlighting a positive customer experience, comfort benefits, or general product functionality.",The product is visible. ,"35 out of 100.
+The camera does not move, but the women performs movement with her upper body. "
+https://storage.googleapis.com/somira/PersonUsingTheMassagerProductt.mp4,"The video shows a person seated upright and directly facing the camera at eye level, already wearing the Somira neck massager around their neck. The massager is actively running throughout the entire 6-second duration, clearly demonstrating its massage functionality. The person remains mostly still with minimal movement, visibly enjoying the massage. The video does not display the heat feature. Due to limited action and engagement, it's recommended to trim the video length to approximately the first 1–3 seconds for optimal use. This clip effectively demonstrates normal usage and user satisfaction with the product.",2 seconds,"Use this video when the TTS script refers explicitly or implicitly to ""massager usage,"" ""relaxation,"" ""comfort,"" ""customer satisfaction,"" or ""product in action."" It is suitable for many scenarios, including general product benefits, demonstrating product effectiveness, showcasing user enjoyment, or emphasizing the massage functionality. Avoid this clip when specifically discussing or emphasizing the heat feature, as it is not displayed here. Due to its versatility, this video can act as a reliable fallback for showcasing the product in use during many scenarios. ",The product is visible. ,"20 out of 100.
+The camera does not move, but the women wears the activated massager slightly moves her upper body. "
+https://storage.googleapis.com/somira/MassagerMassageAndHeatFeatureWhiteBackgroundd.mp4,"The video provides a close-up view of the Somira Massager, clearly showing it powered on with both the massage function and the red-light heat therapy feature activated. It gradually moves closer towards the massage knots to highlight key product details. The original background has been digitally replaced with plain white to prominently showcase the massager without distraction. This handheld mobile shot is smooth but engaging, focusing exclusively on the product with no visible people. The video should be trimmed in length for optimal engagement. It is ideal as a general or fallback clip for clearly demonstrating product features.",2 seconds,"Use this video when the TTS script explicitly mentions or implies ""massage knots,"" ""heat therapy,"" ""red-light,"" ""product features,"" ""massager functionality,"" or ""product close-up."" Additionally, it's suitable when discussing detailed product benefits, highlighting massage effectiveness, or referring generally to the Somira Massager. Due to its clarity, versatility, and focus on specific product functions, it also works effectively as a fallback or filler video in many scenarios.",The product is partially visible. ,"40 out of 100.
+The camera moves and the massager is activated and moving."
+https://storage.googleapis.com/somira/PersonWearingSomiraMassagerr.mp4,"A blonde-haired woman is standing upright against a soft white and light beige wall. She is already wearing the Somira neck massager on her neck. The scene is lit with bright, professional lighting and framed with a static, eye-level camera. Throughout the clip, the woman slowly rotates her upper body left and right in a smooth, calm motion to present the massager from multiple angles. Her facial expression is relaxed and content, reinforcing a feeling of comfort and product satisfaction. The setting is minimal and distraction-free, maintaining focus on the product and its wearability. No additional movement occurs. For better viewer engagement, trimming the video is recommended.",2 seconds,"Use this video when the TTS script explicitly mentions the design, comfort, style, or wearability of the Somira neck massager. It is particularly suitable for emphasizing themes like relaxation, comfort while wearing, product aesthetics, or phrases highlighting a calming experience. Additionally, use this video to visually reinforce narratives related to user satisfaction, ease of wear, or ergonomic design, as the gentle rotations and relaxed expressions effectively communicate these aspects to the viewer.",The product is visible. ,35 out of 100.
+https://storage.googleapis.com/somira/PersonEnjoyingTheNeckMassager.mp4,"This selfie video depicts a person seated comfortably on a couch in a relaxed home environment, positioned centrally in frame. The individual leans gently backward, resting their head and neck fully onto the Somira neck massager, which is draped securely behind the neck and over the shoulders in the standard usage position. Their eyes are closed, and the expression is visibly calm, serene, and deeply contented, clearly conveying comfort and satisfaction derived from the massager’s relaxing effects. The clip is very short with only 1.5 seconds in length. The neutral background and clear framing emphasize the person’s peaceful facial expression and the immediate benefits of using the Somira massager.",1.5 seconds,"Use this video clip whenever the TTS script emphasizes comfort, relaxation, immediate relief from neck pain, tension relief, or product satisfaction. It’s especially suitable for phrases like ""experience soothing relief,"" ""relax your neck muscles,"" ""comfortable massage,"" or when highlighting general satisfaction and wellness from using the Somira massager. Its brief, highly focused nature makes it a versatile visual insert when quickly underscoring the relaxing benefits of the product.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/PersonCuriouslyEnjoyingNeckMassager.mp4,"This selfie video shows a person comfortably seated on a couch in a calm, homely setting, with the Somira neck massager positioned securely behind their neck and over their shoulders in the standard usage position. The individual's facial expression is notably curious and slightly skeptical, with eyes partially open and an expression suggesting a combination of mild discomfort and pleasure. The nuanced expression effectively communicates the sensation of deep-tissue massage, capturing the essence of therapeutic pressure in targeting muscle knots. The brief 1.5-second clip highlights a genuine reaction, reflecting an authentic ""hurts-so-good"" experience typical of effective, targeted massage therapy.",1.5 seconds,"Use this clip specifically when the TTS script references therapeutic massage effects, deep-tissue relief, or phrases such as ""deep massage,"" ""working out tough knots,"" or ""therapeutic sensation."" It's particularly suitable in scenarios emphasizing realistic user reactions to initial discomfort combined with beneficial relief from deep massage. Avoid using in generalized relaxation contexts, and instead align with scripts highlighting the intensity or effectiveness of targeted therapeutic massages.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/PersonPainfullyEnjoyingNeckMassage.mp4,"The selfie video features a person seated comfortably on a couch within a relaxed home environment, using the Somira neck massager placed around their neck and shoulders. The individual's facial expression vividly conveys an experience of intense massage therapy, characterized by slightly furrowed eyebrows, partially squinted eyes, and a subtly tense mouth indicating discomfort coupled with therapeutic relief. Despite the visible intensity, the expression clearly communicates beneficial outcomes and appreciation of the massage’s deep effectiveness. The brief, expressive 1.5-second clip authentically portrays the experience of intense massage pressure effectively relieving stubborn muscle tension.",1.5 seconds,"Align this video carefully with TTS narratives mentioning intense therapeutic massages, deep muscle relief, phrases such as ""intensive massage,"" ""deep therapeutic effects,"" or explicit descriptions of initial discomfort transforming into beneficial relief. Ideal for illustrating realistic, impactful massage experiences, use this clip selectively to reinforce messaging on deep massage benefits, effectively targeting muscle tension, knots, and substantial relief from severe neck discomfort or stiffness.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/WeirdChiropractorTreatmentNeckPain.mp4,"The video is a brief, unusual chiropractic treatment for neck pain, lasting approximately one second. It features a chiropractor holding a towel wrapped under the patient's neck, pulling upward while pushing down firmly on the patient's shoulders with his bare feet. Only the chiropractor's hands, feet, towel, and the patient's uncomfortable facial expression are visible, creating a strikingly peculiar visual. This video strongly communicates a unique, niche treatment method rather than conventional chiropractic care.",1 second,"Use this video specifically when the TTS script mentions unusual or alternative chiropractic techniques, unconventional neck pain treatments, or emphasizes unique therapeutic experiences that are a bad solution compared to using the Somira massager product. Avoid using this clip in generalized wellness or standard chiropractic care contexts due to its peculiar nature. Only use this clip when it matches the context specifically. ",The product is not visible.,60 out of 100.
+https://storage.googleapis.com/somira/NeckMassage.mp4,"This three-second video presents a serene, professional point-of-view (POV) of an expert masseuse performing a relaxing massage on a person's upper back. The setting emphasizes tranquility, with a soothing atmosphere enhanced by soft lighting, flowers, and candles visible in the background. The massage technique demonstrated is gentle, rhythmic, and intended to induce calmness and relaxation.",3 seconds,"Use this video whenever the TTS script discusses relaxation, professional massage therapy, stress relief, or tranquil and soothing massage experiences. It is ideal for scripts highlighting comfort, wellness practices, spa-like environments, or peaceful therapeutic care. It is also ideal to show that the Somira massager feels like a professional massage!",The product is not visible.,40 out of 100.
+https://storage.googleapis.com/somira/PersonEnjoyingMassagerInBed.mp4,"This three-second clip captures a medium close-up of a person comfortably lying in bed, visibly relaxed with closed eyes, enjoying the soothing effects of the Somira neck massager. The massager is positioned snugly behind the neck, sandwiched between the pillow and neck, clearly demonstrating the activated red-light heat function. The person's tranquil expression and comfortable setting strongly convey a moment of peaceful self-care and wellness.",3 seconds,"Ideal for TTS scripts emphasizing relaxation, bedtime routines, comfort, wellness, stress relief, or self-care rituals. Use this clip particularly when mentioning the heat feature of the Somira massager or illustrating calming, end-of-day relaxation scenarios.",The product is visible. ,35 out of 100.
+https://storage.googleapis.com/somira/OldManWearingSomira.mp4,"An older man, approximately fifty years old, sits on a couch at home, visibly experiencing intense but beneficial discomfort from using the Somira massager. His expression vividly conveys a combination of pain and relief, indicating deep tissue or therapeutic massage effects, particularly effective for tension relief in the shoulders and upper back.",3 seconds,"Use this video exclusively when the script explicitly references older users, mature adults, or hard workers and highlights deep, intense, therapeutic massages. It is particularly suitable for discussing targeted pain relief, overcoming muscular tension, or demonstrating the product's efficacy for older adults experiencing chronic muscle stiffness. It can also be used to show the product is used by a diverse group of people.",The product is visible. ,50 out of 100.
+https://storage.googleapis.com/somira/PersonShowingButtons.mp4,"This brief, focused video prominently shows a person’s face and hands as they hold a specific part of the Somira massager's armrest close to the camera, clearly displaying the four navigation buttons. The setting is a casual home environment, conveying a personal and authentic feel. The buttons are held prominently at chest-level, highlighting ease of access and control. The person has a calm, pleasant expression, enhancing the product's approachable and user-friendly impression.",3 seconds,"Use this video specifically when the TTS script references the Somira massager's navigation buttons, ease of control, or various available functions. It's particularly suitable for emphasizing product usability, demonstrating ease of switching modes, or when explaining the convenience of accessing multiple massage options through simple controls.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/TurningOnMassager.mp4,"This short video clearly illustrates the Somira massager’s primary massage function. The video captures the person's face and upper body, with hands holding the massager closely at chest-level. After pressing the top navigation button to turn on the device, the focus shifts directly onto the massage knots, clearly visible as they begin gently moving in a rhythmic motion. The scene is calm, warm, and personal, showcasing the massager’s core massage functionality without activating the heat option.",3 seconds,"Ideal for TTS scripts that explicitly mention activating the massage function, general massaging capabilities, product functionality demonstrations, or emphasizing the soothing movement of massage knots. Avoid this clip when specifically highlighting the heat or red-light function. This is also a great general video of the massage product that can be used in a large number of circumstances or as fallback. ",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/ShowingMassageFunctionWithHeatRedLight.mp4,"In this concise video, the person clearly demonstrates the Somira massager’s combined massage and heat functions. The frame prominently features the individual's calm, satisfied expression and their hands holding the massager. The massage knots visibly move, illuminated distinctly by the red lights indicating the heat therapy function. The setting remains warm and personal, emphasizing comfort and product effectiveness in delivering heated massage therapy. Red lights and heat is turned on.",3 seconds,"Use this video when the script explicitly mentions the combination of massage with heat therapy, red-light features, or the massager's multifunctional capabilities. It's perfectly used when talking about red light and heat function. It’s especially suitable for highlighting therapeutic warmth, enhanced relaxation, or the effectiveness of heat in relieving tension and soothing muscles. This is also a great general video of the massage product that can be used in a large number of circumstances or as fallback. ",The product is visible. ,35 out of 100.
+https://storage.googleapis.com/somira/PersonWearingTheSomiraMassager.mp4,"This straightforward video features a person wearing the Somira massager around their neck in a comfortable home setting. The medium-shot frames the person from the waist up, presenting a calm, content facial expression to convey ease and satisfaction. The massager is worn naturally and visibly, without the heat or red-light function activated, emphasizing comfort, wearability, and everyday usage.",3 seconds,"Use this video whenever the TTS script discusses comfort, ease of use, or wearable convenience of the Somira massager. It's especially appropriate for general descriptions of product usage, daily wellness routines, or emphasizing the natural comfort and simplicity of incorporating the massager into everyday life. Avoid using when specifically mentioning the heat or red-light features.",The product is visible. ,35 out of 100.
+https://storage.googleapis.com/somira/PersonWearingMassagerWithRedLightHeat.mp4,"This medium-shot video captures a person comfortably wearing the Somira massager around their neck in a relaxed home environment. The person's facial expression indicates satisfaction and tranquility. The massager is visibly activated, showcasing the glowing red-light feature signifying the therapeutic heat function. The video effectively communicates relaxation, enhanced comfort, and the added benefits of warmth for tension relief.",3 seconds,"Ideal for scripts specifically highlighting the red-light heat therapy feature, relaxation through warmth, or enhanced comfort and stress relief provided by the massager. Perfect for emphasizing scenarios involving deeper muscle relaxation, therapeutic heat benefits, or soothing, end-of-day wellness routines.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/Short3DanimationSomiraMassager.mp4,"This concise yet elegant video provides a professional 3D animated visualization of the Somira massager, displayed against a single attractive and calm background. The product slowly rotates in a smooth, slow-motion animation, effectively showcasing its key design elements and refined aesthetic.",3 seconds,"Ideal for scripts that introduce the product, emphasize its attractive design, or describe general benefits and features. Due showing the entire product, it's suitable for many use cases, particularly as an introductory visual or a clean, professional representation of the Somira massager.",The product is fully visible. ,40 out of 100.
+https://storage.googleapis.com/somira/Long3DanimationSomiraMassagerr.mp4,"This video features a polished, professional 3D animation showcasing the Somira massager product. The massager is presented against three distinct, calm, and aesthetically pleasing backgrounds, gently rotating in slow-motion to display its sleek design and features comprehensively. The animation effectively highlights the product's shape, texture, and overall appeal without distractions.",1 second,"Use this video when the script emphasizes product design, aesthetic appeal, general features, or introduces the Somira massager to viewers. It's particularly suitable for visually engaging presentations or as a versatile fallback clip due to its clean and professional appearance.",The product is fully visible. ,40 out of 100.
+https://storage.googleapis.com/somira/MassagerMassageAndHeatFeatureDarkBackgroundd.mp4,"The video provides a close-up view of the Somira Massager, clearly showing it powered on with both the massage function and the red-light heat therapy feature activated. It gradually moves closer towards the massage knots to highlight key product details. The original background has been digitally replaced with a dark blue backdrop, prominently showcasing the massager without distraction. This handheld mobile shot is smooth but engaging, focusing exclusively on the product with no visible people. The video should be trimmed in length for optimal engagement. It is ideal as a general or fallback clip for clearly demonstrating product features.",1 second,"Use this video when the TTS script explicitly mentions or implies ""massage knots,"" ""heat therapy,"" ""red-light,"" ""product features,"" ""massager functionality,"" or ""product close-up."" Additionally, it's suitable when discussing detailed product benefits, highlighting massage effectiveness, or referring generally to the Somira Massager. Due to its clarity, versatility, and focus on specific product functions, it also works effectively as a fallback or filler video in many scenarios.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/3D1.mp4,"This animation presents the Somira neck massager in a slow, floating rotation against a neutral grey background. The product is fully visible and remains centered throughout the clip, allowing for a clean, uninterrupted view. The darker tones give the scene a polished, professional atmosphere that feels serious and refined. The subtle floating motion highlights the product’s design and shape without distraction, creating a premium visual experience that communicates sophistication.",2 seconds,"This clip is best used when the TTS script introduces the product in a professional tone, emphasizes its premium design, or highlights general product features. Its clear full-frame view of the massager makes it a versatile option when showcasing the product itself. It works particularly well in moments where the narration is focused on credibility, quality, or premium appeal.",The product is fully visible. ,35 out of 100.
+https://storage.googleapis.com/somira/3D3.mp4,"This stylish 3D animation showcases the Somira neck massager as it floats into view, rotating gently on a warm beige background that reflects the brand’s primary color palette. The product is visible from behind and at an angle, creating a more dynamic and engaging perspective compared to a static front view. The floating entrance from the top adds energy and intrigue, while the soft beige tones keep the presentation aligned with brand aesthetics.",2 seconds,"This clip works best when the narration focuses on brand style, product design identity, or when adding variety to a sequence of product animations. Since the product is only partially shown, it’s not ideal for general showcases but is effective for moments emphasizing mood, brand sophistication, or visual flair. Use it to create engaging transitions or to highlight design language in a more subtle, stylish way.",The product is partially visible. ,35 out of 100.
+https://storage.googleapis.com/somira/3D4.mp4,"In this animation, the Somira massager floats down from above and moves smoothly into the center of the frame, fully visible against a dark brown background. The reveal effect adds an engaging element, as the product appears with motion rather than remaining static. The darker color scheme lends a sense of richness and depth, while the centered product presentation ensures a clear focus on design and shape.",2 seconds,"This clip is ideal when the narration highlights a reveal, discovery, or introduction to the product. It pairs effectively with lines that position the massager as a solution or innovation being presented to the viewer. Its smooth floating reveal makes it a good choice for energetic or engaging script moments that call for a touch of drama while keeping the product clearly visible.",The product is visible. ,50 out of 100.
+https://storage.googleapis.com/somira/3dAnimationControls.mp4,"This close-up 3D animation highlights the Somira neck massager’s smart control buttons with precision and clarity. Against a sleek black background, the camera smoothly pans across the armrest, focusing exclusively on the navigation buttons. The darker backdrop and close framing create a professional, technical mood, emphasizing the usability and sophistication of the controls.",3 seconds,"This video is best aligned with TTS narration that discusses smart controls, ease of navigation, or multiple massage modes. The serious, professional aesthetic works well when emphasizing advanced product functionality, precision, or innovation. It’s particularly effective when highlighting technical aspects of usability or showcasing the quality of design.",The product is partially visible. ,40 out of 100.
+https://storage.googleapis.com/somira/3Dsmartcontrols.mp4,"This 3D animation highlights the Somira neck massager’s smart control buttons in a premium, cinematic style. Set against a beige-golden background, the camera smoothly zooms in with a soft, lens-blur effect that creates depth and a realistic sense of focus. The lighting and movement emphasize the design quality of the controls while maintaining a bright, modern atmosphere. The polished visuals communicate luxury, sophistication, and intuitive usability.",2 seconds,"This video is best suited for narration that highlights the massager’s smart controls, ease of operation, or advanced functionality. The warm tones and cinematic blur create a lighter, inviting mood, making it especially effective when emphasizing convenience, accessibility, and lifestyle benefits. It reinforces the premium design of the product while showing how simple and elegant the control functions are to use.",The product is partially visible. ,40 out of 100.
+https://storage.googleapis.com/somira/3DlightEffectReveal.mp4,"This elegant animation showcases the Somira neck massager as it remains fully centered and still, while light sweeps gracefully across the product. The shifting illumination reveals its contours, textures, and refined design in dramatic fashion. The interplay of light and shadow creates a luxurious reveal effect that positions the product as premium and aspirational.",2 seconds,"This video pairs well with narration that emphasizes luxury, design, craftsmanship, or premium quality. It is especially suitable for introducing the product in a dramatic or stylish way, or when highlighting elegance and sophistication. Use it to create visual impact during moments where the TTS positions the Somira massager as a high-end, beautifully designed product.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/Animation.mp4,"This animation presents the Somira massager at a slight front-facing angle, slowly zooming in to provide a full, unobstructed view of the product. Against a sleek black background, the entire product remains clearly visible throughout the clip. The lack of flashy effects or distractions makes it a clean, versatile, and highly usable animation, with a premium yet straightforward style.",2 seconds,"This clip is highly flexible and can be used in a wide range of scenarios whenever the narration discusses the product itself. Its clarity and simplicity make it a strong fallback option, ideal for moments that require a direct product showcase. It works well with general mentions of the Somira massager, as well as segments that highlight design, usability, or overall appeal.",The product is fully visible. ,35 out of 100.
+https://storage.googleapis.com/somira/GirlWearingSomiraNeckMassagerCalmSettingWarm.mp4,"This video shows a woman seated comfortably with the Somira neck massager around her neck. The front view captures her calm, content expression as she leans her head slightly, reinforcing a sense of ease and satisfaction. The background features a warm, elegant night scene with the Eiffel Tower in view, paired with beige furniture that adds to the cozy, stylish atmosphere. The soft lighting and warm tones highlight both the product and the luxurious setting, creating a polished, inviting mood.",2 seconds,"This video is best aligned with narration that emphasizes personal comfort, stylish relaxation, or the feeling of satisfaction when using the Somira massager. The premium background and calm demeanor make it ideal when the script highlights lifestyle benefits, wellness, or the elegance of incorporating the massager into daily routines. It visually reinforces comfort in a high-end, aspirational context.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/HappyPersonWearingSomiraMassager.mp4,"This video captures a person seated at home with the Somira neck massager worn naturally around the neck. The individual looks directly toward the camera with a bright, joyful smile and visible teeth, radiating happiness and satisfaction. The home setting feels casual yet inviting, creating a relatable, everyday environment. The combination of genuine expression and the clear view of the product emphasizes both enjoyment and authenticity.",2 seconds,"This video works perfectly when the narration highlights customer happiness, satisfaction, or the positive emotional impact of using the Somira massager. The clear smile conveys authenticity, making it highly effective for lines that reinforce trust, customer experience, or overall joy from the product. It is especially strong for emphasizing real, happy results.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/ModelGirlWearingSomira.mp4,"This video presents a model standing confidently with the Somira massager worn around her neck. The stylish background features the Eiffel Tower in a luxurious urban setting, elevating the overall presentation. The model’s composed posture and fashionable appearance frame the product as a premium lifestyle accessory. The combination of professional lighting, elegant scenery, and centered focus communicates refinement and exclusivity.",2 seconds,"This video is best suited for narration that highlights elegance, premium lifestyle, or the fashionable appeal of the Somira massager. It pairs well with lines that connect the product to luxury, sophistication, or high-end self-care. The stylish setting makes it effective for aspirational messaging, where the product is positioned not just as a tool but as part of an elevated lifestyle.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/PersonWearingSomiraInSpace.mp4,"This entertaining video shows a person inside a futuristic spaceship environment while wearing the Somira neck massager. The setting is vibrant and imaginative, contrasting advanced technology with personal relaxation. The individual’s calm demeanor while seated highlights the ease of use and enjoyment of the massager, even in a highly unconventional location. The combination of the sci-fi background and the product in use creates a unique and playful presentation.",2 seconds,"This video works best when the narration leans into futuristic, innovative, or entertaining themes. It is highly effective for highlighting the idea that the Somira massager can bring comfort and relaxation anywhere, even in unexpected environments. Use it to add a fun, eye-catching twist to the message while still showcasing the product in use.",The product is visible. ,40 out of 100.
+https://storage.googleapis.com/somira/BusinessPersonExperiencingNeckPain.mp4,"This video captures a stylish business professional in an office setting, visibly experiencing neck pain. Shot from a medium close-up that moves smoothly into a close-up, the handheld camera adds a dynamic, authentic feel. The person looks stressed yet composed, maintaining a professional charm while placing a hand on their neck to clearly signal discomfort. The modern office environment reinforces the relatability of workplace strain, while the body language makes the pain immediately recognizable.",1 second,"This clip works best when the narration addresses relatable daily struggles such as work-related stress, neck stiffness, or discomfort from long hours at a desk. The professional setting and expressive gesture make it highly effective for illustrating common lifestyle challenges that the Somira massager helps resolve. It is ideal for connecting with viewers who experience tension from work and need practical, stylish solutions.",The product is not visible.,45 out of 100.
+https://storage.googleapis.com/somira/neckpain.mp4,"This high-end 3D animation presents a technical and professional visualization of neck pain. The animation depicts a semi-transparent human figure in shades of blue, showing both skeletal and muscular structures. Against a sleek black background, the camera zooms in on the neck area to emphasize tension and discomfort. The polished, anatomical rendering provides a scientific and precise way of communicating pain, stiffness, or strain in the neck.",3 seconds,"This video aligns best when the TTS script mentions neck pain and muscle stiffness. The high-end 3D anatomical style makes it different from live-action clips, giving it a credible and highly professional tone. It is most effective when the TTS highlights neck pain discomfort and needs a clear, accurate visualization of neck pain before presenting the Somira massager as the solution.",The product is not visible.,45 out of 100.
+https://storage.googleapis.com/somira/StrongFaceExpressionNeckPain.mp4,"This video portrays a person working on a laptop while visibly struggling with neck pain. The individual’s exaggerated facial expressions and hand placed firmly on the neck create a playful yet instantly recognizable depiction of discomfort. The casual home or work setting keeps the scene relatable, while the strong acting style ensures that the pain is clear and attention-grabbing. The lighthearted exaggeration gives the clip an entertaining edge while still communicating the problem.",2 seconds,"This clip is highly effective when the narration emphasizes relatable tension, stress, or pain from everyday activities such as working at a computer. Its exaggerated style makes it ideal for more playful or entertaining edits while still conveying the key issue of neck discomfort. Use it when the script aims to quickly grab attention, connect with viewers through humor, and set the stage for showing how the Somira massager delivers relief.",The product is not visible.,50 out of 100.
+https://storage.googleapis.com/somira/PersonExperiencingStressOffice.mp4,"This video shows a stylish business professional in an office setting, dealing with visible stress from work. Captured from a medium close-up that moves toward a close-up, the lighting uses strong contrasts to highlight the mood while keeping a polished and modern look. The individual’s expression and gestures convey fatigue and pressure but still maintain a sense of style and composure. The overall tone is serious yet relatable, presenting the challenges of office life in a way that feels authentic and professional.",2 seconds,"This video aligns best when the TTS script mentions office stress, long hours, or the strain of demanding work. The balanced tone of stress combined with the individual’s composed presence makes it effective for connecting with viewers in relatable professional scenarios. ",The product is not visible.,45 out of 100.
+https://storage.googleapis.com/somira/DogPosingAsHumanInOfficeWorkingDesk.mp4,"This playful video features a dog dressed in a shirt and tie, seated at an office desk and typing on a laptop as if he were a human professional. The warm tone and lighting add charm while highlighting the humorous contrast of a dog acting in a role that only people normally do. The dog’s serious, focused expression mimics the stress and concentration of office work, turning a common theme of workplace pressure into an entertaining, impossible, and eye-catching scenario.",2 seconds,"This video aligns best when the TTS script talks about stress, office life, or workplace responsibilities in a fun and relatable way. The unique humor of a dog posing as a human professional makes it effective for playful edits that still connect to the themes of long hours, desk jobs, and the pressures of modern work.",The product is not visible.,45 out of 100.
+https://storage.googleapis.com/somira/TiredDogPosingAsHomanInOfficeJob.mp4,"This entertaining video shows a dog wearing a suit and seated at an office desk, surrounded by multiple computer screens, working like a human professional. The cooler lighting tone emphasizes a slightly more serious and tired atmosphere, contrasting the humor of the setup. The dog’s concentrated, weary expression mimics the exhaustion of office workers, creating a funny and exaggerated take on stress and overwork in a corporate environment.",1 second,"This video aligns best when the TTS script emphasizes stress, long hours, or the strain of demanding office jobs. The humor of a dog posing as a human worker makes it memorable and engaging, while the cooler tone underscores the tiredness and intensity of workplace stress in an entertaining way.",The product is not visible.,45 out of 100.
+https://storage.googleapis.com/somira/BackFlop.mp4,"This short, entertaining clip shows a parkour athlete attempting a backflip but landing flat on his back on the concrete. The video cuts right before the impact, creating a funny, exaggerated fail moment that hooks attention instantly. The sudden cutoff adds suspense and humor, making it highly engaging and shareable.",1 second,"This video aligns best when the TTS script references sudden pain, unexpected mistakes, or back and neck discomfort in a humorous way. Its exaggerated fail makes it an ideal viral-style hook to grab attention and inject energy.",The product is not visible.,90 out of 100.
+https://storage.googleapis.com/somira/BumpySlide.mp4,"This playful video shows an adult sliding down a children’s slide at high speed. The oversized rider bumps along the way and the clip cuts just before hitting the ground, leaving the viewer laughing at the exaggerated, chaotic outcome. The comedic timing and relatable playground setting make it instantly engaging.",2 seconds,"This video aligns best when the TTS script mentions relatable discomforts, funny struggles, or moments of tension such as neck pain, back strain, or daily stress. The exaggerated slide fail makes it versatile as a hook or an entertaining insert that keeps attention high and adds humor, no matter where it appears in the edit.",The product is not visible.,90 out of 100.
+https://storage.googleapis.com/somira/Car-Flip-Over.mp4,"This shocking short clip shows a person struck at the legs by a moving car and flipping into the air. The video cuts before the fall lands, creating suspense and making the exaggerated fail more attention-grabbing. The dramatic action makes it both funny and unforgettable as a viral-style moment.",2 seconds,"This video aligns best when the TTS script mentions discomfort, unexpected strain, or exaggerated situations like muscle pain or body stress. The dramatic action makes it a versatile choice as a hook or entertaining insert, keeping the audience engaged and adding humor and surprise at different points in the edit.",The product is not visible.,90 out of 100.
+https://storage.googleapis.com/somira/Construction-Water.mp4,"This entertaining video shows two construction workers handling a large pipe when water suddenly bursts out uncontrollably. One of them slips during the chaos, adding slapstick humor to the scene. The surprise and exaggerated reaction create a funny, lighthearted viral moment.",3 seconds,"This video aligns best when the TTS script highlights unexpected challenges, stressful moments, or chaotic situations. Its playful style makes it a versatile insert to add humor, boost engagement, and keep the audience entertained throughout the edit.",The product is not visible.,85 out of 100.
+https://storage.googleapis.com/somira/DonkeyTakesTumble.mp4,"This goofy clip shows a donkey attempting to hop over a fence but getting stuck in a hilariously awkward way. The clumsy and unexpected result creates laughter and surprise, making it a memorable and entertaining viral-style fail.",1 second,"This video aligns best when the TTS script refers to awkward moments, clumsy mistakes, or humorous discomforts. Its exaggerated tumble makes it a versatile choice for injecting humor, keeping energy high, and grabbing attention in playful edits.",The product is not visible.,90 out of 100.
+https://storage.googleapis.com/somira/FunnyPersonDoingHulaHoop.mp4,This lively clip shows a person in a gym exaggerating their hula hoop moves in a funny and playful way. The over-the-top energy and unexpected expressions make the video entertaining and instantly engaging for viewers.,1 second,"This video aligns best when the TTS script emphasizes playful exaggeration, bursts of energy, or lighthearted moments. Its humor and upbeat tone make it a flexible option to keep the edit fun, engaging, and entertaining for the audience. Also when talking about positive elements such as success, relief, relaxation, wellness or similar. ",The product is not visible.,85 out of 100.
+https://storage.googleapis.com/somira/PeopleTurningLookingEngagingFace.mp4,"This playful video shows three people dramatically turning their heads with wide-eyed, exaggerated expressions of surprise and amazement. The synchronized reactions are humorous and create intrigue, making viewers curious about what they are looking at.",2 seconds,"This video aligns best when the TTS script introduces something impressive, highlights a surprising benefit, or builds curiosity. The exaggerated reactions make it ideal for playful reveals, adding humor and keeping the audience engaged.",The product is not visible.,85 out of 100.
+https://storage.googleapis.com/somira/Tube-Launch.mp4,"This high-energy clip shows a person on an inflatable tube being launched at high speed across a lake. The video cuts before the landing, amplifying suspense and leaving viewers laughing at the chaotic ride. The extreme motion and timing make it instantly engaging and attention-grabbing.",2 seconds,"This video aligns best when the TTS script highlights problems or pain. Its fast-paced style makes it a versatile mid-edit hook insert, adding humor, surprise, and entertainment to keep the video lively.",The product is not visible.,100 out of 100.
+https://storage.googleapis.com/somira/PersonWearingMassagerRelaxedd.mp4,"This video shows a male person sitting on the coach with their eyes closed and deeply relaxed while the Somira neck massager is turned on and behind the persons neck. Calm, natural at home setting.",2 seconds,"Use this clip specifically when the TTS script references therapeutic massage effects, deep-tissue relief, or general relaxation benifits of using the Somira neck massager. It is particularly suitable strong comfort and relaxation when the massager is used by a person.",The product is visible. ,35 out of 100.

src/utils.py CHANGED Viewed

@@ -77,132 +77,3 @@ def setup_logger(name='ContentAutomation', level=logging.INFO, log_file=None):
 # Create global logger instance
 logger = setup_logger()
-def format_duration(seconds: float) -> str:
-    """
-    Format duration in seconds to human-readable string
-    Args:
-        seconds: Duration in seconds
-    Returns:
-        Formatted string (e.g., "1m 23s" or "45s")
-    """
-    if seconds < 60:
-        return f"{seconds:.1f}s"
-    minutes = int(seconds // 60)
-    remaining_seconds = seconds % 60
-    if minutes < 60:
-        return f"{minutes}m {remaining_seconds:.0f}s"
-    hours = int(minutes // 60)
-    remaining_minutes = minutes % 60
-    return f"{hours}h {remaining_minutes}m"
-def format_file_size(size_bytes: int) -> str:
-    """
-    Format file size in bytes to human-readable string
-    Args:
-        size_bytes: Size in bytes
-    Returns:
-        Formatted string (e.g., "1.5 MB")
-    """
-    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
-        if size_bytes < 1024.0:
-            return f"{size_bytes:.1f} {unit}"
-        size_bytes /= 1024.0
-    return f"{size_bytes:.1f} PB"
-def validate_video_config(config: dict) -> bool:
-    """
-    Validate video configuration parameters
-    Args:
-        config: Video configuration dictionary
-    Returns:
-        True if valid, False otherwise
-    """
-    valid_aspect_ratios = ['16:9', '9:16', '1:1', '4:5']
-    valid_styles = ['commercial', 'minimal', 'cinematic', 'social']
-    if 'aspect_ratio' in config:
-        if config['aspect_ratio'] not in valid_aspect_ratios:
-            logger.warning(f"Invalid aspect ratio: {config['aspect_ratio']}")
-            return False
-    if 'style' in config:
-        if config['style'] not in valid_styles:
-            logger.warning(f"Invalid style: {config['style']}")
-            return False
-    if 'duration' in config:
-        if not (1 <= config['duration'] <= 60):
-            logger.warning(f"Invalid duration: {config['duration']}s (must be 1-60)")
-            return False
-    return True
-def sanitize_filename(filename: str) -> str:
-    """
-    Sanitize filename by removing invalid characters
-    Args:
-        filename: Original filename
-    Returns:
-        Sanitized filename
-    """
-    import re
-    # Remove invalid characters
-    filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
-    # Remove leading/trailing spaces and dots
-    filename = filename.strip('. ')
-    return filename
-def generate_video_id() -> str:
-    """
-    Generate unique video ID based on timestamp
-    Returns:
-        Unique video ID string
-    """
-    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
-    return f"video_{timestamp}"
-class ProgressTracker:
-    """Track progress of multi-step operations"""
-    def __init__(self, total_steps: int, description: str = "Processing"):
-        self.total_steps = total_steps
-        self.current_step = 0
-        self.description = description
-        self.start_time = datetime.now()
-    def update(self, step_name: str):
-        """Update progress to next step"""
-        self.current_step += 1
-        progress = (self.current_step / self.total_steps) * 100
-        elapsed = (datetime.now() - self.start_time).total_seconds()
-        logger.info(
-            f"[{progress:.0f}%] Step {self.current_step}/{self.total_steps}: "
-            f"{step_name} (Elapsed: {format_duration(elapsed)})"
-        )
-    def complete(self):
-        """Mark progress as complete"""
-        elapsed = (datetime.now() - self.start_time).total_seconds()
-        logger.info(
-            f"✓ {self.description} completed in {format_duration(elapsed)}"
-        )


77
78	# Create global logger instance
79	logger = setup_logger()

src/video_renderer.py CHANGED Viewed

@@ -8,6 +8,7 @@ if not hasattr(PIL.Image, 'ANTIALIAS'):
 import os
 import tempfile
 from typing import List, Dict, Optional
 from pathlib import Path
@@ -16,7 +17,7 @@ from moviepy.editor import VideoFileClip, AudioFileClip, CompositeVideoClip, con
 import numpy as np
 import textwrap
-from utils import logger, format_duration
 class VideoRenderer:
@@ -43,17 +44,21 @@ class VideoRenderer:
             if not self._validate_assets(assets):
                 raise ValueError("Invalid assets provided for video rendering")
-            # Load and prepare all assets
-            video_clips = await self._prepare_video_clips(assets)
-            audio_clips = await self._prepare_audio_clips(assets)
-            # Create video sequence
-            final_video = await self._create_video_sequence(video_clips, video_config)
-            # Add audio
             final_video = await self._add_audio_track(final_video, audio_clips)
-            # Add subtitles if script provided
             if assets.get('tts_script'):
                 final_video = await self._add_subtitles(final_video, assets['tts_script'])
@@ -70,17 +75,51 @@ class VideoRenderer:
             logger.error(f"❌ Video rendering failed: {e}")
             raise
-    async def _prepare_video_clips(self, assets: Dict) -> List[VideoFileClip]:
-        """Load and prepare all video clips"""
         clips = []
         try:
-            # Load RunwayML hook video
             if assets.get('hook_video'):
                 hook_clip = VideoFileClip(assets['hook_video']['local_path'])
                 hook_clip = hook_clip.without_audio()
-                clips.append(('hook', hook_clip))
-                logger.info(f"✓ Loaded hook video: {hook_clip.duration:.2f}s")
             # Load library videos
             for i, lib_video in enumerate(assets.get('selected_videos', [])):
@@ -99,35 +138,38 @@ class VideoRenderer:
                 clip.close()
             raise
-    async def _prepare_audio_clips(self, assets: Dict) -> List[AudioFileClip]:
-        """Load and prepare all audio clips with proper error handling"""
         clips = []
         try:
-            # Load TTS audio
             if assets.get('tts_audio') and assets['tts_audio'].get('local_path'):
                 try:
                     tts_clip = AudioFileClip(assets['tts_audio']['local_path'])
-                    # Ensure the clip has proper duration
                     if tts_clip.duration > 0:
                         clips.append(('tts', tts_clip))
-                        logger.info(f"✓ Loaded TTS audio: {tts_clip.duration:.2f}s")
                     else:
                         logger.warning("⚠️ TTS audio has zero duration")
                         tts_clip.close()
                 except Exception as e:
                     logger.error(f"❌ Failed to load TTS audio: {e}")
-            # Load background music
             if assets.get('background_music_local'):
                 try:
                     bg_clip = AudioFileClip(assets['background_music_local'])
-                    # Ensure the clip has proper duration
                     if bg_clip.duration > 0:
-                        # Reduce volume using volumex instead of custom function
-                        bg_clip = bg_clip.volumex(0.3)
                         clips.append(('background', bg_clip))
-                        logger.info(f"✓ Loaded background music: {bg_clip.duration:.2f}s")
                     else:
                         logger.warning("⚠️ Background music has zero duration")
                         bg_clip.close()
@@ -146,28 +188,133 @@ class VideoRenderer:
                     pass
             raise
     async def _create_video_sequence(self, video_clips: List[VideoFileClip],
-                                   video_config: Optional[Dict]) -> VideoFileClip:
-        """Create the final video sequence with proper timing"""
         try:
             if not video_clips:
                 raise ValueError("No video clips available for sequence")
-            # Calculate total available duration (max 15 seconds)
-            max_duration = 15.0
-            current_duration = sum(clip.duration for clip in video_clips)
-            if current_duration > max_duration:
-                logger.warning(f"⚠️ Video sequence too long ({current_duration:.1f}s), will trim to {max_duration}s")
-                video_clips = self._trim_clips_to_fit(video_clips, max_duration)
-            # Resize all clips to target aspect ratio (9:16 vertical)
-            target_size = (1080, 1920)  # 9:16 vertical
-            resized_clips = [self._resize_for_vertical(clip, target_size) for clip in video_clips]
-            # Create sequence
-            final_sequence = concatenate_videoclips(resized_clips)
-            logger.info(f"✓ Created video sequence: {final_sequence.duration:.2f}s")
             return final_sequence
@@ -177,6 +324,83 @@ class VideoRenderer:
                 clip.close()
             raise
     def _resize_for_vertical(self, clip: VideoFileClip, target_size: tuple) -> VideoFileClip:
         """Resize clip to fit vertical 9:16 aspect ratio"""
         target_w, target_h = target_size
@@ -190,9 +414,8 @@ class VideoRenderer:
             # Clip is taller, fit to width and crop height
             new_clip = clip.resize(width=target_w)
-        # Center crop to exact size using a more compatible method
         try:
-            # Try the new method first
             new_clip = new_clip.crop(
                 x_center=new_clip.w / 2,
                 y_center=new_clip.h / 2,
@@ -200,41 +423,20 @@ class VideoRenderer:
                 height=target_h
             )
         except Exception:
-            # Fallback method for cropping
             x1 = (new_clip.w - target_w) // 2
             y1 = (new_clip.h - target_h) // 2
             new_clip = new_clip.crop(x1=x1, y1=y1, x2=x1+target_w, y2=y1+target_h)
         return new_clip
-    def _trim_clips_to_fit(self, clips: List[VideoFileClip], max_duration: float) -> List[VideoFileClip]:
-        """Trim video clips to fit within max duration"""
-        trimmed_clips = []
-        remaining_duration = max_duration
-        for clip in clips:
-            if remaining_duration <= 0:
-                break
-            use_duration = min(clip.duration, remaining_duration)
-            if use_duration < clip.duration:
-                trimmed_clip = clip.subclip(0, use_duration)
-                trimmed_clips.append(trimmed_clip)
-                logger.info(f"Trimmed clip from {clip.duration:.1f}s to {use_duration:.1f}s")
-            else:
-                trimmed_clips.append(clip)
-            remaining_duration -= use_duration
-        return trimmed_clips
     async def _add_audio_track(self, video_clip: VideoFileClip, audio_clips: List[AudioFileClip]) -> VideoFileClip:
-        """Add audio track to video with proper timing"""
         if not audio_clips:
             return video_clip
         try:
-            # Filter out invalid audio clips
             valid_audio_clips = []
             for clip in audio_clips:
                 if clip.duration > 0:
@@ -246,24 +448,26 @@ class VideoRenderer:
             if not valid_audio_clips:
                 return video_clip
-            # Mix all valid audio clips
             mixed_audio = CompositeAudioClip(valid_audio_clips)
-            # Ensure audio doesn't exceed video duration
             video_duration = video_clip.duration
-            if mixed_audio.duration > video_duration:
-                logger.info(f"Trimming audio from {mixed_audio.duration:.2f}s to {video_duration:.2f}s")
-                mixed_audio = mixed_audio.subclip(0, video_duration)
             # Add audio to video
             video_with_audio = video_clip.set_audio(mixed_audio)
-            logger.info(f"✓ Added audio track: {mixed_audio.duration:.2f}s")
             return video_with_audio
         except Exception as e:
             logger.error(f"❌ Failed to add audio track: {e}")
-            # Cleanup audio clips
             for clip in audio_clips:
                 try:
                     clip.close()
@@ -272,82 +476,206 @@ class VideoRenderer:
             return video_clip
     async def _add_subtitles(self, video_clip: VideoFileClip, script: str) -> CompositeVideoClip:
-        """Add animated subtitles to video"""
         try:
-            phrases = self._split_script_into_phrases(script)
             text_clips = []
             total_duration = video_clip.duration
             duration_per_phrase = total_duration / len(phrases)
-            fade_duration = 0.3
-            target_width, target_height = video_clip.size
             for i, phrase in enumerate(phrases):
                 start_time = i * duration_per_phrase
-                # Word wrapping for vertical format
-                max_chars_per_line = 25
-                wrapped_text = '\n'.join(textwrap.wrap(phrase, width=max_chars_per_line))
-                # Create text clip
-                text_clip = TextClip(
-                    txt=wrapped_text,
-                    fontsize=65,
-                    color='yellow' if i % 2 == 1 else 'white',
-                    font='Helvetica',
-                    stroke_color='black',
-                    stroke_width=4,
-                    method='caption',
-                    size=(int(target_width * 0.85), None)
-                )
-                # Position in center-upper area (safe zone for vertical video)
-                vertical_position = int(target_height * 0.40)
-                text_clip = text_clip.set_position(('center', vertical_position))
-                text_clip = text_clip.set_start(start_time)
-                text_clip = text_clip.set_duration(duration_per_phrase)
-                # Add fade effects manually
-                text_clip = text_clip.crossfadein(fade_duration).crossfadeout(fade_duration)
-                text_clips.append(text_clip)
             # Combine video with subtitles
             final_video = CompositeVideoClip([video_clip] + text_clips)
-            logger.info(f"✓ Added {len(text_clips)} subtitle segments")
             return final_video
         except Exception as e:
             logger.error(f"❌ Failed to add subtitles: {e}")
             return video_clip
-    def _split_script_into_phrases(self, script: str) -> List[str]:
-        """Split script into subtitle phrases"""
-        # Simple sentence splitting - can be enhanced with NLP
-        sentences = [s.strip() + '.' for s in script.split('.') if s.strip()]
-        return sentences[:6]  # Limit to 6 phrases max
     async def _render_final_video(self, video_clip: VideoFileClip) -> str:
-        """Render final video to file"""
-        output_path = self.temp_dir / "final_video.mp4"
         try:
-            logger.info("📹 Rendering final video file...")
-            video_clip.write_videofile(
                 str(output_path),
                 codec='libx264',
                 audio_codec='aac',
-                temp_audiofile=str(self.temp_dir / 'temp_audio.m4a'),
                 remove_temp=True,
                 fps=24,
                 verbose=False,
-                logger=None  # Suppress moviepy progress bars
             )
-            logger.info(f"✓ Final video rendered: {output_path}")
             return str(output_path)
         except Exception as e:
@@ -358,7 +686,7 @@ class VideoRenderer:
     def _validate_assets(self, assets: Dict) -> bool:
         """Validate that required assets are present"""
-        required = ['selected_videos', 'tts_audio']
         for req in required:
             if not assets.get(req):

 import os
 import tempfile
+import uuid
 from typing import List, Dict, Optional
 from pathlib import Path
 import numpy as np
 import textwrap
+from utils import logger
 class VideoRenderer:
             if not self._validate_assets(assets):
                 raise ValueError("Invalid assets provided for video rendering")
+            # Get TTS audio duration as the base duration
+            tts_duration = await self._get_audio_duration(assets.get('tts_audio', {}).get('local_path'))
+            logger.info(f"📊 TTS audio duration: {tts_duration:.2f}s - this will be our target video duration")
+            # Load and prepare all assets with TTS duration as target
+            video_clips = await self._prepare_video_clips(assets, tts_duration)
+            audio_clips = await self._prepare_audio_clips(assets, tts_duration)
+            # Create video sequence that matches TTS duration exactly
+            final_video = await self._create_video_sequence(video_clips, tts_duration, video_config)
+            # Add audio without any trimming
             final_video = await self._add_audio_track(final_video, audio_clips)
+            # Add improved subtitles if script provided
             if assets.get('tts_script'):
                 final_video = await self._add_subtitles(final_video, assets['tts_script'])
             logger.error(f"❌ Video rendering failed: {e}")
             raise
+    async def _get_audio_duration(self, audio_path: str) -> float:
+        """Get the duration of the TTS audio file"""
+        try:
+            if not audio_path or not os.path.exists(audio_path):
+                logger.warning("⚠️ TTS audio path not found, using default duration")
+                return 12.0  # Fallback duration
+            audio_clip = AudioFileClip(audio_path)
+            duration = audio_clip.duration
+            audio_clip.close()
+            return duration
+        except Exception as e:
+            logger.error(f"❌ Failed to get audio duration: {e}")
+            return 12.0  # Fallback duration
+    async def _prepare_video_clips(self, assets: Dict, target_duration: float) -> List[VideoFileClip]:
+        """Load and prepare all video clips - create seamless loop from hook video"""
         clips = []
         try:
+            # Load hook video for seamless looping
             if assets.get('hook_video'):
                 hook_clip = VideoFileClip(assets['hook_video']['local_path'])
+                hook_duration = hook_clip.duration
                 hook_clip = hook_clip.without_audio()
+                logger.info(f"🔄 Creating seamless loop from {hook_duration:.2f}s hook video")
+                # For seamless loop: Use SECOND HALF at start, FIRST HALF at end
+                # This creates: [second_half] -> [library videos] -> [first_half]
+                # When looped: [first_half][second_half] appears continuous
+                mid_point = hook_duration / 2
+                # Second half for beginning (e.g., 4-8s of an 8s video)
+                hook_start = hook_clip.subclip(mid_point, hook_duration)
+                clips.append(('hook_start', hook_start))
+                logger.info(f"✓ Hook start (second half): {hook_start.duration:.2f}s ({mid_point:.2f}s - {hook_duration:.2f}s)")
+                # First half for ending (e.g., 0-4s of an 8s video)
+                hook_end = hook_clip.subclip(0, mid_point)
+                clips.append(('hook_end', hook_end))
+                logger.info(f"✓ Hook end (first half): {hook_end.duration:.2f}s (0s - {mid_point:.2f}s)")
+                hook_clip.close()
             # Load library videos
             for i, lib_video in enumerate(assets.get('selected_videos', [])):
                 clip.close()
             raise
+    async def _prepare_audio_clips(self, assets: Dict, target_duration: float) -> List[AudioFileClip]:
+        """Load audio clips and prepare for speed adjustment"""
         clips = []
         try:
+            # Load TTS audio - KEEP ORIGINAL VOLUME (no reduction)
             if assets.get('tts_audio') and assets['tts_audio'].get('local_path'):
                 try:
                     tts_clip = AudioFileClip(assets['tts_audio']['local_path'])
                     if tts_clip.duration > 0:
+                        # Keep TTS at full volume (1.0x) - no volumex applied
                         clips.append(('tts', tts_clip))
+                        logger.info(f"✓ Loaded TTS audio at FULL volume: {tts_clip.duration:.2f}s")
                     else:
                         logger.warning("⚠️ TTS audio has zero duration")
                         tts_clip.close()
                 except Exception as e:
                     logger.error(f"❌ Failed to load TTS audio: {e}")
+            # Load background music - VERY LOW volume to not compete with TTS
             if assets.get('background_music_local'):
                 try:
                     bg_clip = AudioFileClip(assets['background_music_local'])
                     if bg_clip.duration > 0:
+                        # Trim background music to match TTS duration
+                        if bg_clip.duration > target_duration:
+                            bg_clip = bg_clip.subclip(0, target_duration)
+                            logger.info(f"✓ Trimmed background music to {target_duration:.2f}s")
+                        # Reduce volume significantly - TTS should be dominant
+                        bg_clip = bg_clip.volumex(0.08)  # Reduced from 0.15 to 0.08 (very subtle)
                         clips.append(('background', bg_clip))
+                        logger.info(f"✓ Loaded background music at 8% volume: {bg_clip.duration:.2f}s")
                     else:
                         logger.warning("⚠️ Background music has zero duration")
                         bg_clip.close()
                     pass
             raise
+    async def _speed_up_audio_with_pitch_correction(self, audio_clip, speed_factor: float) -> AudioFileClip:
+        """Speed up audio while preserving pitch using librosa (pitch-preserving time stretch)"""
+        try:
+            import numpy as np
+            try:
+                import librosa
+                import soundfile as sf
+                has_librosa = True
+            except ImportError:
+                has_librosa = False
+                logger.warning("⚠️ librosa not available, using simple speed-up (pitch will change)")
+            if not has_librosa:
+                # Fallback: use simple speedx (will change pitch)
+                logger.warning("⚠️ Using simple speedx - voice pitch will be higher")
+                # For audio, we can't use speedx directly, so we'll return the original
+                # and let the video speed handle it
+                return audio_clip
+            # Create temp paths
+            temp_input = str(self.temp_dir / f"audio_input_{uuid.uuid4().hex[:8]}.wav")
+            temp_output = str(self.temp_dir / f"audio_output_{uuid.uuid4().hex[:8]}.wav")
+            # Write original audio to temp file
+            logger.info(f"🎵 Exporting audio for pitch-preserving speed adjustment...")
+            audio_clip.write_audiofile(
+                temp_input,
+                fps=44100,
+                nbytes=2,
+                codec='pcm_s16le',
+                verbose=False,
+                logger=None
+            )
+            # Load audio with librosa
+            y, sr = librosa.load(temp_input, sr=44100)
+            # Time-stretch without changing pitch
+            logger.info(f"🎵 Applying pitch-preserving time stretch {speed_factor}x...")
+            y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
+            # Save the stretched audio
+            sf.write(temp_output, y_stretched, sr)
+            # Clean up input file
+            if os.path.exists(temp_input):
+                os.remove(temp_input)
+            # Load back as AudioFileClip
+            stretched_clip = AudioFileClip(temp_output)
+            logger.info(f"✅ Audio sped up {speed_factor}x with preserved pitch using librosa")
+            return stretched_clip
+        except Exception as e:
+            logger.error(f"❌ Failed to speed up audio with pitch correction: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+            # Return original audio as fallback
+            logger.warning("⚠️ Returning original audio without speed adjustment")
+            return audio_clip
     async def _create_video_sequence(self, video_clips: List[VideoFileClip],
+                                   target_duration: float, video_config: Optional[Dict]) -> VideoFileClip:
+        """Create video sequence matching TTS audio duration exactly"""
         try:
             if not video_clips:
                 raise ValueError("No video clips available for sequence")
+            # Identify clips by matching against expected structure
+            hook_start = None
+            hook_end = None
+            library_clips = []
+            # First pass: identify hook clips (they should be equal duration, around 4s each for 8s video)
+            hook_candidates = []
+            for clip in video_clips:
+                if 3.0 <= clip.duration <= 5.0:  # Hook segments are typically 3-5 seconds
+                    hook_candidates.append(clip)
+                else:
+                    library_clips.append(clip)
+            # Assign hook clips if we found exactly 2
+            if len(hook_candidates) == 2:
+                hook_start = hook_candidates[0]
+                hook_end = hook_candidates[1]
+                logger.info(f"✓ Identified hook clips: start={hook_start.duration:.2f}s, end={hook_end.duration:.2f}s")
+            elif len(hook_candidates) > 0:
+                # Use what we have
+                hook_start = hook_candidates[0]
+                if len(hook_candidates) > 1:
+                    hook_end = hook_candidates[1]
+                logger.info(f"⚠️ Found {len(hook_candidates)} hook candidates, using available")
+            # Build sequence: [hook_start] -> [library_clips] -> [hook_end]
+            sequence_clips = []
+            if hook_start:
+                sequence_clips.append(hook_start)
+                logger.info(f"  Added hook_start: {hook_start.duration:.2f}s")
+            for i, clip in enumerate(library_clips):
+                sequence_clips.append(clip)
+                logger.info(f"  Added library_{i}: {clip.duration:.2f}s")
+            if hook_end:
+                sequence_clips.append(hook_end)
+                logger.info(f"  Added hook_end: {hook_end.duration:.2f}s")
+            # Calculate current total
+            current_duration = sum(clip.duration for clip in sequence_clips)
+            logger.info(f"📊 Current sequence: {current_duration:.2f}s, Target: {target_duration:.2f}s")
+            # Adjust to exact target duration
+            sequence_clips = self._adjust_clips_to_duration(sequence_clips, target_duration)
+            # Verify adjusted duration
+            adjusted_duration = sum(clip.duration for clip in sequence_clips)
+            logger.info(f"📊 Adjusted sequence: {adjusted_duration:.2f}s")
+            # Resize all clips to 9:16 vertical
+            target_size = (1080, 1920)
+            resized_clips = [self._resize_for_vertical(clip, target_size) for clip in sequence_clips]
+            # Concatenate with no gap (method='compose' ensures smooth transitions)
+            final_sequence = concatenate_videoclips(resized_clips, method="compose")
+            logger.info(f"✅ Created seamless video sequence: {final_sequence.duration:.2f}s")
             return final_sequence
                 clip.close()
             raise
+    def _adjust_clips_to_duration(self, clips: List[VideoFileClip], target_duration: float) -> List[VideoFileClip]:
+        """Adjust video clips to match target duration by speeding up or extending library clips"""
+        current_duration = sum(clip.duration for clip in clips)
+        duration_diff = target_duration - current_duration
+        logger.info(f"⚙️ Adjusting: {current_duration:.2f}s -> {target_duration:.2f}s (diff: {duration_diff:.2f}s)")
+        if abs(duration_diff) < 0.1:  # Close enough
+            return clips
+        # Identify which clips are library clips (not hook clips)
+        # Hook clips are typically shorter (3-5s), library clips are longer
+        library_indices = []
+        for i, clip in enumerate(clips):
+            if clip.duration > 5.0:  # Likely a library clip
+                library_indices.append(i)
+        if not library_indices:
+            # If no library clips identified, adjust all clips proportionally
+            library_indices = list(range(len(clips)))
+        if duration_diff > 0:
+            # Need to extend - slow down or loop library clips
+            return self._extend_clips(clips, library_indices, duration_diff)
+        else:
+            # Need to shorten - speed up or trim library clips
+            return self._shorten_clips(clips, library_indices, abs(duration_diff))
+    def _extend_clips(self, clips: List[VideoFileClip], library_indices: List[int], extra_duration: float) -> List[VideoFileClip]:
+        """Extend duration by slowing down library clips"""
+        if not library_indices:
+            return clips
+        adjusted_clips = []
+        duration_per_clip = extra_duration / len(library_indices)
+        for i, clip in enumerate(clips):
+            if i in library_indices:
+                # Calculate speed factor to extend this clip
+                target_clip_duration = clip.duration + duration_per_clip
+                speed_factor = clip.duration / target_clip_duration
+                # Slow down the clip (speed < 1.0 means slower)
+                slowed_clip = clip.fx(lambda c: c.speedx(speed_factor))
+                adjusted_clips.append(slowed_clip)
+                logger.info(f"  Extended clip {i}: {clip.duration:.2f}s -> {slowed_clip.duration:.2f}s (speed: {speed_factor:.2f}x)")
+            else:
+                adjusted_clips.append(clip)
+        return adjusted_clips
+    def _shorten_clips(self, clips: List[VideoFileClip], library_indices: List[int], reduce_duration: float) -> List[VideoFileClip]:
+        """Shorten duration by speeding up library clips"""
+        if not library_indices:
+            return clips
+        adjusted_clips = []
+        duration_per_clip = reduce_duration / len(library_indices)
+        for i, clip in enumerate(clips):
+            if i in library_indices:
+                # Calculate speed factor to shorten this clip
+                target_clip_duration = clip.duration - duration_per_clip
+                if target_clip_duration < 1.0:
+                    target_clip_duration = 1.0  # Minimum duration
+                speed_factor = clip.duration / target_clip_duration
+                # Speed up the clip (speed > 1.0 means faster)
+                sped_clip = clip.fx(lambda c: c.speedx(speed_factor))
+                adjusted_clips.append(sped_clip)
+                logger.info(f"  Shortened clip {i}: {clip.duration:.2f}s -> {sped_clip.duration:.2f}s (speed: {speed_factor:.2f}x)")
+            else:
+                adjusted_clips.append(clip)
+        return adjusted_clips
     def _resize_for_vertical(self, clip: VideoFileClip, target_size: tuple) -> VideoFileClip:
         """Resize clip to fit vertical 9:16 aspect ratio"""
         target_w, target_h = target_size
             # Clip is taller, fit to width and crop height
             new_clip = clip.resize(width=target_w)
+        # Center crop to exact size
         try:
             new_clip = new_clip.crop(
                 x_center=new_clip.w / 2,
                 y_center=new_clip.h / 2,
                 height=target_h
             )
         except Exception:
+            # Fallback cropping method
             x1 = (new_clip.w - target_w) // 2
             y1 = (new_clip.h - target_h) // 2
             new_clip = new_clip.crop(x1=x1, y1=y1, x2=x1+target_w, y2=y1+target_h)
         return new_clip
     async def _add_audio_track(self, video_clip: VideoFileClip, audio_clips: List[AudioFileClip]) -> VideoFileClip:
+        """Add full audio track - video duration matches TTS, so no trimming needed"""
         if not audio_clips:
             return video_clip
         try:
+            # Filter valid audio clips
             valid_audio_clips = []
             for clip in audio_clips:
                 if clip.duration > 0:
             if not valid_audio_clips:
                 return video_clip
+            # Mix all audio clips
             mixed_audio = CompositeAudioClip(valid_audio_clips)
             video_duration = video_clip.duration
+            audio_duration = mixed_audio.duration
+            logger.info(f"🔊 Audio: {audio_duration:.2f}s, Video: {video_duration:.2f}s")
+            # Video should already match audio duration, but verify
+            if abs(video_duration - audio_duration) > 0.5:
+                logger.warning(f"⚠️ Duration mismatch: Video={video_duration:.2f}s, Audio={audio_duration:.2f}s")
             # Add audio to video
             video_with_audio = video_clip.set_audio(mixed_audio)
+            logger.info(f"✅ Added full audio track (no trimming)")
             return video_with_audio
         except Exception as e:
             logger.error(f"❌ Failed to add audio track: {e}")
             for clip in audio_clips:
                 try:
                     clip.close()
             return video_clip
     async def _add_subtitles(self, video_clip: VideoFileClip, script: str) -> CompositeVideoClip:
+        """Add word-by-word animated subtitles synced with TTS timing"""
         try:
+            # Split script into words for better timing sync
+            words = self._split_script_into_words(script)
             text_clips = []
             total_duration = video_clip.duration
+            target_width, target_height = video_clip.size
+            logger.info(f"📝 Script has {len(words)} words, video duration: {total_duration:.2f}s")
+            # Group words into small phrases (2-4 words) for better readability
+            phrases = self._group_words_into_phrases(words, max_words=3)
+            logger.info(f"📝 Grouped into {len(phrases)} phrases:")
+            for idx, p in enumerate(phrases):
+                logger.info(f"   Phrase {idx}: '{p}'")
+            # Calculate equal timing for all phrases to ensure full coverage
             duration_per_phrase = total_duration / len(phrases)
+            logger.info(f"⏱️  Each phrase gets {duration_per_phrase:.2f}s")
             for i, phrase in enumerate(phrases):
                 start_time = i * duration_per_phrase
+                phrase_duration = duration_per_phrase
+                logger.info(f"  Phrase {i} at {start_time:.2f}s for {phrase_duration:.2f}s: '{phrase}'")
+                # Smart word wrapping - ensure text fits screen
+                max_chars_per_line = 18
+                wrapped_lines = textwrap.wrap(phrase, width=max_chars_per_line)
+                # If wrapping creates too many lines, split into smaller phrases
+                if len(wrapped_lines) > 3:
+                    # Split phrase in half and create two separate text clips
+                    mid_point = len(phrase.split()) // 2
+                    phrase_words = phrase.split()
+                    first_half = ' '.join(phrase_words[:mid_point])
+                    second_half = ' '.join(phrase_words[mid_point:])
+                    half_duration = phrase_duration / 2
+                    # Process first half
+                    self._add_single_subtitle(
+                        first_half, start_time, half_duration,
+                        target_width, target_height, text_clips
+                    )
+                    # Process second half
+                    self._add_single_subtitle(
+                        second_half, start_time + half_duration, half_duration,
+                        target_width, target_height, text_clips
+                    )
+                else:
+                    # Normal single subtitle
+                    self._add_single_subtitle(
+                        phrase, start_time, phrase_duration,
+                        target_width, target_height, text_clips
+                    )
+            logger.info(f"📊 Created {len(text_clips)} subtitle clips covering full {total_duration:.2f}s")
             # Combine video with subtitles
             final_video = CompositeVideoClip([video_clip] + text_clips)
+            logger.info(f"✅ Added {len(text_clips)} synced subtitle segments")
             return final_video
         except Exception as e:
             logger.error(f"❌ Failed to add subtitles: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
             return video_clip
+    def _add_single_subtitle(self, text: str, start_time: float, duration: float,
+                            target_width: int, target_height: int, text_clips: List):
+        """Add a single subtitle clip with proper formatting"""
+        try:
+            # Word wrap with reduced character limit
+            max_chars_per_line = 18
+            wrapped_text = '\n'.join(textwrap.wrap(text, width=max_chars_per_line))
+            # Dynamic font size based on text length
+            line_count = len(wrapped_text.split('\n'))
+            if line_count > 2:
+                fontsize = 70  # Smaller for 3+ lines
+            elif line_count > 1:
+                fontsize = 75  # Medium for 2 lines
+            else:
+                fontsize = 85  # Larger for single line
+            # Create text clip
+            text_clip = TextClip(
+                txt=wrapped_text,
+                fontsize=fontsize,
+                color='white',
+                font='Arial-Bold',
+                stroke_color='black',
+                stroke_width=4,
+                method='caption',
+                size=(int(target_width * 0.85), None),  # Reduced from 0.90 to 0.85
+                align='center'
+            )
+            # Position in lower third of screen (safe area)
+            vertical_position = int(target_height * 0.72)  # Slightly higher
+            text_clip = text_clip.set_position(('center', vertical_position))
+            text_clip = text_clip.set_start(start_time)
+            text_clip = text_clip.set_duration(duration)
+            # Add smooth fade effects
+            fade_duration = min(0.2, duration / 4)  # Adaptive fade
+            text_clip = text_clip.crossfadein(fade_duration).crossfadeout(fade_duration)
+            text_clips.append(text_clip)
+        except Exception as e:
+            logger.error(f"❌ Failed to create subtitle: {e}")
+    def _split_script_into_words(self, script: str) -> List[str]:
+        """Split script into individual words"""
+        # Remove extra punctuation but keep sentence structure
+        import re
+        # Remove multiple spaces and clean up
+        script = re.sub(r'\s+', ' ', script).strip()
+        words = script.split()
+        return words
+    def _group_words_into_phrases(self, words: List[str], max_words: int = 3) -> List[str]:
+        """Group words into small readable phrases"""
+        phrases = []
+        current_phrase = []
+        for word in words:
+            current_phrase.append(word)
+            # Create phrase break at punctuation or max word count
+            has_punctuation = any(p in word for p in ['.', ',', '!', '?', ';'])
+            if len(current_phrase) >= max_words or has_punctuation:
+                phrases.append(' '.join(current_phrase))
+                current_phrase = []
+        # Add remaining words
+        if current_phrase:
+            phrases.append(' '.join(current_phrase))
+        return phrases
     async def _render_final_video(self, video_clip: VideoFileClip) -> str:
+        """Render final video with 1.3x speed - video sped up, audio pitch-preserved"""
+        # Generate unique filename using UUID
+        unique_id = uuid.uuid4().hex[:8]
+        filename = f"final_video_{unique_id}.mp4"
+        output_path = self.temp_dir / filename
         try:
+            original_duration = video_clip.duration
+            speed_factor = 1.3
+            logger.info(f"📹 Rendering final video: {filename}")
+            logger.info(f"⚡ Speeding up: {original_duration:.2f}s -> {original_duration/speed_factor:.2f}s")
+            # Extract audio for pitch-preserving speed up
+            audio_clip = video_clip.audio
+            if audio_clip:
+                # Speed up audio with pitch correction using librosa
+                sped_audio_clip = await self._speed_up_audio_with_pitch_correction(audio_clip, speed_factor)
+                # Speed up video only (no audio yet)
+                video_only = video_clip.without_audio()
+                sped_video = video_only.fx(lambda c: c.speedx(speed_factor))
+                # Combine sped-up video with pitch-corrected audio
+                final_clip = sped_video.set_audio(sped_audio_clip)
+                # Ensure audio and video match in duration
+                final_duration = min(sped_video.duration, sped_audio_clip.duration)
+                final_clip = final_clip.set_duration(final_duration)
+                logger.info(f"✅ Video with pitch-preserved audio: {final_duration:.2f}s")
+            else:
+                # No audio - just speed up video
+                final_clip = video_clip.fx(lambda c: c.speedx(speed_factor))
+            # Render final video
+            final_clip.write_videofile(
                 str(output_path),
                 codec='libx264',
                 audio_codec='aac',
+                temp_audiofile=str(self.temp_dir / f'temp_audio_{unique_id}.m4a'),
                 remove_temp=True,
                 fps=24,
                 verbose=False,
+                logger=None
             )
+            logger.info(f"✅ Final video rendered: {output_path}")
             return str(output_path)
         except Exception as e:
     def _validate_assets(self, assets: Dict) -> bool:
         """Validate that required assets are present"""
+        required = ['selected_videos', 'tts_audio', 'hook_video']
         for req in required:
             if not assets.get(req):