Spaces:

Elvoro
/

Tools

Running

jebin2 commited on Jan 12

Commit

ea33c8c

1 Parent(s): 659fbdb

refactor: create asset_manager module with singleton classes

- Add VideoLib singleton for video library loading from GSheet
- Add AudioLib singleton for audio library and music selection
- Add AssetDownloader singleton for centralized downloads
- Add AssetProcessor for AI video selection via Gemini
- Simplify AssetSelector to thin wrapper using singletons
- Add public HTTP/HTTPS URL support in FileDownloader
- Fix video processing order: remove_black_padding -> resize_video
- Remove SHARED_ASSET_SELECTOR (singletons handle caching)
- Use FileDownloader directly instead of wrapper methods

Files changed (9) hide show

src/asset_manager/__init__.py +20 -0
src/asset_manager/asset_downloader.py +230 -0
src/asset_manager/asset_processor.py +177 -0
src/asset_manager/audio_lib.py +205 -0
src/asset_manager/video_lib.py +109 -0
src/asset_selector.py +55 -363
src/automation.py +22 -137
src/file_downloader.py +56 -2
src/process_csv.py +23 -43

src/asset_manager/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+Asset Manager Module
+Provides singleton classes for managing video/audio libraries and asset downloads.
+"""
+from .video_lib import get_video_lib, VideoLib
+from .audio_lib import get_audio_lib, AudioLib
+from .asset_downloader import get_asset_downloader, AssetDownloader
+from .asset_processor import AssetProcessor
+__all__ = [
+    "get_video_lib",
+    "VideoLib",
+    "get_audio_lib",
+    "AudioLib",
+    "get_asset_downloader",
+    "AssetDownloader",
+    "AssetProcessor",
+]

src/asset_manager/asset_downloader.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+AssetDownloader - Singleton class for downloading video and visual assets
+Uses FileDownloader for all download operations.
+"""
+import asyncio
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+from utils import logger, is_valid_video, resize_video, remove_black_padding
+from file_downloader import FileDownloader
+from .video_lib import get_video_lib, VideoLib
+class AssetDownloader:
+    """
+    Singleton class that handles downloading all video and visual assets.
+    Uses FileDownloader for all download operations.
+    Usage:
+        downloader = get_asset_downloader()
+        videos = await downloader.download_all_videos()
+        await downloader.download_all_visual_assets(data_holder)
+    """
+    def __init__(self):
+        self.file_downloader = FileDownloader()
+        self._downloaded_videos: List[Dict] = []
+        self._videos_downloaded = False
+        logger.info("✓ AssetDownloader initialized")
+    @property
+    def downloaded_videos(self) -> List[Dict]:
+        """Get list of downloaded videos with their local paths"""
+        return self._downloaded_videos
+    async def download_all_videos(self, video_lib: Optional[VideoLib] = None) -> List[Dict]:
+        """
+        Download all library videos once and cache them.
+        Args:
+            video_lib: VideoLib instance (uses singleton if not provided)
+        Returns:
+            List of dicts with 'url' and 'local_path' keys
+        """
+        # Return cached if already downloaded
+        if self._videos_downloaded and self._downloaded_videos:
+            logger.info("✅ All videos already downloaded — returning cached.")
+            return self._downloaded_videos
+        video_lib = video_lib or get_video_lib()
+        download_path = "testData/video_for_workflow"
+        Path(download_path).mkdir(parents=True, exist_ok=True)
+        videos = []
+        for _, row in video_lib.video_library.iterrows():
+            url = str(row.get("VIDEO_LINK", "")).strip()
+            if not url:
+                continue
+            local_path = self.file_downloader.safe_download(url=url)
+            if not local_path or not is_valid_video(local_path):
+                logger.warning(f"⚠️ Skipped invalid video: {url}")
+                continue
+            # Process video: first remove padding, then resize to 1080x1920
+            try:
+                remove_black_padding(local_path, overwrite=True)
+                resize_video(local_path, overwrite=True)
+            except Exception as e:
+                logger.warning(f"⚠️ Could not process {local_path}: {e}")
+                # Continue anyway - video is still usable
+            videos.append({
+                "url": url,
+                "local_path": str(local_path),
+            })
+        self._downloaded_videos = videos
+        self._videos_downloaded = True
+        logger.info(f"✅ Downloaded {len(videos)} library videos")
+        return videos
+    async def download_all_visual_assets(
+        self,
+        data_holder: Any,
+        api_clients: Any = None,  # Kept for backward compatibility, not used
+        resize: bool = True
+    ) -> None:
+        """
+        Download ALL visual assets with proper error handling.
+        Uses FileDownloader for all downloads.
+        Args:
+            data_holder: DataHolder instance with visual_assets
+            api_clients: Deprecated, kept for backward compatibility
+            resize: Whether to resize videos after download
+        """
+        download_tasks = []
+        assets = data_holder.visual_assets
+        # Download hook video with explicit local_path assignment
+        if assets.get("hook_video") and assets["hook_video"].get("video_url"):
+            hook_url = assets["hook_video"]["video_url"]
+            download_tasks.append(
+                self._download_file(
+                    hook_url, "hook_video.mp4",
+                    assets["hook_video"], "local_path", resize=resize
+                )
+            )
+            # VEO library videos
+            if assets["hook_video"].get("veo_video_data") and assets["hook_video"].get("veo_video_data").get("video_url"):
+                veo_hook_url = assets["hook_video"]["veo_video_data"]["video_url"]
+                download_tasks.append(
+                    self._download_file(
+                        veo_hook_url, "veo_hook_url.mp4",
+                        assets["hook_video"]["veo_video_data"], "local_path",
+                        resize=resize, remove_padding=True
+                    )
+                )
+        # Download library videos
+        for i, video in enumerate(assets.get("selected_videos", [])):
+            if video.get("url"):
+                download_tasks.append(
+                    self._download_file(
+                        video["url"], f"library_video_{i}.mp4",
+                        video, "local_path", resize=resize
+                    )
+                )
+            if video.get("alternate_url"):
+                download_tasks.append(
+                    self._download_file(
+                        video["alternate_url"], f"library_all_video_alternate_url_{i}.mp4",
+                        video, "alternate_url_local_path", resize=resize
+                    )
+                )
+        # Download library videos from all_videos
+        for i, video in enumerate(assets.get("all_videos", [])):
+            if video.get("url") and not video.get("local_path", None):
+                download_tasks.append(
+                    self._download_file(
+                        video["url"], f"library_all_video_{i}.mp4",
+                        video, "local_path"
+                    )
+                )
+        # Wait for all downloads to complete
+        if download_tasks:
+            results = await asyncio.gather(*download_tasks, return_exceptions=True)
+            # Check for failures
+            for i, result in enumerate(results):
+                if isinstance(result, Exception):
+                    logger.error(f"❌ Download task {i} failed: {result}")
+    async def _download_file(
+        self,
+        url: str,
+        filename: str,
+        target_dict: Dict,
+        key: str = "local_path",
+        resize: bool = False,
+        remove_padding: bool = False
+    ) -> Optional[str]:
+        """
+        Download file using FileDownloader and store in target_dict.
+        Args:
+            url: URL to download from (GCS or Drive)
+            filename: Suggested filename (used if can't determine from URL)
+            target_dict: Dict to store the local_path in
+            key: Key to use in target_dict (default: 'local_path')
+            resize: Whether to resize video after download
+            remove_padding: Whether to remove black padding after download
+        Returns:
+            Local path to downloaded file, or None on failure
+        """
+        try:
+            # Use FileDownloader for the actual download
+            local_path = self.file_downloader.safe_download(url=url)
+            if not local_path:
+                raise Exception(f"Download returned None for {url}")
+            if remove_padding:
+                remove_black_padding(str(local_path), overwrite=True)
+            if resize:
+                resize_video(str(local_path), overwrite=True)
+            target_dict[key] = str(local_path)
+            logger.info(f"✓ Downloaded {filename}")
+            return str(local_path)
+        except Exception as e:
+            logger.error(f"❌ Failed to download {filename}: {e}")
+            raise
+    def reset(self) -> None:
+        """Reset the downloader state (useful for testing)"""
+        self._downloaded_videos = []
+        self._videos_downloaded = False
+        logger.info("🔄 AssetDownloader reset")
+# Module-level singleton instance
+_asset_downloader: Optional[AssetDownloader] = None
+def get_asset_downloader() -> AssetDownloader:
+    """
+    Get the singleton AssetDownloader instance.
+    Returns:
+        AssetDownloader: The singleton instance
+    """
+    global _asset_downloader
+    if _asset_downloader is None:
+        _asset_downloader = AssetDownloader()
+    return _asset_downloader
+def reset_asset_downloader() -> None:
+    """Reset the singleton (useful for testing)"""
+    global _asset_downloader
+    _asset_downloader = None

src/asset_manager/asset_processor.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+AssetProcessor - Handles video selection and processing using AI
+"""
+import json
+import re
+import random
+from typing import List, Dict, Optional, Tuple
+import pandas as pd
+import json_repair
+from moviepy.editor import VideoFileClip
+import gemini_sdk
+from utils import logger
+from .video_lib import get_video_lib
+class AssetProcessor:
+    """
+    Handles video selection and processing using AI (Gemini).
+    Usage:
+        processor = AssetProcessor(data_holder)
+        videos = await processor.select_videos(tts_script, timed_transcript)
+    """
+    def __init__(self, data_holder):
+        self.data_holder = data_holder
+        self._video_lib = get_video_lib()
+    @property
+    def video_library(self) -> pd.DataFrame:
+        """Get video library from singleton"""
+        return self._video_lib.video_library
+    def _parse_duration(self, duration_str: str) -> int:
+        """Parse duration from various string formats to integer seconds"""
+        try:
+            if pd.isna(duration_str) or duration_str == "":
+                return 0
+            duration_str = str(duration_str).lower().strip()
+            numbers = re.findall(r"(\d+\.?\d*)", duration_str)
+            if numbers:
+                return int(float(numbers[0]))
+            return 0
+        except (ValueError, TypeError) as e:
+            logger.warning(f"Failed to parse duration '{duration_str}': {e}")
+            return 0
+    async def select_videos(self, tts_script: str, timed_transcript, max_duration: int = 12) -> List[Dict]:
+        """Select videos using AI analysis of TTS script"""
+        try:
+            logger.info(f"🤖 AI video selection for script: {tts_script[:300]}...")
+            selected_videos = await self._analyze_with_gemini(
+                tts_script=tts_script,
+                timed_transcript=timed_transcript
+            )
+            if not selected_videos:
+                raise Exception("⚠️ AI selection failed")
+            for video in selected_videos:
+                if isinstance(video.get("duration"), str):
+                    video["duration"] = self._parse_duration(video["duration"])
+            total_duration = sum(int(v.get("duration", 0)) for v in selected_videos)
+            logger.info(f"✓ Selected {len(selected_videos)} videos, total: {total_duration}s")
+            return selected_videos
+        except Exception as e:
+            logger.error(f"❌ Video selection failed: {e}")
+            raise
+    async def _analyze_with_gemini(self, tts_script: str, timed_transcript) -> List[Dict]:
+        """Use Gemini API for contextual video selection"""
+        try:
+            video_context = await self.prepare_video_context()
+            with open("src/prompt/best_matches_two_video_tracking.md", "r", encoding="utf-8") as file:
+                system_prompt = file.read()
+            model_input = f"""SYSTEM INSTRUCTION::
+{system_prompt}
+USER PROMPT:
+TTS Script: {tts_script}
+Video Options: {video_context}
+"""
+            response = gemini_sdk.generate(model_input)
+            response_text = response.strip()
+            selection = json_repair.loads(response_text)
+            selected = []
+            for item in selection:
+                video_index = item["video_index"]
+                if video_index < len(self.video_library):
+                    video_row = self.video_library[self.video_library["Video URL (No Audio)"] == item["video_url"]]
+                    video = video_row.iloc[0]
+                    selected.append(
+                        {
+                            "url": video.get("Video URL (No Audio)", video.get("url", "")),
+                            "alternate_url": None,
+                            "alternate_url_local_path": None,
+                            "video_summary": video.get("Full Video Description Summary"),
+                            "tts_script_segment": item["tts_script_segment"],
+                            "duration": video.get("duration", 0),
+                            "reason": item["reason"],
+                            "alignment": video.get("Video Alignment with the TTS Script", video.get("alignment", "")),
+                            "energy": video.get("energy_score", 0),
+                        }
+                    )
+                    if "alternate_video_index" in item:
+                        video_row = self.video_library[self.video_library["Video URL (No Audio)"] == item["alternate_video_url"]]
+                        video = video_row.iloc[0]
+                        selected[-1]["alternate_url"] = video.get("Video URL (No Audio)", video.get("url", ""))
+            logger.info(f"✓ Gemini selected {len(selected)}")
+            return selected
+        except json.JSONDecodeError as e:
+            logger.error(f"Failed to parse Gemini JSON response: {e}")
+            logger.debug(f"Raw response: {response_text[:500]}")
+            raise
+        except Exception as e:
+            logger.error(f"Gemini analysis failed: {e}")
+            import traceback
+            traceback.print_exc()
+            raise
+    async def prepare_video_context(self) -> str:
+        """Prepare video context for AI analysis by reading actual durations"""
+        # Update durations using actual local files
+        for video in self.data_holder.visual_assets["all_videos"]:
+            local_path = video.get("local_path")
+            if local_path:
+                try:
+                    with VideoFileClip(local_path) as clip:
+                        video["duration"] = round(clip.duration, 2)
+                except Exception as e:
+                    logger.warning(f"⚠️ Error reading duration for {local_path}: {e}")
+                    video["duration"] = 0
+            else:
+                video["duration"] = 0
+        # Form video_context string (using actual durations)
+        video_context = "\n".join(
+            [
+                f"{i+1}. {row.get('Video URL (No Audio)')} - "
+                f"{row.get('Full Video Description Summary', row.get('description', ''))} - "
+                f"{next((v.get('duration', 0) for v in self.data_holder.visual_assets['all_videos'] if v['url'] == row.get('Video URL (No Audio)')), 0)}s - "
+                f"Alignment: {row.get('Video Alignment with the TTS Script', row.get('alignment', ''))} - "
+                f"Usage Count: {self.data_holder.video_usage_count.get(row.get('Video URL (No Audio)'), 0)}"
+                for i, row in self.video_library.iterrows()
+            ]
+        )
+        return video_context
+    def select_random_videos(self, count: int) -> List[str]:
+        """Select random videos from downloaded library"""
+        all_videos = self.data_holder.visual_assets.get("all_videos", [])
+        available_videos = [v for v in all_videos if v.get("local_path")]
+        if len(available_videos) < count:
+            logger.warning(f"⚠️ Not enough videos to select {count} random videos. Selecting {len(available_videos)} instead.")
+            count = len(available_videos)
+        selected_videos = random.sample(available_videos, count)
+        return [v["local_path"] for v in selected_videos]

src/asset_manager/audio_lib.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""
+AudioLib - Singleton class for managing audio library from Google Sheets
+"""
+import os
+import re
+import pandas as pd
+from typing import Optional, List
+from utils import logger, clean_and_drop_empty
+from google_sheet_reader import GoogleSheetReader
+from google_src import get_default_wrapper, GCloudWrapper
+import setup_config
+class AudioLib:
+    """
+    Singleton class that loads and manages audio library from Google Sheets.
+    Handles background music selection and beat timing.
+    Usage:
+        audio_lib = get_audio_lib()
+        music_url = audio_lib.select_background_music()
+        beats = audio_lib.get_audio_beats(music_url)
+    """
+    def __init__(self, gcloud_wrapper: Optional[GCloudWrapper] = None, initial_audio_index: int = 0):
+        self._gcloud_wrapper = gcloud_wrapper or get_default_wrapper()
+        self._audio_library: pd.DataFrame = self._load_from_gsheet()
+        if len(self._audio_library) == 0:
+            raise ValueError("Audio library is empty! Check AUDIO_LIBRARY_GSHEET_WORKSHEET env var and Google Sheet access.")
+        self._current_audio_index = initial_audio_index % len(self._audio_library)
+        logger.info(f"✓ AudioLib initialized with {len(self._audio_library)} audio tracks, starting at index {self._current_audio_index}")
+    @property
+    def audio_library(self) -> pd.DataFrame:
+        """Get the audio library DataFrame"""
+        return self._audio_library
+    @property
+    def current_audio_index(self) -> int:
+        """Get current audio index"""
+        return self._current_audio_index
+    @current_audio_index.setter
+    def current_audio_index(self, value: int) -> None:
+        """Set current audio index (wraps around)"""
+        if len(self._audio_library) > 0:
+            self._current_audio_index = value % len(self._audio_library)
+    def _load_from_gsheet(self, account_id: str = "test_data") -> pd.DataFrame:
+        """
+        Load audio library from Google Sheet.
+        Args:
+            account_id: Which account to use ('final_data' or 'test_data')
+        """
+        try:
+            worksheet_name = os.getenv("AUDIO_LIBRARY_GSHEET_WORKSHEET")
+            if not worksheet_name:
+                logger.error("AUDIO_LIBRARY_GSHEET_WORKSHEET env var not set!")
+                return pd.DataFrame()
+            logger.info(f"Loading audio library using account: {account_id}")
+            googleSheetReader = GoogleSheetReader(
+                worksheet_name=worksheet_name,
+                gcloud_wrapper=self._gcloud_wrapper,
+                account_id=account_id,
+            )
+            audio_df = googleSheetReader.get_filtered_dataframe()
+            # Filter by beats timing if in beats_cut mode
+            if setup_config.get_str("setup_type") == "beats_cut":
+                audio_df = clean_and_drop_empty(audio_df, "Beats Timing(SS:FF) AT 25FPS")
+            return clean_and_drop_empty(audio_df, "AUDIO_LINK")
+        except Exception as e:
+            error_msg = str(e) if str(e) else type(e).__name__
+            if "403" in error_msg or "permission" in error_msg.lower() or "forbidden" in error_msg.lower():
+                logger.error(f"❌ PERMISSION ERROR loading audio library: {error_msg}")
+                logger.error("Share the Google Sheet with the service account email as Editor!")
+            elif "404" in error_msg or "not found" in error_msg.lower():
+                logger.error(f"❌ WORKSHEET NOT FOUND: '{os.getenv('AUDIO_LIBRARY_GSHEET_WORKSHEET')}'")
+            else:
+                logger.error(f"Failed to load audio library from Google Sheet: {error_msg}")
+            return pd.DataFrame()
+    def inc_audio_index(self) -> None:
+        """Increment current audio index (wraps around)"""
+        self._current_audio_index = (self._current_audio_index + 1) % len(self._audio_library)
+    def select_background_music(self) -> str:
+        """
+        Select background music SEQUENTIALLY (not random).
+        Each call increments the index to ensure different music for each video.
+        Returns:
+            URL of the selected audio track
+        """
+        if self._audio_library.empty:
+            logger.error("❌ Audio library is empty")
+            return ""
+        selected = self._audio_library.iloc[self._current_audio_index]["AUDIO_LINK"]
+        logger.info(
+            f"🎵 Selected background music #{self._current_audio_index + 1}/{len(self._audio_library)}: {selected}"
+        )
+        # Increment index for next call (loop back to start if needed)
+        self._current_audio_index = (self._current_audio_index + 1) % len(self._audio_library)
+        return selected
+    def get_audio_beats(self, audio_link: str) -> Optional[List[float]]:
+        """
+        Load audio beats timing from audio_library and convert
+        SS:FF (25 FPS) → seconds (float)
+        Example:
+            "01:12" → 1 + 12/25 = 1.48
+        Args:
+            audio_link: URL of the audio track
+        Returns:
+            List of beat times in seconds, or None if not found
+        """
+        try:
+            if self._audio_library.empty:
+                logger.error("Audio library is empty")
+                return None
+            # Find matching row
+            row = self._audio_library.loc[
+                self._audio_library["AUDIO_LINK"] == audio_link
+            ]
+            if row.empty:
+                logger.error(f"No audio entry found for: {audio_link}")
+                return None
+            beats_raw = row.iloc[0]["Beats Timing(SS:FF) AT 25FPS"]
+            if pd.isna(beats_raw) or not str(beats_raw).strip():
+                logger.warning(f"No beat data for audio: {audio_link}")
+                return None
+            beats: List[float] = []
+            for token in str(beats_raw).split(","):
+                token = token.strip()
+                if ":" not in token:
+                    continue
+                sec, frame = token.split(":", 1)
+                beats.append(
+                    round(int(sec) + (int(frame) / 25.0), 2)
+                )
+            return beats if beats else None
+        except Exception as e:
+            logger.error(
+                f"Failed to compute audio beats map for {audio_link}: {e}"
+            )
+            return None
+    def reset_audio_index(self) -> None:
+        """Reset audio index to start from beginning (useful for batch processing)"""
+        self._current_audio_index = 0
+        logger.info("🔄 Reset background music index to 0")
+    def __len__(self) -> int:
+        return len(self._audio_library)
+# Module-level singleton instance
+_audio_lib: Optional[AudioLib] = None
+def get_audio_lib(initial_audio_index: int = 0) -> AudioLib:
+    """
+    Get the singleton AudioLib instance.
+    Args:
+        initial_audio_index: Starting index for audio selection (only used on first call)
+    Returns:
+        AudioLib: The singleton instance
+    """
+    global _audio_lib
+    if _audio_lib is None:
+        _audio_lib = AudioLib(initial_audio_index=initial_audio_index)
+    return _audio_lib
+def reset_audio_lib() -> None:
+    """Reset the singleton (useful for testing)"""
+    global _audio_lib
+    _audio_lib = None

src/asset_manager/video_lib.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+VideoLib - Singleton class for managing video library from Google Sheets
+"""
+import os
+import pandas as pd
+from typing import Optional, List, Dict
+from utils import logger, clean_and_drop_empty
+from google_sheet_reader import GoogleSheetReader
+from google_src import get_default_wrapper, GCloudWrapper
+class VideoLib:
+    """
+    Singleton class that loads and manages video library from Google Sheets.
+    Usage:
+        video_lib = get_video_lib()
+        for url in video_lib.get_video_urls():
+            print(url)
+    """
+    def __init__(self, gcloud_wrapper: Optional[GCloudWrapper] = None):
+        self._gcloud_wrapper = gcloud_wrapper or get_default_wrapper()
+        self._video_library: pd.DataFrame = self._load_from_gsheet()
+        logger.info(f"✓ VideoLib initialized with {len(self._video_library)} videos")
+    @property
+    def video_library(self) -> pd.DataFrame:
+        """Get the video library DataFrame"""
+        return self._video_library
+    def _load_from_gsheet(self, account_id: str = "test_data") -> pd.DataFrame:
+        """
+        Load video library from Google Sheet.
+        Args:
+            account_id: Which account to use ('final_data' or 'test_data')
+        """
+        try:
+            worksheet_name = os.getenv("VIDEO_LIBRARY_GSHEET_WORKSHEET")
+            if not worksheet_name:
+                logger.error("VIDEO_LIBRARY_GSHEET_WORKSHEET env var not set!")
+                return pd.DataFrame()
+            logger.info(f"Loading video library using account: {account_id}")
+            googleSheetReader = GoogleSheetReader(
+                worksheet_name=worksheet_name,
+                gcloud_wrapper=self._gcloud_wrapper,
+                account_id=account_id,
+            )
+            video_df = googleSheetReader.get_filtered_dataframe()
+            return clean_and_drop_empty(video_df, "VIDEO_LINK")
+        except Exception as e:
+            error_msg = str(e) if str(e) else type(e).__name__
+            if "403" in error_msg or "permission" in error_msg.lower() or "forbidden" in error_msg.lower():
+                logger.error(f"❌ PERMISSION ERROR loading video library: {error_msg}")
+                logger.error("Share the Google Sheet with the service account email as Editor!")
+            elif "404" in error_msg or "not found" in error_msg.lower():
+                logger.error(f"❌ WORKSHEET NOT FOUND: '{os.getenv('VIDEO_LIBRARY_GSHEET_WORKSHEET')}'")
+            else:
+                logger.error(f"Failed to load video library from Google Sheet: {error_msg}")
+            return pd.DataFrame()
+    def get_video_urls(self) -> List[str]:
+        """Get list of all video URLs"""
+        if self._video_library.empty:
+            return []
+        return [
+            str(row.get("VIDEO_LINK", "")).strip()
+            for _, row in self._video_library.iterrows()
+            if row.get("VIDEO_LINK", "").strip()
+        ]
+    def get_video_by_url(self, url: str) -> Optional[Dict]:
+        """Get video data by URL"""
+        if self._video_library.empty:
+            return None
+        matches = self._video_library[self._video_library["VIDEO_LINK"] == url]
+        if matches.empty:
+            return None
+        return matches.iloc[0].to_dict()
+    def __len__(self) -> int:
+        return len(self._video_library)
+# Module-level singleton instance
+_video_lib: Optional[VideoLib] = None
+def get_video_lib() -> VideoLib:
+    """
+    Get the singleton VideoLib instance.
+    Returns:
+        VideoLib: The singleton instance
+    """
+    global _video_lib
+    if _video_lib is None:
+        _video_lib = VideoLib()
+    return _video_lib
+def reset_video_lib() -> None:
+    """Reset the singleton (useful for testing)"""
+    global _video_lib
+    _video_lib = None

src/asset_selector.py CHANGED Viewed

@@ -1,387 +1,79 @@
 import pandas as pd
-import utils
-import json
 from typing import List, Dict, Optional, Tuple
 from utils import logger
-import os
-import re
-import json_repair
-from moviepy.editor import VideoFileClip, AudioFileClip
 from data_holder import DataHolder
-import gemini_sdk
-from google_sheet_reader import GoogleSheetReader
-from google_src import GCloudWrapper, GCloudAccount, get_default_wrapper
-import setup_config
 class AssetSelector:
-    def __init__(self, config: Dict, data_holder: DataHolder = None, gcloud_wrapper: GCloudWrapper = None):
         self.config = config
         self.data_holder = data_holder
-        # Setup GCloud wrapper with two accounts: final_data and test_data
-        self._gcloud_wrapper = gcloud_wrapper or get_default_wrapper()
-        self.video_library = self._load_video_library_from_gsheet()
-        self.audio_library = self._load_audio_library_from_gsheet()
-        # Track current background music index for sequential selection
-        self.current_audio_index = 0 if "current_audio_index" not in self.config else self.config["current_audio_index"]
-        if len(self.audio_library) == 0:
-            raise ValueError("Audio library is empty! Check AUDIO_LIBRARY_GSHEET_WORKSHEET env var and Google Sheet access.")
-        self.current_audio_index = (self.current_audio_index) % len(self.audio_library)
     def inc_audio_index(self):
-        """Increment and save current audio index"""
-        self.current_audio_index = (self.current_audio_index + 1) % len(self.audio_library)
-        self.config["current_audio_index"] = self.current_audio_index
-    def _parse_duration(self, duration_str: str) -> int:
-        """Parse duration from various string formats to integer seconds"""
-        try:
-            if pd.isna(duration_str) or duration_str == "":
-                return 0
-            duration_str = str(duration_str).lower().strip()
-            numbers = re.findall(r"(\d+\.?\d*)", duration_str)
-            if numbers:
-                return int(float(numbers[0]))
-            return 0
-        except (ValueError, TypeError) as e:
-            logger.warning(f"Failed to parse duration '{duration_str}': {e}")
-            return 0
     def get_audio_beats(self, audio_link: str) -> Optional[List[float]]:
-        """
-        Load audio beats timing from audio_library and convert
-        SS:FF (25 FPS) → seconds (float)
-        Example:
-            "01:12" → 1 + 12/25 = 1.48
-        """
-        try:
-            if self.audio_library.empty:
-                logger.error("Audio library is empty")
-                return None
-            # Find matching row
-            row = self.audio_library.loc[
-                self.audio_library["AUDIO_LINK"] == audio_link
-            ]
-            if row.empty:
-                logger.error(f"No audio entry found for: {audio_link}")
-                return None
-            beats_raw = row.iloc[0]["Beats Timing(SS:FF) AT 25FPS"]
-            if pd.isna(beats_raw) or not str(beats_raw).strip():
-                logger.warning(f"No beat data for audio: {audio_link}")
-                return None
-            beats: List[float] = []
-            for token in str(beats_raw).split(","):
-                token = token.strip()
-                if ":" not in token:
-                    continue
-                sec, frame = token.split(":", 1)
-                beats.append(
-                    round(int(sec) + (int(frame) / 25.0), 2)
-                )
-            return beats if beats else None
-        except Exception as e:
-            logger.error(
-                f"Failed to compute audio beats map for {audio_link}: {e}"
-            )
-            return None
-    def _load_audio_library_from_gsheet(self, account_id: str = "test_data") -> pd.DataFrame:
-        """
-        Load audio library from Google Sheet.
-        Args:
-            account_id: Which account to use ('final_data' or 'test_data')
-        """
-        try:
-            worksheet_name = os.getenv("AUDIO_LIBRARY_GSHEET_WORKSHEET")
-            if not worksheet_name:
-                logger.error("AUDIO_LIBRARY_GSHEET_WORKSHEET env var not set!")
-                return pd.DataFrame()
-            logger.info(f"Loading audio library using account: {account_id}")
-            googleSheetReader = GoogleSheetReader(
-                worksheet_name=worksheet_name,
-                gcloud_wrapper=self._gcloud_wrapper,
-                account_id=account_id,
-            )
-            audio_df = googleSheetReader.get_filtered_dataframe()
-            if setup_config.get_str("setup_type") == "beats_cut":
-                audio_df = utils.clean_and_drop_empty(audio_df, "Beats Timing(SS:FF) AT 25FPS")
-            return utils.clean_and_drop_empty(audio_df, "AUDIO_LINK")
-        except Exception as e:
-            error_msg = str(e) if str(e) else type(e).__name__
-            if "403" in error_msg or "permission" in error_msg.lower() or "forbidden" in error_msg.lower():
-                logger.error(f"❌ PERMISSION ERROR loading audio library: {error_msg}")
-                logger.error("Share the Google Sheet with the service account email as Editor!")
-            elif "404" in error_msg or "not found" in error_msg.lower():
-                logger.error(f"❌ WORKSHEET NOT FOUND: '{os.getenv('AUDIO_LIBRARY_GSHEET_WORKSHEET')}'")
-            else:
-                logger.error(f"Failed to load audio library from Google Sheet: {error_msg}")
-            return pd.DataFrame()
-    def _load_video_library_from_gsheet(self, account_id: str = "test_data") -> pd.DataFrame:
-        """
-        Load video library from Google Sheet.
-        Args:
-            account_id: Which account to use ('final_data' or 'test_data')
-        """
-        try:
-            worksheet_name = os.getenv("VIDEO_LIBRARY_GSHEET_WORKSHEET")
-            if not worksheet_name:
-                logger.error("VIDEO_LIBRARY_GSHEET_WORKSHEET env var not set!")
-                return pd.DataFrame()
-            logger.info(f"Loading video library using account: {account_id}")
-            googleSheetReader = GoogleSheetReader(
-                worksheet_name=worksheet_name,
-                gcloud_wrapper=self._gcloud_wrapper,
-                account_id=account_id,
-            )
-            video_df = googleSheetReader.get_filtered_dataframe()
-            return utils.clean_and_drop_empty(video_df, "VIDEO_LINK")
-        except Exception as e:
-            error_msg = str(e) if str(e) else type(e).__name__
-            if "403" in error_msg or "permission" in error_msg.lower() or "forbidden" in error_msg.lower():
-                logger.error(f"❌ PERMISSION ERROR loading video library: {error_msg}")
-                logger.error("Share the Google Sheet with the service account email as Editor!")
-            elif "404" in error_msg or "not found" in error_msg.lower():
-                logger.error(f"❌ WORKSHEET NOT FOUND: '{os.getenv('VIDEO_LIBRARY_GSHEET_WORKSHEET')}'")
-            else:
-                logger.error(f"Failed to load video library from Google Sheet: {error_msg}")
-            return pd.DataFrame()
-    async def select_videos(self, tts_script, timed_transcript, max_duration: int = 12) -> Tuple[List[Dict], str]:
-        """Select videos using AI analysis of TTS script"""
-        try:
-            logger.info(f"🤖 AI video selection for script: {tts_script[:300]}...")
-            selected_videos = await self._analyze_with_gemini(
-                tts_script=tts_script,
-                timed_transcript=timed_transcript
-            )
-            if not selected_videos:
-                logger.warning("⚠️ AI selection failed, using fallback")
-                selected_videos = self._fallback_selection(tts_script, max_duration)
-            for video in selected_videos:
-                if isinstance(video.get("duration"), str):
-                    video["duration"] = self._parse_duration(video["duration"])
-            total_duration = sum(int(v.get("duration", 0)) for v in selected_videos)
-            logger.info(f"✓ Selected {len(selected_videos)} videos, total: {total_duration}s")
-            return selected_videos
-        except Exception as e:
-            logger.error(f"❌ Video selection failed: {e}")
-            raise
-            # return self._fallback_selection(self.data_holder.tts_script, max_duration)
-    def _parse_energy_score(self, energy_score_str: str) -> int:
-        """Parse energy score from string format to integer"""
-        try:
-            if pd.isna(energy_score_str) or energy_score_str == "":
-                return 0
-            match = re.search(r"(\d+)\s*out of\s*\d+", str(energy_score_str))
-            if match:
-                return int(match.group(1))
-            numbers = re.findall(r"\d+", str(energy_score_str))
-            if numbers:
-                return int(numbers[0])
-            return 0
-        except (ValueError, TypeError) as e:
-            logger.warning(f"Failed to parse energy score '{energy_score_str}': {e}")
-            return 0
-    async def _analyze_with_gemini(self, tts_script, timed_transcript) -> List[Dict]:
-        """Use Gemini API for contextual video selection"""
-        try:
-            video_context = await self.prepare_video_context()
-            # with open("src/prompt/best_matches_video.md", "r", encoding="utf-8") as file:
-            # with open("src/prompt/best_matches_video_with_timestamp.md", "r", encoding="utf-8") as file:
-            with open("src/prompt/best_matches_two_video_tracking.md", "r", encoding="utf-8") as file:
-                system_prompt = file.read()
-            model_input = f"""SYSTEM INSTRUCTION::
-{system_prompt}
-USER PROMPT:
-TTS Script: {tts_script}
-Video Options: {video_context}
-"""
-            response = gemini_sdk.generate(model_input)
-            response_text = response.strip()
-            selection = json_repair.loads(response_text)
-            selected = []
-            for item in selection:
-                video_index = item["video_index"]
-                if video_index < len(self.video_library):
-                    video_row = self.video_library[self.video_library["Video URL (No Audio)"] == item["video_url"]]
-                    video = video_row.iloc[0]
-                    selected.append(
-                        {
-                            "url": video.get("Video URL (No Audio)", video.get("url", "")),
-                            "alternate_url": None,
-                            "alternate_url_local_path": None,
-                            "video_summary": video.get("Full Video Description Summary"),
-                            "tts_script_segment":item["tts_script_segment"],
-                            "duration": video.get("duration", 0),
-                            "reason": item["reason"],
-                            "alignment": video.get("Video Alignment with the TTS Script", video.get("alignment", "")),
-                            "energy": video.get("energy_score", 0),
-                        }
-                    )
-                    if "alternate_video_index" in item:
-                        video_row = self.video_library[self.video_library["Video URL (No Audio)"] == item["alternate_video_url"]]
-                        video = video_row.iloc[0]
-                        selected[-1]["alternate_url"] = video.get("Video URL (No Audio)", video.get("url", ""))
-            logger.info(f"✓ Gemini selected {len(selected)}")
-            return selected
-        except json.JSONDecodeError as e:
-            logger.error(f"Failed to parse Gemini JSON response: {e}")
-            logger.debug(f"Raw response: {response_text[:500]}")
-            raise
-        except Exception as e:
-            logger.error(f"Gemini analysis failed: {e}")
-            import traceback
-            traceback.print_exc()
-            raise
-    async def prepare_video_context(self):
-        # STEP 3: Update durations using actual local files
-        for video in self.data_holder.visual_assets["all_videos"]:
-            local_path = video.get("local_path")
-            if local_path:
-                try:
-                    with VideoFileClip(local_path) as clip:
-                        video["duration"] = round(clip.duration, 2)
-                except Exception as e:
-                    logger.warning(f"⚠️ Error reading duration for {local_path}: {e}")
-                    video["duration"] = 0
-            else:
-                video["duration"] = 0
-        # STEP 4: Form video_context string (using actual durations)
-        video_context = "\n".join(
-            [
-                f"{i+1}. {row.get('Video URL (No Audio)')} - "
-                f"{row.get('Full Video Description Summary', row.get('description', ''))} - "
-                f"{next((v.get('duration', 0) for v in self.data_holder.visual_assets['all_videos'] if v['url'] == row.get('Video URL (No Audio)')), 0)}s - "
-                f"Alignment: {row.get('Video Alignment with the TTS Script', row.get('alignment', ''))} - "
-                f"Usage Count: {self.data_holder.video_usage_count.get(row.get('Video URL (No Audio)'), 0)}"
-                for i, row in self.video_library.iterrows()
-            ]
-        )
-        return video_context
-    def _fallback_selection(self, tts_script: str, max_duration: int) -> List[Dict]:
-        """Fallback selection based on keyword matching"""
-        script_lower = tts_script.lower()
-        selected = []
-        total_duration = 0
-        fallback_videos = [
-            {
-                "url": "https://storage.googleapis.com/somira/Somira%20Massager.mp4",
-                "duration": 2,
-                "reason": "Product showcase",
-                "alignment": "product",
-                "energy": 5,
-            },
-            {
-                "url": "https://storage.googleapis.com/somira/FemaleWomenPuttingOnNeckMassagerr.mp4",
-                "duration": 2,
-                "reason": "Usage demonstration",
-                "alignment": "usage",
-                "energy": 35,
-            },
-            {
-                "url": "https://storage.googleapis.com/somira/PersonEnjoyingTheNeckMassager.mp4",
-                "duration": 1.5,
-                "reason": "User satisfaction",
-                "alignment": "satisfaction",
-                "energy": 40,
-            },
-        ]
-        for video in fallback_videos:
-            if total_duration + video["duration"] <= max_duration:
-                selected.append(video)
-                total_duration += video["duration"]
-        return selected[:3]
     def select_background_music(self) -> str:
-        """
-        Select background music SEQUENTIALLY (not random)
-        Each call increments the index to ensure different music for each video
-        """
-        if self.audio_library.empty:
-            logger.error("❌ Audio library is empty")
-            return ""
-        # Select current index
-        selected = self.audio_library.iloc[self.current_audio_index]["AUDIO_LINK"]
-        logger.info(
-            f"🎵 Selected background music #{self.current_audio_index + 1}/{len(self.audio_library)}: {selected}"
-        )
-        # Increment index for next call (loop back to start if needed)
-        self.current_audio_index = (self.current_audio_index + 1) % len(self.audio_library)
         return selected
     def reset_audio_index(self):
-        """Reset audio index to start from beginning (useful for batch processing)"""
-        self.current_audio_index = 0
-        logger.info("🔄 Reset background music index to 0")
     def select_random_videos(self, count: int) -> List[str]:
-        import random
-        all_videos = self.data_holder.visual_assets.get("all_videos", [])
-        available_videos = [v for v in all_videos if v.get("local_path")]
-        if len(available_videos) < count:
-            logger.warning(f"⚠️ Not enough videos to select {count} random videos. Selecting {len(available_videos)} instead.")
-            count = len(available_videos)
-        selected_videos = random.sample(available_videos, count)
-        return [v["local_path"] for v in selected_videos]

+"""
+AssetSelector - Thin wrapper for backward compatibility
+Use asset_manager classes directly for new code.
+"""
 import pandas as pd
 from typing import List, Dict, Optional, Tuple
 from utils import logger
 from data_holder import DataHolder
+from asset_manager import get_video_lib, get_audio_lib, AssetProcessor
 class AssetSelector:
+    """
+    Wrapper class for backward compatibility.
+    New code should use asset_manager classes directly:
+    - get_video_lib() for video library
+    - get_audio_lib() for audio library
+    - AssetProcessor(data_holder) for video selection
+    """
+    def __init__(self, config: Dict, data_holder: DataHolder = None, gcloud_wrapper=None):
         self.config = config
         self.data_holder = data_holder
+        # Use singletons from asset_manager
+        self._video_lib = get_video_lib()
+        initial_audio_index = config.get("current_audio_index", 0)
+        self._audio_lib = get_audio_lib(initial_audio_index)
+        self._audio_lib.current_audio_index = initial_audio_index
+        # Processor for video selection (only create when data_holder available)
+        self._processor = AssetProcessor(data_holder) if data_holder else None
+    @property
+    def video_library(self) -> pd.DataFrame:
+        return self._video_lib.video_library
+    @property
+    def audio_library(self) -> pd.DataFrame:
+        return self._audio_lib.audio_library
+    @property
+    def current_audio_index(self) -> int:
+        return self._audio_lib.current_audio_index
+    @current_audio_index.setter
+    def current_audio_index(self, value: int):
+        self._audio_lib.current_audio_index = value
+        self.config["current_audio_index"] = value
     def inc_audio_index(self):
+        self._audio_lib.inc_audio_index()
+        self.config["current_audio_index"] = self._audio_lib.current_audio_index
     def get_audio_beats(self, audio_link: str) -> Optional[List[float]]:
+        return self._audio_lib.get_audio_beats(audio_link)
+    async def select_videos(self, tts_script, timed_transcript, max_duration: int = 12) -> List[Dict]:
+        """Delegate to AssetProcessor"""
+        if not self._processor:
+            self._processor = AssetProcessor(self.data_holder)
+        return await self._processor.select_videos(tts_script, timed_transcript, max_duration)
     def select_background_music(self) -> str:
+        selected = self._audio_lib.select_background_music()
+        self.config["current_audio_index"] = self._audio_lib.current_audio_index
         return selected
     def reset_audio_index(self):
+        self._audio_lib.reset_audio_index()
+        self.config["current_audio_index"] = 0
     def select_random_videos(self, count: int) -> List[str]:
+        """Delegate to AssetProcessor"""
+        if not self._processor:
+            self._processor = AssetProcessor(self.data_holder)
+        return self._processor.select_random_videos(count)

src/automation.py CHANGED Viewed

@@ -27,6 +27,8 @@ import numpy as np
 from file_downloader import FileDownloader
 from data_holder import DataHolder
 import setup_config
 class ContentAutomation:
     def __init__(self, config: Dict[str, Any], data_holder: DataHolder = None, asset_selector: 'AssetSelector' = None, api_clients: 'APIClients' = None):
@@ -37,6 +39,7 @@ class ContentAutomation:
         self.video_renderer = VideoRenderer(config, self.data_holder)
         # Reuse provided asset_selector or create new one
         self.asset_selector = asset_selector or AssetSelector(config, self.data_holder)
         self.file_downloader = FileDownloader()
         self.pipeline_start_time = None
@@ -78,11 +81,17 @@ class ContentAutomation:
                     "speaking_rate": 1.2
                 }
                 self.data_holder.visual_assets["a2e_video_url"] = video_url
-                await self._download_to_local(audio_url, f'a2e_tts.mp3', self.data_holder.visual_assets["tts_audio"])
                 with AudioFileClip(self.data_holder.visual_assets["tts_audio"]["local_path"]) as audio:
                     self.data_holder.visual_assets["tts_audio"]["duration"] = audio.duration
-                await self._download_to_local(video_url, f'a2e_video.mp4', self.data_holder.visual_assets, "a2e_video_local_path")
                 # await self.api_clients.upload_to_temp_gcs(self.data_holder.visual_assets["tts_audio"]["local_path"], "audio")
                 # await self.api_clients.upload_to_temp_gcs(self.data_holder.visual_assets["a2e_video_local_path"], "video")
             else:
@@ -94,7 +103,8 @@ class ContentAutomation:
             await self.create_audio()
             logger.info("\n STEP 4: Download all the video assets.")
-            await self._download_all_video()
             # STEP 3: Generate visual assets
             logger.info("\n📦 STEP 3: Generate Visual Assets")
@@ -106,7 +116,7 @@ class ContentAutomation:
             # STEP 2: Download ALL visual assets with proper error handling
             logger.info("\n⬇️ STEP 4: Download Visual Assets")
-            await self._download_all_visual_assets()
             # STEP 3: Render video WITHOUT audio (natural speed)
             logger.info("\n🎬 STEP 5: Render Video (Natural Speed, No Audio)")
@@ -135,9 +145,9 @@ class ContentAutomation:
             logger.info("\n🎵 STEP 7: Background Music")
             self.data_holder.visual_assets["background_music_url"] = self.asset_selector.select_background_music()
-            await self._download_to_local(
-                self.data_holder.visual_assets["background_music_url"], "background_music.mp3", self.data_holder.visual_assets, "background_music_local"
-            )
             # STEP 7: Add audio to video
             logger.info("\n🔊 STEP 8: Add Audio to Video")
@@ -198,7 +208,8 @@ class ContentAutomation:
     async def execute_random_pipeline(self, content_strategy: Dict[str, str], tts_script: str) -> Dict[str, Any]:
         try:
-            await self._download_all_video()
             music_duration = None
@@ -350,9 +361,9 @@ class ContentAutomation:
             self.asset_selector.inc_audio_index()
         self.data_holder.visual_assets["background_music_url"] = self.asset_selector.select_background_music()
-        await self._download_to_local(
-            self.data_holder.visual_assets["background_music_url"], "background_music.mp3", self.data_holder.visual_assets, "background_music_local"
-        )
     async def create_audio(self):
         try_again = False
@@ -393,42 +404,6 @@ class ContentAutomation:
         return tts_audio, timed_words
-    async def _download_all_video(self):
-        all_videos = self.data_holder.visual_assets.get("all_videos", [])
-        # ✅ Skip downloading if all have local_path
-        if all_videos and all(v.get("local_path") for v in all_videos):
-            logger.info("✅ All videos already have local_path — skipping download.")
-            return
-        download_path = "testData/video_for_workflow"
-        Path(download_path).mkdir(parents=True, exist_ok=True)
-        videos = []
-        for _, row in self.asset_selector.video_library.iterrows():
-            url = str(row.get("VIDEO_LINK", "")).strip()
-            if not url:
-                continue
-            local_path = self.file_downloader.safe_download(url=url)
-            if not local_path or not utils.is_valid_video(local_path):
-                continue
-            # Resize and remove padding (handle potential errors)
-            try:
-                utils.resize_video(local_path, overwrite=True)
-                utils.remove_black_padding(local_path, overwrite=True)
-            except Exception as e:
-                logger.warning(f"⚠️ Could not process {local_path}: {e}")
-                # Continue anyway - video is still usable
-            videos.append({
-                "url": url,
-                "local_path": str(local_path),
-            })
-        self.data_holder.visual_assets["all_videos"] = videos
     async def _generate_visual_assets_parallel(self, content_strategy: Dict) -> Dict:
         """Generate visual assets in parallel (hook video + library videos)"""
         tasks = {
@@ -493,96 +468,6 @@ class ContentAutomation:
             traceback.print_exc()
             raise
-    async def _download_all_visual_assets(self):
-        """Download ALL visual assets with proper error handling"""
-        download_tasks = []
-        # Download hook video with explicit local_path assignment
-        assets = self.data_holder.visual_assets
-        if assets.get("hook_video") and assets["hook_video"].get("video_url"):
-            hook_url = assets["hook_video"]["video_url"]
-            download_tasks.append(
-                self._download_with_fallback(hook_url, "hook_video.mp4", assets["hook_video"], "local_path", resize=True)
-            )
-            # VEO library videos
-            if assets["hook_video"].get("veo_video_data") and assets["hook_video"].get("veo_video_data").get("video_url"):
-                veo_hook_url = assets["hook_video"]["veo_video_data"]["video_url"]
-                download_tasks.append(
-                    self._download_with_fallback(veo_hook_url, "veo_hook_url.mp4", assets["hook_video"]["veo_video_data"], "local_path", resize=True, remove_black_padding=True)
-                )
-        # Download library videos
-        for i, video in enumerate(assets.get("selected_videos", [])):
-            if video.get("url"):
-                download_tasks.append(
-                    self._download_with_fallback(video["url"], f"library_video_{i}.mp4", video, "local_path", resize=True)
-                )
-            if video.get("alternate_url"):
-                download_tasks.append(
-                    self._download_with_fallback(video["alternate_url"], f"library_all_video_alternate_url_{i}.mp4", video, "alternate_url_local_path", resize=True)
-                )
-        # Download library videos
-        for i, video in enumerate(assets.get("all_videos", [])):
-            if video.get("url") and not video.get("local_path", None):
-                download_tasks.append(
-                    self._download_with_fallback(video["url"], f"library_all_video_{i}.mp4", video, "local_path")
-                )
-        # Wait for all downloads to complete
-        if download_tasks:
-            results = await asyncio.gather(*download_tasks, return_exceptions=True)
-            # Check for failures
-            for i, result in enumerate(results):
-                if isinstance(result, Exception):
-                    logger.error(f"❌ Download task {i} failed: {result}")
-        # Verify all required assets have local_path
-        self._verify_assets_downloaded(assets)
-    async def _download_with_fallback(self, url: str, filename: str, target_dict: Dict, key: str = "local_path", resize: bool = False, remove_black_padding: bool = False):
-        """Download file with fallback to ensure local_path is always set"""
-        try:
-            local_path = await self.api_clients.download_file(url, filename)
-            if remove_black_padding:
-                utils.remove_black_padding(local_path, overwrite=True)
-            if resize:
-                utils.resize_video(local_path, overwrite=True)
-            target_dict[key] = local_path
-            logger.info(f"✓ Downloaded {filename}")
-            return local_path
-        except Exception as e:
-            logger.error(f"❌ Failed to download {filename}: {e}")
-            raise
-    def _verify_assets_downloaded(self, assets: Dict):
-        """Verify that all required assets have local_path"""
-        missing_assets = []
-        # Check hook video
-        if assets.get("hook_video") and not assets["hook_video"].get("local_path"):
-            missing_assets.append("hook_video")
-        # Check library videos
-        for i, video in enumerate(assets.get("selected_videos", [])):
-            if not video.get("local_path"):
-                missing_assets.append(f"library_video_{i}")
-        if missing_assets:
-            logger.warning(f"⚠️ Missing local_path for: {', '.join(missing_assets)}")
-            # Don't raise exception here, let the pipeline continue with fallbacks
-    async def _download_to_local(self, url: str, filename: str, target_dict: Dict, key: str = "local_path"):
-        """Download file from URL and store local path in target dictionary"""
-        try:
-            local_path = await self.api_clients.download_file(url, filename)
-            target_dict[key] = local_path
-            logger.info(f"✓ Downloaded {filename}")
-        except Exception as e:
-            logger.error(f"❌ Failed to download {filename}: {e}")
-            raise
     async def health_check(self) -> Dict[str, bool]:
         """Comprehensive health check of all components"""
         logger.info("🏥 Running comprehensive health check...")

 from file_downloader import FileDownloader
 from data_holder import DataHolder
 import setup_config
+from asset_manager import get_asset_downloader
+from file_downloader import FileDownloader
 class ContentAutomation:
     def __init__(self, config: Dict[str, Any], data_holder: DataHolder = None, asset_selector: 'AssetSelector' = None, api_clients: 'APIClients' = None):
         self.video_renderer = VideoRenderer(config, self.data_holder)
         # Reuse provided asset_selector or create new one
         self.asset_selector = asset_selector or AssetSelector(config, self.data_holder)
+        self.asset_downloader = get_asset_downloader()
         self.file_downloader = FileDownloader()
         self.pipeline_start_time = None
                     "speaking_rate": 1.2
                 }
                 self.data_holder.visual_assets["a2e_video_url"] = video_url
+                # Download audio using file_downloader directly
+                local_path = self.file_downloader.safe_download(audio_url)
+                if local_path:
+                    self.data_holder.visual_assets["tts_audio"]["local_path"] = str(local_path)
                 with AudioFileClip(self.data_holder.visual_assets["tts_audio"]["local_path"]) as audio:
                     self.data_holder.visual_assets["tts_audio"]["duration"] = audio.duration
+                # Download video using file_downloader directly
+                local_path = self.file_downloader.safe_download(video_url)
+                if local_path:
+                    self.data_holder.visual_assets["a2e_video_local_path"] = str(local_path)
                 # await self.api_clients.upload_to_temp_gcs(self.data_holder.visual_assets["tts_audio"]["local_path"], "audio")
                 # await self.api_clients.upload_to_temp_gcs(self.data_holder.visual_assets["a2e_video_local_path"], "video")
             else:
             await self.create_audio()
             logger.info("\n STEP 4: Download all the video assets.")
+            videos = await self.asset_downloader.download_all_videos()
+            self.data_holder.visual_assets["all_videos"] = videos
             # STEP 3: Generate visual assets
             logger.info("\n📦 STEP 3: Generate Visual Assets")
             # STEP 2: Download ALL visual assets with proper error handling
             logger.info("\n⬇️ STEP 4: Download Visual Assets")
+            await self.asset_downloader.download_all_visual_assets(self.data_holder)
             # STEP 3: Render video WITHOUT audio (natural speed)
             logger.info("\n🎬 STEP 5: Render Video (Natural Speed, No Audio)")
             logger.info("\n🎵 STEP 7: Background Music")
             self.data_holder.visual_assets["background_music_url"] = self.asset_selector.select_background_music()
+            local_path = self.file_downloader.safe_download(self.data_holder.visual_assets["background_music_url"])
+            if local_path:
+                self.data_holder.visual_assets["background_music_local"] = str(local_path)
             # STEP 7: Add audio to video
             logger.info("\n🔊 STEP 8: Add Audio to Video")
     async def execute_random_pipeline(self, content_strategy: Dict[str, str], tts_script: str) -> Dict[str, Any]:
         try:
+            videos = await self.asset_downloader.download_all_videos()
+            self.data_holder.visual_assets["all_videos"] = videos
             music_duration = None
             self.asset_selector.inc_audio_index()
         self.data_holder.visual_assets["background_music_url"] = self.asset_selector.select_background_music()
+        local_path = self.file_downloader.safe_download(self.data_holder.visual_assets["background_music_url"])
+        if local_path:
+            self.data_holder.visual_assets["background_music_local"] = str(local_path)
     async def create_audio(self):
         try_again = False
         return tts_audio, timed_words
     async def _generate_visual_assets_parallel(self, content_strategy: Dict) -> Dict:
         """Generate visual assets in parallel (hook video + library videos)"""
         tasks = {
             traceback.print_exc()
             raise
     async def health_check(self) -> Dict[str, bool]:
         """Comprehensive health check of all components"""
         logger.info("🏥 Running comprehensive health check...")

src/file_downloader.py CHANGED Viewed

@@ -159,12 +159,14 @@ class FileDownloader:
     def _detect_url_type(self, url: str) -> str:
         """
-        Detect URL type: 'drive', 'gcs', or 'unknown'
         """
         if "drive.google.com" in url:
             return "drive"
         elif url.startswith("gs://") or "storage.googleapis.com" in url or "storage.cloud.google.com" in url:
             return "gcs"
         else:
             return "unknown"
@@ -338,6 +340,55 @@ class FileDownloader:
         logger.info("Downloaded from GCS to %s", output_path)
         return output_path
     def safe_download(self, url: str, output_path: Path | None = None, account_id: str | None = None) -> Path | None:
         """
         Safe download wrapper to handle exceptions.
@@ -400,8 +451,11 @@ class FileDownloader:
                     return self.download_from_gcs(bucket_name, blob_name, output_path=output_path, public=False, account_id=account_id)
                 raise
         else:
-            raise ValueError(f"Unknown URL type. Expected Google Drive or GCS URL, got: {url}")
     # ------------------ Batch download ------------------

     def _detect_url_type(self, url: str) -> str:
         """
+        Detect URL type: 'drive', 'gcs', 'public', or 'unknown'
         """
         if "drive.google.com" in url:
             return "drive"
         elif url.startswith("gs://") or "storage.googleapis.com" in url or "storage.cloud.google.com" in url:
             return "gcs"
+        elif url.startswith("http://") or url.startswith("https://"):
+            return "public"
         else:
             return "unknown"
         logger.info("Downloaded from GCS to %s", output_path)
         return output_path
+    def download_from_url(
+        self,
+        url: str,
+        output_path: Path | None = None,
+        filename: str | None = None,
+    ) -> Path:
+        """
+        Download a file from a regular HTTP/HTTPS URL.
+        Args:
+            url: Public HTTP/HTTPS URL
+            output_path: Full path where to save the file (optional)
+            filename: Filename to use if output_path not specified (optional)
+        Returns:
+            Path to the downloaded file
+        """
+        logger.info("Downloading from public URL: %s", url)
+        # Determine filename from URL if not provided
+        if filename is None and output_path is None:
+            from urllib.parse import urlparse, unquote
+            parsed = urlparse(url)
+            filename = Path(unquote(parsed.path)).name
+            if not filename:
+                filename = "downloaded_file"
+        # Determine output path
+        if output_path is None:
+            output_path = self.temp_dir / filename
+        # Check if file already exists
+        if self.skip_existing and output_path.exists():
+            logger.info("File already exists, skipping download: %s", output_path)
+            return output_path
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        # Download via HTTP
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        with output_path.open("wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        logger.info("Downloaded from URL to %s", output_path)
+        return output_path
     def safe_download(self, url: str, output_path: Path | None = None, account_id: str | None = None) -> Path | None:
         """
         Safe download wrapper to handle exceptions.
                     return self.download_from_gcs(bucket_name, blob_name, output_path=output_path, public=False, account_id=account_id)
                 raise
+        elif url_type == "public":
+            return self.download_from_url(url, output_path=output_path)
         else:
+            raise ValueError(f"Unknown URL type. Expected Google Drive, GCS, or public HTTP URL, got: {url}")
     # ------------------ Batch download ------------------

src/process_csv.py CHANGED Viewed

@@ -12,17 +12,15 @@ from automation import ContentAutomation
 from api_clients import APIClients
 from utils import logger
 from data_holder import DataHolder
-from asset_selector import AssetSelector
 from google_sheet_reader import GoogleSheetReader
 import argparse
 import uuid
 from cleanup_manager import process_delete_entries
 from google_src.gcs_utils import list_gcs_files
 import setup_config
 DATA_DIR = Path("data")
-ALL_VIDEO_FILE_INFO = None
-SHARED_ASSET_SELECTOR = None  # Shared instance to avoid redundant sheet loads
 SHARED_API_CLIENTS = None  # Shared instance to avoid redundant GCS/TTS client initialization
@@ -99,7 +97,7 @@ def log_progress_to_gsheet(tts_script: str, result: dict, job_index: int, commit
 async def process_row(row, config: dict):
     """Process one CSV row using the main pipeline."""
-    global ALL_VIDEO_FILE_INFO, SHARED_ASSET_SELECTOR, SHARED_API_CLIENTS
     tts_script = row.get("TTS Script (AI Avatar)", "")
     if os.getenv("ON_SCREEN_TEXT", "false").lower() == "true":
         tts_script = row.get("On-Screen Text", "").strip()
@@ -107,18 +105,17 @@ async def process_row(row, config: dict):
     logger.info(f"▶️ Executing: {tts_script}...")
     dataHolder = DataHolder()
-    dataHolder.visual_assets["all_videos"] = ALL_VIDEO_FILE_INFO
-    # Update shared instances with current dataHolder before use
-    if SHARED_ASSET_SELECTOR:
-        SHARED_ASSET_SELECTOR.data_holder = dataHolder
     if SHARED_API_CLIENTS:
         SHARED_API_CLIENTS.data_holder = dataHolder
-    # Reuse shared AssetSelector and APIClients to avoid redundant initialization
     automation = ContentAutomation(
         config, dataHolder,
-        asset_selector=SHARED_ASSET_SELECTOR,
         api_clients=SHARED_API_CLIENTS
     )
@@ -141,41 +138,24 @@ async def process_row(row, config: dict):
 async def download_all_video(config: dict):
-    """Download all library videos once and cache them. Creates shared instances."""
-    global ALL_VIDEO_FILE_INFO, SHARED_ASSET_SELECTOR, SHARED_API_CLIENTS
-    if ALL_VIDEO_FILE_INFO is None:
-        logger.info("📥 Pre-downloading all library videos...")
-        # Create the shared AssetSelector once - this loads video/audio libraries from sheets
-        if SHARED_ASSET_SELECTOR is None:
-            SHARED_ASSET_SELECTOR = AssetSelector(config)
-        video_urls = [
-            row.get("Video URL (No Audio)", "").strip()
-            for _, row in SHARED_ASSET_SELECTOR.video_library.iterrows()
-            if row.get("Video URL (No Audio)", "").strip()
-        ]
         dataHolder = DataHolder()
-        dataHolder.visual_assets["all_videos"] = [{"url": url} for url in video_urls]
-        # Create the shared APIClients once - this initializes GCS/TTS clients
-        if SHARED_API_CLIENTS is None:
-            SHARED_API_CLIENTS = APIClients(config, dataHolder)
-        # Pass the shared instances to avoid creating new ones
-        automation = ContentAutomation(
-            config, dataHolder,
-            asset_selector=SHARED_ASSET_SELECTOR,
-            api_clients=SHARED_API_CLIENTS
-        )
-        await automation._download_all_visual_assets()
-        ALL_VIDEO_FILE_INFO = dataHolder.visual_assets.get("all_videos", [])
-        logger.info(f"✓ Downloaded {len(ALL_VIDEO_FILE_INFO)} library videos")
-    return ALL_VIDEO_FILE_INFO
 async def process_all_csvs(config, commit=False, job_index=None, total_jobs=None):
     """Process all CSVs in data directory."""

 from api_clients import APIClients
 from utils import logger
 from data_holder import DataHolder
 from google_sheet_reader import GoogleSheetReader
 import argparse
 import uuid
 from cleanup_manager import process_delete_entries
 from google_src.gcs_utils import list_gcs_files
 import setup_config
+from asset_manager import get_video_lib, get_audio_lib, get_asset_downloader
 DATA_DIR = Path("data")
 SHARED_API_CLIENTS = None  # Shared instance to avoid redundant GCS/TTS client initialization
 async def process_row(row, config: dict):
     """Process one CSV row using the main pipeline."""
+    global SHARED_API_CLIENTS
     tts_script = row.get("TTS Script (AI Avatar)", "")
     if os.getenv("ON_SCREEN_TEXT", "false").lower() == "true":
         tts_script = row.get("On-Screen Text", "").strip()
     logger.info(f"▶️ Executing: {tts_script}...")
     dataHolder = DataHolder()
+    # Get downloaded videos from singleton
+    asset_downloader = get_asset_downloader()
+    dataHolder.visual_assets["all_videos"] = asset_downloader.downloaded_videos
+    # Update shared APIClients with current dataHolder
     if SHARED_API_CLIENTS:
         SHARED_API_CLIENTS.data_holder = dataHolder
+    # AssetSelector uses singletons internally, no need to share
     automation = ContentAutomation(
         config, dataHolder,
         api_clients=SHARED_API_CLIENTS
     )
 async def download_all_video(config: dict):
+    """Download all library videos once using singletons."""
+    global SHARED_API_CLIENTS
+    # Get the asset downloader singleton
+    asset_downloader = get_asset_downloader()
+    # Download all videos using the singleton
+    logger.info("📥 Pre-downloading all library videos...")
+    videos = await asset_downloader.download_all_videos()
+    # Create the shared APIClients once
+    if SHARED_API_CLIENTS is None:
         dataHolder = DataHolder()
+        dataHolder.visual_assets["all_videos"] = videos
+        SHARED_API_CLIENTS = APIClients(config, dataHolder)
+    logger.info(f"✓ Downloaded {len(videos)} library videos")
+    return videos
 async def process_all_csvs(config, commit=False, job_index=None, total_jobs=None):
     """Process all CSVs in data directory."""