Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

utils/__init__.py +25 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/llm_capabilities.cpython-311.pyc +0 -0
utils/__pycache__/ngc_cli.cpython-311.pyc +0 -0
utils/__pycache__/ngc_resources.cpython-311.pyc +0 -0
utils/__pycache__/s3_dataset_loader.cpython-311.pyc +0 -0
utils/__pycache__/transcript_corrector.cpython-311.pyc +0 -0
utils/llm_capabilities.py +68 -0
utils/ngc_cli.py +400 -0
utils/ngc_resources.py +184 -0
utils/s3_dataset_loader.py +495 -0
utils/subtitle_processor.py +96 -0
utils/transcript_corrector.py +155 -0

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+Shared AI utilities for training and dataset pipelines.
+"""
+from .ngc_cli import (
+    NGCCLI,
+    NGCCLIAuthError,
+    NGCCLIDownloadError,
+    NGCCLIError,
+    NGCCLINotFoundError,
+    NGCConfig,
+    ensure_ngc_cli_configured,
+    get_ngc_cli,
+)
+__all__ = [
+    "NGCCLI",
+    "NGCCLIAuthError",
+    "NGCCLIDownloadError",
+    "NGCCLIError",
+    "NGCCLINotFoundError",
+    "NGCConfig",
+    "ensure_ngc_cli_configured",
+    "get_ngc_cli",
+]

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (584 Bytes). View file

utils/__pycache__/llm_capabilities.cpython-311.pyc ADDED Viewed

Binary file (3.35 kB). View file

utils/__pycache__/ngc_cli.cpython-311.pyc ADDED Viewed

Binary file (17.8 kB). View file

utils/__pycache__/ngc_resources.cpython-311.pyc ADDED Viewed

Binary file (6.86 kB). View file

utils/__pycache__/s3_dataset_loader.cpython-311.pyc ADDED Viewed

Binary file (22.3 kB). View file

utils/__pycache__/transcript_corrector.cpython-311.pyc ADDED Viewed

Binary file (7.8 kB). View file

utils/llm_capabilities.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+from google import genai
+_WORKING_MODEL_CACHE = None
+def get_best_available_gemini_model(client: genai.Client) -> str:
+    """
+    Dynamically interrogates the Gemini API to find the best functioning
+    model available for the current API key's tier/region. This prevents
+    hardcoded models from throwing 404s if they are restricted.
+    """
+    global _WORKING_MODEL_CACHE
+    if _WORKING_MODEL_CACHE:
+        return _WORKING_MODEL_CACHE
+    target_models = [
+        "models/gemini-2.0-flash-001",
+        "models/gemini-2.0-flash-lite-001",
+        "models/gemini-flash-latest",
+        "models/gemini-pro-latest",
+        "models/gemini-2.5-flash",
+        "models/gemini-2.5-pro",
+    ]
+    try:
+        available_models = [m.name for m in client.models.list()]
+        print(f"DISCOVERED MODELS on this key: {available_models}")
+    except Exception as e:
+        print(f"Failed to list models: {e}")
+        return "gemini-1.5-flash"  # Fallback guess
+    for target in target_models:
+        for available in available_models:
+            if target == available or available.endswith(target):
+                # Double check that we can actually invoke it
+                # (some show up in list but 404 on invoke due to constraints)
+                try:
+                    client.models.generate_content(model=target, contents="ping")
+                    _WORKING_MODEL_CACHE = target
+                    print(f"Dynamically locked to functioning Gemini model: {target}")
+                    return target
+                except Exception as eval_e:
+                    print(f"Model {target} is listed but uninvokeable: {eval_e}")
+                    continue
+    print(
+        "CRITICAL WARNING: No preferred Gemini models available on this API Key. "
+        "Falling back to gemini-flash-latest."
+    )
+    return "models/gemini-flash-latest"
+def ensure_valid_key() -> str:
+    """Validates that the Gemini API key provided is a REST key, not an OAuth token."""
+    key = os.environ.get("GOOGLE_CLOUD_API_KEY") or os.environ.get("GEMINI_API_KEY")
+    if not key:
+        raise ValueError(
+            "Neither GOOGLE_CLOUD_API_KEY nor GEMINI_API_KEY are configured."
+        )
+    if key.startswith("AQ"):
+        raise ValueError(
+            "Provided GEMINI_API_KEY is an OAuth token (AQ...). "
+            "The AI engine requires a Google Cloud REST API key (AIza...). "
+            "Please update your .env file."
+        )
+    return key

utils/ngc_cli.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""
+NGC CLI Utility Module
+Provides utilities for working with NVIDIA GPU Cloud (NGC) CLI to download
+NeMo resources, datasets, and other NGC catalog resources.
+This module handles:
+- NGC CLI detection and installation
+- Resource download from NGC catalog
+- Configuration management
+- Error handling and retry logic
+"""
+import logging
+import os
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+logger = logging.getLogger(__name__)
+@dataclass
+class NGCConfig:
+    """NGC CLI configuration"""
+    api_key: str | None = None
+    org: str | None = None
+    team: str | None = None
+class NGCCLIError(Exception):
+    """Base exception for NGC CLI operations"""
+class NGCCLINotFoundError(NGCCLIError):
+    """NGC CLI not found or not installed"""
+class NGCCLIAuthError(NGCCLIError):
+    """NGC CLI authentication error"""
+class NGCCLIDownloadError(NGCCLIError):
+    """NGC CLI download error"""
+class NGCCLI:
+    """
+    NGC CLI wrapper for downloading resources from NVIDIA GPU Cloud.
+    Supports multiple installation methods:
+    1. System-installed ngc in PATH
+    2. Local installation at ~/ngc-cli/ngc
+    3. Python package via uv (ngc-python-cli)
+    """
+    def __init__(self, use_uv: bool = True):
+        """
+        Initialize NGC CLI wrapper.
+        Args:
+            use_uv: If True, prefer uv-based installation if ngc not in PATH
+        """
+        self.use_uv = use_uv
+        self.ngc_cmd: str | None = None
+        self.uv_cmd: str | None = None
+        self._detect_ngc_cli()
+    def _detect_ngc_cli(self) -> None:
+        """Detect and set up NGC CLI command"""
+        # Method 1: Check if ngc is in PATH
+        if shutil.which("ngc"):
+            self.ngc_cmd = "ngc"
+            logger.info("Found NGC CLI in PATH")
+            return
+        # Method 2: Check common installation location
+        home_ngc = Path.home() / "ngc-cli" / "ngc"
+        if home_ngc.exists():
+            self.ngc_cmd = str(home_ngc)
+            # Add to PATH for subprocess calls
+            env_path = os.environ.get("PATH", "")
+            os.environ["PATH"] = f"{home_ngc.parent}:{env_path}"
+            logger.info(f"Found NGC CLI at {home_ngc}")
+            return
+        # Method 3: Use uv to run ngc (if enabled)
+        if self.use_uv:
+            self._setup_uv_ngc()
+    def _setup_uv_ngc(self) -> None:
+        """Set up NGC CLI via uv"""
+        # Find uv
+        if shutil.which("uv"):
+            self.uv_cmd = "uv"
+        elif (Path.home() / ".local" / "bin" / "uv").exists():
+            self.uv_cmd = str(Path.home() / ".local" / "bin" / "uv")
+        elif (Path.home() / ".cargo" / "bin" / "uv").exists():
+            self.uv_cmd = str(Path.home() / ".cargo" / "bin" / "uv")
+        else:
+            logger.warning("uv not found, cannot use uv-based NGC CLI")
+            return
+        # Check if ngc is installed via uv
+        try:
+            result = subprocess.run(
+                [self.uv_cmd, "pip", "list"],
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            if "ngc" in result.stdout.lower():
+                self.ngc_cmd = f"{self.uv_cmd} run ngc"
+                logger.info("Found NGC CLI via uv")
+                return
+        except Exception as e:
+            logger.debug(f"Error checking uv packages: {e}")
+        # Note: NGC CLI is not a Python package on PyPI
+        # It must be downloaded from https://catalog.ngc.nvidia.com
+        # We can only check if it's available in PATH or local installation
+        # The uv method here is for running Python-based NGC SDK if available
+        logger.debug("NGC CLI must be installed separately from NVIDIA website")
+    def is_available(self) -> bool:
+        """Check if NGC CLI is available"""
+        return self.ngc_cmd is not None
+    def ensure_available(self) -> None:
+        """Ensure NGC CLI is available, raise error if not"""
+        if not self.is_available():
+            raise NGCCLINotFoundError(
+                "NGC CLI not found. Please install it:\n"
+                "  1. Download from https://catalog.ngc.nvidia.com\n"
+                "  2. Or install to ~/ngc-cli/ directory\n"
+                "  3. Or add to system PATH\n"
+                "\n"
+                "Note: NGC CLI is not available as a PyPI package.\n"
+                "You must download it directly from NVIDIA."
+            )
+    def check_config(self) -> dict[str, Any]:
+        """
+        Check NGC CLI configuration.
+        Returns:
+            Configuration dictionary with API key status, org, team, etc.
+        Raises:
+            NGCCLINotFoundError: If NGC CLI is not available
+            NGCCLIAuthError: If authentication is not configured
+        """
+        self.ensure_available()
+        if self.ngc_cmd is None:
+            raise NGCCLINotFoundError("NGC CLI command not set")
+        try:
+            result = subprocess.run(
+                [*self.ngc_cmd.split(), "config", "current"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            config = {}
+            # Parse the table format output
+            lines = result.stdout.strip().split("\n")
+            current_key = None
+            for line in lines:
+                if "|" in line and "| key " not in line.lower() and "---" not in line:
+                    parts = [part.strip() for part in line.split("|") if part.strip()]
+                    if len(parts) >= 3:  # key | value | source
+                        key, value, source = parts[0], parts[1], parts[2]
+                        if key:  # New key
+                            current_key = key
+                            config[key] = value
+                        elif current_key and value:  # Continuation of previous key
+                            config[current_key] += value
+                    elif len(parts) == 1 and current_key:  # Just a value continuation
+                        config[current_key] += parts[0]
+            # Check if API key is configured (it will be masked with asterisks)
+            # If we have any config and apikey exists (even masked), consider it configured
+            if config and ("apikey" in config or "API key" in config):
+                return config
+            raise NGCCLIAuthError(
+                "NGC CLI not configured. Run: ngc config set\n"
+                "Get your API key from: https://catalog.ngc.nvidia.com"
+            )
+            return config
+        except subprocess.CalledProcessError as e:
+            raise NGCCLIAuthError(f"Failed to check NGC config: {e.stderr}") from e
+    def set_config(
+        self, api_key: str, _org: str | None = None, _team: str | None = None
+    ) -> None:
+        """
+        Configure NGC CLI with API key.
+        Args:
+            api_key: NGC API key from https://catalog.ngc.nvidia.com
+            _org: Optional organization name (reserved for future use)
+            _team: Optional team name (reserved for future use)
+        """
+        self.ensure_available()
+        if self.ngc_cmd is None:
+            raise NGCCLINotFoundError("NGC CLI command not set")
+        # Set API key
+        try:
+            subprocess.run(
+                [*self.ngc_cmd.split(), "config", "set"],
+                input=f"{api_key}\n",
+                text=True,
+                check=True,
+                capture_output=True,
+            )
+            logger.info("NGC CLI configured successfully")
+        except subprocess.CalledProcessError as e:
+            raise NGCCLIAuthError(f"Failed to configure NGC CLI: {e.stderr}") from e
+    def download_resource(
+        self,
+        resource_path: str,
+        version: str | None = None,
+        output_dir: Path | None = None,
+        extract: bool = True,  # noqa: ARG002
+    ) -> Path:
+        """
+        Download a resource from NGC catalog.
+        Args:
+            resource_path: Resource path in format "org/team/resource" or "nvidia/nemo-microservices/nemo-microservices-quickstart"
+            version: Optional version tag (e.g., "25.10")
+            output_dir: Optional output directory (defaults to current directory)
+            extract: Whether to extract downloaded archive
+        Returns:
+            Path to downloaded/extracted resource
+        Raises:
+            NGCCLINotFoundError: If NGC CLI is not available
+            NGCCLIAuthError: If authentication failed
+            NGCCLIDownloadError: If download failed
+        """
+        self.ensure_available()
+        # Check config first
+        try:
+            self.check_config()
+        except NGCCLIAuthError:
+            logger.warning("NGC CLI not configured. Attempting download anyway...")
+        if output_dir is None:
+            output_dir = Path.cwd()
+        else:
+            output_dir = Path(output_dir)
+            output_dir.mkdir(parents=True, exist_ok=True)
+        if self.ngc_cmd is None:
+            raise NGCCLINotFoundError("NGC CLI command not set")
+        # Build download command
+        cmd = [*self.ngc_cmd.split(), "registry", "resource", "download-version"]
+        resource_spec = f"{resource_path}:{version}" if version else resource_path
+        cmd.append(resource_spec)
+        # Change to output directory for download
+        original_cwd = Path.cwd()
+        try:
+            return self._execute_download_in_directory(output_dir, resource_spec, cmd)
+        finally:
+            os.chdir(original_cwd)
+    def _execute_download_in_directory(
+        self, output_dir: Path, resource_spec: str, cmd: list[str]
+    ) -> Path:
+        """
+        Execute download command in the specified directory and locate the downloaded resource.
+        Args:
+            output_dir: Directory to download into
+            resource_spec: Resource specification string for logging
+            cmd: Command to execute
+        Returns:
+            Path to the downloaded resource (most recently modified item, or output_dir if empty)
+        Raises:
+            NGCCLIDownloadError: If download fails
+        """
+        os.chdir(output_dir)
+        logger.info(f"Downloading {resource_spec} to {output_dir}...")
+        result = subprocess.run(cmd, capture_output=True, text=True, check=False)
+        if result.returncode != 0:
+            error_msg = result.stderr or result.stdout
+            raise NGCCLIDownloadError(
+                f"Failed to download {resource_spec}:\n{error_msg}"
+            )
+        logger.info(f"Successfully downloaded {resource_spec}")
+        if downloaded_items := list(output_dir.iterdir()):
+            # Return the most recently modified item
+            return max(downloaded_items, key=lambda p: p.stat().st_mtime)
+        return output_dir
+    def list_resources(
+        self, org: str | None = None, team: str | None = None
+    ) -> list[dict[str, Any]]:
+        """
+        List available resources in NGC catalog.
+        Args:
+            org: Optional organization filter
+            team: Optional team filter
+        Returns:
+            List of resource dictionaries
+        """
+        self.ensure_available()
+        if self.ngc_cmd is None:
+            raise NGCCLINotFoundError("NGC CLI command not set")
+        cmd = [*self.ngc_cmd.split(), "registry", "resource", "list"]
+        if org:
+            cmd.extend(["--org", org])
+        if team:
+            cmd.extend(["--team", team])
+        try:
+            subprocess.run(cmd, capture_output=True, text=True, check=True)
+            # Parse output (format may vary)
+            # TODO: Implement proper parsing based on actual NGC CLI output format
+            return []
+        except subprocess.CalledProcessError as e:
+            logger.warning(f"Failed to list resources: {e.stderr}")
+            return []
+def get_ngc_cli(use_uv: bool = True) -> NGCCLI:
+    """
+    Get an NGC CLI instance.
+    Args:
+        use_uv: If True, prefer uv-based installation
+    Returns:
+        NGCCLI instance
+    """
+    return NGCCLI(use_uv=use_uv)
+def ensure_ngc_cli_configured(api_key: str | None = None) -> NGCCLI:
+    """
+    Ensure NGC CLI is available and configured.
+    Args:
+        api_key: Optional API key to configure (if not already configured)
+    Returns:
+        Configured NGCCLI instance
+    Raises:
+        NGCCLINotFoundError: If NGC CLI cannot be found or installed
+        NGCCLIAuthError: If configuration fails
+    """
+    cli = get_ngc_cli()
+    if not cli.is_available():
+        raise NGCCLINotFoundError(
+            "NGC CLI not available. Install with:\n"
+            "  uv pip install nvidia-pyindex nvidia-nim ngc-python-cli\n"
+            "Or download from: https://catalog.ngc.nvidia.com"
+        )
+    # Check if already configured
+    try:
+        cli.check_config()
+        return cli
+    except NGCCLIAuthError as err:
+        if api_key:
+            cli.set_config(api_key)
+            return cli
+        raise NGCCLIAuthError(
+            "NGC CLI not configured. Provide API key or run: ngc config set\n"
+            "Get API key from: https://catalog.ngc.nvidia.com"
+        ) from err

utils/ngc_resources.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+NGC Resources Downloader for Training Ready
+Downloads NeMo resources and training-related assets from NGC catalog.
+Integrates with training_ready pipeline for automated resource acquisition.
+"""
+import logging
+import os
+import sys
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from ai.utils.ngc_cli import (
+    NGCCLIAuthError,
+    NGCCLINotFoundError,
+    ensure_ngc_cli_configured,
+)
+logger = logging.getLogger(__name__)
+class NGCResourceDownloader:
+    """
+    Downloads NeMo and training resources from NGC catalog.
+    """
+    # Common NeMo resources used in training
+    NEMO_RESOURCES = {
+        "nemo-microservices-quickstart": {
+            "path": "nvidia/nemo-microservices/nemo-microservices-quickstart",
+            "default_version": "25.10",
+            "description": "NeMo Microservices quickstart package",
+        },
+        "nemo-framework": {
+            "path": "nvidia/nemo/nemo",
+            "default_version": None,  # Use latest
+            "description": "NeMo framework for training",
+        },
+        "nemo-megatron": {
+            "path": "nvidia/nemo/nemo-megatron",
+            "default_version": None,
+            "description": "NeMo Megatron for large-scale training",
+        },
+    }
+    def __init__(self, output_base: Path | None = None, api_key: str | None = None):
+        """
+        Initialize NGC resource downloader.
+        Args:
+            output_base: Base directory for downloads (defaults to training_ready/resources/)
+            api_key: Optional NGC API key (if not set, will check environment or prompt)
+        """
+        if output_base is None:
+            output_base = Path(__file__).parent.parent / "resources"
+        self.output_base = Path(output_base)
+        self.output_base.mkdir(parents=True, exist_ok=True)
+        # Get API key from environment if not provided
+        if api_key is None:
+            api_key = os.environ.get("NGC_API_KEY")
+        try:
+            self.cli = ensure_ngc_cli_configured(api_key=api_key)
+        except (NGCCLINotFoundError, NGCCLIAuthError) as e:
+            logger.warning(f"NGC CLI not available: {e}")
+            self.cli = None
+    def download_nemo_quickstart(
+        self, version: str | None = None, output_dir: Path | None = None
+    ) -> Path:
+        """
+        Download NeMo Microservices quickstart package.
+        Args:
+            version: Version to download (defaults to 25.10)
+            output_dir: Output directory (defaults to resources/nemo-microservices/)
+        Returns:
+            Path to downloaded/extracted quickstart directory
+        """
+        if not self.cli:
+            raise NGCCLINotFoundError("NGC CLI not available")
+        if version is None:
+            version = self.NEMO_RESOURCES["nemo-microservices-quickstart"][
+                "default_version"
+            ]
+        if output_dir is None:
+            output_dir = self.output_base / "nemo-microservices"
+        resource_path = self.NEMO_RESOURCES["nemo-microservices-quickstart"]["path"]
+        logger.info(f"Downloading NeMo Microservices quickstart v{version}...")
+        return self.cli.download_resource(
+            resource_path=resource_path,
+            version=version,
+            output_dir=output_dir,
+            extract=True,
+        )
+    def download_nemo_framework(
+        self, version: str | None = None, output_dir: Path | None = None
+    ) -> Path:
+        """
+        Download NeMo framework.
+        Args:
+            version: Version to download
+            output_dir: Output directory
+        Returns:
+            Path to downloaded framework
+        """
+        if not self.cli:
+            raise NGCCLINotFoundError("NGC CLI not available")
+        if output_dir is None:
+            output_dir = self.output_base / "nemo-framework"
+        resource_path = self.NEMO_RESOURCES["nemo-framework"]["path"]
+        logger.info("Downloading NeMo framework...")
+        return self.cli.download_resource(
+            resource_path=resource_path,
+            version=version,
+            output_dir=output_dir,
+            extract=True,
+        )
+    def download_custom_resource(
+        self,
+        resource_path: str,
+        version: str | None = None,
+        output_dir: Path | None = None,
+    ) -> Path:
+        """
+        Download a custom resource from NGC catalog.
+        Args:
+            resource_path: Resource path (e.g., "nvidia/nemo-microservices/nemo-microservices-quickstart")
+            version: Optional version tag
+            output_dir: Optional output directory
+        Returns:
+            Path to downloaded resource
+        """
+        if not self.cli:
+            raise NGCCLINotFoundError("NGC CLI not available")
+        if output_dir is None:
+            # Create directory from resource name
+            resource_name = resource_path.split("/")[-1]
+            output_dir = self.output_base / resource_name
+        logger.info(f"Downloading {resource_path}...")
+        return self.cli.download_resource(
+            resource_path=resource_path,
+            version=version,
+            output_dir=output_dir,
+            extract=True,
+        )
+def download_nemo_quickstart(
+    version: str | None = None, output_dir: Path | None = None
+) -> Path:
+    """
+    Convenience function to download NeMo Microservices quickstart.
+    Args:
+        version: Version to download (defaults to 25.10)
+        output_dir: Output directory
+    Returns:
+        Path to downloaded quickstart directory
+    """
+    downloader = NGCResourceDownloader()
+    return downloader.download_nemo_quickstart(version=version, output_dir=output_dir)

utils/s3_dataset_loader.py ADDED Viewed

	@@ -0,0 +1,495 @@

+#!/usr/bin/env python3
+"""
+S3 Dataset Loader - Streaming JSON/JSONL loader for S3 training data
+S3 is the training mecca - all training data should be loaded from S3
+"""
+import contextlib
+import json
+import logging
+import os
+from collections.abc import Iterator
+from io import BytesIO
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+try:
+    import boto3
+    from botocore.exceptions import ClientError as _BotocoreClientError
+except ImportError:
+    # Keep runtime behavior (error on use) while making type checkers happy.
+    boto3 = None  # type: ignore[assignment]
+    _BotocoreClientError = None  # type: ignore[assignment]
+if TYPE_CHECKING:
+    # Minimal shape we rely on in this module.
+    class ClientError(Exception):
+        response: dict[str, Any]
+else:
+    ClientError = (
+        _BotocoreClientError if _BotocoreClientError is not None else Exception
+    )  # type: ignore[assignment]
+BOTO3_AVAILABLE = boto3 is not None
+# Load .env file if available
+with contextlib.suppress(ImportError):
+    from dotenv import load_dotenv
+    # Try loading from ai/ directory first (where .env actually is), then project root
+    # Module is at: ai/training_ready/utils/s3_dataset_loader.py
+    # So parents[0] = ai/training_ready/utils/, parents[1] = ai/training_ready/,
+    #    parents[2] = ai/, parents[3] = project root
+    module_path = Path(__file__).resolve()
+    env_paths = []
+    try:
+        env_paths.append(module_path.parents[2] / ".env")  # ai/.env
+        env_paths.append(module_path.parents[3] / ".env")  # project root/.env
+    except IndexError:
+        # Fallback for shallow/flattened structures
+        env_paths.append(module_path.parent / ".env")
+        if module_path.parent.name != "ai":
+            env_paths.append(module_path.parent.parent / ".env")
+    for env_path in env_paths:
+        try:
+            if env_path.exists() and env_path.is_file():
+                load_dotenv(env_path, override=False)
+                break
+        except Exception:
+            continue
+logger = logging.getLogger(__name__)
+class S3DatasetLoader:
+    """
+    Load datasets from S3 with streaming support for large files.
+    S3 is the canonical training data location.
+    """
+    def __init__(
+        self,
+        bucket: str = "pixel-data",
+        endpoint_url: str | None = None,
+        aws_access_key_id: str | None = None,
+        aws_secret_access_key: str | None = None,
+        region_name: str = "us-east-va",
+    ):
+        """
+        Initialize S3 client for dataset loading.
+        Args:
+            bucket: S3 bucket name (default: pixel-data)
+            endpoint_url: S3 endpoint URL (default: OVH S3 endpoint)
+            aws_access_key_id: AWS access key (from env if not provided)
+            aws_secret_access_key: AWS secret key (from env if not provided)
+            region_name: AWS region (default: us-east-va for OVH)
+        """
+        if boto3 is None:
+            raise ImportError(
+                "boto3 is required for S3 dataset loading. "
+                "Install with: uv pip install boto3"
+            )
+        # Always allow env to override bucket for OVH S3
+        # This ensures OVH_S3_BUCKET is always used when set
+        self.bucket = os.getenv("OVH_S3_BUCKET", bucket)
+        print(
+            f"[DEBUG] S3Loader: env OVH_S3_BUCKET={os.getenv('OVH_S3_BUCKET')}, "
+            f"input bucket={bucket}, final={self.bucket}",
+            flush=True,
+        )
+        self.endpoint_url = endpoint_url or os.getenv(
+            "OVH_S3_ENDPOINT", "https://s3.us-east-va.io.cloud.ovh.us"
+        )
+        # Get credentials from params or environment
+        access_key = (
+            aws_access_key_id
+            or os.getenv("OVH_S3_ACCESS_KEY")
+            or os.getenv("OVH_ACCESS_KEY")
+            or os.getenv("AWS_ACCESS_KEY_ID")
+        )
+        secret_key = (
+            aws_secret_access_key
+            or os.getenv("OVH_S3_SECRET_KEY")
+            or os.getenv("OVH_SECRET_KEY")
+            or os.getenv("AWS_SECRET_ACCESS_KEY")
+        )
+        if not access_key or not secret_key:
+            raise ValueError(
+                "S3 credentials not found. Set OVH_S3_ACCESS_KEY/OVH_S3_SECRET_KEY "
+                "(or OVH_ACCESS_KEY/OVH_SECRET_KEY, "
+                "or AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY)."
+            )
+        # Initialize S3 client (OVH S3 compatible)
+        # OVH uses self-signed certificates, so verify=False is required
+        # Initialize S3 client (OVH S3 compatible)
+        # OVH uses self-signed certificates, so verify=False is required
+        verify_ssl = os.getenv("OVH_S3_CA_BUNDLE", True)
+        # Handle string "False" or "0" from env
+        if str(verify_ssl).lower() in {"false", "0", "no"}:
+            verify_ssl = False
+        if verify_ssl is False:
+            logger.warning(
+                "Initializing S3 client with SSL verification DISABLED (insecure)"
+            )
+        self.s3_client = boto3.client(
+            "s3",
+            endpoint_url=self.endpoint_url,
+            aws_access_key_id=access_key,
+            aws_secret_access_key=secret_key,
+            region_name=region_name or os.getenv("OVH_S3_REGION", "us-east-va"),
+            verify=verify_ssl,
+        )
+        logger.info(f"S3DatasetLoader initialized for bucket: {bucket}")
+    def _parse_s3_path(self, s3_path: str) -> tuple[str, str]:
+        """
+        Parse S3 path into bucket and key.
+        Args:
+            s3_path: S3 path (s3://bucket/key or just key)
+        Returns:
+            Tuple of (bucket, key)
+        """
+        # If path starts with s3://, it includes bucket
+        if s3_path.startswith("s3://"):
+            s3_path = s3_path.removeprefix("s3://")
+            if "/" in s3_path:
+                parts = s3_path.split("/", 1)
+                return parts[0], parts[1]
+            # s3://bucket-only (no key)
+            return s3_path, ""
+        # Otherwise, it's just a key - use configured bucket
+        return self.bucket, s3_path
+    def object_exists(self, s3_path: str) -> bool:
+        """Check if S3 object exists"""
+        try:
+            bucket, key = self._parse_s3_path(s3_path)
+            self.s3_client.head_object(Bucket=bucket, Key=key)
+            return True
+        except ClientError as e:
+            if e.response["Error"]["Code"] == "404":
+                return False
+            raise
+    def load_json(
+        self,
+        s3_path: str,
+        cache_local: Path | None = None,
+    ) -> dict[str, Any]:
+        """
+        Load JSON dataset from S3.
+        Args:
+            s3_path: S3 path (s3://bucket/key or just key)
+            cache_local: Optional local cache path
+        Returns:
+            Parsed JSON data
+        """
+        bucket, key = self._parse_s3_path(s3_path)
+        # Check local cache first
+        if cache_local and cache_local.exists():
+            logger.info(f"Loading from local cache: {cache_local}")
+            with open(cache_local) as f:
+                return json.load(f)
+        # Load from S3
+        logger.info(f"Loading from S3: s3://{bucket}/{key}")
+        try:
+            response = self.s3_client.get_object(Bucket=bucket, Key=key)
+            data = json.loads(response["Body"].read())
+            # Cache locally if requested
+            if cache_local:
+                cache_local.parent.mkdir(parents=True, exist_ok=True)
+                with open(cache_local, "w") as f:
+                    json.dump(data, f)
+                logger.info(f"Cached to: {cache_local}")
+            return data
+        except ClientError as e:
+            if e.response["Error"]["Code"] == "NoSuchKey":
+                raise FileNotFoundError(
+                    f"Dataset not found in S3: s3://{bucket}/{key}"
+                ) from e
+            raise
+    def load_bytes(self, s3_path: str) -> bytes:
+        """
+        Load raw bytes from S3.
+        Args:
+            s3_path: S3 path (s3://bucket/key or just key)
+        Returns:
+            Raw bytes of the object body
+        """
+        bucket, key = self._parse_s3_path(s3_path)
+        logger.info(f"Loading bytes from S3: s3://{bucket}/{key}")
+        try:
+            response = self.s3_client.get_object(Bucket=bucket, Key=key)
+            return response["Body"].read()
+        except ClientError as e:
+            if e.response["Error"]["Code"] == "NoSuchKey":
+                raise FileNotFoundError(
+                    f"Dataset not found in S3: s3://{bucket}/{key}"
+                ) from e
+            raise
+    def load_text(
+        self,
+        s3_path: str,
+        *,
+        encoding: str = "utf-8",
+        errors: str = "replace",
+    ) -> str:
+        """
+        Load a text object from S3.
+        This is primarily for transcript corpora (e.g. .txt) that need to be
+        converted into ChatML examples.
+        """
+        data = self.load_bytes(s3_path)
+        return data.decode(encoding, errors=errors)
+    def _parse_jsonl_line(self, line: bytes) -> dict[str, Any] | None:
+        """
+        Parse a single JSONL line with robust error handling.
+        Args:
+            line: Raw bytes of a JSONL line
+        Returns:
+            Parsed JSON object or None if parsing failed
+        """
+        try:
+            return json.loads(line.decode("utf-8"))
+        except UnicodeDecodeError:
+            try:
+                return json.loads(line.decode("utf-8", errors="replace"))
+            except json.JSONDecodeError as e:
+                logger.warning(f"Failed to parse JSONL line: {e}")
+        except json.JSONDecodeError as e:
+            logger.warning(f"Failed to parse JSONL line: {e}")
+        return None
+    def _stream_with_iter_lines(self, body) -> Iterator[dict[str, Any]]:
+        """
+        Stream JSONL using iter_lines() method.
+        Args:
+            body: S3 response body with iter_lines capability
+        Yields:
+            Parsed JSON objects
+        """
+        for raw_line in body.iter_lines():
+            if not raw_line:
+                continue
+            parsed = self._parse_jsonl_line(raw_line)
+            if parsed is not None:
+                yield parsed
+    def _stream_with_manual_buffering(self, body) -> Iterator[dict[str, Any]]:
+        """
+        Stream JSONL using manual buffering as fallback.
+        Args:
+            body: S3 response body
+        Yields:
+            Parsed JSON objects
+        """
+        buffer = BytesIO()
+        for chunk in body.iter_chunks(chunk_size=8192):
+            buffer.write(chunk)
+            while True:
+                buffer.seek(0)
+                line = buffer.readline()
+                if not line:
+                    buffer = BytesIO()
+                    break
+                if not line.endswith(b"\n"):
+                    # Keep incomplete tail in buffer
+                    rest = buffer.read()
+                    buffer = BytesIO(line + rest)
+                    break
+                parsed = self._parse_jsonl_line(line)
+                if parsed is not None:
+                    yield parsed
+                rest = buffer.read()
+                buffer = BytesIO(rest)
+    def stream_jsonl(self, s3_path: str) -> Iterator[dict[str, Any]]:
+        """
+        Stream JSONL dataset from S3 (memory-efficient for large files).
+        Args:
+            s3_path: S3 path (s3://bucket/key or just key)
+        Yields:
+            Parsed JSON objects (one per line)
+        """
+        bucket, key = self._parse_s3_path(s3_path)
+        logger.info(f"Streaming JSONL from S3: s3://{bucket}/{key}")
+        try:
+            response = self.s3_client.get_object(Bucket=bucket, Key=key)
+            body = response["Body"]
+            with contextlib.closing(body):
+                # Prefer iter_lines() which handles chunk boundaries robustly
+                iter_lines = getattr(body, "iter_lines", None)
+                if callable(iter_lines):
+                    yield from self._stream_with_iter_lines(body)
+                else:
+                    # Fallback to manual buffering
+                    yield from self._stream_with_manual_buffering(body)
+        except ClientError as e:
+            if e.response["Error"]["Code"] == "NoSuchKey":
+                raise FileNotFoundError(
+                    f"Dataset not found in S3: s3://{bucket}/{key}"
+                ) from e
+            raise
+    def list_datasets(self, prefix: str = "gdrive/processed/") -> list[str]:
+        """
+        List available datasets in S3.
+        Args:
+            prefix: S3 prefix to search (default: gdrive/processed/)
+        Returns:
+            List of S3 paths
+        """
+        logger.info(f"Listing datasets with prefix: {prefix}")
+        datasets: list[str] = []
+        try:
+            paginator = self.s3_client.get_paginator("list_objects_v2")
+            pages = paginator.paginate(Bucket=self.bucket, Prefix=prefix)
+            for page in pages:
+                if "Contents" in page:
+                    datasets.extend(
+                        f"s3://{self.bucket}/{obj['Key']}"
+                        for obj in page["Contents"]
+                        if obj["Key"].endswith((".json", ".jsonl"))
+                    )
+        except ClientError:
+            logger.exception("Failed to list S3 objects")
+            raise
+        return datasets
+    def download_file(self, s3_path: str, local_path: Path | str) -> None:
+        """Download a file from S3 to local path"""
+        try:
+            bucket, key = self._parse_s3_path(s3_path)
+            logger.info(f"Downloading s3://{bucket}/{key} to {local_path}")
+            self.s3_client.download_file(bucket, key, str(local_path))
+        except Exception:
+            logger.exception(f"Failed to download {s3_path} to {local_path}")
+            raise
+    def upload_file(self, local_path: Path | str, s3_key: str) -> None:
+        """Upload a local file to S3"""
+        try:
+            if not isinstance(local_path, Path):
+                local_path = Path(local_path)
+            bucket, key = self._parse_s3_path(s3_key)
+            logger.info(f"Uploading {local_path} to s3://{bucket}/{key}")
+            self.s3_client.upload_file(str(local_path), bucket, key)
+        except Exception:
+            logger.exception(f"Failed to upload {local_path} to {s3_key}")
+            raise
+def get_s3_dataset_path(
+    dataset_name: str,
+    category: str | None = None,
+    bucket: str = "pixel-data",
+    prefer_processed: bool = True,
+) -> str:
+    """
+    Get S3 path for dataset - S3 is canonical training data location.
+    Args:
+        dataset_name: Name of the dataset file
+        category: Optional category (cot_reasoning, professional_therapeutic, etc.)
+        bucket: S3 bucket name
+        prefer_processed: Prefer processed/canonical structure over raw
+    Returns:
+        S3 path (s3://bucket/path)
+    """
+    loader = S3DatasetLoader(bucket=bucket)
+    # Try canonical processed structure first
+    if category and prefer_processed:
+        path = f"s3://{bucket}/gdrive/processed/{category}/{dataset_name}"
+        if loader.object_exists(path):
+            return path
+    # Fallback to raw structure
+    if prefer_processed:
+        path = f"s3://{bucket}/gdrive/raw/{dataset_name}"
+        if loader.object_exists(path):
+            return path
+    # Fallback to acquired
+    path = f"s3://{bucket}/acquired/{dataset_name}"
+    if loader.object_exists(path):
+        return path
+    # If category provided, construct path even if doesn't exist yet
+    if category:
+        return f"s3://{bucket}/gdrive/processed/{category}/{dataset_name}"
+    return f"s3://{bucket}/gdrive/raw/{dataset_name}"
+def load_dataset_from_s3(
+    dataset_name: str,
+    category: str | None = None,
+    cache_local: Path | None = None,
+    bucket: str = "pixel-data",
+) -> dict[str, Any]:
+    """
+    Load dataset from S3 with automatic path resolution.
+    Args:
+        dataset_name: Name of the dataset file
+        category: Optional category for canonical structure
+        cache_local: Optional local cache path
+        bucket: S3 bucket name
+    Returns:
+        Dataset data
+    """
+    loader = S3DatasetLoader(bucket=bucket)
+    s3_path = get_s3_dataset_path(dataset_name, category, bucket)
+    if dataset_name.endswith(".jsonl"):
+        # For JSONL, convert to list
+        return {"conversations": list(loader.stream_jsonl(s3_path))}
+    return loader.load_json(s3_path, cache_local)

utils/subtitle_processor.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import re
+from pathlib import Path
+class SubtitleProcessor:
+    """Utility for cleaning and formatting YouTube VTT subtitles."""
+    @staticmethod
+    def clean_vtt(vtt_content: str) -> str:
+        """
+        Clean VTT content by removing timestamps, tags, and duplicates.
+        YouTube automatic captions often repeat lines with incremental words.
+        """
+        # Remove header
+        lines = vtt_content.split("\n")
+        if lines and lines[0].startswith("WEBVTT"):
+            lines = lines[1:]
+        # Remove metadata lines (Kind:, Language:, etc)
+        lines = [
+            line
+            for line in lines
+            if not any(
+                line.startswith(prefix)
+                for prefix in ["Kind:", "Language:", "align:", "position:"]
+            )
+        ]
+        # Remove timestamp lines and tags
+        # Pattern for 00:00:00.000 --> 00:00:00.000
+        timestamp_pattern = re.compile(
+            r"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*"
+        )
+        # Pattern for <00:00:00.000><c> etc
+        tag_pattern = re.compile(r"<[^>]+>")
+        cleaned_paragraphs = []
+        current_text = []
+        seen_lines = set()
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            if timestamp_pattern.match(line):
+                continue
+            # Clean tags
+            cleaned_line = tag_pattern.sub("", line).strip()
+            if not cleaned_line:
+                continue
+            # YouTube auto-subs repeat text heavily.
+            # We want to keep unique sentences/segments.
+            if cleaned_line in seen_lines:
+                continue
+            seen_lines.add(cleaned_line)
+            current_text.append(cleaned_line)
+        # Merge lines and remove redundant parts of sentences
+        full_text = " ".join(current_text)
+        # Simple cleanup of redundant repeated segments (YouTube specific)
+        # e.g. "Hello world Hello world there" -> "Hello world there"
+        # This is a bit complex to do perfectly without NLP, but we can do some basics.
+        return full_text
+    @staticmethod
+    def format_as_markdown(text: str, metadata: dict) -> str:
+        """Format the cleaned text as a structured Markdown file."""
+        title = metadata.get("title", "Unknown Title")
+        channel = metadata.get("channel", "Unknown Channel")
+        video_url = metadata.get("url", "")
+        date = metadata.get("date", "")
+        md = f"# {title}\n\n"
+        md += f"**Channel:** {channel}\n"
+        md += f"**Source:** {video_url}\n"
+        md += f"**Date:** {date}\n\n"
+        md += "## Transcript\n\n"
+        # Split into paragraphs of roughly 5-7 sentences
+        sentences = re.split(r"(?<=[.!?])\s+", text)
+        paragraphs = []
+        for i in range(0, len(sentences), 6):
+            paragraphs.append(" ".join(sentences[i : i + 6]))
+        md += "\n\n".join(paragraphs)
+        md += "\n"
+        return md

utils/transcript_corrector.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import json
+import logging
+import re
+from pathlib import Path
+from typing import Any, Dict
+# Configure logger
+logger = logging.getLogger(__name__)
+class TranscriptCorrector:
+    """
+    Utility class for correcting transcripts using a multi-pass approach:
+    1. Therapeutic Terminology Validation
+    2. LLM-based Contextual Correction (Mocked for now)
+    3. Structural Alignment (Basic regex cleanup)
+    """
+    def __init__(self, config_path: str = "ai/config/therapeutic_terminology.json"):
+        """
+        Initialize the TranscriptCorrector with terminology configuration.
+        Args:
+            config_path: Path to the JSON configuration file containing
+                therapeutic terms.
+        """
+        self.config_path = Path(config_path)
+        self.terms: Dict[str, Any] = self._load_terminology()
+    def _load_terminology(self) -> Dict[str, Any]:
+        """Load therapeutic terminology from JSON config."""
+        try:
+            # Handle relative paths from project root if needed
+            if not self.config_path.exists():
+                # Try relative to the current file location
+                # structure is usually ai/utils/transcript_corrector.py
+                # config is at ai/config/therapeutic_terminology.json
+                # so we go up 2 levels
+                base_path = Path(__file__).parent.parent
+                alt_path = base_path / "config" / "therapeutic_terminology.json"
+                if alt_path.exists():
+                    self.config_path = alt_path
+                else:
+                    logger.warning(
+                        f"Terminology config not found at {self.config_path} or "
+                        f"{alt_path}. Using empty config."
+                    )
+                    return {
+                        "cptsd_terms": [],
+                        "medical_terms": [],
+                        "common_misinterpretations": {},
+                    }
+            with open(self.config_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except Exception as e:
+            logger.error(f"Failed to load terminology config: {e}")
+            return {
+                "cptsd_terms": [],
+                "medical_terms": [],
+                "common_misinterpretations": {},
+            }
+    def correct_transcript(self, text: str, context: str = "therapy_session") -> str:
+        """
+        Main entry point for transcript correction.
+        Args:
+            text: Single string containing the transcript text to correct.
+            context: Context hint for LLM correction.
+        Returns:
+            Corrected transcript text.
+        """
+        if not text or not text.strip():
+            return ""
+        # Pass 1: Basic Structural Cleanup
+        text = self._clean_structure(text)
+        # Pass 2: Terminology Replacement
+        text = self._apply_terminology_fixes(text)
+        # Pass 3: LLM Contextual Correction (Mocked)
+        text = self._llm_contextual_correction(text, context)
+        return text
+    def _clean_structure(self, text: str) -> str:
+        """Remove filler words and normalize whitespace."""
+        # Common filler words in speech, optionally followed by a comma
+        fillers = r"\b(um|uh|err|ah|like|you know|I mean)\b,?\s*"
+        # Remove fillers (case-insensitive)
+        cleaned = re.sub(fillers, "", text, flags=re.IGNORECASE)
+        # Normalize whitespace (replace multiple spaces with single space)
+        cleaned = re.sub(r"\s+", " ", cleaned).strip()
+        return cleaned
+    def _apply_terminology_fixes(self, text: str) -> str:
+        """Apply deterministic terminology fixes from config."""
+        misinterpretations = self.terms.get("common_misinterpretations", {})
+        for bad_term, good_term in misinterpretations.items():
+            # Use word boundaries to match whole words/phrases ignoring case
+            pattern = re.compile(re.escape(bad_term), re.IGNORECASE)
+            text = pattern.sub(good_term, text)
+        return text
+    def _llm_contextual_correction(self, text: str, context: str) -> str:
+        """
+        Mock function for GPT-4 based correction.
+        In the future, this will call the LLM service to fix grammar and nuances.
+        """
+        # TODO: Implement actual LLM call via external service or local model
+        # For now, we just log that we would allow the LLM to process this
+        # and return the text as is (or maybe apply a dummy transformation
+        # for testing if needed)
+        # Simulating a check for critical CPTSD terms that might be missed
+        # If we had an LLM, we'd ask it: "Correct this transcript keeping CPTSD context
+        # in mind."
+        return text
+    def validate_term_coverage(self, text: str) -> Dict[str, float]:
+        """
+        Calculate metrics on how well the transcript effectively uses domain
+        terminology. Useful for validation pass.
+        """
+        cptsd_terms = {t.lower() for t in self.terms.get("cptsd_terms", [])}
+        medical_terms = {t.lower() for t in self.terms.get("medical_terms", [])}
+        text_lower = text.lower()
+        found_cptsd = sum(term in text_lower for term in cptsd_terms)
+        found_medical = sum(term in text_lower for term in medical_terms)
+        total_domain_terms = len(cptsd_terms) + len(medical_terms)
+        found_total = found_cptsd + found_medical
+        # This is a naive metric, just for basic validation
+        coverage_score = (
+            found_total / total_domain_terms if total_domain_terms > 0 else 0.0
+        )
+        return {
+            "cptsd_term_count": found_cptsd,
+            "medical_term_count": found_medical,
+            "domain_coverage_score": round(coverage_score, 4),
+        }