Spaces:

astronolan
/

AION-Search

Running

App Files Files Community

astronolan commited on Nov 17, 2025

Commit

b944de3

1 Parent(s): cf36170

Cleaned up

Browse files

Files changed (7) hide show

clip/evaluation/__init__.py +0 -5
clip/evaluation/inference.py +0 -82
clip/utils/__init__.py +0 -10
clip/utils/data_loader.py +0 -250
clip/utils/io_utils.py +0 -103
clip/utils/logging_utils.py +0 -42
main.py +0 -6

clip/evaluation/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-"""Evaluation utilities for CLIP model."""
-from .inference import ClipInferenceModel
-__all__ = ["ClipInferenceModel"]

clip/evaluation/inference.py DELETED Viewed

@@ -1,82 +0,0 @@
-"""
-Inference utilities for trained CLIP model.
-"""
-import torch
-import torch.nn.functional as F
-import numpy as np
-from pathlib import Path
-from typing import Union, List, Dict, Tuple
-import logging
-from ..models import GalaxyClipModel
-logger = logging.getLogger(__name__)
-class ClipInferenceModel:
-    """Wrapper for using trained CLIP model for inference and search."""
-    def __init__(self, model_path: str, device: str = "cpu"):
-        """
-        Initialize inference model.
-        Args:
-            model_path: Path to saved model (.pt file)
-            device: Device to use for inference
-        """
-        self.device = torch.device(device)
-        # Load model
-        checkpoint = torch.load(model_path, map_location=self.device)
-        model_config = checkpoint['model_config']
-        # Create model with same config
-        self.model = GalaxyClipModel(
-            image_input_dim=model_config['image_input_dim'],
-            text_input_dim=model_config['text_input_dim'],
-            embedding_dim=model_config['embedding_dim']
-        )
-        # Load weights
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.model.to(self.device)
-        self.model.eval()
-        self.config = model_config
-        logger.info(f"Loaded CLIP model on {device}")
-        logger.info(f"Model config: {model_config}")
-    def encode_images(self, image_embeddings):
-        """Encode image embeddings to shared space."""
-        tensor = torch.as_tensor(image_embeddings, dtype=torch.float, device=self.device)
-        if tensor.ndim == 1:
-            tensor = tensor.unsqueeze(0)
-            squeeze = True
-        else:
-            squeeze = False
-        with torch.no_grad():
-            # Use image_projector and normalize
-            out = self.model.image_projector(tensor)
-        return out.squeeze(0).cpu() if squeeze else out.cpu()
-    def encode_texts(self, text_embeddings):
-        """Encode text embeddings to shared space."""
-        tensor = torch.as_tensor(text_embeddings, dtype=torch.float, device=self.device)
-        if tensor.ndim == 1:
-            tensor = tensor.unsqueeze(0)
-            squeeze = True
-        else:
-            squeeze = False
-        with torch.no_grad():
-            # Use text_projector and normalize
-            out = self.model.text_projector(tensor)
-        return out.squeeze(0).cpu() if squeeze else out.cpu()

clip/utils/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-"""Utility functions for CLIP training and evaluation."""
-from .logging_utils import setup_logging
-from .io_utils import save_clip_embeddings_hdf5, inspect_generated_files
-__all__ = [
-    "setup_logging",
-    "save_clip_embeddings_hdf5",
-    "inspect_generated_files"
-]

clip/utils/data_loader.py DELETED Viewed

@@ -1,250 +0,0 @@
-"""
-Data loader for multi-text training using unified parquet file with nested text embeddings.
-This loader handles the new unified format from 05_generate_unified_embeddings.py.
-"""
-import numpy as np
-import pandas as pd
-import torch
-from torch.utils.data import Dataset, DataLoader
-import logging
-from pathlib import Path
-import random
-logger = logging.getLogger(__name__)
-class UnifiedMultiTextDataset(Dataset):
-    """Dataset for unified parquet file with multiple text embeddings per galaxy."""
-    def __init__(self, parquet_path, split="train", train_ratio=0.8,
-                 text_sampling_strategy="random", epoch=0, max_train_samples=None,
-                 num_embedding=None):
-        self.parquet_path = Path(parquet_path)
-        self.split = split
-        self.train_ratio = train_ratio
-        self.text_sampling_strategy = text_sampling_strategy
-        self.epoch = epoch
-        self.max_train_samples = max_train_samples
-        self.num_embedding = num_embedding
-        # Load the parquet file
-        logger.info(f"Loading unified embeddings from {self.parquet_path}")
-        self.df = pd.read_parquet(self.parquet_path)
-        # Create train/val split based on galaxy_index
-        n_samples = len(self.df)
-        indices = np.arange(n_samples)
-        self.seed = 42
-        # Deterministic split based on galaxy_index
-        split_mask = []
-        for idx in range(n_samples):
-            galaxy_idx = self.df.iloc[idx]['galaxy_index']
-            # Hash the galaxy index for deterministic assignment
-            sample_hash = hash((galaxy_idx, self.seed)) % 10000 / 10000.0
-            is_train = sample_hash < self.train_ratio
-            split_mask.append(is_train)
-        split_mask = np.array(split_mask)
-        if split == "train":
-            self.indices = indices[split_mask]
-            # Limit training samples if specified
-            if self.max_train_samples is not None and len(self.indices) > self.max_train_samples:
-                rng = np.random.RandomState(self.seed)
-                selected_indices = rng.choice(self.indices, size=self.max_train_samples, replace=False)
-                self.indices = np.sort(selected_indices)  # Sort for reproducibility
-                logger.info(f"Limited training set to {self.max_train_samples} samples")
-        else:
-            self.indices = indices[~split_mask]
-        logger.info(f"Dataset initialized: {len(self.indices)} samples for {split} split")
-        logger.info(f"Text sampling strategy: {text_sampling_strategy}")
-        # Validate num_embedding parameter for specific_summary strategy
-        if text_sampling_strategy == "specific_summary" and num_embedding is None:
-            raise ValueError("num_embedding parameter is required when using 'specific_summary' strategy")
-        # Check data structure
-        sample_row = self.df.iloc[0]
-        n_augmented = len(sample_row['augmented_embeddings'])
-        logger.info(f"Each galaxy has 1 original + {n_augmented} augmented embeddings = {1 + n_augmented} total")
-        # Validate num_embedding is within valid range
-        if text_sampling_strategy == "specific_summary":
-            total_embeddings = 1 + n_augmented
-            if num_embedding < 0 or num_embedding >= total_embeddings:
-                raise ValueError(f"num_embedding must be between 0 and {total_embeddings-1}, got {num_embedding}")
-            logger.info(f"Using specific embedding at index {num_embedding}")
-    def __len__(self):
-        return len(self.indices)
-    def set_epoch(self, epoch):
-        """Set current epoch for round-robin sampling."""
-        self.epoch = epoch
-    def _get_all_embeddings_and_sources(self, row):
-        """Combine original and augmented embeddings into single lists."""
-        # Start with original embedding
-        all_embeddings = [np.array(row['text_embedding'], dtype=np.float32)]
-        all_sources = [row['description_sources'][0]]  # 'original'
-        # Add augmented embeddings
-        for aug_emb, aug_source in zip(row['augmented_embeddings'], row['description_sources'][1:]):
-            all_embeddings.append(np.array(aug_emb, dtype=np.float32))
-            all_sources.append(aug_source)
-        return all_embeddings, all_sources
-    def _sample_text_embedding(self, text_embeddings, text_sources, galaxy_idx):
-        """Sample one text embedding from multiple options."""
-        n_texts = len(text_embeddings)
-        if self.text_sampling_strategy == "original":
-            # Always use original text (index 0)
-            idx = 0
-        elif self.text_sampling_strategy == "summaries-only":
-            # Only use summaries (exclude original at index 0)
-            if n_texts > 1:
-                rng = random.Random(galaxy_idx + self.epoch * 1000000)
-                idx = rng.randint(1, n_texts - 1)  # Start from 1 to exclude original
-            else:
-                # Fallback to original if no summaries available
-                idx = 0
-        elif self.text_sampling_strategy == "specific_summary":
-            # Use the specific embedding index provided
-            if self.num_embedding < n_texts:
-                idx = self.num_embedding
-            else:
-                # Fallback to original if index out of range
-                logger.warning(f"Requested embedding index {self.num_embedding} out of range for {n_texts} embeddings, using original")
-                idx = 0
-        elif self.text_sampling_strategy == "random":
-            # Random sampling with seed based on galaxy_idx and epoch
-            rng = random.Random(galaxy_idx + self.epoch * 1000000)
-            idx = rng.randint(0, n_texts - 1)
-        elif self.text_sampling_strategy == "round-robin":
-            # Cycle through texts based on epoch
-            idx = (self.epoch + galaxy_idx) % n_texts
-        elif self.text_sampling_strategy == "weighted":
-            # Weight towards original (50%) and summaries (50% / n_summaries each)
-            rng = random.Random(galaxy_idx + self.epoch * 1000000)
-            n_summaries = n_texts - 1
-            if n_summaries > 0:
-                summary_weight = 0.5 / n_summaries
-                weights = [0.5] + [summary_weight] * n_summaries
-            else:
-                weights = [1.0]
-            idx = rng.choices(range(n_texts), weights=weights)[0]
-        else:
-            idx = 0  # Default to original
-        return text_embeddings[idx], text_sources[idx], idx
-    def __getitem__(self, idx):
-        """Get a single sample with randomly selected text embedding."""
-        actual_idx = self.indices[idx]
-        row = self.df.iloc[actual_idx]
-        # Get AION embedding
-        aion_embedding = np.array(row['aion_embedding'], dtype=np.float32)
-        # Get all text embeddings and sources
-        text_embeddings, text_sources = self._get_all_embeddings_and_sources(row)
-        # Sample one text embedding
-        galaxy_idx = row['galaxy_index']
-        selected_text, selected_source, text_idx = self._sample_text_embedding(
-            text_embeddings, text_sources, galaxy_idx
-        )
-        # Log selection details periodically (every 100th sample)
-        if idx % 100 == 0:
-            logger.debug(f"Galaxy {galaxy_idx}: Selected {selected_source} (index {text_idx}) from {len(text_sources)} options")
-        return {
-            'aion_embedding': torch.from_numpy(aion_embedding),
-            'text_embedding': torch.from_numpy(selected_text),
-            'galaxy_index': galaxy_idx,
-            'text_source': selected_source,
-            'text_index': text_idx,
-            'object_id': row['object_id']
-        }
-def create_unified_multi_text_loaders(
-    unified_embeddings_path,
-    batch_size=64,
-    train_ratio=0.8,
-    pin_memory=True,
-    text_sampling_strategy="random",
-    num_workers=4,
-    max_train_samples=None,
-    num_embedding=None,
-    **kwargs
-):
-    """
-    Create train and validation data loaders for multi-text training from unified parquet.
-    Args:
-        unified_embeddings_path: Path to unified parquet file
-        batch_size: Batch size for training
-        train_ratio: Fraction of samples for training
-        pin_memory: Whether to pin memory for GPU transfer
-        text_sampling_strategy: How to sample text embeddings ("original", "summaries-only", "specific_summary", "random", "round-robin", "weighted")
-        num_workers: Number of data loading workers
-        max_train_samples: Maximum number of training samples (for data scaling experiments)
-        num_embedding: When using "specific_summary" strategy, the index of the embedding to use
-        **kwargs: Additional arguments
-    """
-    # Convert to Path
-    parquet_path = Path(unified_embeddings_path)
-    if not parquet_path.exists():
-        raise ValueError(f"Unified embeddings file not found: {parquet_path}")
-    logger.info(f"Creating unified multi-text data loaders from {parquet_path}")
-    logger.info(f"Batch size: {batch_size}, Workers: {num_workers}")
-    logger.info(f"Text sampling strategy: {text_sampling_strategy}")
-    # Create datasets
-    train_dataset = UnifiedMultiTextDataset(
-        parquet_path=parquet_path,
-        split="train",
-        train_ratio=train_ratio,
-        text_sampling_strategy=text_sampling_strategy,
-        max_train_samples=max_train_samples,
-        num_embedding=num_embedding
-    )
-    val_dataset = UnifiedMultiTextDataset(
-        parquet_path=parquet_path,
-        split="val",
-        train_ratio=train_ratio,
-        text_sampling_strategy=text_sampling_strategy,
-        num_embedding=num_embedding
-    )
-    # Create loaders
-    train_loader = DataLoader(
-        train_dataset,
-        batch_size=batch_size,
-        shuffle=True,  # Shuffle within the train split
-        num_workers=num_workers,
-        pin_memory=pin_memory,
-        drop_last=True  # Drop incomplete batches for stable training
-    )
-    val_loader = DataLoader(
-        val_dataset,
-        batch_size=batch_size,
-        shuffle=False,  # No shuffle for validation
-        num_workers=num_workers,
-        pin_memory=pin_memory,
-        drop_last=False
-    )
-    return train_loader, val_loader

clip/utils/io_utils.py DELETED Viewed

@@ -1,103 +0,0 @@
-"""
-I/O utilities for saving and loading CLIP embeddings.
-"""
-import h5py
-import numpy as np
-from pathlib import Path
-from datetime import datetime
-import logging
-logger = logging.getLogger(__name__)
-def save_clip_embeddings_hdf5(
-    object_ids,
-    galaxy_data,
-    text_data,
-    aion_clip_embeddings,
-    text_clip_embeddings,
-    output_dir="data/processed"
-):
-    """Save CLIP embeddings to separate HDF5 files."""
-    output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # File paths (standardized names)
-    aion_clip_path = output_dir / "galaxy_aion_clip_embeddings.hdf5"
-    text_clip_path = output_dir / "galaxy_text_clip_embeddings.hdf5"
-    logger.info(f"Saving AION CLIP embeddings to: {aion_clip_path}")
-    # Save AION CLIP embeddings
-    with h5py.File(aion_clip_path, 'w') as f:
-        # Object IDs
-        dt = h5py.special_dtype(vlen=str)
-        f.create_dataset('object_id', data=[str(oid) for oid in object_ids], dtype=dt)
-        # Coordinates and metadata
-        ra_values = np.array([galaxy_data[oid]['ra'] for oid in object_ids])
-        dec_values = np.array([galaxy_data[oid]['dec'] for oid in object_ids])
-        healpix_values = np.array([galaxy_data[oid]['healpix'] for oid in object_ids])
-        f.create_dataset('ra', data=ra_values, dtype=np.float64)
-        f.create_dataset('dec', data=dec_values, dtype=np.float64)
-        f.create_dataset('healpix', data=healpix_values, dtype=np.int64)
-        # AION CLIP embeddings
-        f.create_dataset('AION_clip_embedding', data=aion_clip_embeddings, dtype=np.float32)
-        # Metadata
-        f.attrs['description'] = 'AION embeddings encoded through trained CLIP model'
-        f.attrs['embedding_dim'] = aion_clip_embeddings.shape[1]
-        f.attrs['num_objects'] = len(object_ids)
-        f.attrs['created'] = datetime.now().isoformat()
-    logger.info(f"Saving text CLIP embeddings to: {text_clip_path}")
-    # Save text CLIP embeddings
-    with h5py.File(text_clip_path, 'w') as f:
-        # Object IDs
-        dt = h5py.special_dtype(vlen=str)
-        f.create_dataset('object_id', data=[str(oid) for oid in object_ids], dtype=dt)
-        # Coordinates and metadata (use text data for consistency)
-        ra_values = np.array([text_data[oid]['ra'] for oid in object_ids])
-        dec_values = np.array([text_data[oid]['dec'] for oid in object_ids])
-        healpix_values = np.array([text_data[oid]['healpix'] for oid in object_ids])
-        f.create_dataset('ra', data=ra_values, dtype=np.float64)
-        f.create_dataset('dec', data=dec_values, dtype=np.float64)
-        f.create_dataset('healpix', data=healpix_values, dtype=np.int64)
-        # Text CLIP embeddings
-        f.create_dataset('text_clip_embedding', data=text_clip_embeddings, dtype=np.float32)
-        # Metadata
-        f.attrs['description'] = 'Text embeddings encoded through trained CLIP model'
-        f.attrs['embedding_dim'] = text_clip_embeddings.shape[1]
-        f.attrs['num_objects'] = len(object_ids)
-        f.attrs['created'] = datetime.now().isoformat()
-    return aion_clip_path, text_clip_path
-def inspect_generated_files(aion_clip_path, text_clip_path):
-    """Inspect the generated HDF5 files."""
-    logger.info("Inspecting generated AION CLIP embeddings file...")
-    with h5py.File(aion_clip_path, 'r') as f:
-        logger.info(f"AION file datasets: {list(f.keys())}")
-        for key in f.keys():
-            dataset = f[key]
-            logger.info(f"  {key}: shape={dataset.shape}, dtype={dataset.dtype}")
-        logger.info(f"  Attributes: {dict(f.attrs)}")
-    logger.info("Inspecting generated text CLIP embeddings file...")
-    with h5py.File(text_clip_path, 'r') as f:
-        logger.info(f"Text file datasets: {list(f.keys())}")
-        for key in f.keys():
-            dataset = f[key]
-            logger.info(f"  {key}: shape={dataset.shape}, dtype={dataset.dtype}")
-        logger.info(f"  Attributes: {dict(f.attrs)}")

clip/utils/logging_utils.py DELETED Viewed

@@ -1,42 +0,0 @@
-"""Logging utilities."""
-import logging
-import sys
-from pathlib import Path
-def setup_logging(log_level: str = "INFO", log_file: str = None):
-    """
-    Setup logging configuration.
-    Args:
-        log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
-        log_file: Optional path to log file
-    """
-    # Clear any existing handlers
-    logging.getLogger().handlers.clear()
-    # Create formatter
-    formatter = logging.Formatter(
-        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    )
-    # Console handler
-    console_handler = logging.StreamHandler(sys.stdout)
-    console_handler.setFormatter(formatter)
-    # Setup root logger
-    logger = logging.getLogger()
-    logger.setLevel(getattr(logging, log_level.upper()))
-    logger.addHandler(console_handler)
-    # File handler if specified
-    if log_file:
-        log_path = Path(log_file)
-        log_path.parent.mkdir(parents=True, exist_ok=True)
-        file_handler = logging.FileHandler(log_path)
-        file_handler.setFormatter(formatter)
-        logger.addHandler(file_handler)
-    return logger

main.py DELETED Viewed

@@ -1,6 +0,0 @@
-def main():
-    print("Hello from aion-search!")
-if __name__ == "__main__":
-    main()