Spaces:

Smith42
/

perihelion

Sleeping

App Files Files Community

Smith42 commited on Mar 13

Commit

346f507

1 Parent(s): 7fefd3b

Fix error

Browse files

Files changed (4) hide show

app.py +26 -10
dataset_config.yaml +1 -1
src/elo.py +20 -2
src/galaxy_data_loader.py +40 -4

app.py CHANGED Viewed

@@ -45,16 +45,32 @@ def create_app() -> dash.Dash:
     # Initialize tournament
     logger.info("Loading tournament state...")
     loaded = elo.load_tournament_state()
-    if not loaded:
-        logger.info("No existing tournament found. Streaming new pool...")
-        try:
-            logger.info("Streaming pool of %d galaxies from HF dataset...", POOL_SIZE)
-            pool, metadata_map = sample_pool_streaming(POOL_SIZE)
-            register_metadata(metadata_map)
-            elo.initialize_tournament(pool)
-        except Exception as e:
-            logger.error("Failed to initialize tournament: %s", e)
-            raise
     # Layout and callbacks
     app.layout = create_layout()

     # Initialize tournament
     logger.info("Loading tournament state...")
     loaded = elo.load_tournament_state()
+    # Always re-stream the pool to populate the image + metadata caches.
+    # On reload we reuse the saved seed so the same galaxies are sampled in the
+    # same order, keeping ELO rankings consistent across restarts.
+    seed = elo.get_pool_seed() if loaded else None
+    logger.info(
+        "Streaming pool of %d galaxies (seed=%s)...",
+        POOL_SIZE,
+        seed if seed is not None else "random",
+    )
+    try:
+        pool, metadata_map, used_seed = sample_pool_streaming(POOL_SIZE, seed=seed)
+        register_metadata(metadata_map)
+        if not loaded:
+            elo.initialize_tournament(pool, pool_seed=used_seed)
+        else:
+            # Persist seed into existing state so future reloads can reuse it
+            elo.set_pool_seed(used_seed)
+            logger.info(
+                "Tournament state restored: round %d, %d active galaxies",
+                elo.get_tournament_info().get("current_round", 1),
+                len(pool),
+            )
+    except Exception as e:
+        logger.error("Failed to stream galaxy pool: %s", e)
+        raise
     # Layout and callbacks
     app.layout = create_layout()

dataset_config.yaml CHANGED Viewed

@@ -3,7 +3,7 @@ config: "default"
 split: "train"
 image_column: "image"
 id_column: "id_str"
-pool_size: 3000
 min_comparisons_per_round: 3
 max_comparisons_per_round: 5
 elimination_fraction: 0.5

 split: "train"
 image_column: "image"
 id_column: "id_str"
+pool_size: 1000
 min_comparisons_per_round: 3
 max_comparisons_per_round: 5
 elimination_fraction: 0.5

src/elo.py CHANGED Viewed

@@ -44,6 +44,7 @@ class TournamentState:
         eliminated: list[int] | None = None,
         total_comparisons: int = 0,
         tournament_complete: bool = False,
     ):
         self.active_pool = list(active_pool)
         self.elo_ratings = elo_ratings or {idx: DEFAULT_ELO for idx in active_pool}
@@ -52,6 +53,7 @@ class TournamentState:
         self.eliminated = eliminated or []
         self.total_comparisons = total_comparisons
         self.tournament_complete = tournament_complete
     def to_dict(self) -> dict:
         return {
@@ -62,6 +64,7 @@ class TournamentState:
             "eliminated": self.eliminated,
             "total_comparisons": self.total_comparisons,
             "tournament_complete": self.tournament_complete,
         }
     @classmethod
@@ -74,6 +77,7 @@ class TournamentState:
             eliminated=d.get("eliminated", []),
             total_comparisons=d.get("total_comparisons", 0),
             tournament_complete=d.get("tournament_complete", False),
         )
@@ -94,11 +98,11 @@ def _init_scheduler():
     logger.info("ELO state scheduler initialized (repo=%s)", HF_LOG_REPO_ID)
-def initialize_tournament(pool_indices: list[int]):
     """Create a fresh tournament with the given pool."""
     global _state
     with _lock:
-        _state = TournamentState(active_pool=pool_indices)
     _save_state()
     _init_scheduler()
     logger.info("Tournament initialized with %d galaxies", len(pool_indices))
@@ -333,6 +337,20 @@ def select_pair(seen_pairs: set[tuple[int, int]]) -> tuple[int, int] | None:
     return (pair[0], pair[1])
 def get_tournament_info() -> dict:
     """Return a snapshot of tournament state for the progress dashboard."""
     with _lock:

         eliminated: list[int] | None = None,
         total_comparisons: int = 0,
         tournament_complete: bool = False,
+        pool_seed: int | None = None,
     ):
         self.active_pool = list(active_pool)
         self.elo_ratings = elo_ratings or {idx: DEFAULT_ELO for idx in active_pool}
         self.eliminated = eliminated or []
         self.total_comparisons = total_comparisons
         self.tournament_complete = tournament_complete
+        self.pool_seed = pool_seed
     def to_dict(self) -> dict:
         return {
             "eliminated": self.eliminated,
             "total_comparisons": self.total_comparisons,
             "tournament_complete": self.tournament_complete,
+            "pool_seed": self.pool_seed,
         }
     @classmethod
             eliminated=d.get("eliminated", []),
             total_comparisons=d.get("total_comparisons", 0),
             tournament_complete=d.get("tournament_complete", False),
+            pool_seed=d.get("pool_seed"),
         )
     logger.info("ELO state scheduler initialized (repo=%s)", HF_LOG_REPO_ID)
+def initialize_tournament(pool_indices: list[int], pool_seed: int | None = None):
     """Create a fresh tournament with the given pool."""
     global _state
     with _lock:
+        _state = TournamentState(active_pool=pool_indices, pool_seed=pool_seed)
     _save_state()
     _init_scheduler()
     logger.info("Tournament initialized with %d galaxies", len(pool_indices))
     return (pair[0], pair[1])
+def get_pool_seed() -> int | None:
+    """Return the shuffle seed used when the current pool was sampled."""
+    with _lock:
+        return _state.pool_seed if _state else None
+def set_pool_seed(seed: int):
+    """Store the pool seed into the current tournament state and save."""
+    with _lock:
+        if _state is not None:
+            _state.pool_seed = seed
+    _save_state()
 def get_tournament_info() -> dict:
     """Return a snapshot of tournament state for the progress dashboard."""
     with _lock:

src/galaxy_data_loader.py CHANGED Viewed

@@ -60,6 +60,29 @@ def sample_pool_indices(total: int, pool_size: int) -> list[int]:
 # Row / image fetching
 # ---------------------------------------------------------------------------
 def fetch_rows(offsets: list[int]) -> dict[int, dict]:
     """Fetch rows by offset via the HF dataset-viewer /rows endpoint.
@@ -206,20 +229,33 @@ image_cache = ImageCache()
 # Streaming pool sampler
 # ---------------------------------------------------------------------------
-def sample_pool_streaming(pool_size: int) -> tuple[list[int], dict[int, dict]]:
     """Stream pool_size shuffled galaxies from HF Datasets, pre-caching images.
     Returns:
         ids: sequential ints 0..N-1 used as galaxy IDs throughout the app
         metadata_map: {id -> row_dict (without image column)} for display names
     """
     from datasets import load_dataset
     from datasets import Image as HFImage
     logger.info(
-        "Streaming %d galaxies from %s (shuffle buffer=10000)...",
         pool_size,
         DATASET_ID,
     )
     ds = load_dataset(
@@ -235,7 +271,7 @@ def sample_pool_streaming(pool_size: int) -> tuple[list[int], dict[int, dict]]:
     if features and IMAGE_COLUMN in features:
         ds = ds.cast_column(IMAGE_COLUMN, HFImage(decode=False))
-    ds = ds.shuffle(seed=random.randint(0, 2**32 - 1), buffer_size=10_000)
     ds = ds.take(pool_size)
     ids: list[int] = []
@@ -259,4 +295,4 @@ def sample_pool_streaming(pool_size: int) -> tuple[list[int], dict[int, dict]]:
             logger.info("Streamed %d/%d galaxies", i + 1, pool_size)
     logger.info("Finished streaming %d galaxies", len(ids))
-    return ids, metadata_map

 # Row / image fetching
 # ---------------------------------------------------------------------------
+def fetch_image_bytes(row_index: int) -> bytes | None:
+    """Fetch raw image bytes for a single row via the dataset-viewer API.
+    Uses fetch_rows to get the signed image URL, then downloads the image.
+    Returns None on any failure.
+    """
+    rows = fetch_rows([row_index])
+    row = rows.get(row_index)
+    if row is None:
+        return None
+    img_url = _extract_image_url(row)
+    if not img_url:
+        logger.warning("No image URL in row %d", row_index)
+        return None
+    try:
+        resp = requests.get(img_url, headers=_hf_headers(), timeout=30)
+        resp.raise_for_status()
+        return resp.content
+    except Exception as e:
+        logger.warning("Failed to download image for row %d: %s", row_index, e)
+        return None
 def fetch_rows(offsets: list[int]) -> dict[int, dict]:
     """Fetch rows by offset via the HF dataset-viewer /rows endpoint.
 # Streaming pool sampler
 # ---------------------------------------------------------------------------
+def sample_pool_streaming(
+    pool_size: int, seed: int | None = None
+) -> tuple[list[int], dict[int, dict], int]:
     """Stream pool_size shuffled galaxies from HF Datasets, pre-caching images.
+    Args:
+        pool_size: Number of galaxies to include in the pool.
+        seed: Shuffle seed. If None, a random seed is generated. Pass the same
+              seed on subsequent startups to reproduce the exact same pool order
+              so that saved ELO state remains valid across restarts.
     Returns:
         ids: sequential ints 0..N-1 used as galaxy IDs throughout the app
         metadata_map: {id -> row_dict (without image column)} for display names
+        seed: the seed that was used (store in tournament state for reuse)
     """
     from datasets import load_dataset
     from datasets import Image as HFImage
+    if seed is None:
+        seed = random.randint(0, 2**32 - 1)
     logger.info(
+        "Streaming %d galaxies from %s (shuffle seed=%d)...",
         pool_size,
         DATASET_ID,
+        seed,
     )
     ds = load_dataset(
     if features and IMAGE_COLUMN in features:
         ds = ds.cast_column(IMAGE_COLUMN, HFImage(decode=False))
+    ds = ds.shuffle(seed=seed, buffer_size=10_000)
     ds = ds.take(pool_size)
     ids: list[int] = []
             logger.info("Streamed %d/%d galaxies", i + 1, pool_size)
     logger.info("Finished streaming %d galaxies", len(ids))
+    return ids, metadata_map, seed