Spaces:

kaurm43
/

PolyFusionAgent

Running

App Files Files Community

kaurm43 commited on 18 days ago

Commit

d6c1ff7

verified ·

1 Parent(s): 9c5022f

Update PolyAgent/orchestrator.py

Browse files

Files changed (1) hide show

PolyAgent/orchestrator.py +58 -12

PolyAgent/orchestrator.py CHANGED Viewed

@@ -20,6 +20,7 @@ import sys
 from pathlib import Path
 from typing import Dict, Any, List, Optional, Tuple
 from urllib.parse import urlparse
 import numpy as np
 import torch
@@ -76,24 +77,69 @@ SELFIES_AVAILABLE = sf is not None
 # =============================================================================
 class PathsConfig:
     """
-    Centralized path placeholders. Replace these with your local paths.
     """
-    # CL weights
-    cl_weights_path = "/path/to/multimodal_output_5M/best/pytorch_model.bin"
-    # Chroma DB (local RAG vectorstore persist dir)
-    chroma_db_path = "/path/to/chroma_polymer_db_big"
-    # SentencePiece model + vocab
-    spm_model_path = "/path/to/spm_5M.model"
-    spm_vocab_path = "/path/to/spm_5M.vocab"
-    # Downstream bestweights directory produced by your 5M downstream script
-    downstream_bestweights_5m_dir = "/path/to/multimodal_downstream_bestweights_5M"
-    # Inverse-design generator artifacts directory produced by your 5M inverse design run
-    inverse_design_5m_dir = "/path/to/multimodal_inverse_design_output_5M_polybart_style/best_models"
 # =============================================================================
 # DOI NORMALIZATION / RESOLUTION HELPERS

 from pathlib import Path
 from typing import Dict, Any, List, Optional, Tuple
 from urllib.parse import urlparse
+from huggingface_hub import snapshot_download
 import numpy as np
 import torch
 # =============================================================================
 class PathsConfig:
     """
+    Centralized paths for Spaces/local runs.
+    On Hugging Face Spaces:
+      - Downloads required artifacts from a HF Model repo (weights) into a local cache dir
+      - Exposes stable local filesystem paths used by the rest of orchestrator.py
     """
+    def __init__(self):
+        # 1) HF model repo where you uploaded the staged bundle
+        #    Example: "kaurm43/PolyFusionAgent-weights-5m" (change to your real repo_id)
+        self.hf_repo_id = os.getenv("POLYFUSION_WEIGHTS_REPO", "kaurm43/PolyFusionAgent-weights-5m")
+        self.hf_repo_type = os.getenv("POLYFUSION_WEIGHTS_REPO_TYPE", "model")  # usually "model"
+        # 2) Where to store downloaded files
+        #    Prefer /data on Spaces with persistent storage; else use a cache folder.
+        default_root = "/data/polyfusion_cache" if os.path.isdir("/data") else os.path.expanduser("~/.cache/polyfusion_cache")
+        self.local_weights_root = os.getenv("POLYFUSION_WEIGHTS_DIR", default_root)
+        # 3) Optional token (only needed if the weights repo is private)
+        self.hf_token = os.getenv("HF_TOKEN", None)
+        # 4) Download (cached) + get local folder path.
+        #    allow_patterns keeps download smaller/faster (only pull what orchestrator needs).
+        allow = [
+            "tokenizer_spm_5m/**",
+            "polyfusion_cl_5m/**",
+            "downstream_heads_5m/**",
+            "inverse_design_5m/**",
+            "MANIFEST.txt",
+        ]
+        self._weights_dir = snapshot_download(
+            repo_id=self.hf_repo_id,
+            repo_type=self.hf_repo_type,
+            local_dir=self.local_weights_root,
+            local_dir_use_symlinks=False,
+            token=self.hf_token,
+            allow_patterns=allow,
+        )
+        # 5) Map to the exact files your existing code expects
+        #    (Only path wiring changes; no behavior changes elsewhere.)
+        self.cl_weights_path = os.path.join(self._weights_dir, "polyfusion_cl_5m", "pytorch_model.bin")
+        # If your Space also includes a local Chroma DB folder in the Space repo,
+        # keep this as-is. Otherwise, you can also host Chroma DB as a dataset/model repo.
+        self.chroma_db_path = os.getenv("CHROMA_DB_PATH", "chroma_polymer_db_big")
+        self.spm_model_path = os.path.join(self._weights_dir, "tokenizer_spm_5m", "spm.model")
+        self.spm_vocab_path = os.path.join(self._weights_dir, "tokenizer_spm_5m", "spm.vocab")
+        self.downstream_bestweights_5m_dir = os.path.join(self._weights_dir, "downstream_heads_5m")
+        self.inverse_design_5m_dir = os.path.join(self._weights_dir, "inverse_design_5m")
+        # 6) Optional: sanity-check required files (fail early with a clear message)
+        self._assert_exists(self.cl_weights_path, "CL weights")
+        self._assert_exists(self.spm_model_path, "SentencePiece model")
+        self._assert_exists(self.spm_vocab_path, "SentencePiece vocab")
+    @staticmethod
+    def _assert_exists(p: str, label: str):
+        if not os.path.exists(p):
+            raise FileNotFoundError(f"{label} not found at: {p}")
 # =============================================================================
 # DOI NORMALIZATION / RESOLUTION HELPERS