Spaces:

Hafnium49
/

crystal-embedder

Sleeping

App Files Files Community

Hafnium13 commited on Feb 3

Commit

668ae63

1 Parent(s): de0ab6c

Initial deployment: Tri-Fusion Crystal Embedder

Browse files

Files changed (4) hide show

Dockerfile +50 -0
README.md +94 -4
main.py +172 -0
requirements.txt +19 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+FROM python:3.10-slim
+# CRITICAL: Set legacy Keras before any TensorFlow imports
+# Required for MEGNet compatibility with TensorFlow 2.16+
+ENV TF_USE_LEGACY_KERAS=1
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libgomp1 \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# CRITICAL: Install CPU-only versions to save space and avoid GPU conflicts
+# Install PyTorch CPU first (from special index)
+RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
+# Install orb-models (required for ORBFeaturizer)
+RUN pip install --no-cache-dir orb-models
+# Install MatterVial and its dependencies
+# Then pin TensorFlow to 2.15.x (last version compatible with MEGNet)
+RUN pip install --no-cache-dir \
+    "mattervial @ git+https://github.com/rogeriog/MatterVial.git" \
+    pymatgen \
+    scikit-learn \
+    pandas \
+    fastapi \
+    uvicorn \
+    python-multipart
+# CRITICAL: Force TensorFlow 2.15 and compatible Keras AFTER all other installs
+# TensorFlow 2.16+ uses Keras 3.x which breaks MEGNet's Trainer.compile()
+# tf_keras is required when TF_USE_LEGACY_KERAS=1
+RUN pip install --no-cache-dir --force-reinstall \
+    "tensorflow-cpu>=2.15,<2.16" \
+    "keras>=2.15,<2.16" \
+    "tf_keras>=2.15,<2.16" \
+    "tensorboard>=2.15,<2.16" \
+    "ml-dtypes>=0.3.1,<0.4"
+COPY . .
+# HF Spaces requires port 7860
+EXPOSE 7860
+# Launch with generous timeout for model loading
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "120"]

README.md CHANGED Viewed

@@ -1,10 +1,100 @@
 ---
 title: Crystal Embedder
-emoji: 🌖
-colorFrom: gray
-colorTo: red
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Crystal Embedder
+emoji: 🔮
+colorFrom: purple
+colorTo: blue
 sdk: docker
 pinned: false
+license: mit
 ---
+# Crystal-Embedder
+Compute **Tri-Fusion physics embeddings** for crystal structures.
+## What it does
+This service takes a crystal structure in CIF format and returns a 2738-dimensional embedding vector that captures:
+| Component | Dimensions | Description |
+|-----------|------------|-------------|
+| **Orb-v3** | 1792 | Force field features (PyTorch) |
+| **l-MM** | 758 | Electronic structure features (TensorFlow/MEGNet) |
+| **l-OFM** | 188 | Orbital field matrix features (TensorFlow/MEGNet) |
+| **Total** | **2738** | Concatenated, L2-normalized |
+## API
+### `POST /embed`
+Compute embedding for a CIF structure.
+**Request:**
+```json
+{
+  "cif": "data_Si\n_cell_length_a 5.43..."
+}
+```
+**Response:**
+```json
+{
+  "vector": [0.123, -0.456, ...],
+  "dims": 2738
+}
+```
+### `GET /health`
+Health check endpoint.
+**Response:**
+```json
+{
+  "status": "healthy",
+  "models_loaded": true,
+  "vector_dims": 2738
+}
+```
+## Example Usage
+```python
+import httpx
+cif_content = open("structure.cif").read()
+response = httpx.post(
+    "https://hafnium49-crystal-embedder.hf.space/embed",
+    json={"cif": cif_content},
+    timeout=60
+)
+embedding = response.json()["vector"]
+print(f"Embedding shape: {len(embedding)}")  # 2738
+```
+## Performance
+- **Latency:** ~15 seconds per structure (CPU-only)
+- **Memory:** ~8GB peak during inference
+- **Cold start:** ~2 minutes (model loading)
+## Local Development
+```bash
+# Build
+docker build -t crystal-embedder .
+# Run
+docker run -p 7860:7860 crystal-embedder
+# Test
+curl http://localhost:7860/health
+```
+## Powered By
+- [MatterVial](https://github.com/rogeriog/MatterVial) - Physics embeddings
+- [pymatgen](https://pymatgen.org/) - Crystal structure parsing
+- [FastAPI](https://fastapi.tiangolo.com/) - Web framework

main.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Crystal-Embedder Microservice
+Computes Tri-Fusion embeddings (Orb-v3 + l-MM + l-OFM) for crystal structures.
+Deployed on Hugging Face Spaces (CPU-only, 16GB RAM).
+API:
+    POST /embed
+    Request: {"cif": "<CIF content>"}
+    Response: {"vector": [...], "dims": 2738}
+"""
+import os
+# CRITICAL: Set TensorFlow to use legacy Keras before any imports
+# Required for MEGNet compatibility with TensorFlow 2.16+
+os.environ["TF_USE_LEGACY_KERAS"] = "1"
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import normalize
+from pymatgen.core import Structure
+app = FastAPI(
+    title="Crystal-Embedder",
+    description="Tri-Fusion physics embeddings for crystal structures",
+    version="1.0.0"
+)
+# Global model instances (loaded at startup)
+orb_model = None
+mm_model = None
+ofm_model = None
+models_loaded = False
+class CifRequest(BaseModel):
+    cif: str
+class EmbedResponse(BaseModel):
+    vector: list[float]
+    dims: int
+class HealthResponse(BaseModel):
+    status: str
+    models_loaded: bool
+    vector_dims: int
+def load_models():
+    """Load all three featurizer models at startup."""
+    global orb_model, mm_model, ofm_model, models_loaded
+    print("Loading Physics Engines (Orb-v3 + l-MM + l-OFM)...")
+    print("This may take a few minutes on first load...")
+    try:
+        # Import MatterVial featurizers
+        from mattervial.featurizers import ORBFeaturizer, DescriptorMEGNetFeaturizer
+        # Load Orb-v3 (PyTorch, CPU)
+        print("  Loading Orb-v3 (1792-dim)...")
+        orb_model = ORBFeaturizer(model_name="ORB_v3", device="cpu")
+        # Load l-MM (TensorFlow/MEGNet)
+        print("  Loading l-MM (758-dim)...")
+        mm_model = DescriptorMEGNetFeaturizer(base_descriptor='l-MM_v1')
+        # Load l-OFM (TensorFlow/MEGNet)
+        print("  Loading l-OFM (188-dim)...")
+        ofm_model = DescriptorMEGNetFeaturizer(base_descriptor='l-OFM_v1')
+        models_loaded = True
+        print("All models loaded successfully!")
+    except Exception as e:
+        print(f"Error loading models: {e}")
+        raise
+@app.on_event("startup")
+async def startup_event():
+    """Load models when the server starts."""
+    load_models()
+@app.get("/health", response_model=HealthResponse)
+async def health_check():
+    """Health check endpoint."""
+    return HealthResponse(
+        status="healthy" if models_loaded else "loading",
+        models_loaded=models_loaded,
+        vector_dims=2738
+    )
+@app.post("/embed", response_model=EmbedResponse)
+async def embed_structure(req: CifRequest):
+    """
+    Compute Tri-Fusion embedding for a CIF structure.
+    The embedding is a concatenation of three L2-normalized vectors:
+    - Orb-v3: 1792-dim (force field features)
+    - l-MM: 758-dim (electronic structure features)
+    - l-OFM: 188-dim (orbital field matrix features)
+    Total: 2738 dimensions
+    """
+    if not models_loaded:
+        raise HTTPException(status_code=503, detail="Models still loading, please retry")
+    try:
+        # 1. Parse CIF to pymatgen Structure
+        struct = Structure.from_str(req.cif, fmt="cif")
+        s_series = pd.Series([struct])
+        # 2. Compute features (sequential on CPU)
+        print("Computing Orb-v3 features...")
+        vec_orb = orb_model.get_features(s_series).values[0]  # ~10s
+        print("Computing l-MM features...")
+        vec_mm = mm_model.get_features(s_series).values[0]    # ~2s
+        print("Computing l-OFM features...")
+        vec_ofm = ofm_model.get_features(s_series).values[0]  # ~0.5s
+        # 3. Handle NaN values (replace with 0)
+        vec_orb = np.nan_to_num(vec_orb, nan=0.0)
+        vec_mm = np.nan_to_num(vec_mm, nan=0.0)
+        vec_ofm = np.nan_to_num(vec_ofm, nan=0.0)
+        # 4. L2 Normalize each vector (prevents magnitude dominance)
+        v1 = normalize([vec_orb])[0]
+        v2 = normalize([vec_mm])[0]
+        v3 = normalize([vec_ofm])[0]
+        # 5. Concatenate
+        final_vector = np.concatenate([v1, v2, v3])
+        print(f"Embedding complete: {len(final_vector)} dimensions")
+        return EmbedResponse(
+            vector=final_vector.tolist(),
+            dims=len(final_vector)
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Embedding failed: {str(e)}")
+@app.get("/")
+async def root():
+    """Root endpoint with API info."""
+    return {
+        "service": "Crystal-Embedder",
+        "version": "1.0.0",
+        "description": "Tri-Fusion physics embeddings for crystal structures",
+        "endpoints": {
+            "/embed": "POST - Compute embedding from CIF",
+            "/health": "GET - Health check"
+        },
+        "vector_dimensions": {
+            "orb_v3": 1792,
+            "l_mm": 758,
+            "l_ofm": 188,
+            "total": 2738
+        }
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+# CPU-only PyTorch (installed separately in Dockerfile via --index-url)
+# torch
+# CPU-only TensorFlow
+tensorflow-cpu
+# MatterVial (physics embeddings)
+mattervial @ git+https://github.com/rogeriog/MatterVial.git
+# Scientific stack
+pymatgen
+scikit-learn
+pandas
+numpy
+# Web framework
+fastapi
+uvicorn[standard]
+python-multipart