Spaces:

asif45
/

llmprop-api

Sleeping

App Files Files Community

Mdasif45 commited on 14 days ago

Commit

e620469

1 Parent(s): 316e7a7

Initial FastAPI deployment

Browse files

Files changed (10) hide show

Dockerfile +13 -0
main.py +105 -0
predict_all.py +25 -0
predict_band_gap.py +185 -0
predict_e_above_hull.py +185 -0
predict_epa.py +184 -0
predict_fepa.py +185 -0
predict_is_gap_direct.py +114 -0
predict_volume.py +185 -0
requirements.txt +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import shutil
+from pathlib import Path
+from typing import Callable, Dict, Optional
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from huggingface_hub import hf_hub_download
+from pydantic import BaseModel, Field
+THRESHOLD = 0.33
+predict_all_fn: Optional[Callable[..., Dict[str, object]]] = None
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+HF_REPO_ID = "asif45/LLM-PROP"
+HF_TOKEN = os.getenv("HF_TOKEN")
+CHECKPOINT_FILES = {
+    "checkpoints/samples/classification/best_checkpoint_for_is_gap_direct.pt": "checkpoints/samples/classification/best_checkpoint_for_is_gap_direct.pt",
+    "checkpoints/samples/regression/best_checkpoint_for_band_gap.pt": "checkpoints/samples/regression/best_checkpoint_for_band_gap.pt",
+    "checkpoints/samples/regression/best_checkpoint_for_energy_per_atom.pt": "checkpoints/samples/regression/best_checkpoint_for_energy_per_atom.pt",
+    "checkpoints/samples/regression/best_checkpoint_for_e_above_hull.pt": "checkpoints/samples/regression/best_checkpoint_for_e_above_hull.pt",
+    "checkpoints/samples/regression/best_checkpoint_for_fepa.pt": "checkpoints/samples/regression/best_checkpoint_for_fepa.pt",
+    "checkpoints/samples/regression/best_checkpoint_for_volume.pt": "checkpoints/samples/regression/best_checkpoint_for_volume.pt",
+}
+class PredictRequest(BaseModel):
+    text: str = Field(..., description="Crystal description text")
+class PredictResponse(BaseModel):
+    is_gap_direct: str
+    energy_per_atom: float
+    formation_energy_per_atom: float
+    band_gap: float
+    e_above_hull: float
+    volume: float
+app = FastAPI(title="Crystal Property Predictor API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "http://localhost:8080",
+        "http://127.0.0.1:8080",
+        "http://localhost:5173",
+        "http://127.0.0.1:5173",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+def ensure_checkpoint_files() -> None:
+    for local_relative_path, repo_file_path in CHECKPOINT_FILES.items():
+        local_path = PROJECT_DIR / local_relative_path
+        if local_path.exists():
+            continue
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+        downloaded_path = hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=repo_file_path,
+            repo_type="model",
+            token=HF_TOKEN,
+        )
+        shutil.copy2(downloaded_path, local_path)
+@app.on_event("startup")
+def load_model_once() -> None:
+    # Download missing checkpoints first, then import the predictor so it loads the local files once.
+    ensure_checkpoint_files()
+    global predict_all_fn
+    from predict_all import predict_all
+    predict_all_fn = predict_all
+@app.get("/health")
+def health() -> Dict[str, object]:
+    return {"status": "ok", "model_loaded": predict_all_fn is not None}
+@app.post("/predict", response_model=PredictResponse)
+def predict(payload: PredictRequest) -> PredictResponse:
+    if predict_all_fn is None:
+        raise HTTPException(status_code=503, detail="Model is not loaded yet")
+    text = payload.text.strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Text input cannot be empty")
+    predictions = predict_all_fn(text, threshold=THRESHOLD)
+    filtered_predictions = {
+        "is_gap_direct": predictions["is_gap_direct"],
+        "energy_per_atom": predictions["energy_per_atom"],
+        "formation_energy_per_atom": predictions["formation_energy_per_atom"],
+        "band_gap": predictions["band_gap"],
+        "e_above_hull": predictions["e_above_hull"],
+        "volume": predictions["volume"],
+    }
+    return PredictResponse(**filtered_predictions)

predict_all.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from predict_is_gap_direct import predict
+from predict_epa import predict_epa
+from predict_fepa import predict_fepa
+from predict_band_gap import predict_band_gap
+from predict_e_above_hull import predict_e_above_hull
+from predict_volume import predict_volume
+def predict_all(text, threshold=0.33, max_length=256):
+    is_gap_direct, confidence = predict(text, threshold=threshold)
+    energy_per_atom = predict_epa(text, max_length=max_length)
+    formation_energy_per_atom = predict_fepa(text, max_length=max_length)
+    band_gap = predict_band_gap(text, max_length=max_length)
+    e_above_hull = predict_e_above_hull(text, max_length=max_length)
+    volume = predict_volume(text, max_length=max_length)
+    return {
+        "is_gap_direct": is_gap_direct,
+        "confidence": confidence,
+        "energy_per_atom": energy_per_atom,
+        "formation_energy_per_atom": formation_energy_per_atom,
+        "band_gap": band_gap,
+        "e_above_hull": e_above_hull,
+        "volume": volume,
+    }

predict_band_gap.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import argparse
+import contextlib
+import io
+import logging
+import os
+import sys
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+from transformers.utils import logging as transformers_logging
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_CANDIDATES = [
+    SCRIPT_DIR,
+    os.path.dirname(SCRIPT_DIR),
+    os.path.join(os.path.dirname(SCRIPT_DIR), "LLM-Prop"),
+]
+PROJECT_DIR = None
+for candidate in PROJECT_CANDIDATES:
+    if os.path.exists(os.path.join(candidate, "llmprop_model.py")):
+        PROJECT_DIR = candidate
+        break
+if PROJECT_DIR is None:
+    raise FileNotFoundError(
+        "Could not locate project root containing llmprop_model.py. "
+        "Expected near the deployment folder."
+    )
+if os.path.isdir(PROJECT_DIR) and PROJECT_DIR not in sys.path:
+    sys.path.insert(0, PROJECT_DIR)
+from llmprop_model import T5Predictor
+def z_denormalize(scaled_labels, labels_mean, labels_std):
+    return (scaled_labels * labels_std) + labels_mean
+# -------------------------
+# CONFIG
+# -------------------------
+MODEL_PATH = os.path.join(
+    PROJECT_DIR,
+    "checkpoints",
+    "samples",
+    "regression",
+    "best_checkpoint_for_band_gap.pt",
+)
+TOKENIZER_PATH = os.path.join(
+    PROJECT_DIR,
+    "tokenizers",
+    "t5_tokenizer_trained_on_modified_part_of_C4_and_textedge",
+)
+TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, "data", "samples", "train_data.csv")
+PROPERTY_NAME = "band_gap"
+DEVICE = torch.device("cpu")
+# Silence HF/Transformers startup logs for cleaner terminal output.
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+transformers_logging.set_verbosity_error()
+logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
+# -------------------------
+# PATH CHECKS
+# -------------------------
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Checkpoint not found: {MODEL_PATH}")
+if not os.path.exists(TOKENIZER_PATH):
+    raise FileNotFoundError(f"Tokenizer path not found: {TOKENIZER_PATH}")
+if not os.path.exists(TRAIN_DATA_PATH):
+    raise FileNotFoundError(f"Training data not found: {TRAIN_DATA_PATH}")
+# -------------------------
+# LOAD TRAIN LABEL STATS (z_norm)
+# -------------------------
+train_df = pd.read_csv(TRAIN_DATA_PATH)
+if PROPERTY_NAME not in train_df.columns:
+    raise ValueError(f"Column '{PROPERTY_NAME}' not found in {TRAIN_DATA_PATH}")
+train_labels = torch.tensor(
+    train_df[PROPERTY_NAME].dropna().to_numpy(),
+    dtype=torch.float32,
+)
+if train_labels.numel() == 0:
+    raise ValueError(f"No non-null values found for '{PROPERTY_NAME}' in {TRAIN_DATA_PATH}")
+TRAIN_LABEL_MEAN = torch.mean(train_labels)
+TRAIN_LABEL_STD = torch.std(train_labels)
+if float(TRAIN_LABEL_STD) == 0.0:
+    raise ValueError(
+        f"Standard deviation for '{PROPERTY_NAME}' is 0.0; z_norm de-normalization is undefined"
+    )
+def _quiet_call(fn, *args, **kwargs):
+    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
+        return fn(*args, **kwargs)
+# -------------------------
+# LOAD TOKENIZER
+# -------------------------
+tokenizer = _quiet_call(AutoTokenizer.from_pretrained, TOKENIZER_PATH)
+# -------------------------
+# LOAD MODEL
+# -------------------------
+base_model = _quiet_call(T5EncoderModel.from_pretrained, "google/t5-v1_1-small")
+base_model_output_size = 512
+# Match embedding matrix size to the tokenizer used during training.
+base_model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+model = T5Predictor(
+    base_model,
+    base_model_output_size,
+    drop_rate=0.1,
+    pooling="mean",
+)
+# -------------------------
+# LOAD WEIGHTS
+# -------------------------
+state_dict = _quiet_call(torch.load, MODEL_PATH, map_location=DEVICE)
+# Some checkpoints were trained with an extra tokenizer token; align embedding size to checkpoint.
+checkpoint_vocab_size = state_dict["model.shared.weight"].shape[0]
+if model.model.shared.weight.shape[0] != checkpoint_vocab_size:
+    model.model.resize_token_embeddings(checkpoint_vocab_size, mean_resizing=False)
+model.load_state_dict(state_dict, strict=False)
+model.to(DEVICE)
+model.eval()
+# -------------------------
+# PREDICT FUNCTION
+# -------------------------
+def predict_band_gap(text, max_length=256):
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=max_length,
+    )
+    input_ids = inputs["input_ids"].to(DEVICE)
+    attention_mask = inputs["attention_mask"].to(DEVICE)
+    with torch.no_grad():
+        _, prediction_norm = model(input_ids, attention_mask)
+        prediction_band_gap = z_denormalize(
+            prediction_norm.squeeze(),
+            TRAIN_LABEL_MEAN,
+            TRAIN_LABEL_STD,
+        ).item()
+    return prediction_band_gap
+# -------------------------
+# TEST
+# -------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Predict band_gap from text")
+    parser.add_argument("--max_length", type=int, default=256, help="Tokenizer max length")
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="A simple cubic crystalLiAl(MoO₄)₂ crystallizes in the triclinic P̅1 space group. Li¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Li–O bond distances ranging from 1.98–2.25 Å. There are two inequivalent Mo⁶⁺ sites. In the first Mo⁶⁺ site, Mo⁶⁺ is bonded in a 4-coordinate geometry to five O²⁻ atoms. There are a spread of Mo–O bond distances ranging from 1.74–2.46 Å. In the second Mo⁶⁺ site, Mo⁶⁺ is bonded to four O²⁻ atoms to form MoO₄ tetrahedra that share corners with three equivalent AlO₆ octahedra. The corner-sharing octahedral tilt angles range from 15–44°. There are a spread of Mo–O bond distances ranging from 1.77–1.82 Å. Al³⁺ is bonded to six O²⁻ atoms to form AlO₆ octahedra that share corners with three equivalent MoO₄ tetrahedra and  an edgeedge with one AlO₆ octahedra. There are a spread of Al–O bond distances ranging from 1.88–1.95 Å. There are eight inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Li¹⁺, one Mo⁶⁺, and one Al³⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Mo⁶⁺ and two equivalent Al³⁺ atoms. In the third O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fourth O²⁻ site, O²⁻ is bonded in a linear geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fifth O²⁻ site, O²⁻ is bonded in a linear geometry to one Mo⁶⁺ and one Al³⁺ atom. In the sixth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the seventh O²⁻ site, O²⁻ is bonded in a 4-coordinate geometry to one Li¹⁺, two equivalent Mo⁶⁺, and one Al³⁺ atom. In the eighth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Mo⁶⁺ and one Al³⁺ atom. with atoms arranged periodically and stable at room temperature.",
+        help="Input text to predict band gap",
+    )
+    args = parser.parse_args()
+    value = predict_band_gap(args.text, max_length=args.max_length)
+    print(f"Predicted band_gap: {value:.6f}")

predict_e_above_hull.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import argparse
+import contextlib
+import io
+import logging
+import os
+import sys
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+from transformers.utils import logging as transformers_logging
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_CANDIDATES = [
+    SCRIPT_DIR,
+    os.path.dirname(SCRIPT_DIR),
+    os.path.join(os.path.dirname(SCRIPT_DIR), "LLM-Prop"),
+]
+PROJECT_DIR = None
+for candidate in PROJECT_CANDIDATES:
+    if os.path.exists(os.path.join(candidate, "llmprop_model.py")):
+        PROJECT_DIR = candidate
+        break
+if PROJECT_DIR is None:
+    raise FileNotFoundError(
+        "Could not locate project root containing llmprop_model.py. "
+        "Expected near the deployment folder."
+    )
+if os.path.isdir(PROJECT_DIR) and PROJECT_DIR not in sys.path:
+    sys.path.insert(0, PROJECT_DIR)
+from llmprop_model import T5Predictor
+def z_denormalize(scaled_labels, labels_mean, labels_std):
+    return (scaled_labels * labels_std) + labels_mean
+# -------------------------
+# CONFIG
+# -------------------------
+MODEL_PATH = os.path.join(
+    PROJECT_DIR,
+    "checkpoints",
+    "samples",
+    "regression",
+    "best_checkpoint_for_e_above_hull.pt",
+)
+TOKENIZER_PATH = os.path.join(
+    PROJECT_DIR,
+    "tokenizers",
+    "t5_tokenizer_trained_on_modified_part_of_C4_and_textedge",
+)
+TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, "data", "samples", "train_data.csv")
+PROPERTY_NAME = "e_above_hull"
+DEVICE = torch.device("cpu")
+# Silence HF/Transformers startup logs for cleaner terminal output.
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+transformers_logging.set_verbosity_error()
+logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
+# -------------------------
+# PATH CHECKS
+# -------------------------
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Checkpoint not found: {MODEL_PATH}")
+if not os.path.exists(TOKENIZER_PATH):
+    raise FileNotFoundError(f"Tokenizer path not found: {TOKENIZER_PATH}")
+if not os.path.exists(TRAIN_DATA_PATH):
+    raise FileNotFoundError(f"Training data not found: {TRAIN_DATA_PATH}")
+# -------------------------
+# LOAD TRAIN LABEL STATS (z_norm)
+# -------------------------
+train_df = pd.read_csv(TRAIN_DATA_PATH)
+if PROPERTY_NAME not in train_df.columns:
+    raise ValueError(f"Column '{PROPERTY_NAME}' not found in {TRAIN_DATA_PATH}")
+train_labels = torch.tensor(
+    train_df[PROPERTY_NAME].dropna().to_numpy(),
+    dtype=torch.float32,
+)
+if train_labels.numel() == 0:
+    raise ValueError(f"No non-null values found for '{PROPERTY_NAME}' in {TRAIN_DATA_PATH}")
+TRAIN_LABEL_MEAN = torch.mean(train_labels)
+TRAIN_LABEL_STD = torch.std(train_labels)
+if float(TRAIN_LABEL_STD) == 0.0:
+    raise ValueError(
+        f"Standard deviation for '{PROPERTY_NAME}' is 0.0; z_norm de-normalization is undefined"
+    )
+def _quiet_call(fn, *args, **kwargs):
+    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
+        return fn(*args, **kwargs)
+# -------------------------
+# LOAD TOKENIZER
+# -------------------------
+tokenizer = _quiet_call(AutoTokenizer.from_pretrained, TOKENIZER_PATH)
+# -------------------------
+# LOAD MODEL
+# -------------------------
+base_model = _quiet_call(T5EncoderModel.from_pretrained, "google/t5-v1_1-small")
+base_model_output_size = 512
+# Match embedding matrix size to the tokenizer used during training.
+base_model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+model = T5Predictor(
+    base_model,
+    base_model_output_size,
+    drop_rate=0.1,
+    pooling="mean",
+)
+# -------------------------
+# LOAD WEIGHTS
+# -------------------------
+state_dict = _quiet_call(torch.load, MODEL_PATH, map_location=DEVICE)
+# Some checkpoints were trained with an extra tokenizer token; align embedding size to checkpoint.
+checkpoint_vocab_size = state_dict["model.shared.weight"].shape[0]
+if model.model.shared.weight.shape[0] != checkpoint_vocab_size:
+    model.model.resize_token_embeddings(checkpoint_vocab_size, mean_resizing=False)
+model.load_state_dict(state_dict, strict=False)
+model.to(DEVICE)
+model.eval()
+# -------------------------
+# PREDICT FUNCTION
+# -------------------------
+def predict_e_above_hull(text, max_length=256):
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=max_length,
+    )
+    input_ids = inputs["input_ids"].to(DEVICE)
+    attention_mask = inputs["attention_mask"].to(DEVICE)
+    with torch.no_grad():
+        _, prediction_norm = model(input_ids, attention_mask)
+        prediction_e_above_hull = z_denormalize(
+            prediction_norm.squeeze(),
+            TRAIN_LABEL_MEAN,
+            TRAIN_LABEL_STD,
+        ).item()
+    return prediction_e_above_hull
+# -------------------------
+# TEST
+# -------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Predict e_above_hull from text")
+    parser.add_argument("--max_length", type=int, default=256, help="Tokenizer max length")
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="A simple cubic crystalLiAl(MoO₄)₂ crystallizes in the triclinic P̅1 space group. Li¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Li-O bond distances ranging from 1.98-2.25 A. There are two inequivalent Mo⁶⁺ sites. In the first Mo⁶⁺ site, Mo⁶⁺ is bonded in a 4-coordinate geometry to five O²⁻ atoms. There are a spread of Mo-O bond distances ranging from 1.74-2.46 A. In the second Mo⁶⁺ site, Mo⁶⁺ is bonded to four O²⁻ atoms to form MoO₄ tetrahedra that share corners with three equivalent AlO₆ octahedra. The corner-sharing octahedral tilt angles range from 15-44 degrees. There are a spread of Mo-O bond distances ranging from 1.77-1.82 A. Al³⁺ is bonded to six O²⁻ atoms to form AlO₆ octahedra that share corners with three equivalent MoO₄ tetrahedra and an edgeedge with one AlO₆ octahedra. There are a spread of Al-O bond distances ranging from 1.88-1.95 A. There are eight inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Li¹⁺, one Mo⁶⁺, and one Al³⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Mo⁶⁺ and two equivalent Al³⁺ atoms. In the third O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fourth O²⁻ site, O²⁻ is bonded in a linear geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fifth O²⁻ site, O²⁻ is bonded in a linear geometry to one Mo⁶⁺ and one Al³⁺ atom. In the sixth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the seventh O²⁻ site, O²⁻ is bonded in a 4-coordinate geometry to one Li¹⁺, two equivalent Mo⁶⁺, and one Al³⁺ atom. In the eighth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Mo⁶⁺ and one Al³⁺ atom. with atoms arranged periodically and stable at room temperature.",
+        help="Input text to predict e_above_hull",
+    )
+    args = parser.parse_args()
+    value = predict_e_above_hull(args.text, max_length=args.max_length)
+    print(f"Predicted e_above_hull: {value:.6f}")

predict_epa.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import argparse
+import os
+import sys
+import logging
+import io
+import contextlib
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+from transformers.utils import logging as transformers_logging
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_CANDIDATES = [
+    SCRIPT_DIR,
+    os.path.dirname(SCRIPT_DIR),
+    os.path.join(os.path.dirname(SCRIPT_DIR), "LLM-Prop"),
+]
+PROJECT_DIR = None
+for candidate in PROJECT_CANDIDATES:
+    if os.path.exists(os.path.join(candidate, "llmprop_model.py")):
+        PROJECT_DIR = candidate
+        break
+if PROJECT_DIR is None:
+    raise FileNotFoundError(
+        "Could not locate project root containing llmprop_model.py. "
+        "Expected near the deployment folder."
+    )
+if os.path.isdir(PROJECT_DIR) and PROJECT_DIR not in sys.path:
+    sys.path.insert(0, PROJECT_DIR)
+from llmprop_model import T5Predictor
+def z_denormalize(scaled_labels, labels_mean, labels_std):
+    return (scaled_labels * labels_std) + labels_mean
+# -------------------------
+# CONFIG
+# -------------------------
+MODEL_PATH = os.path.join(
+    PROJECT_DIR,
+    "checkpoints",
+    "samples",
+    "regression",
+    "best_checkpoint_for_energy_per_atom.pt",
+)
+TOKENIZER_PATH = os.path.join(
+    PROJECT_DIR,
+    "tokenizers",
+    "t5_tokenizer_trained_on_modified_part_of_C4_and_textedge",
+)
+TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, "data", "samples", "train_data.csv")
+PROPERTY_NAME = "energy_per_atom"
+DEVICE = torch.device("cpu")
+# Silence HF/Transformers startup logs for cleaner terminal output.
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+transformers_logging.set_verbosity_error()
+logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
+# -------------------------
+# PATH CHECKS
+# -------------------------
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Checkpoint not found: {MODEL_PATH}")
+if not os.path.exists(TOKENIZER_PATH):
+    raise FileNotFoundError(f"Tokenizer path not found: {TOKENIZER_PATH}")
+if not os.path.exists(TRAIN_DATA_PATH):
+    raise FileNotFoundError(f"Training data not found: {TRAIN_DATA_PATH}")
+# -------------------------
+# LOAD TRAIN LABEL STATS (z_norm)
+# -------------------------
+train_df = pd.read_csv(TRAIN_DATA_PATH)
+if PROPERTY_NAME not in train_df.columns:
+    raise ValueError(f"Column '{PROPERTY_NAME}' not found in {TRAIN_DATA_PATH}")
+train_labels = torch.tensor(
+    train_df[PROPERTY_NAME].dropna().to_numpy(),
+    dtype=torch.float32,
+)
+if train_labels.numel() == 0:
+    raise ValueError(f"No non-null values found for '{PROPERTY_NAME}' in {TRAIN_DATA_PATH}")
+TRAIN_LABEL_MEAN = torch.mean(train_labels)
+TRAIN_LABEL_STD = torch.std(train_labels)
+if float(TRAIN_LABEL_STD) == 0.0:
+    raise ValueError(
+        f"Standard deviation for '{PROPERTY_NAME}' is 0.0; z_norm de-normalization is undefined"
+    )
+def _quiet_call(fn, *args, **kwargs):
+    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
+        return fn(*args, **kwargs)
+# -------------------------
+# LOAD TOKENIZER
+# -------------------------
+tokenizer = _quiet_call(AutoTokenizer.from_pretrained, TOKENIZER_PATH)
+# -------------------------
+# LOAD MODEL
+# -------------------------
+base_model = _quiet_call(T5EncoderModel.from_pretrained, "google/t5-v1_1-small")
+base_model_output_size = 512
+# Match embedding matrix size to the tokenizer used during training.
+base_model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+model = T5Predictor(
+    base_model,
+    base_model_output_size,
+    drop_rate=0.1,
+    pooling="mean",
+)
+# -------------------------
+# LOAD WEIGHTS
+# -------------------------
+state_dict = _quiet_call(torch.load, MODEL_PATH, map_location=DEVICE)
+# Some checkpoints were trained with an extra tokenizer token; align embedding size to checkpoint.
+checkpoint_vocab_size = state_dict["model.shared.weight"].shape[0]
+if model.model.shared.weight.shape[0] != checkpoint_vocab_size:
+    model.model.resize_token_embeddings(checkpoint_vocab_size, mean_resizing=False)
+model.load_state_dict(state_dict, strict=False)
+model.to(DEVICE)
+model.eval()
+# -------------------------
+# PREDICT FUNCTION
+# -------------------------
+def predict_epa(text, max_length=256):
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=max_length,
+    )
+    input_ids = inputs["input_ids"].to(DEVICE)
+    attention_mask = inputs["attention_mask"].to(DEVICE)
+    with torch.no_grad():
+        _, prediction_norm = model(input_ids, attention_mask)
+        prediction_epa = z_denormalize(
+            prediction_norm.squeeze(),
+            TRAIN_LABEL_MEAN,
+            TRAIN_LABEL_STD,
+        ).item()
+    return prediction_epa
+# -------------------------
+# TEST
+# -------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Predict energy_per_atom from text")
+    parser.add_argument("--max_length", type=int, default=256, help="Tokenizer max length")
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="A simple cubic crystalLiAl(MoO₄)₂ crystallizes in the triclinic P̅1 space group. Li¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Li–O bond distances ranging from 1.98–2.25 Å. There are two inequivalent Mo⁶⁺ sites. In the first Mo⁶⁺ site, Mo⁶⁺ is bonded in a 4-coordinate geometry to five O²⁻ atoms. There are a spread of Mo–O bond distances ranging from 1.74–2.46 Å. In the second Mo⁶⁺ site, Mo⁶⁺ is bonded to four O²⁻ atoms to form MoO₄ tetrahedra that share corners with three equivalent AlO₆ octahedra. The corner-sharing octahedral tilt angles range from 15–44°. There are a spread of Mo–O bond distances ranging from 1.77–1.82 Å. Al³⁺ is bonded to six O²⁻ atoms to form AlO₆ octahedra that share corners with three equivalent MoO₄ tetrahedra and  an edgeedge with one AlO₆ octahedra. There are a spread of Al–O bond distances ranging from 1.88–1.95 Å. There are eight inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Li¹⁺, one Mo⁶⁺, and one Al³⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Mo⁶⁺ and two equivalent Al³⁺ atoms. In the third O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fourth O²⁻ site, O²⁻ is bonded in a linear geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fifth O²⁻ site, O²⁻ is bonded in a linear geometry to one Mo⁶⁺ and one Al³⁺ atom. In the sixth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the seventh O²⁻ site, O²⁻ is bonded in a 4-coordinate geometry to one Li¹⁺, two equivalent Mo⁶⁺, and one Al³⁺ atom. In the eighth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Mo⁶⁺ and one Al³⁺ atom. with atoms arranged periodically and stable at room temperature.",
+        help="Input text to predict EPA",
+    )
+    args = parser.parse_args()
+    value = predict_epa(args.text, max_length=args.max_length)
+    print(f"Predicted energy_per_atom: {value:.6f}")

predict_fepa.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import argparse
+import contextlib
+import io
+import logging
+import os
+import sys
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+from transformers.utils import logging as transformers_logging
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_CANDIDATES = [
+    SCRIPT_DIR,
+    os.path.dirname(SCRIPT_DIR),
+    os.path.join(os.path.dirname(SCRIPT_DIR), "LLM-Prop"),
+]
+PROJECT_DIR = None
+for candidate in PROJECT_CANDIDATES:
+    if os.path.exists(os.path.join(candidate, "llmprop_model.py")):
+        PROJECT_DIR = candidate
+        break
+if PROJECT_DIR is None:
+    raise FileNotFoundError(
+        "Could not locate project root containing llmprop_model.py. "
+        "Expected near the deployment folder."
+    )
+if os.path.isdir(PROJECT_DIR) and PROJECT_DIR not in sys.path:
+    sys.path.insert(0, PROJECT_DIR)
+from llmprop_model import T5Predictor
+def z_denormalize(scaled_labels, labels_mean, labels_std):
+    return (scaled_labels * labels_std) + labels_mean
+# -------------------------
+# CONFIG
+# -------------------------
+MODEL_PATH = os.path.join(
+    PROJECT_DIR,
+    "checkpoints",
+    "samples",
+    "regression",
+    "best_checkpoint_for_fepa.pt",
+)
+TOKENIZER_PATH = os.path.join(
+    PROJECT_DIR,
+    "tokenizers",
+    "t5_tokenizer_trained_on_modified_part_of_C4_and_textedge",
+)
+TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, "data", "samples", "train_data.csv")
+PROPERTY_NAME = "formation_energy_per_atom"
+DEVICE = torch.device("cpu")
+# Silence HF/Transformers startup logs for cleaner terminal output.
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+transformers_logging.set_verbosity_error()
+logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
+# -------------------------
+# PATH CHECKS
+# -------------------------
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Checkpoint not found: {MODEL_PATH}")
+if not os.path.exists(TOKENIZER_PATH):
+    raise FileNotFoundError(f"Tokenizer path not found: {TOKENIZER_PATH}")
+if not os.path.exists(TRAIN_DATA_PATH):
+    raise FileNotFoundError(f"Training data not found: {TRAIN_DATA_PATH}")
+# -------------------------
+# LOAD TRAIN LABEL STATS (z_norm)
+# -------------------------
+train_df = pd.read_csv(TRAIN_DATA_PATH)
+if PROPERTY_NAME not in train_df.columns:
+    raise ValueError(f"Column '{PROPERTY_NAME}' not found in {TRAIN_DATA_PATH}")
+train_labels = torch.tensor(
+    train_df[PROPERTY_NAME].dropna().to_numpy(),
+    dtype=torch.float32,
+)
+if train_labels.numel() == 0:
+    raise ValueError(f"No non-null values found for '{PROPERTY_NAME}' in {TRAIN_DATA_PATH}")
+TRAIN_LABEL_MEAN = torch.mean(train_labels)
+TRAIN_LABEL_STD = torch.std(train_labels)
+if float(TRAIN_LABEL_STD) == 0.0:
+    raise ValueError(
+        f"Standard deviation for '{PROPERTY_NAME}' is 0.0; z_norm de-normalization is undefined"
+    )
+def _quiet_call(fn, *args, **kwargs):
+    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
+        return fn(*args, **kwargs)
+# -------------------------
+# LOAD TOKENIZER
+# -------------------------
+tokenizer = _quiet_call(AutoTokenizer.from_pretrained, TOKENIZER_PATH)
+# -------------------------
+# LOAD MODEL
+# -------------------------
+base_model = _quiet_call(T5EncoderModel.from_pretrained, "google/t5-v1_1-small")
+base_model_output_size = 512
+# Match embedding matrix size to the tokenizer used during training.
+base_model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+model = T5Predictor(
+    base_model,
+    base_model_output_size,
+    drop_rate=0.1,
+    pooling="mean",
+)
+# -------------------------
+# LOAD WEIGHTS
+# -------------------------
+state_dict = _quiet_call(torch.load, MODEL_PATH, map_location=DEVICE)
+# Some checkpoints were trained with an extra tokenizer token; align embedding size to checkpoint.
+checkpoint_vocab_size = state_dict["model.shared.weight"].shape[0]
+if model.model.shared.weight.shape[0] != checkpoint_vocab_size:
+    model.model.resize_token_embeddings(checkpoint_vocab_size, mean_resizing=False)
+model.load_state_dict(state_dict, strict=False)
+model.to(DEVICE)
+model.eval()
+# -------------------------
+# PREDICT FUNCTION
+# -------------------------
+def predict_fepa(text, max_length=256):
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=max_length,
+    )
+    input_ids = inputs["input_ids"].to(DEVICE)
+    attention_mask = inputs["attention_mask"].to(DEVICE)
+    with torch.no_grad():
+        _, prediction_norm = model(input_ids, attention_mask)
+        prediction_fepa = z_denormalize(
+            prediction_norm.squeeze(),
+            TRAIN_LABEL_MEAN,
+            TRAIN_LABEL_STD,
+        ).item()
+    return prediction_fepa
+# -------------------------
+# TEST
+# -------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Predict formation_energy_per_atom from text")
+    parser.add_argument("--max_length", type=int, default=256, help="Tokenizer max length")
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="A simple cubic crystalLiAl(MoO₄)₂ crystallizes in the triclinic P̅1 space group. Li¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Li–O bond distances ranging from 1.98–2.25 Å. There are two inequivalent Mo⁶⁺ sites. In the first Mo⁶⁺ site, Mo⁶⁺ is bonded in a 4-coordinate geometry to five O²⁻ atoms. There are a spread of Mo–O bond distances ranging from 1.74–2.46 Å. In the second Mo⁶⁺ site, Mo⁶⁺ is bonded to four O²⁻ atoms to form MoO₄ tetrahedra that share corners with three equivalent AlO₆ octahedra. The corner-sharing octahedral tilt angles range from 15–44°. There are a spread of Mo–O bond distances ranging from 1.77–1.82 Å. Al³⁺ is bonded to six O²⁻ atoms to form AlO₆ octahedra that share corners with three equivalent MoO₄ tetrahedra and  an edgeedge with one AlO₆ octahedra. There are a spread of Al–O bond distances ranging from 1.88–1.95 Å. There are eight inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Li¹⁺, one Mo⁶⁺, and one Al³⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Mo⁶⁺ and two equivalent Al³⁺ atoms. In the third O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fourth O²⁻ site, O²⁻ is bonded in a linear geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fifth O²⁻ site, O²⁻ is bonded in a linear geometry to one Mo⁶⁺ and one Al³⁺ atom. In the sixth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the seventh O²⁻ site, O²⁻ is bonded in a 4-coordinate geometry to one Li¹⁺, two equivalent Mo⁶⁺, and one Al³⁺ atom. In the eighth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Mo⁶⁺ and one Al³⁺ atom. with atoms arranged periodically and stable at room temperature.",
+        help="Input text to predict FEPA",
+    )
+    args = parser.parse_args()
+    value = predict_fepa(args.text, max_length=args.max_length)
+    print(f"Predicted formation_energy_per_atom: {value:.6f}")

predict_is_gap_direct.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import os
+import sys
+import argparse
+from transformers import AutoTokenizer, T5EncoderModel
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_CANDIDATES = [
+    SCRIPT_DIR,
+    os.path.dirname(SCRIPT_DIR),
+    os.path.join(os.path.dirname(SCRIPT_DIR), "LLM-Prop"),
+]
+PROJECT_DIR = None
+for candidate in PROJECT_CANDIDATES:
+    if os.path.exists(os.path.join(candidate, "llmprop_model.py")):
+        PROJECT_DIR = candidate
+        break
+if PROJECT_DIR is None:
+    raise FileNotFoundError(
+        "Could not locate project root containing llmprop_model.py. "
+        "Expected near the deployment folder."
+    )
+if os.path.isdir(PROJECT_DIR) and PROJECT_DIR not in sys.path:
+    sys.path.insert(0, PROJECT_DIR)
+from llmprop_model import T5Predictor
+# -------------------------
+# CONFIG
+# -------------------------
+MODEL_PATH = os.path.join(PROJECT_DIR, "checkpoints", "samples", "classification", "best_checkpoint_for_is_gap_direct.pt")
+TOKENIZER_PATH = os.path.join(PROJECT_DIR, "tokenizers", "t5_tokenizer_trained_on_modified_part_of_C4_and_textedge")
+DEVICE = torch.device("cpu")
+# -------------------------
+# LOAD TOKENIZER
+# -------------------------
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
+# -------------------------
+# LOAD MODEL
+# -------------------------
+base_model = T5EncoderModel.from_pretrained("google/t5-v1_1-small")
+base_model_output_size = 512
+# Match embedding matrix size to the tokenizer used during training.
+base_model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+model = T5Predictor(
+    base_model,
+    base_model_output_size,
+    drop_rate=0.1,
+    pooling="mean"   # ✅ confirmed from your command
+)
+# -------------------------
+# LOAD WEIGHTS
+# -------------------------
+state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
+# Some checkpoints were trained with an extra tokenizer token; align embedding size to checkpoint.
+checkpoint_vocab_size = state_dict["model.shared.weight"].shape[0]
+if model.model.shared.weight.shape[0] != checkpoint_vocab_size:
+    model.model.resize_token_embeddings(checkpoint_vocab_size, mean_resizing=False)
+model.load_state_dict(state_dict, strict=False)
+model.to(DEVICE)
+model.eval()
+# -------------------------
+# PREDICT FUNCTION
+# -------------------------
+def predict(text, threshold=0.33):
+    # ❌ NO preprocessing (important)
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=256   # ✅ from your command
+    )
+    input_ids = inputs["input_ids"].to(DEVICE)
+    attention_mask = inputs["attention_mask"].to(DEVICE)
+    with torch.no_grad():
+        _, predictions = model(input_ids, attention_mask)
+        prob = torch.sigmoid(predictions).item()
+        if prob > threshold:
+            return "TRUE", prob
+        else:
+            return "FALSE", prob
+# -------------------------
+# TEST
+# -------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Predict is_gap_direct from text")
+    parser.add_argument("--threshold", type=float, default=0.33, help="Decision threshold for TRUE/FALSE")
+    parser.add_argument("--text", type=str, default="Rb₂NaPrCl₆ is (Cubic) Perovskite-derived structured and crystallizes in the cubic Fm̅3m space group. Rb¹⁺ is bonded to twelve equivalent Cl¹⁻ atoms to form RbCl₁₂ cuboctahedra that share corners with twelve equivalent RbCl₁₂ cuboctahedra, faces with six equivalent RbCl₁₂ cuboctahedra, faces with four equivalent NaCl₆ octahedra, and faces with four equivalent PrCl₆ octahedra. All Rb–Cl bond lengths are 3.90 Å. Na¹⁺ is bonded to six equivalent Cl¹⁻ atoms to form NaCl₆ octahedra that share corners with six equivalent PrCl₆ octahedra and faces with eight equivalent RbCl₁₂ cuboctahedra. The corner-sharing octahedra are not tilted. All Na–Cl bond lengths are 2.76 Å. Pr³⁺ is bonded to six equivalent Cl¹⁻ atoms to form PrCl₆ octahedra that share corners with six equivalent NaCl₆ octahedra and faces with eight equivalent RbCl₁₂ cuboctahedra. The corner-sharing octahedra are not tilted. All Pr–Cl bond lengths are 2.75 Å. Cl¹⁻ is bonded in a distorted linear geometry to four equivalent Rb¹⁺, one Na¹⁺, and one Pr³⁺ atom.", help="Input text to classify")
+    args = parser.parse_args()
+    result, prob = predict(args.text, threshold=args.threshold)
+    print(result, prob)

predict_volume.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import argparse
+import contextlib
+import io
+import logging
+import os
+import sys
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, T5EncoderModel
+from transformers.utils import logging as transformers_logging
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_CANDIDATES = [
+    SCRIPT_DIR,
+    os.path.dirname(SCRIPT_DIR),
+    os.path.join(os.path.dirname(SCRIPT_DIR), "LLM-Prop"),
+]
+PROJECT_DIR = None
+for candidate in PROJECT_CANDIDATES:
+    if os.path.exists(os.path.join(candidate, "llmprop_model.py")):
+        PROJECT_DIR = candidate
+        break
+if PROJECT_DIR is None:
+    raise FileNotFoundError(
+        "Could not locate project root containing llmprop_model.py. "
+        "Expected near the deployment folder."
+    )
+if os.path.isdir(PROJECT_DIR) and PROJECT_DIR not in sys.path:
+    sys.path.insert(0, PROJECT_DIR)
+from llmprop_model import T5Predictor
+def z_denormalize(scaled_labels, labels_mean, labels_std):
+    return (scaled_labels * labels_std) + labels_mean
+# -------------------------
+# CONFIG
+# -------------------------
+MODEL_PATH = os.path.join(
+    PROJECT_DIR,
+    "checkpoints",
+    "samples",
+    "regression",
+    "best_checkpoint_for_volume.pt",
+)
+TOKENIZER_PATH = os.path.join(
+    PROJECT_DIR,
+    "tokenizers",
+    "t5_tokenizer_trained_on_modified_part_of_C4_and_textedge",
+)
+TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, "data", "samples", "train_data.csv")
+PROPERTY_NAME = "volume"
+DEVICE = torch.device("cpu")
+# Silence HF/Transformers startup logs for cleaner terminal output.
+os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
+transformers_logging.set_verbosity_error()
+logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
+# -------------------------
+# PATH CHECKS
+# -------------------------
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Checkpoint not found: {MODEL_PATH}")
+if not os.path.exists(TOKENIZER_PATH):
+    raise FileNotFoundError(f"Tokenizer path not found: {TOKENIZER_PATH}")
+if not os.path.exists(TRAIN_DATA_PATH):
+    raise FileNotFoundError(f"Training data not found: {TRAIN_DATA_PATH}")
+# -------------------------
+# LOAD TRAIN LABEL STATS (z_norm)
+# -------------------------
+train_df = pd.read_csv(TRAIN_DATA_PATH)
+if PROPERTY_NAME not in train_df.columns:
+    raise ValueError(f"Column '{PROPERTY_NAME}' not found in {TRAIN_DATA_PATH}")
+train_labels = torch.tensor(
+    train_df[PROPERTY_NAME].dropna().to_numpy(),
+    dtype=torch.float32,
+)
+if train_labels.numel() == 0:
+    raise ValueError(f"No non-null values found for '{PROPERTY_NAME}' in {TRAIN_DATA_PATH}")
+TRAIN_LABEL_MEAN = torch.mean(train_labels)
+TRAIN_LABEL_STD = torch.std(train_labels)
+if float(TRAIN_LABEL_STD) == 0.0:
+    raise ValueError(
+        f"Standard deviation for '{PROPERTY_NAME}' is 0.0; z_norm de-normalization is undefined"
+    )
+def _quiet_call(fn, *args, **kwargs):
+    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
+        return fn(*args, **kwargs)
+# -------------------------
+# LOAD TOKENIZER
+# -------------------------
+tokenizer = _quiet_call(AutoTokenizer.from_pretrained, TOKENIZER_PATH)
+# -------------------------
+# LOAD MODEL
+# -------------------------
+base_model = _quiet_call(T5EncoderModel.from_pretrained, "google/t5-v1_1-small")
+base_model_output_size = 512
+# Match embedding matrix size to the tokenizer used during training.
+base_model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
+model = T5Predictor(
+    base_model,
+    base_model_output_size,
+    drop_rate=0.1,
+    pooling="mean",
+)
+# -------------------------
+# LOAD WEIGHTS
+# -------------------------
+state_dict = _quiet_call(torch.load, MODEL_PATH, map_location=DEVICE)
+# Some checkpoints were trained with an extra tokenizer token; align embedding size to checkpoint.
+checkpoint_vocab_size = state_dict["model.shared.weight"].shape[0]
+if model.model.shared.weight.shape[0] != checkpoint_vocab_size:
+    model.model.resize_token_embeddings(checkpoint_vocab_size, mean_resizing=False)
+model.load_state_dict(state_dict, strict=False)
+model.to(DEVICE)
+model.eval()
+# -------------------------
+# PREDICT FUNCTION
+# -------------------------
+def predict_volume(text, max_length=256):
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding=True,
+        max_length=max_length,
+    )
+    input_ids = inputs["input_ids"].to(DEVICE)
+    attention_mask = inputs["attention_mask"].to(DEVICE)
+    with torch.no_grad():
+        _, prediction_norm = model(input_ids, attention_mask)
+        prediction_volume = z_denormalize(
+            prediction_norm.squeeze(),
+            TRAIN_LABEL_MEAN,
+            TRAIN_LABEL_STD,
+        ).item()
+    return prediction_volume
+# -------------------------
+# TEST
+# -------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Predict volume from text")
+    parser.add_argument("--max_length", type=int, default=256, help="Tokenizer max length")
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="A simple cubic crystalLiAl(MoO₄)₂ crystallizes in the triclinic P̅1 space group. Li¹⁺ is bonded in a 5-coordinate geometry to five O²⁻ atoms. There are a spread of Li-O bond distances ranging from 1.98-2.25 A. There are two inequivalent Mo⁶⁺ sites. In the first Mo⁶⁺ site, Mo⁶⁺ is bonded in a 4-coordinate geometry to five O²⁻ atoms. There are a spread of Mo-O bond distances ranging from 1.74-2.46 A. In the second Mo⁶⁺ site, Mo⁶⁺ is bonded to four O²⁻ atoms to form MoO₄ tetrahedra that share corners with three equivalent AlO₆ octahedra. The corner-sharing octahedral tilt angles range from 15-44 degrees. There are a spread of Mo-O bond distances ranging from 1.77-1.82 A. Al³⁺ is bonded to six O²⁻ atoms to form AlO₆ octahedra that share corners with three equivalent MoO₄ tetrahedra and an edgeedge with one AlO₆ octahedra. There are a spread of Al-O bond distances ranging from 1.88-1.95 A. There are eight inequivalent O²⁻ sites. In the first O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Li¹⁺, one Mo⁶⁺, and one Al³⁺ atom. In the second O²⁻ site, O²⁻ is bonded in a distorted trigonal planar geometry to one Mo⁶⁺ and two equivalent Al³⁺ atoms. In the third O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fourth O²⁻ site, O²⁻ is bonded in a linear geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the fifth O²⁻ site, O²⁻ is bonded in a linear geometry to one Mo⁶⁺ and one Al³⁺ atom. In the sixth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Li¹⁺ and one Mo⁶⁺ atom. In the seventh O²⁻ site, O²⁻ is bonded in a 4-coordinate geometry to one Li¹⁺, two equivalent Mo⁶⁺, and one Al³⁺ atom. In the eighth O²⁻ site, O²⁻ is bonded in a bent 150 degrees geometry to one Mo⁶⁺ and one Al³⁺ atom. with atoms arranged periodically and stable at room temperature.",
+        help="Input text to predict volume",
+    )
+    args = parser.parse_args()
+    value = predict_volume(args.text, max_length=args.max_length)
+    print(f"Predicted volume: {value:.6f}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi>=0.110.0
+huggingface_hub>=0.23.0
+numpy>=1.24.0
+pydantic>=2.7.0
+uvicorn>=0.29.0
+torch==2.1.0
+pandas==2.0.1
+transformers==4.23.1
+sentencepiece==0.1.97
+tokenizers==0.13.1
+torchmetrics>=1.4.0
+scikit-learn==1.2.2
+tqdm==4.66.1