TS-Proof-Ranker-v0
This is a small TS proof-ranker trained on synthetic first-order Horn proof tasks. It scores whether a goal is likely derivable from a list of axioms.
The model is not a standalone theorem prover. It is designed to sit above the repo's symbolic TS proof kernel, which verifies final derivations.
Metrics
{
"task": "synthetic_first_order_horn_derivability",
"train_examples": 10000,
"val_examples": 1500,
"test_examples": 1500,
"history": [
{
"epoch": 1,
"train_loss": 0.6953035289314902,
"val_loss": 0.6947557258605958,
"val_accuracy": 0.48,
"val_precision": 0.0,
"val_recall": 0.0,
"val_f1": 0.0,
"val_tp": 0,
"val_fp": 0,
"val_tn": 720,
"val_fn": 780
},
{
"epoch": 2,
"train_loss": 0.690956760743621,
"val_loss": 0.6783425331115722,
"val_accuracy": 0.5633333333333334,
"val_precision": 0.5511038430089943,
"val_recall": 0.8641025641025641,
"val_f1": 0.6729905142286571,
"val_tp": 674,
"val_fp": 549,
"val_tn": 171,
"val_fn": 106
},
{
"epoch": 3,
"train_loss": 0.595336615469805,
"val_loss": 0.4245245539347331,
"val_accuracy": 0.7926666666666666,
"val_precision": 0.7270087124878993,
"val_recall": 0.9628205128205128,
"val_f1": 0.8284611141753999,
"val_tp": 751,
"val_fp": 282,
"val_tn": 438,
"val_fn": 29
},
{
"epoch": 4,
"train_loss": 0.23917718022871928,
"val_loss": 0.04427426568667094,
"val_accuracy": 0.9846666666666667,
"val_precision": 0.9833971902937421,
"val_recall": 0.9871794871794872,
"val_f1": 0.9852847088931542,
"val_tp": 770,
"val_fp": 13,
"val_tn": 707,
"val_fn": 10
},
{
"epoch": 5,
"train_loss": 0.03695377022262283,
"val_loss": 0.020305812527736028,
"val_accuracy": 0.9946666666666667,
"val_precision": 0.9961439588688946,
"val_recall": 0.9935897435897436,
"val_f1": 0.994865211810013,
"val_tp": 775,
"val_fp": 3,
"val_tn": 717,
"val_fn": 5
},
{
"epoch": 6,
"train_loss": 0.0044971378269875245,
"val_loss": 0.0014470961211870113,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 7,
"train_loss": 0.0011997800953441543,
"val_loss": 0.00039825337364648777,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 8,
"train_loss": 0.0006471918863168351,
"val_loss": 0.00029759583280732234,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 9,
"train_loss": 0.0004827459261908009,
"val_loss": 0.00019198915533100565,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 10,
"train_loss": 0.00032637073736681207,
"val_loss": 0.00014338239538483322,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 11,
"train_loss": 0.0006662533271928566,
"val_loss": 0.00029968379534936197,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 12,
"train_loss": 0.00037879797566498023,
"val_loss": 8.06617513881065e-05,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 13,
"train_loss": 0.00021325066544687518,
"val_loss": 9.499855366690705e-05,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
},
{
"epoch": 14,
"train_loss": 0.00017575904239724676,
"val_loss": 5.698425287846476e-05,
"val_accuracy": 1.0,
"val_precision": 1.0,
"val_recall": 1.0,
"val_f1": 1.0,
"val_tp": 780,
"val_fp": 0,
"val_tn": 720,
"val_fn": 0
}
],
"test": {
"loss": 0.0009200108467290798,
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"tp": 791,
"fp": 0,
"tn": 709,
"fn": 0
}
}
Usage
import torch
from solm.ts_neural_prover import load_hf_artifact, score_task
model, tokenizer, metrics = load_hf_artifact(".", torch.device("cpu"))
score = score_task(
model,
tokenizer,
[
"Human(socrates)",
"forall x. Human(x) -> Mortal(x)",
"forall x. Mortal(x) -> Finite(x)",
],
"Finite(socrates)",
torch.device("cpu"),
)
print(score)
Limitations
This v0 model is trained on compact synthetic Horn-chain tasks. It should be used as a search/ranking signal, not as a proof certificate. Checked proofs must compile through the symbolic kernel.
- Downloads last month
- 17