Spaces:
Sleeping
Sleeping
Stable small-N semantic scaling in the demo (fixed population bounds)
#2
by Auenchanters - opened
- app/app.py +16 -8
- artifacts/precompute_meta.json +13 -0
app/app.py
CHANGED
|
@@ -32,14 +32,22 @@ st.set_page_config(page_title="Lighthouse — Candidate Ranker", page_icon="🔦
|
|
| 32 |
def _facets_and_rubric():
|
| 33 |
rubric = json.load(open(os.path.join(ART, "jd_rubric.json"), encoding="utf-8"))
|
| 34 |
facet_emb = np.load(os.path.join(ART, "jd_facet_emb.npy"))
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
dim = facet_emb.shape[1]
|
| 41 |
return {"rubric": rubric, "ids": [], "id_to_row": {},
|
| 42 |
-
"cand_emb": np.zeros((0, dim), dtype=np.float32), "facet_emb": facet_emb
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
def _parse_jsonl(text: str):
|
|
@@ -68,7 +76,7 @@ with st.sidebar:
|
|
| 68 |
st.markdown("Ranking runs on **CPU, no hosted LLM**. The five-component score is gated by "
|
| 69 |
"JD hard-negatives and a behavioral modifier; honeypots are zeroed.")
|
| 70 |
|
| 71 |
-
rubric, facet_emb = _facets_and_rubric()
|
| 72 |
|
| 73 |
raws = []
|
| 74 |
if mode.startswith("Preloaded"):
|
|
@@ -87,7 +95,7 @@ raws = raws[:100]
|
|
| 87 |
|
| 88 |
if raws and st.button("🔦 Rank candidates", type="primary"):
|
| 89 |
with st.spinner(f"Encoding + scoring {len(raws)} candidates on CPU ..."):
|
| 90 |
-
art = _build_art(rubric, facet_emb)
|
| 91 |
records = ranker.score_all(raws, art)
|
| 92 |
mx = max((r["final_score"] for r in records), default=0.0)
|
| 93 |
if mx > 0:
|
|
|
|
| 32 |
def _facets_and_rubric():
|
| 33 |
rubric = json.load(open(os.path.join(ART, "jd_rubric.json"), encoding="utf-8"))
|
| 34 |
facet_emb = np.load(os.path.join(ART, "jd_facet_emb.npy"))
|
| 35 |
+
# fixed population semantic bounds -> stable scores regardless of upload size
|
| 36 |
+
sem_lo = sem_hi = None
|
| 37 |
+
meta_path = os.path.join(ART, "precompute_meta.json")
|
| 38 |
+
if os.path.exists(meta_path):
|
| 39 |
+
meta = json.load(open(meta_path, encoding="utf-8"))
|
| 40 |
+
sem_lo, sem_hi = meta.get("semantic_p5"), meta.get("semantic_p95")
|
| 41 |
+
return rubric, facet_emb, sem_lo, sem_hi
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _build_art(rubric, facet_emb, sem_lo=None, sem_hi=None):
|
| 45 |
+
"""Empty precomputed set -> every candidate is encoded on the fly.
|
| 46 |
+
Carries the fixed population semantic bounds so small uploads score stably."""
|
| 47 |
dim = facet_emb.shape[1]
|
| 48 |
return {"rubric": rubric, "ids": [], "id_to_row": {},
|
| 49 |
+
"cand_emb": np.zeros((0, dim), dtype=np.float32), "facet_emb": facet_emb,
|
| 50 |
+
"sem_lo": sem_lo, "sem_hi": sem_hi}
|
| 51 |
|
| 52 |
|
| 53 |
def _parse_jsonl(text: str):
|
|
|
|
| 76 |
st.markdown("Ranking runs on **CPU, no hosted LLM**. The five-component score is gated by "
|
| 77 |
"JD hard-negatives and a behavioral modifier; honeypots are zeroed.")
|
| 78 |
|
| 79 |
+
rubric, facet_emb, sem_lo, sem_hi = _facets_and_rubric()
|
| 80 |
|
| 81 |
raws = []
|
| 82 |
if mode.startswith("Preloaded"):
|
|
|
|
| 95 |
|
| 96 |
if raws and st.button("🔦 Rank candidates", type="primary"):
|
| 97 |
with st.spinner(f"Encoding + scoring {len(raws)} candidates on CPU ..."):
|
| 98 |
+
art = _build_art(rubric, facet_emb, sem_lo, sem_hi)
|
| 99 |
records = ranker.score_all(raws, art)
|
| 100 |
mx = max((r["final_score"] for r in records), default=0.0)
|
| 101 |
if mx > 0:
|
artifacts/precompute_meta.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "BAAI/bge-small-en-v1.5",
|
| 3 |
+
"dim": 384,
|
| 4 |
+
"n_candidates": 100000,
|
| 5 |
+
"n_facets": 10,
|
| 6 |
+
"seed": 1729,
|
| 7 |
+
"max_seq": 256,
|
| 8 |
+
"built_at": "2026-06-06T18:56:26",
|
| 9 |
+
"elapsed_sec": 4148.6,
|
| 10 |
+
"emb_dtype": "float16",
|
| 11 |
+
"semantic_p5": 0.5745536088943481,
|
| 12 |
+
"semantic_p95": 0.6435830593109131
|
| 13 |
+
}
|