Stable small-N semantic scaling in the demo (fixed population bounds)

#2
Files changed (2) hide show
  1. app/app.py +16 -8
  2. artifacts/precompute_meta.json +13 -0
app/app.py CHANGED
@@ -32,14 +32,22 @@ st.set_page_config(page_title="Lighthouse — Candidate Ranker", page_icon="🔦
32
  def _facets_and_rubric():
33
  rubric = json.load(open(os.path.join(ART, "jd_rubric.json"), encoding="utf-8"))
34
  facet_emb = np.load(os.path.join(ART, "jd_facet_emb.npy"))
35
- return rubric, facet_emb
36
-
37
-
38
- def _build_art(rubric, facet_emb):
39
- """Empty precomputed set -> every candidate is encoded on the fly."""
 
 
 
 
 
 
 
40
  dim = facet_emb.shape[1]
41
  return {"rubric": rubric, "ids": [], "id_to_row": {},
42
- "cand_emb": np.zeros((0, dim), dtype=np.float32), "facet_emb": facet_emb}
 
43
 
44
 
45
  def _parse_jsonl(text: str):
@@ -68,7 +76,7 @@ with st.sidebar:
68
  st.markdown("Ranking runs on **CPU, no hosted LLM**. The five-component score is gated by "
69
  "JD hard-negatives and a behavioral modifier; honeypots are zeroed.")
70
 
71
- rubric, facet_emb = _facets_and_rubric()
72
 
73
  raws = []
74
  if mode.startswith("Preloaded"):
@@ -87,7 +95,7 @@ raws = raws[:100]
87
 
88
  if raws and st.button("🔦 Rank candidates", type="primary"):
89
  with st.spinner(f"Encoding + scoring {len(raws)} candidates on CPU ..."):
90
- art = _build_art(rubric, facet_emb)
91
  records = ranker.score_all(raws, art)
92
  mx = max((r["final_score"] for r in records), default=0.0)
93
  if mx > 0:
 
32
  def _facets_and_rubric():
33
  rubric = json.load(open(os.path.join(ART, "jd_rubric.json"), encoding="utf-8"))
34
  facet_emb = np.load(os.path.join(ART, "jd_facet_emb.npy"))
35
+ # fixed population semantic bounds -> stable scores regardless of upload size
36
+ sem_lo = sem_hi = None
37
+ meta_path = os.path.join(ART, "precompute_meta.json")
38
+ if os.path.exists(meta_path):
39
+ meta = json.load(open(meta_path, encoding="utf-8"))
40
+ sem_lo, sem_hi = meta.get("semantic_p5"), meta.get("semantic_p95")
41
+ return rubric, facet_emb, sem_lo, sem_hi
42
+
43
+
44
+ def _build_art(rubric, facet_emb, sem_lo=None, sem_hi=None):
45
+ """Empty precomputed set -> every candidate is encoded on the fly.
46
+ Carries the fixed population semantic bounds so small uploads score stably."""
47
  dim = facet_emb.shape[1]
48
  return {"rubric": rubric, "ids": [], "id_to_row": {},
49
+ "cand_emb": np.zeros((0, dim), dtype=np.float32), "facet_emb": facet_emb,
50
+ "sem_lo": sem_lo, "sem_hi": sem_hi}
51
 
52
 
53
  def _parse_jsonl(text: str):
 
76
  st.markdown("Ranking runs on **CPU, no hosted LLM**. The five-component score is gated by "
77
  "JD hard-negatives and a behavioral modifier; honeypots are zeroed.")
78
 
79
+ rubric, facet_emb, sem_lo, sem_hi = _facets_and_rubric()
80
 
81
  raws = []
82
  if mode.startswith("Preloaded"):
 
95
 
96
  if raws and st.button("🔦 Rank candidates", type="primary"):
97
  with st.spinner(f"Encoding + scoring {len(raws)} candidates on CPU ..."):
98
+ art = _build_art(rubric, facet_emb, sem_lo, sem_hi)
99
  records = ranker.score_all(raws, art)
100
  mx = max((r["final_score"] for r in records), default=0.0)
101
  if mx > 0:
artifacts/precompute_meta.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "BAAI/bge-small-en-v1.5",
3
+ "dim": 384,
4
+ "n_candidates": 100000,
5
+ "n_facets": 10,
6
+ "seed": 1729,
7
+ "max_seq": 256,
8
+ "built_at": "2026-06-06T18:56:26",
9
+ "elapsed_sec": 4148.6,
10
+ "emb_dtype": "float16",
11
+ "semantic_p5": 0.5745536088943481,
12
+ "semantic_p95": 0.6435830593109131
13
+ }