| from __future__ import annotations
|
|
|
| import json
|
| import logging
|
| import os
|
| import pickle
|
| import sys
|
| import time
|
|
|
| _SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
|
| _PROJECT_ROOT = os.path.dirname(_SCRIPTS_DIR)
|
| _SRC_DIR = os.path.join(_PROJECT_ROOT, "src")
|
| for _p in [_SRC_DIR, _PROJECT_ROOT]:
|
| if _p not in sys.path:
|
| sys.path.insert(0, _p)
|
|
|
| import numpy as np
|
|
|
| logging.basicConfig(
|
| level=logging.INFO,
|
| format="%(asctime)s %(levelname)s %(message)s",
|
| datefmt="%H:%M:%S",
|
| )
|
| logger = logging.getLogger(__name__)
|
|
|
| BASE_DIR = _PROJECT_ROOT
|
| PRECOMPUTED_DIR = os.path.join(BASE_DIR, "precomputed")
|
| CANDIDATES_PATH = os.path.join(BASE_DIR, "candidates.jsonl")
|
|
|
|
|
|
|
| def build_numpy_bm25_artifacts(bm25, precomputed_dir: str) -> None:
|
| """
|
| Build scipy sparse BM25 score matrix from an existing BM25Okapi object.
|
|
|
| Saves:
|
| vocab.pkl - {term: row_index} mapping (tiny, fast to load)
|
| bm25_matrix.npz - scipy sparse CSR (vocab_size × n_docs), float32
|
| Each entry [term_idx, doc_idx] = precomputed
|
| idf(term) × bm25_tf_adjusted(term, doc)
|
|
|
| Scoring at runtime:
|
| q_vec (1 × vocab_size) @ bm25_matrix (vocab_size × n_docs)
|
| → (1 × n_docs) dense result in a single scipy sparse op (<10 ms).
|
| """
|
| try:
|
| from scipy.sparse import coo_matrix, save_npz
|
| except ImportError:
|
| import subprocess
|
| subprocess.check_call([sys.executable, "-m", "pip", "install", "scipy"])
|
| from scipy.sparse import coo_matrix, save_npz
|
|
|
| logger.info("Building NumPy sparse BM25 matrix …")
|
| t0 = time.perf_counter()
|
|
|
| k1: float = getattr(bm25, "k1", 1.5)
|
| b: float = getattr(bm25, "b", 0.75)
|
| avgdl: float = float(bm25.avgdl)
|
| doc_len_arr = np.array(bm25.doc_len, dtype=np.float32)
|
| n_docs: int = int(bm25.corpus_size)
|
|
|
|
|
| vocab: dict = {term: idx for idx, term in enumerate(bm25.idf.keys())}
|
| idf_array = np.array([bm25.idf[term] for term in vocab], dtype=np.float32)
|
| n_vocab: int = len(vocab)
|
|
|
| logger.info(" vocab_size=%d n_docs=%d", n_vocab, n_docs)
|
|
|
| rows_list: list = []
|
| cols_list: list = []
|
| data_list: list = []
|
|
|
| checkpoint = max(1, n_docs // 10)
|
| for doc_idx, doc_freq_dict in enumerate(bm25.doc_freqs):
|
| dl = float(doc_len_arr[doc_idx])
|
| denom_k = k1 * (1.0 - b + b * dl / avgdl)
|
| for term, tf in doc_freq_dict.items():
|
| term_idx = vocab.get(term)
|
| if term_idx is None:
|
| continue
|
| tf_f = float(tf)
|
| tf_adj = (tf_f * (k1 + 1.0)) / (tf_f + denom_k)
|
| rows_list.append(term_idx)
|
| cols_list.append(doc_idx)
|
| data_list.append(float(idf_array[term_idx]) * tf_adj)
|
| if doc_idx % checkpoint == 0 and doc_idx > 0:
|
| logger.info(" … %d / %d docs processed", doc_idx, n_docs)
|
|
|
| nnz = len(data_list)
|
| logger.info(" COO built: nnz=%d (%.1f s)", nnz, time.perf_counter() - t0)
|
|
|
| bm25_matrix = coo_matrix(
|
| (
|
| np.array(data_list, dtype=np.float32),
|
| (np.array(rows_list, dtype=np.int32),
|
| np.array(cols_list, dtype=np.int32)),
|
| ),
|
| shape=(n_vocab, n_docs),
|
| ).tocsr()
|
|
|
| elapsed = time.perf_counter() - t0
|
| logger.info(" CSR matrix: shape=%s nnz=%d (%.1f s total)",
|
| bm25_matrix.shape, bm25_matrix.nnz, elapsed)
|
|
|
| vocab_path = os.path.join(precomputed_dir, "vocab.pkl")
|
| matrix_path = os.path.join(precomputed_dir, "bm25_matrix.npz")
|
|
|
| with open(vocab_path, "wb") as f:
|
| pickle.dump(vocab, f, protocol=pickle.HIGHEST_PROTOCOL)
|
| save_npz(matrix_path, bm25_matrix)
|
|
|
| logger.info(" Saved vocab.pkl (%d terms)", n_vocab)
|
| logger.info(" Saved bm25_matrix.npz (%.1f MB)",
|
| os.path.getsize(matrix_path) / 1e6)
|
|
|
|
|
|
|
| def build_candidate_offset_index(candidates_path: str, precomputed_dir: str) -> None:
|
| """
|
| Scan candidates.jsonl once in binary mode and record the byte offset of
|
| each candidate_id.
|
|
|
| Saves candidate_offsets.pkl: {candidate_id: byte_offset}
|
|
|
| At runtime Stage 2 uses f.seek(offset) + f.readline() for each of the
|
| ~8500 stage-1 candidates instead of streaming all 487 MB. Reduces
|
| Stage 2 from ~4 s to ~0.1–0.3 s.
|
| """
|
| logger.info("Building candidate byte-offset index …")
|
| t0 = time.perf_counter()
|
|
|
| offsets: dict = {}
|
| size_bytes = os.path.getsize(candidates_path)
|
|
|
| with open(candidates_path, "rb") as f:
|
| while True:
|
| offset = f.tell()
|
| raw_line = f.readline()
|
| if not raw_line:
|
| break
|
| stripped = raw_line.strip()
|
| if not stripped:
|
| continue
|
| try:
|
| cid = json.loads(stripped).get("candidate_id")
|
| if cid:
|
| offsets[cid] = offset
|
| except json.JSONDecodeError:
|
| pass
|
|
|
| if len(offsets) % 10_000 == 0 and len(offsets) > 0:
|
| pct = f.tell() / size_bytes * 100
|
| logger.info(" … %d candidates indexed (%.0f%% of file)", len(offsets), pct)
|
|
|
| elapsed = time.perf_counter() - t0
|
| logger.info(" Offset index: %d candidates in %.1f s", len(offsets), elapsed)
|
|
|
| out_path = os.path.join(precomputed_dir, "candidate_offsets.pkl")
|
| with open(out_path, "wb") as f:
|
| pickle.dump(offsets, f, protocol=pickle.HIGHEST_PROTOCOL)
|
| logger.info(" Saved candidate_offsets.pkl (%.1f MB)",
|
| os.path.getsize(out_path) / 1e6)
|
|
|
|
|
|
|
| def export_lgbm_native(precomputed_dir: str) -> None:
|
| """
|
| Re-save lgbm_model.pkl in LightGBM's native text format.
|
| lgb.Booster(model_file=...) loads ~10-20x faster than pickle.
|
| """
|
| import lightgbm as lgb
|
|
|
| pkl_path = os.path.join(precomputed_dir, "lgbm_model.pkl")
|
| txt_path = os.path.join(precomputed_dir, "lgbm_model.txt")
|
|
|
| logger.info("Exporting LightGBM model to native text format …")
|
| t0 = time.perf_counter()
|
|
|
| with open(pkl_path, "rb") as f:
|
| model = pickle.load(f)
|
|
|
| model.save_model(txt_path)
|
| logger.info(" Saved lgbm_model.txt (%.1f MB) in %.2f s",
|
| os.path.getsize(txt_path) / 1e6,
|
| time.perf_counter() - t0)
|
|
|
| def main() -> None:
|
| logger.info("=" * 60)
|
| logger.info("REBUILD FAST ARTIFACTS")
|
| logger.info("=" * 60)
|
| t_total = time.perf_counter()
|
|
|
| bm25_pkl = os.path.join(PRECOMPUTED_DIR, "bm25_index.pkl")
|
| logger.info("Loading bm25_index.pkl (%.1f MB) …",
|
| os.path.getsize(bm25_pkl) / 1e6)
|
| t0 = time.perf_counter()
|
| with open(bm25_pkl, "rb") as f:
|
| bm25 = pickle.load(f)
|
| logger.info(" Loaded in %.2f s", time.perf_counter() - t0)
|
|
|
| build_numpy_bm25_artifacts(bm25, PRECOMPUTED_DIR)
|
|
|
|
|
| build_candidate_offset_index(CANDIDATES_PATH, PRECOMPUTED_DIR)
|
|
|
| export_lgbm_native(PRECOMPUTED_DIR)
|
|
|
| logger.info("=" * 60)
|
| logger.info("ALL ARTIFACTS BUILT in %.1f s", time.perf_counter() - t_total)
|
| logger.info("New files in precomputed/:")
|
| for fname in ["vocab.pkl", "bm25_matrix.npz", "candidate_offsets.pkl", "lgbm_model.txt"]:
|
| fpath = os.path.join(PRECOMPUTED_DIR, fname)
|
| if os.path.isfile(fpath):
|
| logger.info(" %-30s %.1f MB", fname, os.path.getsize(fpath) / 1e6)
|
| logger.info("rank.py will auto-detect these and use the fast paths.")
|
| logger.info("=" * 60)
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|