scifact-relevance-classifier / scifact_features.py
andreiaalexa's picture
upload scifact_features.py
37804fb verified
"""Embedding feature builders for claim-document relevance classification."""
from __future__ import annotations
import numpy as np
def e5_queries(texts: list[str]) -> list[str]:
return [f"query: {text}" for text in texts]
def e5_passages(texts: list[str]) -> list[str]:
return [f"passage: {text}" for text in texts]
def pair_features(model, claims: list[str], documents: list[str], show_progress_bar=False):
"""Build standard sentence-pair features from two embedding vectors.
q and d alone give the classifier raw semantic position. abs(q-d) exposes
distance dimensions. q*d exposes alignment dimensions. cosine gives a
single retrieval-style similarity signal.
"""
q = model.encode(
e5_queries(claims),
normalize_embeddings=True,
show_progress_bar=show_progress_bar,
)
d = model.encode(
e5_passages(documents),
normalize_embeddings=True,
show_progress_bar=show_progress_bar,
)
cosine = np.sum(q * d, axis=1, keepdims=True)
return np.hstack([q, d, np.abs(q - d), q * d, cosine])