LiquidAI
/

LFM2-ColBERT-350M-GGUF

Sentence Similarity

sentence-transformers

feature-extraction

Model card Files Files and versions

LFM2-ColBERT-350M-GGUF / colbert-rerank.py

tarek-liquid's picture

Update readme and add license

a799c05 verified 4 months ago

history blame contribute delete

1.98 kB

	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "transformers",
	# "huggingface-hub",
	# "numpy",
	# "requests",
	# "torch",
	# ]
	# ///

	from transformers import AutoTokenizer
	from huggingface_hub import hf_hub_download
	import numpy as np, requests, torch, torch.nn.functional as F, json


	model_id = "LiquidAI/LFM2-ColBert-350M"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	config = json.load(open(hf_hub_download(model_id, "config_sentence_transformers.json")))
	skiplist = set(
	t
	for w in config["skiplist_words"]
	for t in tokenizer.encode(w, add_special_tokens=False)
	)


	def maxsim(q, d):
	return (q @ d.T).max(dim=1).values.sum().item()


	def preprocess(text, is_query):
	prefix = config["query_prefix"] if is_query else config["document_prefix"]
	toks = tokenizer.encode(prefix + text)
	max_len = config["query_length"] if is_query else config["document_length"]
	if is_query:
	toks += [tokenizer.pad_token_id] * (max_len - len(toks))
	else:
	toks = toks[:max_len]
	mask = None if is_query else [t not in skiplist for t in toks]
	return toks, mask


	def embed(content, mask=None):
	emb = np.array(
	requests.post(
	"http://localhost:8080/embedding",
	json={"content": content},
	).json()[0]["embedding"]
	)
	if mask:
	emb = emb[mask]
	emb = torch.from_numpy(emb)
	emb = F.normalize(emb, p=2, dim=-1) # L2 normalize each token embedding
	return emb.unsqueeze(0)


	docs = [
	"hi",
	"it is a bear",
	"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
	]
	query = "What is panda?"

	q = embed(*preprocess(query, True))
	d = [embed(*preprocess(doc, False)) for doc in docs]
	s = [(query, doc, maxsim(q.squeeze(), di.squeeze())) for doc, di in zip(docs, d)]
	for q_text, d_text, score in s:
	print(f"Score: {score:.2f} \| Q: {q_text} \| D: {d_text}")