| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from transformers import AutoTokenizer |
| from huggingface_hub import hf_hub_download |
| import numpy as np, requests, torch, torch.nn.functional as F, json |
|
|
|
|
| model_id = "LiquidAI/LFM2-ColBert-350M" |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
| config = json.load(open(hf_hub_download(model_id, "config_sentence_transformers.json"))) |
| skiplist = set( |
| t |
| for w in config["skiplist_words"] |
| for t in tokenizer.encode(w, add_special_tokens=False) |
| ) |
|
|
|
|
| def maxsim(q, d): |
| return (q @ d.T).max(dim=1).values.sum().item() |
|
|
|
|
| def preprocess(text, is_query): |
| prefix = config["query_prefix"] if is_query else config["document_prefix"] |
| toks = tokenizer.encode(prefix + text) |
| max_len = config["query_length"] if is_query else config["document_length"] |
| if is_query: |
| toks += [tokenizer.pad_token_id] * (max_len - len(toks)) |
| else: |
| toks = toks[:max_len] |
| mask = None if is_query else [t not in skiplist for t in toks] |
| return toks, mask |
|
|
|
|
| def embed(content, mask=None): |
| emb = np.array( |
| requests.post( |
| "http://localhost:8080/embedding", |
| json={"content": content}, |
| ).json()[0]["embedding"] |
| ) |
| if mask: |
| emb = emb[mask] |
| emb = torch.from_numpy(emb) |
| emb = F.normalize(emb, p=2, dim=-1) |
| return emb.unsqueeze(0) |
|
|
|
|
| docs = [ |
| "hi", |
| "it is a bear", |
| "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.", |
| ] |
| query = "What is panda?" |
|
|
| q = embed(*preprocess(query, True)) |
| d = [embed(*preprocess(doc, False)) for doc in docs] |
| s = [(query, doc, maxsim(q.squeeze(), di.squeeze())) for doc, di in zip(docs, d)] |
| for q_text, d_text, score in s: |
| print(f"Score: {score:.2f} | Q: {q_text} | D: {d_text}") |
|
|