LunaLan07 commited on
Commit
2b8c0e4
·
verified ·
1 Parent(s): 9ca21f2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +17 -39
README.md CHANGED
@@ -35,52 +35,30 @@ BioHiCL aligns:
35
 
36
  ---
37
 
38
- ## 🚀 Usage — Text Similarity
39
 
40
  ```python
41
- from transformers import AutoTokenizer, AutoModel
42
- import torch
43
- import torch.nn.functional as F
 
 
44
 
45
- model_name = "LunaLan07/BioHiCL-base"
46
-
47
- tokenizer = AutoTokenizer.from_pretrained(model_name)
48
- model = AutoModel.from_pretrained(model_name)
49
-
50
- def encode(texts):
51
- inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
52
- outputs = model(**inputs)
53
- embeddings = outputs.last_hidden_state[:, 0] # CLS token
54
- return F.normalize(embeddings, p=2, dim=1)
55
-
56
- # Example
57
- query = encode(["What are treatments for COPD?"])
58
- doc = encode(["Chronic obstructive pulmonary disease is treated with bronchodilators."])
59
-
60
- similarity = (query @ doc.T).item()
61
- print(similarity)
62
 
63
-
64
-
65
- ```python
66
- from transformers import AutoTokenizer, AutoModel
67
- import torch
68
- import torch.nn.functional as F
69
 
70
  model_name = "LunaLan07/BioHiCL-base"
 
 
 
 
71
 
72
- tokenizer = AutoTokenizer.from_pretrained(model_name)
73
- model = AutoModel.from_pretrained(model_name)
 
74
 
75
- def encode(texts):
76
- inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
77
- outputs = model(**inputs)
78
- embeddings = outputs.last_hidden_state[:, 0] # CLS token
79
- return F.normalize(embeddings, p=2, dim=1)
80
 
81
- # Example
82
- query = encode(["What are treatments for COPD?"])
83
- doc = encode(["Chronic obstructive pulmonary disease is treated with bronchodilators."])
84
 
85
- similarity = (query @ doc.T).item()
86
- print(similarity)
 
35
 
36
  ---
37
 
38
+ ## 🚀 Usage — Evaluation on BEIR Benchmark
39
 
40
  ```python
41
+ from beir import util
42
+ from beir.datasets.data_loader import GenericDataLoader
43
+ from beir.retrieval.models import SentenceBERT
44
+ from beir.retrieval.search.dense import DenseRetrievalExactSearch
45
+ from beir.retrieval.evaluation import EvaluateRetrieval
46
 
47
+ dataset = "scifact"
48
+ url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/scifact.zip"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ data_path = util.download_and_unzip(url, "datasets")
51
+ corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")
 
 
 
 
52
 
53
  model_name = "LunaLan07/BioHiCL-base"
54
+ model = SentenceBERT(model_name)
55
+
56
+ retriever = DenseRetrievalExactSearch(model, batch_size=16)
57
+ results = retriever.search(corpus, queries, top_k=10, score_function="cos_sim")
58
 
59
+ ndcg, _map, recall, precision = EvaluateRetrieval.evaluate(
60
+ qrels, results, k_values=[1, 3, 5, 10]
61
+ )
62
 
 
 
 
 
 
63
 
 
 
 
64