livctr commited on
Commit
71a1c38
·
1 Parent(s): e40e7ff

add core recommendation logic

Browse files
Files changed (2) hide show
  1. core/__init__.py +0 -0
  2. core/recommender.py +128 -0
core/__init__.py ADDED
File without changes
core/recommender.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter, defaultdict
2
+ import json
3
+ from operator import itemgetter
4
+ from typing import List
5
+
6
+ from datasets import Dataset
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from transformers import AutoTokenizer, AutoModel
10
+
11
+
12
+ class EmbeddingProcessor:
13
+ def __init__(self,
14
+ model_name: str = 'sentence-transformers/all-mpnet-base-v2',
15
+ custom_model_name: str = 'salsabiilashifa11/sbert-paper'):
16
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ self.model = AutoModel.from_pretrained(custom_model_name)
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ self.device = torch.device(device)
20
+ self.model.to(self.device)
21
+ torch.cuda.empty_cache()
22
+
23
+ @staticmethod
24
+ def mean_pooling(model_output, attention_mask):
25
+ # First element of model_output contains all token embeddings
26
+ token_embeddings = model_output[0]
27
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
28
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
29
+
30
+ def get_embeddings(self, batch):
31
+ title_tkn, abstract_tkn = " [TITLE] ", " [ABSTRACT] "
32
+ titles = batch["title"]
33
+ abstracts = batch["abstract"]
34
+
35
+ texts = [title_tkn + t + abstract_tkn + a for t, a in zip(titles, abstracts)]
36
+
37
+ # Tokenize sentences
38
+ encoded_input = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
39
+ encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
40
+
41
+ # Compute token embeddings
42
+ with torch.no_grad():
43
+ model_output = self.model(**encoded_input)
44
+
45
+ # Perform pooling
46
+ embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
47
+
48
+ # Normalize embeddings
49
+ embeddings = F.normalize(embeddings, p=2, dim=1)
50
+
51
+ # Move embeddings to CPU and convert to list
52
+ return embeddings.cpu().numpy().tolist()
53
+
54
+ def process_dataset(self, dataset_path: str, save_path: str, batch_size: int = 128):
55
+ # Load dataset
56
+ ds = Dataset.load_from_disk(dataset_path)
57
+
58
+ # Compute embeddings and add as a new column
59
+ ds_with_embeddings = ds.map(lambda x: {"embeddings": self.get_embeddings(x)}, batched=True, batch_size=batch_size)
60
+
61
+ # Save the updated dataset
62
+ save_path = save_path
63
+ ds_with_embeddings.save_to_disk(save_path)
64
+ print(f"Dataset with embeddings saved to {save_path}")
65
+
66
+ import os
67
+
68
+ class Recommender:
69
+ def __init__(self,
70
+ embedding_processor: EmbeddingProcessor,
71
+ frontend_embds_path: str = "data/frontend_data/all-mpnet-base-v2-embds",
72
+ frontend_id2professor_path: str = "data/frontend_data/arxiv_id2professor.json",
73
+ frontend_us_professor_path: str = "data/frontend_data/us_professor.json",
74
+ ):
75
+ self.embedding_processor = embedding_processor
76
+ self.ita = Dataset.load_from_disk(os.path.join(frontend_embds_path, "id_title_author"))
77
+ self.embds = torch.load(os.path.join(frontend_embds_path, "weights.pt"), weights_only=True)
78
+
79
+ # with open(frontend_id2professor_path, 'r') as f:
80
+ # self.id2professors = json.load(f)
81
+ with open(frontend_us_professor_path, 'r') as f:
82
+ # dictionary with professor names as keys and their metadata as values
83
+ self.us_professor_profiles = json.load(f)
84
+
85
+ def get_top_k(self, query: str, top_k: int = 5):
86
+ """Returns the top indices of papers most similar to the query."""
87
+ query_batch = {'title': [query], 'abstract': [""]}
88
+ query_embd = torch.Tensor(self.embedding_processor.get_embeddings(query_batch)[0])
89
+ sim = self.embds @ query_embd
90
+ return torch.argsort(sim, descending=True)[:top_k]
91
+
92
+ def get_recommended_data(self, top_indices: torch.Tensor):
93
+ """Returns a list of dictionaries with professors corresponding to their information."""
94
+ selected = self.ita.select(top_indices)
95
+ professors = selected["authors"]
96
+ professors = [prof for profs in professors for prof in profs]
97
+
98
+ # rank professors first by number of times appeared in the list
99
+ # and then by their order of appearance
100
+ counts = Counter(professors)
101
+ ranked_professors = sorted(counts.keys(), key=lambda name: (-counts[name], professors.index(name)))
102
+
103
+ # professor to IDs
104
+ professor2ids = defaultdict(list)
105
+ for pid_, pt, pauthors in zip(selected['id'], selected['title'], selected['authors']):
106
+ for prof in pauthors:
107
+ professor2ids[prof].append((pid_, pt))
108
+
109
+ # Build professor metadata
110
+ data = []
111
+ for prof in ranked_professors:
112
+ data.append({
113
+ "name": prof,
114
+ "title": self.us_professor_profiles[prof]["title"],
115
+ "department": self.us_professor_profiles[prof]["department"],
116
+ "university": self.us_professor_profiles[prof]["university"],
117
+ "papers": professor2ids[prof],
118
+ })
119
+ return data
120
+
121
+
122
+ if __name__ == "__main__":
123
+ embedding_processor = EmbeddingProcessor()
124
+ recommender = Recommender(embedding_processor)
125
+
126
+ top_k = recommender.get_top_k("What is the most important aspect of machine learning in computer science?", top_k=10)
127
+ data = recommender.get_recommended_data(top_k)
128
+ print(data)