Andreas99 commited on
Commit
10d949e
·
verified ·
1 Parent(s): 8dd388d

Delete retriever

Browse files
Files changed (1) hide show
  1. retriever/retriever.py +0 -129
retriever/retriever.py DELETED
@@ -1,129 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModel
2
- from sklearn.metrics.pairwise import cosine_similarity
3
- import json
4
- import torch
5
- from tqdm import tqdm
6
- import os
7
- import pandas as pd
8
- import numpy as np
9
- from datasets import load_dataset
10
- from utils.utils import read_yaml_file
11
-
12
-
13
- def generate_topic_level_embeddings(model, tokenizer, paper_list, tmp_id_2_abs):
14
- id2topics = {
15
- entry["paper_id"]: [entry["Level 1"], entry["Level 2"], entry["Level 3"]]
16
- for entry in tmp_id_2_abs['train']
17
- }
18
-
19
- for topic_level in ['Level 1', 'Level 2', 'Level 3']:
20
- i = 0
21
- batch_size = 2048
22
- candidate_emb_list = []
23
- pbar = tqdm(total=len(paper_list))
24
- while i < len(paper_list):
25
- yield i / len(paper_list) / 3 if topic_level == 'Level 1' else 0.33 + i / len(paper_list) / 3 if topic_level == 'Level 2' else 0.66 + i / len(paper_list) / 3
26
- paper_batch = paper_list[i:i+batch_size]
27
- paper_text_batch = []
28
- for paper_id in paper_batch:
29
- topics = id2topics[paper_id][int(topic_level[6])-1]
30
- topic_text = ''
31
- for t in topics:
32
- topic_text += t + ','
33
- paper_text_batch.append(topic_text)
34
- inputs = tokenizer(paper_text_batch, return_tensors='pt', padding=True, truncation=True)
35
- with torch.no_grad():
36
- outputs = model(**inputs.to('cuda'))
37
- candidate_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
38
- candidate_embeddings = candidate_embeddings.reshape(-1, 1024)
39
- candidate_emb_list.append(candidate_embeddings)
40
-
41
- i += len(candidate_embeddings)
42
- pbar.update(len(candidate_embeddings))
43
-
44
- all_candidate_embs = torch.cat(candidate_emb_list, 0)
45
-
46
- df = pd.DataFrame({
47
- "paper_id": paper_list,
48
- "embedding": list(all_candidate_embs.numpy())
49
- })
50
-
51
- if not os.path.exists('datasets/topic_level_embeds'):
52
- os.makedirs('datasets/topic_level_embeds')
53
-
54
- df.to_parquet(f'datasets/topic_level_embeds/{topic_level}_emb.parquet', engine='pyarrow', compression='snappy')
55
-
56
- all_candidate_embs_L1 = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/Level 1_emb.parquet')['embedding'].tolist()))
57
- all_candidate_embs_L2 = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/Level 2_emb.parquet')['embedding'].tolist()))
58
- all_candidate_embs_L3 = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/Level 3_emb.parquet')['embedding'].tolist()))
59
- all_candidate_embs = all_candidate_embs_L1 + all_candidate_embs_L2 + all_candidate_embs_L3
60
-
61
- df = pd.DataFrame({
62
- "paper_id": paper_list,
63
- "embedding": list(all_candidate_embs.numpy())
64
- })
65
-
66
- df.to_parquet('datasets/topic_level_embeds/arxiv_papers_embeds.parquet', engine='pyarrow', compression='snappy')
67
-
68
-
69
-
70
- def retriever(query, retrieval_nodes_path):
71
- yield 0
72
- config = read_yaml_file('configs/config.yaml')
73
-
74
- # Load the model and tokenizer to generate the embeddings
75
- embedder_name = config['retriever']['embedder']
76
- tokenizer = AutoTokenizer.from_pretrained(embedder_name)
77
- model = AutoModel.from_pretrained(embedder_name).to(device='cuda', dtype=torch.float16)
78
-
79
-
80
- # Load the arXiv dataset
81
- tmp_id_2_abs = load_dataset("AliMaatouk/arXiv_Topics", cache_dir="datasets/arxiv_topics")
82
- paper_list = list(tmp_id_2_abs['train']['paper_id'])
83
-
84
-
85
- # Generate the query embeddings
86
- inputs = tokenizer([query], return_tensors='pt', padding=True, truncation=True)
87
- with torch.no_grad():
88
- outputs = model(**inputs.to('cuda'))
89
- query_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
90
-
91
- # Generate the candidate embeddings
92
- # Load the embeddings from the dataset, otherwise generate the embeddings and save them
93
- if config['retriever']['load_arxiv_embeds']:
94
- dataset = load_dataset("AliMaatouk/arXiv-Topics-Embeddings", cache_dir="datasets/topic_level_embeds")
95
- table = dataset["train"].data # Get PyArrow Table
96
- all_candidate_embs = table.column("embedding").to_numpy()
97
- else:
98
- # If the file does not exist, generate the embeddings, otherwise, load the embeddings
99
- if not os.path.exists('datasets/topic_level_embeds/arxiv_papers_embeds.parquet'):
100
- yield from generate_topic_level_embeddings(model, tokenizer, paper_list, tmp_id_2_abs)
101
-
102
- all_candidate_embs = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/arxiv_papers_embeds.parquet')['embedding'].tolist()))
103
- all_candidate_embs = all_candidate_embs.cpu().numpy()
104
-
105
- all_candidate_embs = np.stack(all_candidate_embs)
106
-
107
-
108
- # Calculate the cosine similarity between the query and all candidate embeddings
109
- query_embeddings = np.array(query_embeddings)
110
- similarity_scores = cosine_similarity(query_embeddings, all_candidate_embs)[0]
111
-
112
-
113
- # Sort the papers by similarity scores and select the top K papers
114
- id_score_list = []
115
- for i in range(len(paper_list)):
116
- id_score_list.append([paper_list[i], similarity_scores[i]])
117
-
118
- sorted_scores = sorted(id_score_list, key=lambda i: i[-1], reverse = True)
119
- top_K_paper = [sample[0] for sample in sorted_scores[:config['retriever']['num_retrievals']]]
120
-
121
- papers_results = {
122
- paper: True
123
- for paper in top_K_paper
124
- }
125
-
126
- with open(retrieval_nodes_path, 'w') as f:
127
- json.dump(papers_results, f)
128
-
129
- yield 1.0