Perunio commited on
Commit
f2f3107
·
verified ·
1 Parent(s): 4029143

Upload paper_similarity.py

Browse files
Files changed (1) hide show
  1. model/paper_similarity.py +315 -0
model/paper_similarity.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataset.ogbn_link_pred_dataset import OGBNLinkPredDataset
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from typing import List, Tuple
7
+ import re
8
+ import os
9
+
10
+
11
+ class PaperSimilarityFinder:
12
+ """Extension to find most similar papers based on title and abstract"""
13
+
14
+ def __init__(
15
+ self,
16
+ dataset,
17
+ method="tfidf",
18
+ model_name="all-MiniLM-L6-v2",
19
+ embeddings_cache_path=".",
20
+ ):
21
+ """
22
+ Initialize the similarity finder
23
+
24
+ Args:
25
+ dataset: Your OGBNLinkPredDataset instance
26
+ method: 'tfidf' or 'sentence_transformer'
27
+ model_name: For sentence transformer method
28
+ embeddings_cache_path: Path to directory for caching embeddings
29
+ """
30
+ self.dataset = dataset
31
+ self.method = method
32
+ self.corpus = dataset.corpus
33
+ self.model_name = model_name
34
+ self.embeddings_cache_path = embeddings_cache_path
35
+
36
+ self._load_citations()
37
+
38
+ if method == "tfidf":
39
+ self._setup_tfidf()
40
+ elif method == "sentence_transformer":
41
+ self.model = SentenceTransformer(model_name)
42
+ self._setup_sentence_embeddings()
43
+ else:
44
+ raise ValueError("Method must be 'tfidf' or 'sentence_transformer'")
45
+
46
+ def _load_citations(self):
47
+ """Load citation information from the dataset"""
48
+ self.citations = {}
49
+ self.cited_by = {}
50
+
51
+ edge_index = self.dataset.data.edge_index
52
+
53
+ for i in range(edge_index.shape[1]):
54
+ citing_paper = edge_index[0, i].item()
55
+ cited_paper = edge_index[1, i].item()
56
+
57
+ if citing_paper not in self.citations:
58
+ self.citations[citing_paper] = []
59
+ self.citations[citing_paper].append(cited_paper)
60
+
61
+ if cited_paper not in self.cited_by:
62
+ self.cited_by[cited_paper] = []
63
+ self.cited_by[cited_paper].append(citing_paper)
64
+
65
+ @staticmethod
66
+ def _preprocess_text(text: str) -> str:
67
+ """Basic text preprocessing"""
68
+ text = re.sub(r"\s+", " ", text.strip())
69
+ text = re.sub(r"\[\d+]", "", text)
70
+ return text
71
+
72
+ def _setup_tfidf(self):
73
+ """Setup TF-IDF vectorizer and compute corpus vectors"""
74
+ print("Setting up TF-IDF vectorization...")
75
+
76
+ processed_corpus = [self._preprocess_text(doc) for doc in self.corpus]
77
+
78
+ self.vectorizer = TfidfVectorizer(
79
+ max_features=10000,
80
+ stop_words="english",
81
+ ngram_range=(1, 2),
82
+ min_df=2,
83
+ max_df=0.8,
84
+ )
85
+
86
+ self.corpus_vectors = self.vectorizer.fit_transform(processed_corpus)
87
+ print(f"TF-IDF setup complete. Corpus shape: {self.corpus_vectors.shape}")
88
+
89
+ def _setup_sentence_embeddings(self):
90
+ """Setup sentence transformer and compute corpus embeddings"""
91
+
92
+ os.makedirs(self.embeddings_cache_path, exist_ok=True)
93
+
94
+ cache_filename = f"corpus_embeddings_{self.model_name.replace('/', '_')}.npy"
95
+ cache_filepath = os.path.join(self.embeddings_cache_path, cache_filename)
96
+
97
+ if os.path.exists(cache_filepath):
98
+ print(f"Loading sentence embeddings from cache: {cache_filepath}")
99
+ self.corpus_embeddings = np.load(cache_filepath)
100
+ else:
101
+ print("Computing sentence embeddings for corpus...")
102
+
103
+ batch_size = 100
104
+ embeddings = []
105
+
106
+ for i in range(0, len(self.corpus), batch_size):
107
+ batch = self.corpus[i : i + batch_size]
108
+ batch_embeddings = self.model.encode(batch, show_progress_bar=True)
109
+ embeddings.append(batch_embeddings)
110
+
111
+ self.corpus_embeddings = np.vstack(embeddings)
112
+
113
+ # Zapisujemy embeddingi do pliku cache
114
+ np.save(cache_filepath, self.corpus_embeddings)
115
+ print(f"Sentence embeddings computed and saved to cache: {cache_filepath}")
116
+
117
+ print(f"Sentence embeddings complete. Shape: {self.corpus_embeddings.shape}")
118
+
119
+ def find_similar_papers(
120
+ self, title: str, abstract: str, top_k: int = 10
121
+ ) -> List[Tuple[int, float, str]]:
122
+ """
123
+ Find most similar papers to given title and abstract
124
+
125
+ Args:
126
+ title: Title of your paper
127
+ abstract: Abstract of your paper
128
+ top_k: Number of top similar papers to return
129
+
130
+ Returns:
131
+ List of tuples: (paper_index, similarity_score, paper_text)
132
+ """
133
+ query_text = f"{title}\n {abstract}"
134
+
135
+ if self.method == "tfidf":
136
+ return self._find_similar_tfidf(query_text, top_k)
137
+ elif self.method == "sentence_transformer":
138
+ return self._find_similar_sentence_transformer(query_text, top_k)
139
+
140
+ def _find_similar_tfidf(
141
+ self, query_text: str, top_k: int
142
+ ) -> List[Tuple[int, float, str]]:
143
+ """Find similar papers using TF-IDF"""
144
+ processed_query = self._preprocess_text(query_text)
145
+
146
+ query_vector = self.vectorizer.transform([processed_query])
147
+
148
+ similarities = cosine_similarity(query_vector, self.corpus_vectors).flatten()
149
+
150
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
151
+
152
+ results = []
153
+ for idx in top_indices:
154
+ results.append((idx, similarities[idx], self.corpus[idx]))
155
+
156
+ return results
157
+
158
+ def _find_similar_sentence_transformer(
159
+ self, query_text: str, top_k: int
160
+ ) -> List[Tuple[int, float, str]]:
161
+ """Find similar papers using sentence transformers"""
162
+ query_embedding = self.model.encode([query_text])
163
+
164
+ similarities = cosine_similarity(
165
+ query_embedding, self.corpus_embeddings
166
+ ).flatten()
167
+
168
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
169
+
170
+ results = []
171
+ for idx in top_indices:
172
+ results.append((idx, similarities[idx], self.corpus[idx]))
173
+
174
+ return results
175
+
176
+ def get_paper_citations(self, paper_idx: int) -> Tuple[List[int], List[int]]:
177
+ """
178
+ Get citations for a specific paper
179
+
180
+ Args:
181
+ paper_idx: Index of the paper in the dataset
182
+
183
+ Returns:
184
+ Tuple of (papers_this_cites, papers_that_cite_this)
185
+ """
186
+ papers_cited = self.citations.get(paper_idx, [])
187
+ papers_citing = self.cited_by.get(paper_idx, [])
188
+
189
+ return papers_cited, papers_citing
190
+
191
+ def find_most_similar_with_citations(self, title: str, abstract: str) -> dict:
192
+ """
193
+ Find the most similar paper and return its citation information
194
+
195
+ Args:
196
+ title: Title of your paper
197
+ abstract: Abstract of your paper
198
+
199
+ Returns:
200
+ Dictionary with similarity info and citations
201
+ """
202
+ similar_papers = self.find_similar_papers(title, abstract, top_k=1)
203
+
204
+ if not similar_papers:
205
+ return {"error": "No similar papers found"}
206
+
207
+ most_similar_idx, similarity_score, paper_text = similar_papers[0]
208
+
209
+ papers_cited, papers_citing = self.get_paper_citations(most_similar_idx)
210
+
211
+ cited_papers_text = []
212
+ for cited_idx in papers_cited[:5]:
213
+ if cited_idx < len(self.corpus):
214
+ cited_papers_text.append(
215
+ {
216
+ "index": cited_idx,
217
+ "text": self.corpus[cited_idx][:200] + "...",
218
+ }
219
+ )
220
+
221
+ return {
222
+ "most_similar_paper": {
223
+ "index": most_similar_idx,
224
+ "similarity_score": float(similarity_score),
225
+ "text": paper_text,
226
+ },
227
+ "citation_stats": {
228
+ "num_papers_this_cites": len(papers_cited),
229
+ "num_papers_citing_this": len(papers_citing),
230
+ "total_citation_network_size": len(papers_cited) + len(papers_citing),
231
+ },
232
+ "papers_this_cites": papers_cited,
233
+ "papers_citing_this": papers_citing,
234
+ "sample_cited_papers": cited_papers_text,
235
+ }
236
+
237
+ def compare_methods(self, title: str, abstract: str, top_k: int = 5):
238
+ """Compare TF-IDF vs sentence embeddings"""
239
+ if not hasattr(self, 'corpus_vectors'):
240
+ self._setup_tfidf()
241
+ if not hasattr(self, 'corpus_embeddings'):
242
+ self._setup_sentence_embeddings()
243
+
244
+ query = f"{title}\n{abstract}"
245
+
246
+ tfidf_results = self._find_similar_tfidf(query, top_k)
247
+ sent_results = self._find_similar_sentence_transformer(query, top_k)
248
+
249
+ return {
250
+ 'tfidf': tfidf_results,
251
+ 'sentence_transformer': sent_results
252
+ }
253
+
254
+ if __name__ == "__main__":
255
+ dataset = OGBNLinkPredDataset()
256
+
257
+ model_name = "all-mpnet-base-v2"
258
+ method = "sentence_transformer"
259
+ embeddings_dir = "../embeddings_cache"
260
+
261
+ similarity_finder = PaperSimilarityFinder(
262
+ dataset,
263
+ method=method,
264
+ model_name=model_name,
265
+ embeddings_cache_path=embeddings_dir,
266
+ )
267
+
268
+ my_title = "Polynomial Implicit Neural Representations For Large Diverse Datasets"
269
+ my_abstract = """
270
+ Implicit neural representations (INR) have gained significant popularity for signal and image representation for
271
+ many end-tasks, such as superresolution, 3D modeling, and
272
+ more. Most INR architectures rely on sinusoidal positional
273
+ encoding, which accounts for high-frequency information in
274
+ data. However, the finite encoding size restricts the model’s
275
+ representational power. Higher representational power is
276
+ needed to go from representing a single given image to representing large and diverse datasets. Our approach addresses
277
+ this gap by representing an image with a polynomial function
278
+ and eliminates the need for positional encodings. Therefore,
279
+ to achieve a progressively higher degree of polynomial representation, we use element-wise multiplications between
280
+ features and affine-transformed coordinate locations after
281
+ every ReLU layer. The proposed method is evaluated qualitatively and quantitatively on large datasets like ImageNet.
282
+ The proposed Poly-INR model performs comparably to stateof-the-art generative models without any convolution,
283
+ normalization, or self-attention layers, and with far fewer trainable parameters. With much fewer training parameters and
284
+ higher representative power, our approach paves the way
285
+ for broader adoption of INR models for generative modeling tasks in complex domains. The code is available at
286
+ https://github.com/Rajhans0/Poly_INR
287
+ """
288
+
289
+ top_k = 5
290
+ print(f"\nTop {top_k} Citation Predictions:\n")
291
+
292
+ top_papers = similarity_finder.find_similar_papers(
293
+ my_title, my_abstract, top_k=top_k
294
+ )
295
+
296
+ for idx, score, text in top_papers:
297
+ title = text.split("\n")[0].strip()
298
+ print(f"Title: '{title}'")
299
+
300
+ similarity_finder_cached = PaperSimilarityFinder(
301
+ dataset,
302
+ method=method,
303
+ model_name=model_name,
304
+ embeddings_cache_path=embeddings_dir,
305
+ )
306
+
307
+ top_papers_cached = similarity_finder_cached.find_similar_papers(
308
+ my_title, my_abstract, top_k=top_k
309
+ )
310
+
311
+ for idx, score, text in top_papers_cached:
312
+ title = text.split("\n")[0].strip()
313
+ print(f"Title: '{title}'")
314
+
315
+