chippyjolly commited on
Commit
8c3dca7
·
verified ·
1 Parent(s): 66dcec5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -70
app.py CHANGED
@@ -248,79 +248,89 @@ Summary:
248
  # -----------------------------------------------------------
249
  # FIND SIMILAR PAPERS (arXiv)
250
  # -----------------------------------------------------------
251
- def find_similar_papers():
252
- global vectorstore
253
-
254
-
255
- if vectorstore is None:
256
- return "Please upload a PDF first."
257
-
258
-
259
- try:
260
- # Get content from PDF
261
- top_chunks = vectorstore.similarity_search("", k=5)
262
- pdf_text = " ".join(doc.page_content for doc in top_chunks)
263
-
264
 
265
- if not pdf_text.strip():
266
- return "PDF content too small."
267
-
268
-
269
- # Extract keywords
270
- keywords = " ".join(pdf_text.split()[:20])
271
- encoded = urllib.parse.quote(keywords)
272
- url = f"http://export.arxiv.org/api/query?search_query=all:{encoded}&start=0&max_results=5"
273
-
274
-
275
- feed = feedparser.parse(url)
276
- entries = feed.entries
277
-
278
-
279
- if not entries:
280
- return "No arXiv results found."
281
-
282
-
283
- # Embeddings for ranking
284
- embedding_model = HuggingFaceEmbeddings(
285
- model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
286
- )
287
- pdf_emb = embedding_model.embed_query(pdf_text)
288
-
289
-
290
- results = []
291
- for entry in entries:
292
- txt = f"{entry.title} {entry.summary}"
293
- emb = embedding_model.embed_query(txt)
294
- sim = dot(pdf_emb, emb) / (norm(pdf_emb) * norm(emb))
295
-
296
-
297
- results.append({
298
- "title": entry.title,
299
- "summary": entry.summary.replace("\n", " ").strip(),
300
- "link": entry.link,
301
- "similarity": sim
302
- })
303
-
304
-
305
- # Sort by similarity DESC
306
- results.sort(key=lambda x: x["similarity"], reverse=True)
307
-
308
-
309
- formatted = []
310
- for paper in results[:3]:
311
- formatted.append(
312
- f"**{paper['title']}**\n"
313
- f"{paper['summary']}\n"
314
- f"🔗 {paper['link']}\n"
315
- f"Similarity Score: {paper['similarity']:.2f}"
316
- )
317
-
318
-
319
- return "\n\n".join(formatted)
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
- except Exception as e:
323
- return f"Error: {str(e)}"
324
 
325
 
326
 
 
248
  # -----------------------------------------------------------
249
  # FIND SIMILAR PAPERS (arXiv)
250
  # -----------------------------------------------------------
251
+ def extract_title(text):
252
+ # Take the first non-empty line as the title
253
+ for line in text.split("\n"):
254
+ line = line.strip()
255
+ if line:
256
+ return line
257
+ return "Research Paper" # fallback if empty
 
 
 
 
 
 
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ def find_similar_papers():
261
+ global vectorstore
262
+
263
+ if vectorstore is None:
264
+ return "Please upload a PDF first."
265
+
266
+ try:
267
+ # Get full PDF text from all chunks
268
+ docs = vectorstore.similarity_search("", k=30)
269
+ full_pdf_text = " ".join(d.page_content for d in docs)
270
+
271
+ if not full_pdf_text.strip():
272
+ return "PDF content too small."
273
+
274
+ # ----------------------------
275
+ # 1️⃣ Extract only the title
276
+ # ----------------------------
277
+ title = extract_title(full_pdf_text)
278
+ query_text = title # Use only the title for arXiv search
279
+
280
+ # ----------------------------
281
+ # 2️⃣ Search arXiv
282
+ # ----------------------------
283
+ encoded_query = urllib.parse.quote(query_text)
284
+ url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results=15"
285
+
286
+ feed = feedparser.parse(url)
287
+ entries = feed.entries
288
+
289
+ if not entries:
290
+ return "No similar papers found on arXiv."
291
+
292
+ # ----------------------------
293
+ # 3️⃣ Use embeddings for ranking
294
+ # ----------------------------
295
+ embedding_model = HuggingFaceEmbeddings(
296
+ model_name="sentence-transformers/all-mpnet-base-v2"
297
+ )
298
+ query_emb = embedding_model.embed_query(query_text)
299
+
300
+ ranked = []
301
+ for entry in entries:
302
+ candidate_text = entry.title # only title for similarity
303
+ emb = embedding_model.embed_query(candidate_text)
304
+
305
+ sim = dot(query_emb, emb) / (norm(query_emb) * norm(emb))
306
+ ranked.append({
307
+ "title": entry.title,
308
+ "summary": entry.summary.replace("\n", " ").strip(),
309
+ "link": entry.link,
310
+ "similarity": sim
311
+ })
312
+
313
+ # Sort by similarity
314
+ ranked.sort(key=lambda x: x["similarity"], reverse=True)
315
+
316
+ # ----------------------------
317
+ # 4️⃣ Format top 3 results
318
+ # ----------------------------
319
+ output = []
320
+ for p in ranked[:3]:
321
+ out = (
322
+ f"**{p['title']}**\n"
323
+ f"{p['summary']}\n"
324
+ f"🔗 {p['link']}\n"
325
+ f"Similarity Score: {p['similarity']:.2f}"
326
+ )
327
+ output.append(out)
328
+
329
+ return "\n\n".join(output)
330
+
331
+ except Exception as e:
332
+ return f"Error: {str(e)}"
333
 
 
 
334
 
335
 
336