Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -248,79 +248,89 @@ Summary:
|
|
| 248 |
# -----------------------------------------------------------
|
| 249 |
# FIND SIMILAR PAPERS (arXiv)
|
| 250 |
# -----------------------------------------------------------
|
| 251 |
-
def
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
try:
|
| 260 |
-
# Get content from PDF
|
| 261 |
-
top_chunks = vectorstore.similarity_search("", k=5)
|
| 262 |
-
pdf_text = " ".join(doc.page_content for doc in top_chunks)
|
| 263 |
-
|
| 264 |
|
| 265 |
-
if not pdf_text.strip():
|
| 266 |
-
return "PDF content too small."
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
# Extract keywords
|
| 270 |
-
keywords = " ".join(pdf_text.split()[:20])
|
| 271 |
-
encoded = urllib.parse.quote(keywords)
|
| 272 |
-
url = f"http://export.arxiv.org/api/query?search_query=all:{encoded}&start=0&max_results=5"
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
feed = feedparser.parse(url)
|
| 276 |
-
entries = feed.entries
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
if not entries:
|
| 280 |
-
return "No arXiv results found."
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
# Embeddings for ranking
|
| 284 |
-
embedding_model = HuggingFaceEmbeddings(
|
| 285 |
-
model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
|
| 286 |
-
)
|
| 287 |
-
pdf_emb = embedding_model.embed_query(pdf_text)
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
results = []
|
| 291 |
-
for entry in entries:
|
| 292 |
-
txt = f"{entry.title} {entry.summary}"
|
| 293 |
-
emb = embedding_model.embed_query(txt)
|
| 294 |
-
sim = dot(pdf_emb, emb) / (norm(pdf_emb) * norm(emb))
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
results.append({
|
| 298 |
-
"title": entry.title,
|
| 299 |
-
"summary": entry.summary.replace("\n", " ").strip(),
|
| 300 |
-
"link": entry.link,
|
| 301 |
-
"similarity": sim
|
| 302 |
-
})
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
# Sort by similarity DESC
|
| 306 |
-
results.sort(key=lambda x: x["similarity"], reverse=True)
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
formatted = []
|
| 310 |
-
for paper in results[:3]:
|
| 311 |
-
formatted.append(
|
| 312 |
-
f"**{paper['title']}**\n"
|
| 313 |
-
f"{paper['summary']}\n"
|
| 314 |
-
f"🔗 {paper['link']}\n"
|
| 315 |
-
f"Similarity Score: {paper['similarity']:.2f}"
|
| 316 |
-
)
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
return "\n\n".join(formatted)
|
| 320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
-
except Exception as e:
|
| 323 |
-
return f"Error: {str(e)}"
|
| 324 |
|
| 325 |
|
| 326 |
|
|
|
|
| 248 |
# -----------------------------------------------------------
|
| 249 |
# FIND SIMILAR PAPERS (arXiv)
|
| 250 |
# -----------------------------------------------------------
|
| 251 |
+
def extract_title(text):
|
| 252 |
+
# Take the first non-empty line as the title
|
| 253 |
+
for line in text.split("\n"):
|
| 254 |
+
line = line.strip()
|
| 255 |
+
if line:
|
| 256 |
+
return line
|
| 257 |
+
return "Research Paper" # fallback if empty
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
+
def find_similar_papers():
|
| 261 |
+
global vectorstore
|
| 262 |
+
|
| 263 |
+
if vectorstore is None:
|
| 264 |
+
return "Please upload a PDF first."
|
| 265 |
+
|
| 266 |
+
try:
|
| 267 |
+
# Get full PDF text from all chunks
|
| 268 |
+
docs = vectorstore.similarity_search("", k=30)
|
| 269 |
+
full_pdf_text = " ".join(d.page_content for d in docs)
|
| 270 |
+
|
| 271 |
+
if not full_pdf_text.strip():
|
| 272 |
+
return "PDF content too small."
|
| 273 |
+
|
| 274 |
+
# ----------------------------
|
| 275 |
+
# 1️⃣ Extract only the title
|
| 276 |
+
# ----------------------------
|
| 277 |
+
title = extract_title(full_pdf_text)
|
| 278 |
+
query_text = title # Use only the title for arXiv search
|
| 279 |
+
|
| 280 |
+
# ----------------------------
|
| 281 |
+
# 2️⃣ Search arXiv
|
| 282 |
+
# ----------------------------
|
| 283 |
+
encoded_query = urllib.parse.quote(query_text)
|
| 284 |
+
url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results=15"
|
| 285 |
+
|
| 286 |
+
feed = feedparser.parse(url)
|
| 287 |
+
entries = feed.entries
|
| 288 |
+
|
| 289 |
+
if not entries:
|
| 290 |
+
return "No similar papers found on arXiv."
|
| 291 |
+
|
| 292 |
+
# ----------------------------
|
| 293 |
+
# 3️⃣ Use embeddings for ranking
|
| 294 |
+
# ----------------------------
|
| 295 |
+
embedding_model = HuggingFaceEmbeddings(
|
| 296 |
+
model_name="sentence-transformers/all-mpnet-base-v2"
|
| 297 |
+
)
|
| 298 |
+
query_emb = embedding_model.embed_query(query_text)
|
| 299 |
+
|
| 300 |
+
ranked = []
|
| 301 |
+
for entry in entries:
|
| 302 |
+
candidate_text = entry.title # only title for similarity
|
| 303 |
+
emb = embedding_model.embed_query(candidate_text)
|
| 304 |
+
|
| 305 |
+
sim = dot(query_emb, emb) / (norm(query_emb) * norm(emb))
|
| 306 |
+
ranked.append({
|
| 307 |
+
"title": entry.title,
|
| 308 |
+
"summary": entry.summary.replace("\n", " ").strip(),
|
| 309 |
+
"link": entry.link,
|
| 310 |
+
"similarity": sim
|
| 311 |
+
})
|
| 312 |
+
|
| 313 |
+
# Sort by similarity
|
| 314 |
+
ranked.sort(key=lambda x: x["similarity"], reverse=True)
|
| 315 |
+
|
| 316 |
+
# ----------------------------
|
| 317 |
+
# 4️⃣ Format top 3 results
|
| 318 |
+
# ----------------------------
|
| 319 |
+
output = []
|
| 320 |
+
for p in ranked[:3]:
|
| 321 |
+
out = (
|
| 322 |
+
f"**{p['title']}**\n"
|
| 323 |
+
f"{p['summary']}\n"
|
| 324 |
+
f"🔗 {p['link']}\n"
|
| 325 |
+
f"Similarity Score: {p['similarity']:.2f}"
|
| 326 |
+
)
|
| 327 |
+
output.append(out)
|
| 328 |
+
|
| 329 |
+
return "\n\n".join(output)
|
| 330 |
+
|
| 331 |
+
except Exception as e:
|
| 332 |
+
return f"Error: {str(e)}"
|
| 333 |
|
|
|
|
|
|
|
| 334 |
|
| 335 |
|
| 336 |
|