Spaces:

chippyjolly
/

Research_Assistant

Sleeping

App Files Files Community

chippyjolly commited on Dec 4, 2025

Commit

8c3dca7

verified ·

1 Parent(s): 66dcec5

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -70

app.py CHANGED Viewed

@@ -248,79 +248,89 @@ Summary:
 # -----------------------------------------------------------
 #               FIND SIMILAR PAPERS (arXiv)
 # -----------------------------------------------------------
-def find_similar_papers():
-   global vectorstore
-   if vectorstore is None:
-       return "Please upload a PDF first."
-   try:
-       # Get content from PDF
-       top_chunks = vectorstore.similarity_search("", k=5)
-       pdf_text = " ".join(doc.page_content for doc in top_chunks)
-       if not pdf_text.strip():
-           return "PDF content too small."
-       # Extract keywords
-       keywords = " ".join(pdf_text.split()[:20])
-       encoded = urllib.parse.quote(keywords)
-       url = f"http://export.arxiv.org/api/query?search_query=all:{encoded}&start=0&max_results=5"
-       feed = feedparser.parse(url)
-       entries = feed.entries
-       if not entries:
-           return "No arXiv results found."
-       # Embeddings for ranking
-       embedding_model = HuggingFaceEmbeddings(
-           model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
-       )
-       pdf_emb = embedding_model.embed_query(pdf_text)
-       results = []
-       for entry in entries:
-           txt = f"{entry.title} {entry.summary}"
-           emb = embedding_model.embed_query(txt)
-           sim = dot(pdf_emb, emb) / (norm(pdf_emb) * norm(emb))
-           results.append({
-               "title": entry.title,
-               "summary": entry.summary.replace("\n", " ").strip(),
-               "link": entry.link,
-               "similarity": sim
-           })
-       # Sort by similarity DESC
-       results.sort(key=lambda x: x["similarity"], reverse=True)
-       formatted = []
-       for paper in results[:3]:
-           formatted.append(
-               f"**{paper['title']}**\n"
-               f"{paper['summary']}\n"
-               f"🔗 {paper['link']}\n"
-               f"Similarity Score: {paper['similarity']:.2f}"
-           )
-       return "\n\n".join(formatted)
-   except Exception as e:
-       return f"Error: {str(e)}"

 # -----------------------------------------------------------
 #               FIND SIMILAR PAPERS (arXiv)
 # -----------------------------------------------------------
+def extract_title(text):
+    # Take the first non-empty line as the title
+    for line in text.split("\n"):
+        line = line.strip()
+        if line:
+            return line
+    return "Research Paper"  # fallback if empty
+def find_similar_papers():
+    global vectorstore
+    if vectorstore is None:
+        return "Please upload a PDF first."
+    try:
+        # Get full PDF text from all chunks
+        docs = vectorstore.similarity_search("", k=30)
+        full_pdf_text = " ".join(d.page_content for d in docs)
+        if not full_pdf_text.strip():
+            return "PDF content too small."
+        # ----------------------------
+        # 1️⃣ Extract only the title
+        # ----------------------------
+        title = extract_title(full_pdf_text)
+        query_text = title  # Use only the title for arXiv search
+        # ----------------------------
+        # 2️⃣ Search arXiv
+        # ----------------------------
+        encoded_query = urllib.parse.quote(query_text)
+        url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results=15"
+        feed = feedparser.parse(url)
+        entries = feed.entries
+        if not entries:
+            return "No similar papers found on arXiv."
+        # ----------------------------
+        # 3️⃣ Use embeddings for ranking
+        # ----------------------------
+        embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-mpnet-base-v2"
+        )
+        query_emb = embedding_model.embed_query(query_text)
+        ranked = []
+        for entry in entries:
+            candidate_text = entry.title  # only title for similarity
+            emb = embedding_model.embed_query(candidate_text)
+            sim = dot(query_emb, emb) / (norm(query_emb) * norm(emb))
+            ranked.append({
+                "title": entry.title,
+                "summary": entry.summary.replace("\n", " ").strip(),
+                "link": entry.link,
+                "similarity": sim
+            })
+        # Sort by similarity
+        ranked.sort(key=lambda x: x["similarity"], reverse=True)
+        # ----------------------------
+        # 4️⃣ Format top 3 results
+        # ----------------------------
+        output = []
+        for p in ranked[:3]:
+            out = (
+                f"**{p['title']}**\n"
+                f"{p['summary']}\n"
+                f"🔗 {p['link']}\n"
+                f"Similarity Score: {p['similarity']:.2f}"
+            )
+            output.append(out)
+        return "\n\n".join(output)
+    except Exception as e:
+        return f"Error: {str(e)}"