Spaces:

chippyjolly
/

Research_Assistant

Sleeping

App Files Files Community

chippyjolly commited on Dec 4, 2025

Commit

a961c7a

verified ·

1 Parent(s): 91df8d8

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -98

app.py CHANGED Viewed

@@ -4,38 +4,39 @@ from PyPDF2 import PdfReader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains.retrieval_qa.base import RetrievalQA
 from langchain.prompts import PromptTemplate
 from langchain_core.language_models.llms import LLM
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from typing import Optional, List, Dict, Any
-import requests
 from dotenv import load_dotenv
 from groq import Groq
 import urllib.parse
-import feedparser  # Added for the new function
 from numpy import dot
-from numpy.linalg import norm #newly added - to let the similar paper work
 # Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-# Custom wrapper for Groq to make it LangChain compatible
 class GroqWrapper(LLM):
     client: Any
     model_name: str = "llama-3.3-70b-versatile"
     temperature: float = 0.7
     @property
     def _llm_type(self) -> str:
         return "groq"
     def _call(
         self,
         prompt: str,
@@ -44,18 +45,22 @@ class GroqWrapper(LLM):
         **kwargs: Any,
     ) -> str:
         response = self.client.chat.completions.create(
-            messages=[{"role": "user", "content": prompt}],
             model=self.model_name,
             temperature=self.temperature,
-            **kwargs
         )
         return response.choices[0].message.content
-# Initialize global variables
 vectorstore = None
 qa_chain = None
 groq_llm = None
 def upload_pdf(file):
     global vectorstore, qa_chain, groq_llm
@@ -64,176 +69,202 @@ def upload_pdf(file):
         if groq_llm is None:
             groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
-        # Extract text
         text = "".join(page.extract_text() or "" for page in PdfReader(file).pages)
         if not text.strip():
             return "Error: No readable text found in PDF"
-        # Chunk text
-        text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=150,
             separators=["\n\n", "\n", ".", "?", "!"]
         )
-        texts = text_splitter.split_text(text)
-        # Embeddings
         embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
         )
-        vectorstore = FAISS.from_texts(texts, embeddings)
-        # Custom prompt
-        prompt_template = """
-Use only the following context to answer the question.
-Do NOT make up information. If the answer is not present, say "I don't know."
 Context:
 {context}
 Question: {question}
-Answer:
 """
-        custom_prompt = PromptTemplate(
-            template=prompt_template,
-            input_variables=["context", "question"]
         )
-        # QA chain with custom prompt
         qa_chain = RetrievalQA.from_chain_type(
             llm=groq_llm,
-            chain_type="refine",
             retriever=vectorstore.as_retriever(),
             return_source_documents=True,
-            chain_type_kwargs={"prompt": custom_prompt}  # pass prompt here
         )
         return "PDF processed successfully!"
     except Exception as e:
         return f"Error: {str(e)}"
-# --- Ask questions ---
 def ask_question(query):
     global qa_chain
     if qa_chain is None:
         return "Please upload a PDF first.", ""
     try:
-        # Simply call the chain, no need to override prompt
-        result = qa_chain({"query": query}, return_only_outputs=False)
         answer = result["result"]
-        sources = result.get("source_documents", [])
         if sources:
-            source_text = "\n\n---\n".join([
-                f"Source {i+1} (excerpt):\n{doc.page_content[:500]}{'...' if len(doc.page_content) > 500 else ''}"
                 for i, doc in enumerate(sources)
-            ])
         else:
-            source_text = "No sources cited"
         return answer, source_text
     except Exception as e:
-        return f"Error processing your question: {str(e)}", ""
-# --- Summarize PDF ---
-def summarize_pdf(num_points: int = 6):
-    global vectorstore, groq_llm
     if vectorstore is None:
         return "Please upload a PDF first."
     try:
         docs = vectorstore.similarity_search("summary", k=5)
-        context = "\n\n".join([doc.page_content for doc in docs])
         prompt = f"""
-Imagine you are a passionate science communicator.
-Summarize the following research paper in {num_points} bullet points.
-Highlight core discoveries and significance. Keep it engaging, insightful, and clear.
-Paper Content:
 {context}
 Summary:
 """
         if groq_llm is None:
             groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
-        summary = groq_llm(prompt)
-        return summary.strip()
     except Exception as e:
-        return f"Error during summarization: {str(e)}"
-    #     summary = groq_llm(prompt)
-    #     return summary.strip()
-    # except Exception as e:
-    #     return f"Error during summarization: {str(e)}"
-# *** Modified find_similar_papers function ONLY ***
-# --- Find similar papers (with embedding rerank) ---
 def find_similar_papers():
     if vectorstore is None:
         return "Please upload a PDF first."
     try:
-        # Get top chunks from uploaded PDF
         top_chunks = vectorstore.similarity_search("", k=5)
-        paper_text = " ".join([doc.page_content for doc in top_chunks])
-        if not paper_text.strip():
-            return "PDF content too small for similarity search."
-        # Extract keywords for arXiv query
-        keywords = " ".join(paper_text.split()[:20])
-        encoded_query = urllib.parse.quote(keywords)
-        arxiv_url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results=5"
-        feed = feedparser.parse(arxiv_url)
         entries = feed.entries
         if not entries:
-            return "No similar papers found on arXiv."
-        # Embeddings for reranking
-        embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-MiniLM-L-12-v3")
-        paper_embedding = embeddings_model.embed_query(paper_text)
-        ranked_results = []
         for entry in entries:
-            arxiv_text = f"{entry.title} {entry.summary}"
-            arxiv_embedding = embeddings_model.embed_query(arxiv_text)
-            similarity = dot(paper_embedding, arxiv_embedding) / (norm(paper_embedding) * norm(arxiv_embedding))
-            ranked_results.append({
                 "title": entry.title,
                 "summary": entry.summary.replace("\n", " ").strip(),
                 "link": entry.link,
-                "similarity": similarity
             })
-        # Sort by similarity
-        ranked_results.sort(key=lambda x: x["similarity"], reverse=True)
-        # Format top 3
-        results = []
-        for paper in ranked_results[:3]:
-            results.append(
-                f"**{paper['title']}**\n{paper['summary']}\n🔗 {paper['link']}\nSimilarity: {paper['similarity']:.2f}"
             )
-        return "\n\n".join(results)
     except Exception as e:
-        return f"Error fetching similar papers: {str(e)}"
-    #         results.append(f"**{title}**\n{summary}\n🔗 {link}")
-    #     return "\n\n".join(results)
-    # except Exception as e:
-    #     return f"Error fetching similar papers: {str(e)}"
@@ -534,8 +565,4 @@ with gr.Blocks(css=css) as demo:
     similar_btn.click(find_similar_papers, outputs=similar_output)
 if __name__ == "__main__":
-    demo.launch()

 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains.retrieval_qa.base import RetrievalQA
 from langchain.prompts import PromptTemplate
 from langchain_core.language_models.llms import LLM
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from typing import Optional, List, Dict, Any
 from dotenv import load_dotenv
 from groq import Groq
 import urllib.parse
+import feedparser
 from numpy import dot
+from numpy.linalg import norm
 # Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# -----------------------------------------------------------
+#                       GROQ WRAPPER
+# -----------------------------------------------------------
 class GroqWrapper(LLM):
     client: Any
     model_name: str = "llama-3.3-70b-versatile"
     temperature: float = 0.7
     @property
     def _llm_type(self) -> str:
         return "groq"
     def _call(
         self,
         prompt: str,
         **kwargs: Any,
     ) -> str:
         response = self.client.chat.completions.create(
             model=self.model_name,
+            messages=[{"role": "user", "content": prompt}],
             temperature=self.temperature,
         )
         return response.choices[0].message.content
+# Globals
 vectorstore = None
 qa_chain = None
 groq_llm = None
+# -----------------------------------------------------------
+#                      PROCESS PDF
+# -----------------------------------------------------------
 def upload_pdf(file):
     global vectorstore, qa_chain, groq_llm
         if groq_llm is None:
             groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
+        # Extract text from PDF
         text = "".join(page.extract_text() or "" for page in PdfReader(file).pages)
         if not text.strip():
             return "Error: No readable text found in PDF"
+        # Chunk the text
+        splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=150,
             separators=["\n\n", "\n", ".", "?", "!"]
         )
+        chunks = splitter.split_text(text)
+        # Create Vectorstore
         embeddings = HuggingFaceEmbeddings(
             model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
         )
+        vectorstore = FAISS.from_texts(chunks, embeddings)
+        # --- CUSTOM REFINE PROMPTS ---
+        initial_prompt = PromptTemplate(
+            input_variables=["context", "question"],
+            template="""
+You are an expert researcher.
+Use ONLY the given context to answer the question.
+If the answer is not in the context, say "I don't know".
 Context:
 {context}
 Question: {question}
+Initial Answer:
 """
         )
+        refine_prompt = PromptTemplate(
+            input_variables=["context", "question", "existing_answer"],
+            template="""
+We have an existing answer:
+{existing_answer}
+Using the additional context below, refine the answer.
+Additional Context:
+{context}
+Question: {question}
+Refined Answer:
+"""
+        )
+        # --- BUILD QA CHAIN ---
         qa_chain = RetrievalQA.from_chain_type(
             llm=groq_llm,
             retriever=vectorstore.as_retriever(),
+            chain_type="refine",
             return_source_documents=True,
+            chain_type_kwargs={
+                "initial_response_prompt": initial_prompt,
+                "refine_prompt": refine_prompt
+            }
         )
         return "PDF processed successfully!"
     except Exception as e:
         return f"Error: {str(e)}"
+# -----------------------------------------------------------
+#                      QUESTION ANSWERING
+# -----------------------------------------------------------
 def ask_question(query):
     global qa_chain
     if qa_chain is None:
         return "Please upload a PDF first.", ""
     try:
+        result = qa_chain({"query": query})
         answer = result["result"]
+        # Format sources
+        sources = result.get("source_documents", [])
         if sources:
+            source_text = "\n\n---\n".join(
+                f"Source {i+1}:\n{doc.page_content[:500]}..."
                 for i, doc in enumerate(sources)
+            )
         else:
+            source_text = "No sources found."
         return answer, source_text
     except Exception as e:
+        return f"Error: {str(e)}", ""
+# -----------------------------------------------------------
+#                      SUMMARIZE PDF
+# -----------------------------------------------------------
+def summarize_pdf(num_points=6):
+    global groq_llm, vectorstore
     if vectorstore is None:
         return "Please upload a PDF first."
     try:
         docs = vectorstore.similarity_search("summary", k=5)
+        context = "\n\n".join([d.page_content for d in docs])
         prompt = f"""
+Summarize the research paper in {num_points} bullet points.
+Make it clear, meaningful, and highlight key contributions.
+Content:
 {context}
 Summary:
 """
         if groq_llm is None:
             groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
+        return groq_llm(prompt).strip()
     except Exception as e:
+        return f"Error: {str(e)}"
+# -----------------------------------------------------------
+#               FIND SIMILAR PAPERS (arXiv)
+# -----------------------------------------------------------
 def find_similar_papers():
+    global vectorstore
     if vectorstore is None:
         return "Please upload a PDF first."
     try:
+        # Get content from PDF
         top_chunks = vectorstore.similarity_search("", k=5)
+        pdf_text = " ".join(doc.page_content for doc in top_chunks)
+        if not pdf_text.strip():
+            return "PDF content too small."
+        # Extract keywords
+        keywords = " ".join(pdf_text.split()[:20])
+        encoded = urllib.parse.quote(keywords)
+        url = f"http://export.arxiv.org/api/query?search_query=all:{encoded}&start=0&max_results=5"
+        feed = feedparser.parse(url)
         entries = feed.entries
         if not entries:
+            return "No arXiv results found."
+        # Embeddings for ranking
+        embedding_model = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
+        )
+        pdf_emb = embedding_model.embed_query(pdf_text)
+        results = []
         for entry in entries:
+            txt = f"{entry.title} {entry.summary}"
+            emb = embedding_model.embed_query(txt)
+            sim = dot(pdf_emb, emb) / (norm(pdf_emb) * norm(emb))
+            results.append({
                 "title": entry.title,
                 "summary": entry.summary.replace("\n", " ").strip(),
                 "link": entry.link,
+                "similarity": sim
             })
+        # Sort by similarity DESC
+        results.sort(key=lambda x: x["similarity"], reverse=True)
+        formatted = []
+        for paper in results[:3]:
+            formatted.append(
+                f"**{paper['title']}**\n"
+                f"{paper['summary']}\n"
+                f"🔗 {paper['link']}\n"
+                f"Similarity Score: {paper['similarity']:.2f}"
             )
+        return "\n\n".join(formatted)
     except Exception as e:
+        return f"Error: {str(e)}"
     similar_btn.click(find_similar_papers, outputs=similar_output)
 if __name__ == "__main__":
+    demo.launch()