Spaces:

chippyjolly
/

Research_Assistant

Sleeping

App Files Files Community

chippyjolly commited on Dec 4, 2025

Commit

66dcec5

verified ·

1 Parent(s): a661314

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -187

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import gradio as gr
 from PyPDF2 import PdfReader
@@ -5,51 +6,62 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains.retrieval_qa.base import RetrievalQA
 from langchain.prompts import PromptTemplate
 from langchain_core.language_models.llms import LLM
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from typing import Optional, List, Dict, Any
 from dotenv import load_dotenv
 from groq import Groq
 import urllib.parse
 import feedparser
 from numpy import dot
 from numpy.linalg import norm
 # Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # -----------------------------------------------------------
 #                       GROQ WRAPPER
 # -----------------------------------------------------------
 class GroqWrapper(LLM):
-    client: Any
-    model_name: str = "llama-3.3-70b-versatile"
-    temperature: float = 0.7
-    @property
-    def _llm_type(self) -> str:
-        return "groq"
-    def _call(
-        self,
-        prompt: str,
-        stop: Optional[List[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> str:
-        response = self.client.chat.completions.create(
-            model=self.model_name,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=self.temperature,
-        )
-        return response.choices[0].message.content
 # Globals
@@ -58,40 +70,47 @@ qa_chain = None
 groq_llm = None
 # -----------------------------------------------------------
 #                      PROCESS PDF
 # -----------------------------------------------------------
 def upload_pdf(file):
-    global vectorstore, qa_chain, groq_llm
-    try:
-        # Initialize Groq LLM
-        if groq_llm is None:
-            groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
-        # Extract text from PDF
-        text = "".join(page.extract_text() or "" for page in PdfReader(file).pages)
-        if not text.strip():
-            return "Error: No readable text found in PDF"
-        # Chunk the text
-        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=1000,
-            chunk_overlap=150,
-            separators=["\n\n", "\n", ".", "?", "!"]
-        )
-        chunks = splitter.split_text(text)
-        # Create Vectorstore
-        embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
-        )
-        vectorstore = FAISS.from_texts(chunks, embeddings)
-        # --- CUSTOM REFINE PROMPTS ---
-        initial_prompt = PromptTemplate(
-            input_variables=["context", "question"],
-            template="""
 You are an expert researcher.
 Use ONLY the given context to answer the question.
@@ -104,12 +123,13 @@ Question: {question}
 Initial Answer:
 """
-        )
-        refine_prompt = PromptTemplate(
-            input_variables=["context", "question", "existing_answer"],
-            template="""
-We have an existing answer:
 {existing_answer}
 Using the additional context below, refine the answer.
@@ -121,71 +141,87 @@ Question: {question}
 Refined Answer:
 """
-        )
-        # --- BUILD QA CHAIN ---
-        qa_chain = RetrievalQA.from_chain_type(
-        llm=groq_llm,
-        retriever=vectorstore.as_retriever(),
-        chain_type="refine",
-        return_source_documents=True,
-        chain_type_kwargs={
-            "question_prompt": initial_prompt,
-            "refine_prompt": refine_prompt,
-            "document_variable_name": "context"  # <-- ADD THIS LINE
-    }
 )
-        return "PDF processed successfully!"
-    except Exception as e:
-        return f"Error: {str(e)}"
 # -----------------------------------------------------------
 #                      QUESTION ANSWERING
 # -----------------------------------------------------------
 def ask_question(query):
-    global qa_chain
-    if qa_chain is None:
-        return "Please upload a PDF first.", ""
-    try:
-        result = qa_chain({"query": query})
-        answer = result["result"]
-        # Format sources
-        sources = result.get("source_documents", [])
-        if sources:
-            source_text = "\n\n---\n".join(
-                f"Source {i+1}:\n{doc.page_content[:500]}..."
-                for i, doc in enumerate(sources)
-            )
-        else:
-            source_text = "No sources found."
-        return answer, source_text
-    except Exception as e:
-        return f"Error: {str(e)}", ""
 # -----------------------------------------------------------
 #                      SUMMARIZE PDF
 # -----------------------------------------------------------
 def summarize_pdf(num_points=6):
-    global groq_llm, vectorstore
-    if vectorstore is None:
-        return "Please upload a PDF first."
-    try:
-        docs = vectorstore.similarity_search("summary", k=5)
-        context = "\n\n".join([d.page_content for d in docs])
-        prompt = f"""
 Summarize the research paper in {num_points} bullet points.
 Make it clear, meaningful, and highlight key contributions.
@@ -195,107 +231,97 @@ Content:
 Summary:
 """
-        if groq_llm is None:
-            groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
-        return groq_llm(prompt).strip()
-    except Exception as e:
-        return f"Error: {str(e)}"
 # -----------------------------------------------------------
 #               FIND SIMILAR PAPERS (arXiv)
 # -----------------------------------------------------------
-def extract_title_and_abstract(text):
-    lines = text.split("\n")
-    title = lines[0].strip()
-    abstract = ""
-    for i, line in enumerate(lines):
-        if "abstract" in line.lower():
-            # Take next 8–12 lines as abstract
-            abstract = " ".join(lines[i+1:i+10])
-            break
-    return title, abstract
-def find_similar_papers():
-    global vectorstore
-    if vectorstore is None:
-        return "Please upload a PDF first."
-    try:
-        # Get full PDF text from all chunks
-        docs = vectorstore.similarity_search("", k=30)
-        full_pdf_text = " ".join(d.page_content for d in docs)
-        if not full_pdf_text.strip():
-            return "PDF content too small."
-        # ----------------------------
-        # 1️⃣ Extract title + abstract
-        # ----------------------------
-        title, abstract = extract_title_and_abstract(full_pdf_text)
-        query_text = f"{title}. {abstract}"
-        # ----------------------------
-        # 2️⃣ Search arXiv with a real query
-        # ----------------------------
-        encoded_query = urllib.parse.quote(query_text)
-        url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results=15"
-        feed = feedparser.parse(url)
-        entries = feed.entries
-        if not entries:
-            return "No similar papers found on arXiv."
-        # ----------------------------
-        # 3️⃣ Better embeddings for similarity
-        # ----------------------------
-        embedding_model = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-mpnet-base-v2"
-        )
-        query_emb = embedding_model.embed_query(query_text)
-        ranked = []
-        for entry in entries:
-            candidate_text = f"{entry.title} {entry.summary}"
-            emb = embedding_model.embed_query(candidate_text)
-            sim = dot(query_emb, emb) / (norm(query_emb) * norm(emb))
-            ranked.append({
-                "title": entry.title,
-                "summary": entry.summary.replace("\n", " ").strip(),
-                "link": entry.link,
-                "similarity": sim
-            })
-        # Sort by similarity
-        ranked.sort(key=lambda x: x["similarity"], reverse=True)
-        # ----------------------------
-        # 4️⃣ Format top 3 results
-        # ----------------------------
-        output = []
-        for p in ranked[:3]:
-            out = (
-                f"**{p['title']}**\n"
-                f"{p['summary']}\n"
-                f"🔗 {p['link']}\n"
-                f"Similarity Score: {p['similarity']:.2f}"
-            )
-            output.append(out)
-        return "\n\n".join(output)
-    except Exception as e:
-        return f"Error: {str(e)}"

 import os
 import gradio as gr
 from PyPDF2 import PdfReader
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains.retrieval_qa.base import RetrievalQA
 from langchain.prompts import PromptTemplate
 from langchain_core.language_models.llms import LLM
 from langchain_core.callbacks import CallbackManagerForLLMRun
 from typing import Optional, List, Dict, Any
 from dotenv import load_dotenv
 from groq import Groq
 import urllib.parse
 import feedparser
 from numpy import dot
 from numpy.linalg import norm
 # Load environment variables
 load_dotenv()
 GROQ_API_KEY = os.getenv("GROQ_API_KEY")
 # -----------------------------------------------------------
 #                       GROQ WRAPPER
 # -----------------------------------------------------------
 class GroqWrapper(LLM):
+   client: Any
+   model_name: str = "llama-3.3-70b-versatile"
+   temperature: float = 0.7
+   @property
+   def _llm_type(self) -> str:
+       return "groq"
+   def _call(
+       self,
+       prompt: str,
+       stop: Optional[List[str]] = None,
+       run_manager: Optional[CallbackManagerForLLMRun] = None,
+       **kwargs: Any,
+   ) -> str:
+       response = self.client.chat.completions.create(
+           model=self.model_name,
+           messages=[{"role": "user", "content": prompt}],
+           temperature=self.temperature,
+       )
+       return response.choices[0].message.content
 # Globals
 groq_llm = None
 # -----------------------------------------------------------
 #                      PROCESS PDF
 # -----------------------------------------------------------
 def upload_pdf(file):
+   global vectorstore, qa_chain, groq_llm
+   try:
+       # Initialize Groq LLM
+       if groq_llm is None:
+           groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
+       # Extract text from PDF
+       text = "".join(page.extract_text() or "" for page in PdfReader(file).pages)
+       if not text.strip():
+           return "Error: No readable text found in PDF"
+       # Chunk the text
+       splitter = RecursiveCharacterTextSplitter(
+           chunk_size=1000,
+           chunk_overlap=150,
+           separators=["\n\n", "\n", ".", "?", "!"]
+       )
+       chunks = splitter.split_text(text)
+       # Create Vectorstore
+       embeddings = HuggingFaceEmbeddings(
+           model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
+       )
+       vectorstore = FAISS.from_texts(chunks, embeddings)
+       # --- CUSTOM REFINE PROMPTS ---
+       initial_prompt = PromptTemplate(
+           input_variables=["context", "question"],
+           template="""
 You are an expert researcher.
 Use ONLY the given context to answer the question.
 Initial Answer:
 """
+       )
+       refine_prompt = PromptTemplate(
+           input_variables=["context", "question", "existing_answer"],
+           template="""
+We have an existing answer:
 {existing_answer}
 Using the additional context below, refine the answer.
 Refined Answer:
 """
+       )
+       # --- BUILD QA CHAIN ---
+       qa_chain = RetrievalQA.from_chain_type(
+       llm=groq_llm,
+       retriever=vectorstore.as_retriever(),
+       chain_type="refine",
+       return_source_documents=True,
+       chain_type_kwargs={
+           "question_prompt": initial_prompt,
+           "refine_prompt": refine_prompt,
+           "document_variable_name": "context"  # <-- ADD THIS LINE
+   }
 )
+       return "PDF processed successfully!"
+   except Exception as e:
+       return f"Error: {str(e)}"
 # -----------------------------------------------------------
 #                      QUESTION ANSWERING
 # -----------------------------------------------------------
 def ask_question(query):
+   global qa_chain
+   if qa_chain is None:
+       return "Please upload a PDF first.", ""
+   try:
+       result = qa_chain({"query": query})
+       answer = result["result"]
+       # Format sources
+       sources = result.get("source_documents", [])
+       if sources:
+           source_text = "\n\n---\n".join(
+               f"Source {i+1}:\n{doc.page_content[:500]}..."
+               for i, doc in enumerate(sources)
+           )
+       else:
+           source_text = "No sources found."
+       return answer, source_text
+   except Exception as e:
+       return f"Error: {str(e)}", ""
 # -----------------------------------------------------------
 #                      SUMMARIZE PDF
 # -----------------------------------------------------------
 def summarize_pdf(num_points=6):
+   global groq_llm, vectorstore
+   if vectorstore is None:
+       return "Please upload a PDF first."
+   try:
+       docs = vectorstore.similarity_search("summary", k=5)
+       context = "\n\n".join([d.page_content for d in docs])
+       prompt = f"""
 Summarize the research paper in {num_points} bullet points.
 Make it clear, meaningful, and highlight key contributions.
 Summary:
 """
+       if groq_llm is None:
+           groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY))
+       return groq_llm(prompt).strip()
+   except Exception as e:
+       return f"Error: {str(e)}"
 # -----------------------------------------------------------
 #               FIND SIMILAR PAPERS (arXiv)
 # -----------------------------------------------------------
+def find_similar_papers():
+   global vectorstore
+   if vectorstore is None:
+       return "Please upload a PDF first."
+   try:
+       # Get content from PDF
+       top_chunks = vectorstore.similarity_search("", k=5)
+       pdf_text = " ".join(doc.page_content for doc in top_chunks)
+       if not pdf_text.strip():
+           return "PDF content too small."
+       # Extract keywords
+       keywords = " ".join(pdf_text.split()[:20])
+       encoded = urllib.parse.quote(keywords)
+       url = f"http://export.arxiv.org/api/query?search_query=all:{encoded}&start=0&max_results=5"
+       feed = feedparser.parse(url)
+       entries = feed.entries
+       if not entries:
+           return "No arXiv results found."
+       # Embeddings for ranking
+       embedding_model = HuggingFaceEmbeddings(
+           model_name="sentence-transformers/msmarco-MiniLM-L-12-v3"
+       )
+       pdf_emb = embedding_model.embed_query(pdf_text)
+       results = []
+       for entry in entries:
+           txt = f"{entry.title} {entry.summary}"
+           emb = embedding_model.embed_query(txt)
+           sim = dot(pdf_emb, emb) / (norm(pdf_emb) * norm(emb))
+           results.append({
+               "title": entry.title,
+               "summary": entry.summary.replace("\n", " ").strip(),
+               "link": entry.link,
+               "similarity": sim
+           })
+       # Sort by similarity DESC
+       results.sort(key=lambda x: x["similarity"], reverse=True)
+       formatted = []
+       for paper in results[:3]:
+           formatted.append(
+               f"**{paper['title']}**\n"
+               f"{paper['summary']}\n"
+               f"🔗 {paper['link']}\n"
+               f"Similarity Score: {paper['similarity']:.2f}"
+           )
+       return "\n\n".join(formatted)
+   except Exception as e:
+       return f"Error: {str(e)}"