import os import gradio as gr from PyPDF2 import PdfReader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.embeddings import HuggingFaceEmbeddings from langchain.chains.retrieval_qa.base import RetrievalQA from langchain.prompts import PromptTemplate from langchain_core.language_models.llms import LLM from langchain_core.callbacks import CallbackManagerForLLMRun from typing import Optional, List, Dict, Any from dotenv import load_dotenv from groq import Groq import urllib.parse import feedparser from numpy import dot from numpy.linalg import norm # Load environment variables load_dotenv() GROQ_API_KEY = os.getenv("GROQ_API_KEY") # ----------------------------------------------------------- # GROQ WRAPPER # ----------------------------------------------------------- class GroqWrapper(LLM): client: Any model_name: str = "llama-3.3-70b-versatile" temperature: float = 0.7 @property def _llm_type(self) -> str: return "groq" def _call( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> str: response = self.client.chat.completions.create( model=self.model_name, messages=[{"role": "user", "content": prompt}], temperature=self.temperature, ) return response.choices[0].message.content # Globals vectorstore = None qa_chain = None groq_llm = None # ----------------------------------------------------------- # PROCESS PDF # ----------------------------------------------------------- def upload_pdf(file): global vectorstore, qa_chain, groq_llm try: # Initialize Groq LLM if groq_llm is None: groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY)) # Extract text from PDF text = "".join(page.extract_text() or "" for page in PdfReader(file).pages) if not text.strip(): return "Error: No readable text found in PDF" # Chunk the text splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=150, separators=["\n\n", "\n", ".", "?", "!"] ) chunks = splitter.split_text(text) # Create Vectorstore embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/msmarco-MiniLM-L-12-v3" ) vectorstore = FAISS.from_texts(chunks, embeddings) # --- CUSTOM REFINE PROMPTS --- initial_prompt = PromptTemplate( input_variables=["context", "question"], template=""" You are an expert researcher. Use ONLY the given context to answer the question. If the answer is not in the context, say "I don't know". Context: {context} Question: {question} Initial Answer: """ ) refine_prompt = PromptTemplate( input_variables=["context", "question", "existing_answer"], template=""" We have an existing answer: {existing_answer} Using the additional context below, refine the answer. Additional Context: {context} Question: {question} Refined Answer: """ ) # --- BUILD QA CHAIN --- qa_chain = RetrievalQA.from_chain_type( llm=groq_llm, retriever=vectorstore.as_retriever(), chain_type="refine", return_source_documents=True, chain_type_kwargs={ "question_prompt": initial_prompt, "refine_prompt": refine_prompt, "document_variable_name": "context" # <-- ADD THIS LINE } ) return "PDF processed successfully!" except Exception as e: return f"Error: {str(e)}" # ----------------------------------------------------------- # QUESTION ANSWERING # ----------------------------------------------------------- def ask_question(query): global qa_chain if qa_chain is None: return "Please upload a PDF first.", "" try: result = qa_chain({"query": query}) answer = result["result"] # Format sources sources = result.get("source_documents", []) if sources: source_text = "\n\n---\n".join( f"Source {i+1}:\n{doc.page_content[:500]}..." for i, doc in enumerate(sources) ) else: source_text = "No sources found." return answer, source_text except Exception as e: return f"Error: {str(e)}", "" # ----------------------------------------------------------- # SUMMARIZE PDF # ----------------------------------------------------------- def summarize_pdf(num_points=6): global groq_llm, vectorstore if vectorstore is None: return "Please upload a PDF first." try: docs = vectorstore.similarity_search("summary", k=5) context = "\n\n".join([d.page_content for d in docs]) prompt = f""" Summarize the research paper in {num_points} bullet points. Make it clear, meaningful, and highlight key contributions. Content: {context} Summary: """ if groq_llm is None: groq_llm = GroqWrapper(client=Groq(api_key=GROQ_API_KEY)) return groq_llm(prompt).strip() except Exception as e: return f"Error: {str(e)}" # ----------------------------------------------------------- # FIND SIMILAR PAPERS (arXiv) # ----------------------------------------------------------- def extract_title(text): # Take the first non-empty line as the title for line in text.split("\n"): line = line.strip() if line: return line return "Research Paper" # fallback if empty def find_similar_papers(): global vectorstore if vectorstore is None: return "Please upload a PDF first." try: # Get full PDF text from all chunks docs = vectorstore.similarity_search("", k=30) full_pdf_text = " ".join(d.page_content for d in docs) if not full_pdf_text.strip(): return "PDF content too small." # ---------------------------- # 1️⃣ Extract only the title # ---------------------------- title = extract_title(full_pdf_text) query_text = title # Use only the title for arXiv search # ---------------------------- # 2️⃣ Search arXiv # ---------------------------- encoded_query = urllib.parse.quote(query_text) url = f"http://export.arxiv.org/api/query?search_query=all:{encoded_query}&start=0&max_results=15" feed = feedparser.parse(url) entries = feed.entries if not entries: return "No similar papers found on arXiv." # ---------------------------- # 3️⃣ Use embeddings for ranking # ---------------------------- embedding_model = HuggingFaceEmbeddings( model_name="sentence-transformers/all-mpnet-base-v2" ) query_emb = embedding_model.embed_query(query_text) ranked = [] for entry in entries: candidate_text = entry.title # only title for similarity emb = embedding_model.embed_query(candidate_text) sim = dot(query_emb, emb) / (norm(query_emb) * norm(emb)) ranked.append({ "title": entry.title, "summary": entry.summary.replace("\n", " ").strip(), "link": entry.link, "similarity": sim }) # Sort by similarity ranked.sort(key=lambda x: x["similarity"], reverse=True) # ---------------------------- # 4️⃣ Format top 3 results # ---------------------------- output = [] for p in ranked[:3]: out = ( f"**{p['title']}**\n" f"{p['summary']}\n" f"🔗 {p['link']}\n" f"Similarity Score: {p['similarity']:.2f}" ) output.append(out) return "\n\n".join(output) except Exception as e: return f"Error: {str(e)}" css = ''' :root { --primary: #6e48aa; --secondary: #9d50bb; --accent: #4776e6; --dark: #1a1a2e; --darker: #16213e; --light: #f8f9fa; --success: #4caf50; --warning: #ff9800; --danger: #f44336; } body, .gradio-container { margin: 0; padding: 0; font-family: 'Segoe UI', 'Roboto', sans-serif; background: linear-gradient(135deg, var(--dark), var(--darker)); color: var(--light); min-height: 100vh; } .header { text-align: center; padding: 1.5rem 0; margin-bottom: 2rem; color: white; /* Make text white */ font-size: 3rem; font-weight: 800; letter-spacing: 1px; font-style: italic; /* Make it italic */ text-shadow: 0 2px 10px rgba(0,0,0,0.2); } .nav-tabs { display: flex; justify-content: center; margin-bottom: 2rem; gap: 1rem; } .tab-button { background: rgba(255,255,255,0.1); border: none; padding: 0.8rem 1.5rem; border-radius: 50px; color: white; font-weight: 600; cursor: pointer; transition: all 0.3s ease; box-shadow: 0 4px 6px rgba(0,0,0,0.1); } .tab-button:hover { background: rgba(255,255,255,0.2); transform: translateY(-2px); } .tab-button.active { background: linear-gradient(45deg, var(--primary), var(--accent)); box-shadow: 0 4px 15px rgba(110, 72, 170, 0.4); } .tab-content { display: none; animation: fadeIn 0.5s ease-out; } .tab-content.active { display: block; } .panel { background: rgba(255,255,255,0.05); border-radius: 16px; padding: 2rem; margin: 1rem auto; max-width: 900px; backdrop-filter: blur(10px); border: 1px solid rgba(255,255,255,0.1); box-shadow: 0 8px 32px rgba(0,0,0,0.2); } .panel-header { font-size: 1.5rem; font-weight: 700; margin-bottom: 1.5rem; color: white; display: flex; align-items: center; gap: 0.8rem; } .panel-header svg { width: 1.5rem; height: 1.5rem; } button { background: linear-gradient(45deg, var(--primary), var(--secondary)); color: white; border: none; padding: 0.8rem 1.5rem; border-radius: 50px; font-weight: 600; cursor: pointer; transition: all 0.3s ease; box-shadow: 0 4px 15px rgba(110, 72, 170, 0.3); margin: 0.5rem 0; } button:hover { transform: translateY(-2px); box-shadow: 0 6px 20px rgba(110, 72, 170, 0.4); } button:active { transform: translateY(0); } button.secondary { background: rgba(255,255,255,0.1); } button.secondary:hover { background: rgba(255,255,255,0.2); } textarea, input[type="text"] { background: rgba(255,255,255,0.1); border: 1px solid rgba(255,255,255,0.2); color: white; border-radius: 8px; padding: 0.8rem; width: 100%; margin-bottom: 1rem; } textarea:focus, input[type="text"]:focus { outline: none; border-color: var(--accent); box-shadow: 0 0 0 2px rgba(71, 118, 230, 0.3); } .output-box { background: rgba(0,0,0,0.3); border-radius: 8px; padding: 1rem; margin-top: 1rem; border-left: 4px solid var(--accent); } .output-label { font-weight: 600; margin-bottom: 0.5rem; display: block; color: #ddd; } @keyframes fadeIn { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } } .slide-in { animation: slideIn 0.5s ease-out forwards; } @keyframes slideIn { from { transform: translateX(100%); opacity: 0; } to { transform: translateX(0); opacity: 1; } } .file-upload { border: 2px dashed rgba(255,255,255,0.3); border-radius: 8px; padding: 2rem; text-align: center; margin-bottom: 1rem; transition: all 0.3s ease; } .file-upload:hover { border-color: var(--accent); background: rgba(71, 118, 230, 0.1); } .progress-bar { height: 6px; background: rgba(255,255,255,0.1); border-radius: 3px; margin-top: 1rem; overflow: hidden; } .progress { height: 100%; background: linear-gradient(90deg, var(--primary), var(--accent)); width: 0%; transition: width 0.3s ease; } ''' with gr.Blocks(css=css) as demo: gr.Markdown("""