Spaces:

sdmadhav
/

PaperChat

Sleeping

App Files Files Community

PaperChat / app.py

sdmadhav

Update app.py

fddd403 verified 5 months ago

raw

history blame contribute delete

7.86 kB

	import streamlit as st
	import os
	import tempfile
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Import required libraries
	try:
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_community.retrievers import BM25Retriever
	from smolagents import Tool, CodeAgent, InferenceClientModel
	except ImportError as e:
	st.error(f"Missing dependency: {e}. Please install all requirements.")
	st.stop()

	# Custom Retriever Tool
	class RetrieverTool(Tool):
	name = "retriever"
	description = "Uses semantic search to retrieve the parts of the research paper that could be most relevant to answer your query."
	inputs = {
	"query": {
	"type": "string",
	"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
	}
	}
	output_type = "string"

	def __init__(self, docs, **kwargs):
	super().__init__(**kwargs)
	self.retriever = BM25Retriever.from_documents(docs, k=10)

	def forward(self, query: str) -> str:
	"""Execute the retrieval based on the provided query."""
	assert isinstance(query, str), "Your search query must be a string"

	docs = self.retriever.invoke(query)

	return "\nRetrieved documents:\n" + "".join([
	f"\n\n===== Document {str(i)} (Page {doc.metadata.get('page', 'N/A')}) =====\n" + doc.page_content
	for i, doc in enumerate(docs)
	])

	# Function to load and process PDF
	@st.cache_resource
	def load_and_process_pdf(pdf_path):
	"""Load PDF and split into chunks for retrieval."""
	try:
	# Load PDF
	loader = PyPDFLoader(pdf_path)
	pages = loader.load()

	# Split into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=50,
	add_start_index=True,
	strip_whitespace=True,
	separators=["\n\n", "\n", ".", " ", ""],
	)
	docs_processed = text_splitter.split_documents(pages)

	return docs_processed, len(pages)
	except Exception as e:
	st.error(f"Error processing PDF: {e}")
	return None, 0

	# Function to create agent
	@st.cache_resource
	def create_agent(_docs):
	"""Create the RAG agent with retriever tool."""
	retriever_tool = RetrieverTool(_docs)

	# Use FREE Hugging Face model (Qwen 2.5 72B via serverless inference)
	agent = CodeAgent(
	tools=[retriever_tool],
	model=InferenceClientModel(
	model_id="Qwen/Qwen2.5-72B-Instruct",
	token=os.getenv("HF_TOKEN")
	),
	max_steps=4,
	verbosity_level=0,
	)

	return agent

	# Streamlit UI
	def main():
	st.set_page_config(
	page_title="PaperChat",
	page_icon="📄",
	layout="wide"
	)

	# Header
	st.title("📄 PaperChat - Research Paper Q&A Assistant")
	st.markdown("""
	Upload any research paper (PDF) and ask questions about it.
	Powered by Agentic RAG with retrieval capabilities.
	""")

	# Sidebar
	with st.sidebar:
	st.header("📤 Upload Paper")
	uploaded_file = st.file_uploader(
	"Choose a PDF file",
	type="pdf",
	help="Upload a research paper in PDF format"
	)

	st.markdown("---")
	st.subheader("📚 Example Questions")
	st.markdown("""
	- What is the main contribution of this paper?
	- What methodology was used?
	- What are the key results?
	- What datasets were used?
	- What are the limitations mentioned?
	""")

	st.markdown("---")
	st.subheader("ℹ️ How it works")
	st.markdown("""
	1. Upload your paper
	2. The system chunks and indexes it
	3. Ask questions naturally
	4. Get answers with source citations
	""")

	# Main content area
	if uploaded_file is not None:
	# Save uploaded file to temporary location
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	tmp_file.write(uploaded_file.read())
	tmp_path = tmp_file.name

	# Process PDF
	with st.spinner("🔄 Processing your paper... This may take a moment."):
	docs, num_pages = load_and_process_pdf(tmp_path)

	if docs:
	st.success(f"✅ Paper loaded successfully! ({num_pages} pages, {len(docs)} chunks)")

	# Create agent
	with st.spinner("🤖 Initializing AI agent..."):
	agent = create_agent(docs)

	st.success("✅ Agent ready! You can now ask questions.")

	# Chat interface
	st.markdown("---")
	st.subheader("💬 Ask Questions")

	# Initialize chat history
	if "messages" not in st.session_state:
	st.session_state.messages = []

	# Display chat history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Chat input
	if question := st.chat_input("Ask a question about the paper..."):
	# Add user message to chat history
	st.session_state.messages.append({"role": "user", "content": question})

	# Display user message
	with st.chat_message("user"):
	st.markdown(question)

	# Generate response
	with st.chat_message("assistant"):
	with st.spinner("🤔 Thinking..."):
	try:
	answer = agent.run(question)
	st.markdown(answer)

	# Add assistant response to chat history
	st.session_state.messages.append({"role": "assistant", "content": answer})
	except Exception as e:
	error_msg = f"Error generating answer: {str(e)}"
	st.error(error_msg)
	st.session_state.messages.append({"role": "assistant", "content": error_msg})

	# Clear chat button
	if st.button("🗑️ Clear Chat History"):
	st.session_state.messages = []
	st.rerun()

	# Cleanup temp file
	try:
	os.unlink(tmp_path)
	except:
	pass

	else:
	# Welcome message when no file is uploaded
	st.info("👈 Please upload a research paper PDF from the sidebar to get started.")

	st.markdown("### 🎯 What can you do with PaperChat?")
	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("""
	#### 📖 Understand Papers
	- Get summaries of complex papers
	- Understand methodology
	- Learn about key findings
	""")

	with col2:
	st.markdown("""
	#### 🔍 Extract Information
	- Find specific details
	- Locate datasets used
	- Identify citations
	""")

	with col3:
	st.markdown("""
	#### 💡 Learn Faster
	- Ask follow-up questions
	- Clarify concepts
	- Compare approaches
	""")

	if __name__ == "__main__":
	main()