Spaces:

wangoes-dev
/

Wangoes_PDF_Analyzer_and_Summarizer

Sleeping

App Files Files Community

Wangoes_PDF_Analyzer_and_Summarizer / app.py

wangoes-dev

update

86a3842 verified 10 months ago

raw

history blame contribute delete

11.7 kB

	import os
	import time
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	import streamlit as st
	from pptx import Presentation
	from pptx.util import Inches
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.chains import RetrievalQA, ConversationalRetrievalChain
	from langchain_groq import ChatGroq
	from langchain.memory import ConversationBufferMemory

	from gtts import gTTS

	def load_groq_api_key():
	groq_api_key = os.getenv("GROQ_API_KEY")
	if not groq_api_key:
	raise ValueError("Error: GROQ_API_KEY not found in environment variables.")
	return groq_api_key


	# 🔹 Process Text (Split & Embed)
	def process_text(text):
	text_splitter = CharacterTextSplitter(
	separator="\n",
	chunk_size=3000,
	chunk_overlap=500,
	length_function=len
	)
	chunks = text_splitter.split_text(text)

	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	knowledgeBase = FAISS.from_texts(chunks, embeddings)

	return knowledgeBase


	# 🔹 Generate Structured Summary
	def generate_summary(knowledgeBase):
	query = (
	"Summarize the research paper in a structured format, covering objective, proposed model, methods, evaluation, comparison, and key results. Keep it concise and clear, using bullet points."
	)

	retriever = knowledgeBase.as_retriever()
	llm = ChatGroq(model_name="llama3-8b-8192", groq_api_key=os.getenv("GROQ_API_KEY"), temperature=0.1)
	chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

	response = chain.invoke({"query": query})
	return response['result']


	# 🔹 Generate Importance Analysis
	def generate_importance_analysis(knowledgeBase):
	query = (
	"Analyze why this research paper is important for the world and what readers should learn from it. "
	"Focus on:\n"
	"1. The global significance of this research\n"
	"2. Potential real-world applications\n"
	"3. Key takeaways for readers\n"
	"4. How it advances the field\n"
	"Present in clear, concise bullet points with emojis for better readability."
	)

	retriever = knowledgeBase.as_retriever()
	llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=os.getenv("GROQ_API_KEY"), temperature=0.2)
	chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

	response = chain.invoke({"query": query})
	return response['result']


	# 🔹 Initialize Document Chatbot
	def init_document_chatbot(knowledgeBase):
	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

	llm = ChatGroq(
	model_name="llama3-8b-8192",
	groq_api_key=os.getenv("GROQ_API_KEY"),
	temperature=0.2
	)

	return ConversationalRetrievalChain.from_llm(
	llm=llm,
	retriever=knowledgeBase.as_retriever(),
	memory=memory,
	chain_type="stuff"
	)




	def text_to_speech(text):
	try:
	# Create audio file with gTTS
	tts = gTTS(text=text, lang='en')
	audio_path = "/tmp/summary_audio.mp3" # Use /tmp/ for Hugging Face Spaces
	tts.save(audio_path)

	# Verify file was created
	if os.path.exists(audio_path):
	return audio_path
	else:
	raise Exception("Audio file not created")
	except Exception as e:
	print(f"Error in gTTS: {e}")
	return None


	# 🔹 Generate WordCloud
	def generate_wordcloud(text):
	wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
	plt.figure(figsize=(10, 5))
	plt.imshow(wordcloud, interpolation="bilinear")
	plt.axis("off")
	plt.savefig("wordcloud.png", bbox_inches="tight")
	st.image("wordcloud.png", caption="🔠 WordCloud of Important Keywords", use_container_width=True)


	# 🔹 Convert Summary to PowerPoint
	def generate_ppt(summary):
	prs = Presentation()
	slide_layout = prs.slide_layouts[1] # Title and Content Layout

	# Add Title Slide
	title_slide_layout = prs.slide_layouts[0]
	slide = prs.slides.add_slide(title_slide_layout)
	title = slide.shapes.title
	title.text = "Research Paper Summary"

	# Add Content Slides
	sections = summary.split("\n\n") # Break summary into sections
	for section in sections:
	slide = prs.slides.add_slide(slide_layout)
	title = slide.shapes.title
	content = slide.shapes.placeholders[1]

	lines = section.split("\n")
	if lines:
	title.text = lines[0] # First line as title
	content.text = "\n".join(lines[1:]) # Remaining as bullet points

	# Save PowerPoint File
	ppt_filename = "summary_presentation.pptx"
	prs.save(ppt_filename)
	return ppt_filename


	# 🔹 Display PDF Info for Engagement
	def display_pdf_info(text, pdf_reader):
	total_pages = len(pdf_reader.pages)
	word_count = len(text.split())
	first_few_lines = " ".join(text.split()[:50]) + "..."

	st.subheader("📄 PDF Insights")
	st.write(f"📝 Total Pages: {total_pages}")
	st.write(f"🔢 Word Count: {word_count}")
	st.write(f"📌 First Few Lines: {first_few_lines}")

	with st.expander("🔍 View More Insights"):
	st.write("💡 Pro Tip: LLaMA-3 can summarize large documents in seconds! 🚀")
	st.info(
	"📖 Research papers are typically structured into sections like Abstract, Introduction, Methods, and Results. AI captures these key elements!")


	# 🔹 Document Chatbot Interface
	def document_chatbot_interface(conversation_chain):
	st.subheader("💬 Document Chatbot")
	st.warning(
	"This chatbot only answers questions about the uploaded document. It won't respond to general questions.")

	# Initialize chat history
	if "messages" not in st.session_state:
	st.session_state.messages = []
	st.session_state.messages.append({
	"role": "assistant",
	"content": "Ask me anything about the research paper you uploaded! For example:\n\n"
	"• What is the main objective of this research?\n"
	"• Can you explain the methodology used?\n"
	"• What were the key findings?\n"
	"• How does this compare to previous work?"
	})

	# Display chat messages from history
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Accept user input
	if prompt := st.chat_input("Ask about the research paper..."):
	# Add user message to chat history
	st.session_state.messages.append({"role": "user", "content": prompt})
	# Display user message in chat message container
	with st.chat_message("user"):
	st.markdown(prompt)

	# Display assistant response in chat message container
	with st.chat_message("assistant"):
	with st.spinner("Thinking..."):
	try:
	response = conversation_chain({"question": prompt})
	answer = response["answer"]

	# Check if answer is relevant to document
	if "I don't know" in answer or "not mentioned" in answer.lower():
	answer = "This information is not covered in the document. Please ask questions specifically about the research paper content."

	st.markdown(answer)
	st.session_state.messages.append({"role": "assistant", "content": answer})
	except Exception as e:
	st.error("Sorry, I encountered an error processing your question. Please try again.")
	st.session_state.messages.append({"role": "assistant", "content": "Error processing request"})


	# 🔹 Main Streamlit App
	def main():
	st.title("📄 Advanced Research Paper Analyzer")
	st.write("🚀 Powered by LLaMA-3 on Groq - Understand why research matters and what you should learn")
	st.divider()

	try:
	os.environ["GROQ_API_KEY"] = load_groq_api_key()
	except ValueError as e:
	st.error(str(e))
	return

	pdf = st.file_uploader("📤 Upload your Research Paper (PDF)", type="pdf")

	if pdf is not None:
	with st.spinner("🔄 Extracting text & analyzing PDF... Please wait!"):
	pdf_reader = PdfReader(pdf)
	text = "".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
	knowledgeBase = process_text(text)

	display_pdf_info(text, pdf_reader)

	st.success("✅ PDF processed successfully! Now generating insights...")

	# Create tabs for different analysis sections
	tab1, tab2, tab3 = st.tabs(["📜 Summary", "🌍 Why This Matters", "💬 Chat with Paper"])

	with tab1:
	with st.spinner("🧠 Generating comprehensive summary..."):
	response = generate_summary(knowledgeBase)
	st.subheader("📜 Structured Summary:")
	st.markdown(response, unsafe_allow_html=True)

	# Audio Conversion
	audio_file = text_to_speech(response)
	st.audio(audio_file, format="audio/mp3")

	# WordCloud Generation
	generate_wordcloud(response)

	# PowerPoint Conversion
	ppt_file = generate_ppt(response)
	with open(ppt_file, "rb") as file:
	st.download_button(label="📥 Download Summary PPT", data=file, file_name="Research_Summary.pptx")

	with tab2:
	with st.spinner("🔍 Analyzing global significance and key learnings..."):
	importance = generate_importance_analysis(knowledgeBase)
	st.subheader("🌍 Why This Research Matters")
	st.markdown("""
	<style>
	.big-font {
	font-size:18px !important;
	color: #2e86de;
	}
	.highlight {
	background-color: #f5f6fa;
	padding: 10px;
	border-radius: 5px;
	border-left: 4px solid #4b7bec;
	}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div class="highlight">
	<p class="big-font">This analysis explains why the paper you uploaded is important and what you should learn from it.</p>
	</div>
	""", unsafe_allow_html=True)

	st.markdown(importance, unsafe_allow_html=True)

	st.markdown("""
	<div style="margin-top: 20px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
	<h4>💡 How to Apply This Knowledge</h4>
	<ul>
	<li>Consider how these findings might impact your work or studies</li>
	<li>Think about potential applications in your field</li>
	<li>Identify areas for further research or implementation</li>
	</ul>
	</div>
	""", unsafe_allow_html=True)

	with tab3:
	conversation_chain = init_document_chatbot(knowledgeBase)
	document_chatbot_interface(conversation_chain)




	if __name__ == "__main__":
	main()