wangoes-dev's picture
update
86a3842 verified
import os
import time
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import streamlit as st
from pptx import Presentation
from pptx.util import Inches
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from gtts import gTTS
def load_groq_api_key():
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
raise ValueError("Error: GROQ_API_KEY not found in environment variables.")
return groq_api_key
# πŸ”Ή Process Text (Split & Embed)
def process_text(text):
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=3000,
chunk_overlap=500,
length_function=len
)
chunks = text_splitter.split_text(text)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
knowledgeBase = FAISS.from_texts(chunks, embeddings)
return knowledgeBase
# πŸ”Ή Generate Structured Summary
def generate_summary(knowledgeBase):
query = (
"Summarize the research paper in a structured format, covering objective, proposed model, methods, evaluation, comparison, and key results. Keep it concise and clear, using bullet points."
)
retriever = knowledgeBase.as_retriever()
llm = ChatGroq(model_name="llama3-8b-8192", groq_api_key=os.getenv("GROQ_API_KEY"), temperature=0.1)
chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
response = chain.invoke({"query": query})
return response['result']
# πŸ”Ή Generate Importance Analysis
def generate_importance_analysis(knowledgeBase):
query = (
"Analyze why this research paper is important for the world and what readers should learn from it. "
"Focus on:\n"
"1. The global significance of this research\n"
"2. Potential real-world applications\n"
"3. Key takeaways for readers\n"
"4. How it advances the field\n"
"Present in clear, concise bullet points with emojis for better readability."
)
retriever = knowledgeBase.as_retriever()
llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=os.getenv("GROQ_API_KEY"), temperature=0.2)
chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
response = chain.invoke({"query": query})
return response['result']
# πŸ”Ή Initialize Document Chatbot
def init_document_chatbot(knowledgeBase):
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
llm = ChatGroq(
model_name="llama3-8b-8192",
groq_api_key=os.getenv("GROQ_API_KEY"),
temperature=0.2
)
return ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=knowledgeBase.as_retriever(),
memory=memory,
chain_type="stuff"
)
def text_to_speech(text):
try:
# Create audio file with gTTS
tts = gTTS(text=text, lang='en')
audio_path = "/tmp/summary_audio.mp3" # Use /tmp/ for Hugging Face Spaces
tts.save(audio_path)
# Verify file was created
if os.path.exists(audio_path):
return audio_path
else:
raise Exception("Audio file not created")
except Exception as e:
print(f"Error in gTTS: {e}")
return None
# πŸ”Ή Generate WordCloud
def generate_wordcloud(text):
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig("wordcloud.png", bbox_inches="tight")
st.image("wordcloud.png", caption="πŸ”  WordCloud of Important Keywords", use_container_width=True)
# πŸ”Ή Convert Summary to PowerPoint
def generate_ppt(summary):
prs = Presentation()
slide_layout = prs.slide_layouts[1] # Title and Content Layout
# Add Title Slide
title_slide_layout = prs.slide_layouts[0]
slide = prs.slides.add_slide(title_slide_layout)
title = slide.shapes.title
title.text = "Research Paper Summary"
# Add Content Slides
sections = summary.split("\n\n") # Break summary into sections
for section in sections:
slide = prs.slides.add_slide(slide_layout)
title = slide.shapes.title
content = slide.shapes.placeholders[1]
lines = section.split("\n")
if lines:
title.text = lines[0] # First line as title
content.text = "\n".join(lines[1:]) # Remaining as bullet points
# Save PowerPoint File
ppt_filename = "summary_presentation.pptx"
prs.save(ppt_filename)
return ppt_filename
# πŸ”Ή Display PDF Info for Engagement
def display_pdf_info(text, pdf_reader):
total_pages = len(pdf_reader.pages)
word_count = len(text.split())
first_few_lines = " ".join(text.split()[:50]) + "..."
st.subheader("πŸ“„ PDF Insights")
st.write(f"πŸ“ **Total Pages:** {total_pages}")
st.write(f"πŸ”’ **Word Count:** {word_count}")
st.write(f"πŸ“Œ **First Few Lines:** {first_few_lines}")
with st.expander("πŸ” **View More Insights**"):
st.write("πŸ’‘ **Pro Tip:** LLaMA-3 can summarize large documents in seconds! πŸš€")
st.info(
"πŸ“– Research papers are typically structured into sections like Abstract, Introduction, Methods, and Results. AI captures these key elements!")
# πŸ”Ή Document Chatbot Interface
def document_chatbot_interface(conversation_chain):
st.subheader("πŸ’¬ Document Chatbot")
st.warning(
"This chatbot only answers questions about the uploaded document. It won't respond to general questions.")
# Initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
st.session_state.messages.append({
"role": "assistant",
"content": "Ask me anything about the research paper you uploaded! For example:\n\n"
"β€’ What is the main objective of this research?\n"
"β€’ Can you explain the methodology used?\n"
"β€’ What were the key findings?\n"
"β€’ How does this compare to previous work?"
})
# Display chat messages from history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Accept user input
if prompt := st.chat_input("Ask about the research paper..."):
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": prompt})
# Display user message in chat message container
with st.chat_message("user"):
st.markdown(prompt)
# Display assistant response in chat message container
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
try:
response = conversation_chain({"question": prompt})
answer = response["answer"]
# Check if answer is relevant to document
if "I don't know" in answer or "not mentioned" in answer.lower():
answer = "This information is not covered in the document. Please ask questions specifically about the research paper content."
st.markdown(answer)
st.session_state.messages.append({"role": "assistant", "content": answer})
except Exception as e:
st.error("Sorry, I encountered an error processing your question. Please try again.")
st.session_state.messages.append({"role": "assistant", "content": "Error processing request"})
# πŸ”Ή Main Streamlit App
def main():
st.title("πŸ“„ Advanced Research Paper Analyzer")
st.write("πŸš€ Powered by LLaMA-3 on Groq - Understand why research matters and what you should learn")
st.divider()
try:
os.environ["GROQ_API_KEY"] = load_groq_api_key()
except ValueError as e:
st.error(str(e))
return
pdf = st.file_uploader("πŸ“€ Upload your Research Paper (PDF)", type="pdf")
if pdf is not None:
with st.spinner("πŸ”„ Extracting text & analyzing PDF... Please wait!"):
pdf_reader = PdfReader(pdf)
text = "".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
knowledgeBase = process_text(text)
display_pdf_info(text, pdf_reader)
st.success("βœ… PDF processed successfully! Now generating insights...")
# Create tabs for different analysis sections
tab1, tab2, tab3 = st.tabs(["πŸ“œ Summary", "🌍 Why This Matters", "πŸ’¬ Chat with Paper"])
with tab1:
with st.spinner("🧠 Generating comprehensive summary..."):
response = generate_summary(knowledgeBase)
st.subheader("πŸ“œ Structured Summary:")
st.markdown(response, unsafe_allow_html=True)
# Audio Conversion
audio_file = text_to_speech(response)
st.audio(audio_file, format="audio/mp3")
# WordCloud Generation
generate_wordcloud(response)
# PowerPoint Conversion
ppt_file = generate_ppt(response)
with open(ppt_file, "rb") as file:
st.download_button(label="πŸ“₯ Download Summary PPT", data=file, file_name="Research_Summary.pptx")
with tab2:
with st.spinner("πŸ” Analyzing global significance and key learnings..."):
importance = generate_importance_analysis(knowledgeBase)
st.subheader("🌍 Why This Research Matters")
st.markdown("""
<style>
.big-font {
font-size:18px !important;
color: #2e86de;
}
.highlight {
background-color: #f5f6fa;
padding: 10px;
border-radius: 5px;
border-left: 4px solid #4b7bec;
}
</style>
""", unsafe_allow_html=True)
st.markdown("""
<div class="highlight">
<p class="big-font">This analysis explains why the paper you uploaded is important and what you should learn from it.</p>
</div>
""", unsafe_allow_html=True)
st.markdown(importance, unsafe_allow_html=True)
st.markdown("""
<div style="margin-top: 20px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
<h4>πŸ’‘ How to Apply This Knowledge</h4>
<ul>
<li>Consider how these findings might impact your work or studies</li>
<li>Think about potential applications in your field</li>
<li>Identify areas for further research or implementation</li>
</ul>
</div>
""", unsafe_allow_html=True)
with tab3:
conversation_chain = init_document_chatbot(knowledgeBase)
document_chatbot_interface(conversation_chain)
if __name__ == "__main__":
main()