Spaces:

wangoes-dev
/

Wangoes_PDF_Analyzer_and_Summarizer

Sleeping

File size: 11,678 Bytes

import os
import time
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import streamlit as st
from pptx import Presentation
from pptx.util import Inches
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory

from gtts import gTTS

def load_groq_api_key():
    groq_api_key = os.getenv("GROQ_API_KEY") 
    if not groq_api_key:
        raise ValueError("Error: GROQ_API_KEY not found in environment variables.")
    return groq_api_key


# 🔹 Process Text (Split & Embed)
def process_text(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=3000,
        chunk_overlap=500,
        length_function=len
    )
    chunks = text_splitter.split_text(text)

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    knowledgeBase = FAISS.from_texts(chunks, embeddings)

    return knowledgeBase


# 🔹 Generate Structured Summary
def generate_summary(knowledgeBase):
    query = (
        "Summarize the research paper in a structured format, covering objective, proposed model, methods, evaluation, comparison, and key results. Keep it concise and clear, using bullet points."
    )

    retriever = knowledgeBase.as_retriever()
    llm = ChatGroq(model_name="llama3-8b-8192", groq_api_key=os.getenv("GROQ_API_KEY"), temperature=0.1)
    chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

    response = chain.invoke({"query": query})
    return response['result']


# 🔹 Generate Importance Analysis
def generate_importance_analysis(knowledgeBase):
    query = (
        "Analyze why this research paper is important for the world and what readers should learn from it. "
        "Focus on:\n"
        "1. The global significance of this research\n"
        "2. Potential real-world applications\n"
        "3. Key takeaways for readers\n"
        "4. How it advances the field\n"
        "Present in clear, concise bullet points with emojis for better readability."
    )

    retriever = knowledgeBase.as_retriever()
    llm = ChatGroq(model_name="llama3-70b-8192", groq_api_key=os.getenv("GROQ_API_KEY"), temperature=0.2)
    chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

    response = chain.invoke({"query": query})
    return response['result']


# 🔹 Initialize Document Chatbot
def init_document_chatbot(knowledgeBase):
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

    llm = ChatGroq(
        model_name="llama3-8b-8192",
        groq_api_key=os.getenv("GROQ_API_KEY"),
        temperature=0.2
    )

    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=knowledgeBase.as_retriever(),
        memory=memory,
        chain_type="stuff"
    )




def text_to_speech(text):
    try:
        # Create audio file with gTTS
        tts = gTTS(text=text, lang='en')
        audio_path = "/tmp/summary_audio.mp3"  # Use /tmp/ for Hugging Face Spaces
        tts.save(audio_path)
        
        # Verify file was created
        if os.path.exists(audio_path):
            return audio_path
        else:
            raise Exception("Audio file not created")
    except Exception as e:
        print(f"Error in gTTS: {e}")
        return None    


# 🔹 Generate WordCloud
def generate_wordcloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.savefig("wordcloud.png", bbox_inches="tight")
    st.image("wordcloud.png", caption="🔠 WordCloud of Important Keywords", use_container_width=True)


# 🔹 Convert Summary to PowerPoint
def generate_ppt(summary):
    prs = Presentation()
    slide_layout = prs.slide_layouts[1]  # Title and Content Layout

    # Add Title Slide
    title_slide_layout = prs.slide_layouts[0]
    slide = prs.slides.add_slide(title_slide_layout)
    title = slide.shapes.title
    title.text = "Research Paper Summary"

    # Add Content Slides
    sections = summary.split("\n\n")  # Break summary into sections
    for section in sections:
        slide = prs.slides.add_slide(slide_layout)
        title = slide.shapes.title
        content = slide.shapes.placeholders[1]

        lines = section.split("\n")
        if lines:
            title.text = lines[0]  # First line as title
            content.text = "\n".join(lines[1:])  # Remaining as bullet points

    # Save PowerPoint File
    ppt_filename = "summary_presentation.pptx"
    prs.save(ppt_filename)
    return ppt_filename


# 🔹 Display PDF Info for Engagement
def display_pdf_info(text, pdf_reader):
    total_pages = len(pdf_reader.pages)
    word_count = len(text.split())
    first_few_lines = " ".join(text.split()[:50]) + "..."

    st.subheader("📄 PDF Insights")
    st.write(f"📝 **Total Pages:** {total_pages}")
    st.write(f"🔢 **Word Count:** {word_count}")
    st.write(f"📌 **First Few Lines:** {first_few_lines}")

    with st.expander("🔍 **View More Insights**"):
        st.write("💡 **Pro Tip:** LLaMA-3 can summarize large documents in seconds! 🚀")
        st.info(
            "📖 Research papers are typically structured into sections like Abstract, Introduction, Methods, and Results. AI captures these key elements!")


# 🔹 Document Chatbot Interface
def document_chatbot_interface(conversation_chain):
    st.subheader("💬 Document Chatbot")
    st.warning(
        "This chatbot only answers questions about the uploaded document. It won't respond to general questions.")

    # Initialize chat history
    if "messages" not in st.session_state:
        st.session_state.messages = []
        st.session_state.messages.append({
            "role": "assistant",
            "content": "Ask me anything about the research paper you uploaded! For example:\n\n"
                       "• What is the main objective of this research?\n"
                       "• Can you explain the methodology used?\n"
                       "• What were the key findings?\n"
                       "• How does this compare to previous work?"
        })

    # Display chat messages from history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Accept user input
    if prompt := st.chat_input("Ask about the research paper..."):
        # Add user message to chat history
        st.session_state.messages.append({"role": "user", "content": prompt})
        # Display user message in chat message container
        with st.chat_message("user"):
            st.markdown(prompt)

        # Display assistant response in chat message container
        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                try:
                    response = conversation_chain({"question": prompt})
                    answer = response["answer"]

                    # Check if answer is relevant to document
                    if "I don't know" in answer or "not mentioned" in answer.lower():
                        answer = "This information is not covered in the document. Please ask questions specifically about the research paper content."

                    st.markdown(answer)
                    st.session_state.messages.append({"role": "assistant", "content": answer})
                except Exception as e:
                    st.error("Sorry, I encountered an error processing your question. Please try again.")
                    st.session_state.messages.append({"role": "assistant", "content": "Error processing request"})


# 🔹 Main Streamlit App
def main():
    st.title("📄 Advanced Research Paper Analyzer")
    st.write("🚀 Powered by LLaMA-3 on Groq - Understand why research matters and what you should learn")
    st.divider()

    try:
        os.environ["GROQ_API_KEY"] = load_groq_api_key()
    except ValueError as e:
        st.error(str(e))
        return

    pdf = st.file_uploader("📤 Upload your Research Paper (PDF)", type="pdf")

    if pdf is not None:
        with st.spinner("🔄 Extracting text & analyzing PDF... Please wait!"):
            pdf_reader = PdfReader(pdf)
            text = "".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
            knowledgeBase = process_text(text)

            display_pdf_info(text, pdf_reader)

        st.success("✅ PDF processed successfully! Now generating insights...")

        # Create tabs for different analysis sections
        tab1, tab2, tab3 = st.tabs(["📜 Summary", "🌍 Why This Matters", "💬 Chat with Paper"])

        with tab1:
            with st.spinner("🧠 Generating comprehensive summary..."):
                response = generate_summary(knowledgeBase)
                st.subheader("📜 Structured Summary:")
                st.markdown(response, unsafe_allow_html=True)

                # Audio Conversion
                audio_file = text_to_speech(response)
                st.audio(audio_file, format="audio/mp3")

                # WordCloud Generation
                generate_wordcloud(response)

                # PowerPoint Conversion
                ppt_file = generate_ppt(response)
                with open(ppt_file, "rb") as file:
                    st.download_button(label="📥 Download Summary PPT", data=file, file_name="Research_Summary.pptx")

        with tab2:
            with st.spinner("🔍 Analyzing global significance and key learnings..."):
                importance = generate_importance_analysis(knowledgeBase)
                st.subheader("🌍 Why This Research Matters")
                st.markdown("""
                <style>
                    .big-font {
                        font-size:18px !important;
                        color: #2e86de;
                    }
                    .highlight {
                        background-color: #f5f6fa;
                        padding: 10px;
                        border-radius: 5px;
                        border-left: 4px solid #4b7bec;
                    }
                </style>
                """, unsafe_allow_html=True)

                st.markdown("""
                <div class="highlight">
                    <p class="big-font">This analysis explains why the paper you uploaded is important and what you should learn from it.</p>
                </div>
                """, unsafe_allow_html=True)

                st.markdown(importance, unsafe_allow_html=True)

                st.markdown("""
                <div style="margin-top: 20px; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
                    <h4>💡 How to Apply This Knowledge</h4>
                    <ul>
                        <li>Consider how these findings might impact your work or studies</li>
                        <li>Think about potential applications in your field</li>
                        <li>Identify areas for further research or implementation</li>
                    </ul>
                </div>
                """, unsafe_allow_html=True)

        with tab3:
            conversation_chain = init_document_chatbot(knowledgeBase)
            document_chatbot_interface(conversation_chain)




if __name__ == "__main__":
    main()