Spaces:

raz-135
/

PDF-Summarizer

Sleeping

File size: 5,835 Bytes

from dotenv import load_dotenv
import os
import fitz  # PyMuPDF
import nltk
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.units import inch
import streamlit as st
from groq import Groq
load_dotenv()
# Download NLTK resources
nltk.download('punkt')

# Initialize Groq Client using the environment variable
client = Groq(
    api_key=os.getenv('GROQ_API_KEY')
)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to segment text into topics
def segment_text_into_topics(text):
    topics = text.split('\n\n')  # Simple split by double newline; can be customized
    return topics

# Function to summarize text using LLM
def summarize_text(topic):
    prompt = f"Summarize the following text and define any technical terms used. Provide clear and contextually relevant definitions for the terms, especially those related to AI and machine learning:\n\n{topic}"
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert summarizer and technical writer who provides concise and clear summaries of topics, and defines any technical terms with relevance to the context."
                },
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="llama-3.1-70b-versatile",
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Function to define technical terms using LLM
def define_technical_terms(terms):
    definitions = {}
    for term in terms:
        prompt = f"Define the technical term '{term}' in the context of AI and machine learning."
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "You are an expert in AI and machine learning. Provide clear and contextually relevant definitions for technical terms."
                    },
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama-3.1-70b-versatile",
            )
            definitions[term] = chat_completion.choices[0].message.content.strip()
        except Exception as e:
            definitions[term] = f"Definition not found due to an error: {str(e)}"
    return definitions

# Function to process the entire PDF and generate summaries
def process_pdf(pdf_file):
    text = extract_text_from_pdf(pdf_file)
    topics = segment_text_into_topics(text)
    summary_output = ""
    
    for topic in topics:
        summary = summarize_text(topic)
        summary_output += f"Summary:\n{summary}\n\n"

        # Extract and define technical terms
        words = set(topic.split())
        technical_terms = [word for word in words if word.isalpha() and word.isupper()]
        if technical_terms:
            definitions = define_technical_terms(technical_terms)
            summary_output += "Technical Terms and Definitions:\n"
            for term, definition in definitions.items():
                summary_output += f"{term}: {definition}\n"
            summary_output += "\n"

    return summary_output

# Function to create a PDF from the summary with improved formatting
def create_summary_pdf(output_text, output_pdf_path):
    doc = SimpleDocTemplate(output_pdf_path, pagesize=letter)
    story = []

    # Define styles
    styles = getSampleStyleSheet()
    heading_style = styles['Heading1']
    subheading_style = styles['Heading2']
    para_style = styles['BodyText']
    tech_term_style = ParagraphStyle(
        'TechTerm',
        parent=styles['BodyText'],
        textColor=colors.blue,
        spaceBefore=10,
        leftIndent=20
    )

    # Process the text for PDF
    lines = output_text.split('\n\n')
    for line in lines:
        if line.startswith("Summary:"):
            title = line.split(":", 1)[1].strip()
            story.append(Paragraph("Summary", subheading_style))
            story.append(Spacer(1, 0.1 * inch))
            story.append(Paragraph(title, para_style))
            story.append(Spacer(1, 0.2 * inch))
        elif "Technical Terms and Definitions:" in line:
            story.append(Paragraph("Technical Terms and Definitions", subheading_style))
            story.append(Spacer(1, 0.1 * inch))
            terms = line.split("\n")[1:]
            for term in terms:
                story.append(Paragraph(term, tech_term_style))
                story.append(Spacer(1, 0.1 * inch))
        else:
            story.append(Paragraph(line, para_style))
            story.append(Spacer(1, 0.2 * inch))

    doc.build(story)

# Streamlit Interface
st.title("PDF Summarizer with Technical Definitions")

uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

if uploaded_file is not None:
    st.write("Processing...")
    summary = process_pdf(uploaded_file)
    
    output_pdf_path = "summary_output.pdf"
    create_summary_pdf(summary, output_pdf_path)
    
    with open(output_pdf_path, "rb") as file:
        btn = st.download_button(
            label="Download Summary PDF",
            data=file,
            file_name="summary_output.pdf",
            mime="application/pdf"
        )
st.markdown(summary, unsafe_allow_html=False, *, help=None)