File size: 5,835 Bytes
993f311
981e1c6
 
 
 
 
 
 
 
 
 
993f311
981e1c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c02a2d0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from dotenv import load_dotenv
import os
import fitz  # PyMuPDF
import nltk
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.units import inch
import streamlit as st
from groq import Groq
load_dotenv()
# Download NLTK resources
nltk.download('punkt')

# Initialize Groq Client using the environment variable
client = Groq(
    api_key=os.getenv('GROQ_API_KEY')
)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to segment text into topics
def segment_text_into_topics(text):
    topics = text.split('\n\n')  # Simple split by double newline; can be customized
    return topics

# Function to summarize text using LLM
def summarize_text(topic):
    prompt = f"Summarize the following text and define any technical terms used. Provide clear and contextually relevant definitions for the terms, especially those related to AI and machine learning:\n\n{topic}"
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert summarizer and technical writer who provides concise and clear summaries of topics, and defines any technical terms with relevance to the context."
                },
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model="llama-3.1-70b-versatile",
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Function to define technical terms using LLM
def define_technical_terms(terms):
    definitions = {}
    for term in terms:
        prompt = f"Define the technical term '{term}' in the context of AI and machine learning."
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "You are an expert in AI and machine learning. Provide clear and contextually relevant definitions for technical terms."
                    },
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama-3.1-70b-versatile",
            )
            definitions[term] = chat_completion.choices[0].message.content.strip()
        except Exception as e:
            definitions[term] = f"Definition not found due to an error: {str(e)}"
    return definitions

# Function to process the entire PDF and generate summaries
def process_pdf(pdf_file):
    text = extract_text_from_pdf(pdf_file)
    topics = segment_text_into_topics(text)
    summary_output = ""
    
    for topic in topics:
        summary = summarize_text(topic)
        summary_output += f"Summary:\n{summary}\n\n"

        # Extract and define technical terms
        words = set(topic.split())
        technical_terms = [word for word in words if word.isalpha() and word.isupper()]
        if technical_terms:
            definitions = define_technical_terms(technical_terms)
            summary_output += "Technical Terms and Definitions:\n"
            for term, definition in definitions.items():
                summary_output += f"{term}: {definition}\n"
            summary_output += "\n"

    return summary_output

# Function to create a PDF from the summary with improved formatting
def create_summary_pdf(output_text, output_pdf_path):
    doc = SimpleDocTemplate(output_pdf_path, pagesize=letter)
    story = []

    # Define styles
    styles = getSampleStyleSheet()
    heading_style = styles['Heading1']
    subheading_style = styles['Heading2']
    para_style = styles['BodyText']
    tech_term_style = ParagraphStyle(
        'TechTerm',
        parent=styles['BodyText'],
        textColor=colors.blue,
        spaceBefore=10,
        leftIndent=20
    )

    # Process the text for PDF
    lines = output_text.split('\n\n')
    for line in lines:
        if line.startswith("Summary:"):
            title = line.split(":", 1)[1].strip()
            story.append(Paragraph("Summary", subheading_style))
            story.append(Spacer(1, 0.1 * inch))
            story.append(Paragraph(title, para_style))
            story.append(Spacer(1, 0.2 * inch))
        elif "Technical Terms and Definitions:" in line:
            story.append(Paragraph("Technical Terms and Definitions", subheading_style))
            story.append(Spacer(1, 0.1 * inch))
            terms = line.split("\n")[1:]
            for term in terms:
                story.append(Paragraph(term, tech_term_style))
                story.append(Spacer(1, 0.1 * inch))
        else:
            story.append(Paragraph(line, para_style))
            story.append(Spacer(1, 0.2 * inch))

    doc.build(story)

# Streamlit Interface
st.title("PDF Summarizer with Technical Definitions")

uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

if uploaded_file is not None:
    st.write("Processing...")
    summary = process_pdf(uploaded_file)
    
    output_pdf_path = "summary_output.pdf"
    create_summary_pdf(summary, output_pdf_path)
    
    with open(output_pdf_path, "rb") as file:
        btn = st.download_button(
            label="Download Summary PDF",
            data=file,
            file_name="summary_output.pdf",
            mime="application/pdf"
        )
st.markdown(summary, unsafe_allow_html=False, *, help=None)