Spaces:
Sleeping
Sleeping
File size: 5,835 Bytes
993f311 981e1c6 993f311 981e1c6 c02a2d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from dotenv import load_dotenv
import os
import fitz # PyMuPDF
import nltk
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.units import inch
import streamlit as st
from groq import Groq
load_dotenv()
# Download NLTK resources
nltk.download('punkt')
# Initialize Groq Client using the environment variable
client = Groq(
api_key=os.getenv('GROQ_API_KEY')
)
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text
# Function to segment text into topics
def segment_text_into_topics(text):
topics = text.split('\n\n') # Simple split by double newline; can be customized
return topics
# Function to summarize text using LLM
def summarize_text(topic):
prompt = f"Summarize the following text and define any technical terms used. Provide clear and contextually relevant definitions for the terms, especially those related to AI and machine learning:\n\n{topic}"
try:
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an expert summarizer and technical writer who provides concise and clear summaries of topics, and defines any technical terms with relevance to the context."
},
{
"role": "user",
"content": prompt,
}
],
model="llama-3.1-70b-versatile",
)
return chat_completion.choices[0].message.content
except Exception as e:
return f"An error occurred: {str(e)}"
# Function to define technical terms using LLM
def define_technical_terms(terms):
definitions = {}
for term in terms:
prompt = f"Define the technical term '{term}' in the context of AI and machine learning."
try:
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are an expert in AI and machine learning. Provide clear and contextually relevant definitions for technical terms."
},
{
"role": "user",
"content": prompt,
}
],
model="llama-3.1-70b-versatile",
)
definitions[term] = chat_completion.choices[0].message.content.strip()
except Exception as e:
definitions[term] = f"Definition not found due to an error: {str(e)}"
return definitions
# Function to process the entire PDF and generate summaries
def process_pdf(pdf_file):
text = extract_text_from_pdf(pdf_file)
topics = segment_text_into_topics(text)
summary_output = ""
for topic in topics:
summary = summarize_text(topic)
summary_output += f"Summary:\n{summary}\n\n"
# Extract and define technical terms
words = set(topic.split())
technical_terms = [word for word in words if word.isalpha() and word.isupper()]
if technical_terms:
definitions = define_technical_terms(technical_terms)
summary_output += "Technical Terms and Definitions:\n"
for term, definition in definitions.items():
summary_output += f"{term}: {definition}\n"
summary_output += "\n"
return summary_output
# Function to create a PDF from the summary with improved formatting
def create_summary_pdf(output_text, output_pdf_path):
doc = SimpleDocTemplate(output_pdf_path, pagesize=letter)
story = []
# Define styles
styles = getSampleStyleSheet()
heading_style = styles['Heading1']
subheading_style = styles['Heading2']
para_style = styles['BodyText']
tech_term_style = ParagraphStyle(
'TechTerm',
parent=styles['BodyText'],
textColor=colors.blue,
spaceBefore=10,
leftIndent=20
)
# Process the text for PDF
lines = output_text.split('\n\n')
for line in lines:
if line.startswith("Summary:"):
title = line.split(":", 1)[1].strip()
story.append(Paragraph("Summary", subheading_style))
story.append(Spacer(1, 0.1 * inch))
story.append(Paragraph(title, para_style))
story.append(Spacer(1, 0.2 * inch))
elif "Technical Terms and Definitions:" in line:
story.append(Paragraph("Technical Terms and Definitions", subheading_style))
story.append(Spacer(1, 0.1 * inch))
terms = line.split("\n")[1:]
for term in terms:
story.append(Paragraph(term, tech_term_style))
story.append(Spacer(1, 0.1 * inch))
else:
story.append(Paragraph(line, para_style))
story.append(Spacer(1, 0.2 * inch))
doc.build(story)
# Streamlit Interface
st.title("PDF Summarizer with Technical Definitions")
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
st.write("Processing...")
summary = process_pdf(uploaded_file)
output_pdf_path = "summary_output.pdf"
create_summary_pdf(summary, output_pdf_path)
with open(output_pdf_path, "rb") as file:
btn = st.download_button(
label="Download Summary PDF",
data=file,
file_name="summary_output.pdf",
mime="application/pdf"
)
st.markdown(summary, unsafe_allow_html=False, *, help=None) |