from dotenv import load_dotenv import os import fitz # PyMuPDF import nltk from reportlab.lib.pagesizes import letter from reportlab.lib import colors from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer from reportlab.lib.units import inch import streamlit as st from groq import Groq load_dotenv() # Download NLTK resources nltk.download('punkt') # Initialize Groq Client using the environment variable client = Groq( api_key=os.getenv('GROQ_API_KEY') ) # Function to extract text from PDF def extract_text_from_pdf(pdf_file): doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text # Function to segment text into topics def segment_text_into_topics(text): topics = text.split('\n\n') # Simple split by double newline; can be customized return topics # Function to summarize text using LLM def summarize_text(topic): prompt = f"Summarize the following text and define any technical terms used. Provide clear and contextually relevant definitions for the terms, especially those related to AI and machine learning:\n\n{topic}" try: chat_completion = client.chat.completions.create( messages=[ { "role": "system", "content": "You are an expert summarizer and technical writer who provides concise and clear summaries of topics, and defines any technical terms with relevance to the context." }, { "role": "user", "content": prompt, } ], model="llama-3.1-70b-versatile", ) return chat_completion.choices[0].message.content except Exception as e: return f"An error occurred: {str(e)}" # Function to define technical terms using LLM def define_technical_terms(terms): definitions = {} for term in terms: prompt = f"Define the technical term '{term}' in the context of AI and machine learning." try: chat_completion = client.chat.completions.create( messages=[ { "role": "system", "content": "You are an expert in AI and machine learning. Provide clear and contextually relevant definitions for technical terms." }, { "role": "user", "content": prompt, } ], model="llama-3.1-70b-versatile", ) definitions[term] = chat_completion.choices[0].message.content.strip() except Exception as e: definitions[term] = f"Definition not found due to an error: {str(e)}" return definitions # Function to process the entire PDF and generate summaries def process_pdf(pdf_file): text = extract_text_from_pdf(pdf_file) topics = segment_text_into_topics(text) summary_output = "" for topic in topics: summary = summarize_text(topic) summary_output += f"Summary:\n{summary}\n\n" # Extract and define technical terms words = set(topic.split()) technical_terms = [word for word in words if word.isalpha() and word.isupper()] if technical_terms: definitions = define_technical_terms(technical_terms) summary_output += "Technical Terms and Definitions:\n" for term, definition in definitions.items(): summary_output += f"{term}: {definition}\n" summary_output += "\n" return summary_output # Function to create a PDF from the summary with improved formatting def create_summary_pdf(output_text, output_pdf_path): doc = SimpleDocTemplate(output_pdf_path, pagesize=letter) story = [] # Define styles styles = getSampleStyleSheet() heading_style = styles['Heading1'] subheading_style = styles['Heading2'] para_style = styles['BodyText'] tech_term_style = ParagraphStyle( 'TechTerm', parent=styles['BodyText'], textColor=colors.blue, spaceBefore=10, leftIndent=20 ) # Process the text for PDF lines = output_text.split('\n\n') for line in lines: if line.startswith("Summary:"): title = line.split(":", 1)[1].strip() story.append(Paragraph("Summary", subheading_style)) story.append(Spacer(1, 0.1 * inch)) story.append(Paragraph(title, para_style)) story.append(Spacer(1, 0.2 * inch)) elif "Technical Terms and Definitions:" in line: story.append(Paragraph("Technical Terms and Definitions", subheading_style)) story.append(Spacer(1, 0.1 * inch)) terms = line.split("\n")[1:] for term in terms: story.append(Paragraph(term, tech_term_style)) story.append(Spacer(1, 0.1 * inch)) else: story.append(Paragraph(line, para_style)) story.append(Spacer(1, 0.2 * inch)) doc.build(story) # Streamlit Interface st.title("PDF Summarizer with Technical Definitions") uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") if uploaded_file is not None: st.write("Processing...") summary = process_pdf(uploaded_file) output_pdf_path = "summary_output.pdf" create_summary_pdf(summary, output_pdf_path) with open(output_pdf_path, "rb") as file: btn = st.download_button( label="Download Summary PDF", data=file, file_name="summary_output.pdf", mime="application/pdf" ) st.markdown(summary, unsafe_allow_html=False, *, help=None)