raz-135 commited on
Commit
981e1c6
·
verified ·
1 Parent(s): e33c1b5

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +4 -4
  2. app (1).py +159 -0
  3. env +1 -0
  4. gitignore +1 -0
  5. requirements.txt +7 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: PDF Summarizer
3
- emoji: 🌖
4
- colorFrom: pink
5
- colorTo: indigo
6
  sdk: streamlit
7
  sdk_version: 1.38.0
8
  app_file: app.py
 
1
  ---
2
+ title: DocsSummarizer
3
+ emoji: 🚀
4
+ colorFrom: indigo
5
+ colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.38.0
8
  app_file: app.py
app (1).py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import fitz # PyMuPDF
4
+ import nltk
5
+ from reportlab.lib.pagesizes import letter
6
+ from reportlab.lib import colors
7
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
8
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
9
+ from reportlab.lib.units import inch
10
+ import streamlit as st
11
+ from groq import Groq
12
+ load_dotenv()
13
+ # Download NLTK resources
14
+ nltk.download('punkt')
15
+
16
+ # Initialize Groq Client using the environment variable
17
+ client = Groq(
18
+ api_key=os.getenv('GROQ_API_KEY')
19
+ )
20
+
21
+ # Function to extract text from PDF
22
+ def extract_text_from_pdf(pdf_file):
23
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
24
+ text = ""
25
+ for page in doc:
26
+ text += page.get_text()
27
+ return text
28
+
29
+ # Function to segment text into topics
30
+ def segment_text_into_topics(text):
31
+ topics = text.split('\n\n') # Simple split by double newline; can be customized
32
+ return topics
33
+
34
+ # Function to summarize text using LLM
35
+ def summarize_text(topic):
36
+ prompt = f"Summarize the following text and define any technical terms used. Provide clear and contextually relevant definitions for the terms, especially those related to AI and machine learning:\n\n{topic}"
37
+ try:
38
+ chat_completion = client.chat.completions.create(
39
+ messages=[
40
+ {
41
+ "role": "system",
42
+ "content": "You are an expert summarizer and technical writer who provides concise and clear summaries of topics, and defines any technical terms with relevance to the context."
43
+ },
44
+ {
45
+ "role": "user",
46
+ "content": prompt,
47
+ }
48
+ ],
49
+ model="llama-3.1-70b-versatile",
50
+ )
51
+ return chat_completion.choices[0].message.content
52
+ except Exception as e:
53
+ return f"An error occurred: {str(e)}"
54
+
55
+ # Function to define technical terms using LLM
56
+ def define_technical_terms(terms):
57
+ definitions = {}
58
+ for term in terms:
59
+ prompt = f"Define the technical term '{term}' in the context of AI and machine learning."
60
+ try:
61
+ chat_completion = client.chat.completions.create(
62
+ messages=[
63
+ {
64
+ "role": "system",
65
+ "content": "You are an expert in AI and machine learning. Provide clear and contextually relevant definitions for technical terms."
66
+ },
67
+ {
68
+ "role": "user",
69
+ "content": prompt,
70
+ }
71
+ ],
72
+ model="llama-3.1-70b-versatile",
73
+ )
74
+ definitions[term] = chat_completion.choices[0].message.content.strip()
75
+ except Exception as e:
76
+ definitions[term] = f"Definition not found due to an error: {str(e)}"
77
+ return definitions
78
+
79
+ # Function to process the entire PDF and generate summaries
80
+ def process_pdf(pdf_file):
81
+ text = extract_text_from_pdf(pdf_file)
82
+ topics = segment_text_into_topics(text)
83
+ summary_output = ""
84
+
85
+ for topic in topics:
86
+ summary = summarize_text(topic)
87
+ summary_output += f"Summary:\n{summary}\n\n"
88
+
89
+ # Extract and define technical terms
90
+ words = set(topic.split())
91
+ technical_terms = [word for word in words if word.isalpha() and word.isupper()]
92
+ if technical_terms:
93
+ definitions = define_technical_terms(technical_terms)
94
+ summary_output += "Technical Terms and Definitions:\n"
95
+ for term, definition in definitions.items():
96
+ summary_output += f"{term}: {definition}\n"
97
+ summary_output += "\n"
98
+
99
+ return summary_output
100
+
101
+ # Function to create a PDF from the summary with improved formatting
102
+ def create_summary_pdf(output_text, output_pdf_path):
103
+ doc = SimpleDocTemplate(output_pdf_path, pagesize=letter)
104
+ story = []
105
+
106
+ # Define styles
107
+ styles = getSampleStyleSheet()
108
+ heading_style = styles['Heading1']
109
+ subheading_style = styles['Heading2']
110
+ para_style = styles['BodyText']
111
+ tech_term_style = ParagraphStyle(
112
+ 'TechTerm',
113
+ parent=styles['BodyText'],
114
+ textColor=colors.blue,
115
+ spaceBefore=10,
116
+ leftIndent=20
117
+ )
118
+
119
+ # Process the text for PDF
120
+ lines = output_text.split('\n\n')
121
+ for line in lines:
122
+ if line.startswith("Summary:"):
123
+ title = line.split(":", 1)[1].strip()
124
+ story.append(Paragraph("Summary", subheading_style))
125
+ story.append(Spacer(1, 0.1 * inch))
126
+ story.append(Paragraph(title, para_style))
127
+ story.append(Spacer(1, 0.2 * inch))
128
+ elif "Technical Terms and Definitions:" in line:
129
+ story.append(Paragraph("Technical Terms and Definitions", subheading_style))
130
+ story.append(Spacer(1, 0.1 * inch))
131
+ terms = line.split("\n")[1:]
132
+ for term in terms:
133
+ story.append(Paragraph(term, tech_term_style))
134
+ story.append(Spacer(1, 0.1 * inch))
135
+ else:
136
+ story.append(Paragraph(line, para_style))
137
+ story.append(Spacer(1, 0.2 * inch))
138
+
139
+ doc.build(story)
140
+
141
+ # Streamlit Interface
142
+ st.title("PDF Summarizer with Technical Definitions")
143
+
144
+ uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
145
+
146
+ if uploaded_file is not None:
147
+ st.write("Processing...")
148
+ summary = process_pdf(uploaded_file)
149
+
150
+ output_pdf_path = "summary_output.pdf"
151
+ create_summary_pdf(summary, output_pdf_path)
152
+
153
+ with open(output_pdf_path, "rb") as file:
154
+ btn = st.download_button(
155
+ label="Download Summary PDF",
156
+ data=file,
157
+ file_name="summary_output.pdf",
158
+ mime="application/pdf"
159
+ )
env ADDED
@@ -0,0 +1 @@
 
 
1
+ GROQ_API_KEY = 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
gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pymupdf
2
+ nltk
3
+ reportlab
4
+ requests
5
+ groq
6
+ streamlit
7
+ python-dotenv