PDF_Upload_Vision

Sleeping

zamal commited on Mar 4, 2024

Commit

c289504

verified ·

1 Parent(s): 9efba83

Upload 2 files

Files changed (2) hide show

application.py ADDED Viewed

+import streamlit as st
+from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
+import io
+import base64
+# Initialize your Streamlit app
+st.title("🚀 PDF to Bullet Point Summarizer 🗟 🔏")
+# Initialize the Streamlit app
+# File uploader for the PDF
+uploaded_file = st.file_uploader("Upload your PDF document", type="pdf")
+# Slider for users to select the summarization extent
+summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20)
+if uploaded_file is not None:
+    with st.spinner('Processing...'):
+        # Read the PDF content
+        text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
+        # Extract key phrases from the text
+        key_phrases = extract_key_phrases(text)
+        # Score sentences based on the key phrases
+        sentence_scores = score_sentences(text, key_phrases)
+        # Determine the number of bullet points based on the selected summarization scale
+        total_sentences = len(list(sentence_scores.keys()))
+        num_points = max(1, total_sentences * summary_scale // 100)
+        # Generate the bullet-point summary
+        summary = summarize_text(sentence_scores, num_points=num_points)
+        # Display the summary as bullet points
+        st.subheader("Here's the summary 💯: ")
+        st.markdown(summary)

main.py ADDED Viewed

+import PyPDF2
+import spacy
+from collections import Counter
+import heapq
+import io
+# Load spaCy model
+nlp = spacy.load("en_core_web_sm")
+def read_pdf(file_stream):
+    text = ''
+    reader = PyPDF2.PdfReader(file_stream)
+    for page in reader.pages:
+        text += page.extract_text() + ' '
+    return text.strip()
+def extract_key_phrases(text):
+    doc = nlp(text)
+    # Combine noun chunks and named entities as candidates for key phrases
+    key_phrases = [chunk.text for chunk in doc.noun_chunks] + [ent.text for ent in doc.ents]
+    return key_phrases
+def score_sentences(text, key_phrases):
+    sentence_scores = {}
+    doc = nlp(text)
+    for sent in doc.sents:
+        for phrase in key_phrases:
+            if phrase in sent.text:
+                if sent in sentence_scores:
+                    sentence_scores[sent] += 1
+                else:
+                    sentence_scores[sent] = 1
+    return sentence_scores
+def summarize_text(sentence_scores, num_points=5):
+    summary_sentences = heapq.nlargest(num_points, sentence_scores, key=sentence_scores.get)
+    # Format summary as bullet points
+    summary = '\n'.join([f"- {sent.text}" for sent in summary_sentences])
+    return summary