HaarisIqubal commited on
Commit
c075e70
·
verified ·
1 Parent(s): d6b3a08

Synced repo using 'sync_with_huggingface' Github Action

Browse files
app.py CHANGED
@@ -1,7 +1,8 @@
1
  import streamlit as st
 
2
 
3
  def main():
4
- st.title('Summarize Document')
5
 
6
 
7
  if __name__ == '__main__':
 
1
  import streamlit as st
2
+ from summarize_doc.app.setupview import setup_view
3
 
4
  def main():
5
+ setup_view()
6
 
7
 
8
  if __name__ == '__main__':
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -8,6 +8,10 @@ readme = "README.md"
8
  [tool.poetry.dependencies]
9
  python = "^3.9.21"
10
  streamlit = "^1.41.1"
 
 
 
 
11
 
12
 
13
  [build-system]
 
8
  [tool.poetry.dependencies]
9
  python = "^3.9.21"
10
  streamlit = "^1.41.1"
11
+ pypdf2 = "^3.0.1"
12
+ spacy = "3.7.5"
13
+ langchain = "^0.3.14"
14
+ nltk = "^3.9.1"
15
 
16
 
17
  [build-system]
summarize_doc/app/__init__.py ADDED
File without changes
summarize_doc/app/setupview.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from .sidebar import sidebar_view
3
+ def setup_view():
4
+ st.set_page_config(page_title='Summarize Document', page_icon='📑', layout='centered')
5
+ st.title('Summarize Document 📑')
6
+ st.write("This app summarizes text from a document. You can upload a text or PDF file and get a summary of the text.")
7
+ if 'extracted_text' not in st.session_state:
8
+ st.session_state.extracted_text = ""
9
+ sidebar_view()
10
+ st.write(st.session_state.extracted_text)
11
+
12
+
13
+
summarize_doc/app/sidebar.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from summarize_doc.src.pdf_summarize.pdf_summarize import PDFSummarize
3
+ from summarize_doc.src.txt_summarize.txt_summarize import TxtSummarize
4
+
5
+ def sidebar_view():
6
+ with st.sidebar:
7
+ st.subheader("Your Document")
8
+ docs_file = st.file_uploader("Upload a document", type=['txt', 'pdf'])
9
+ summarization_type = st.radio("Set summarization type :", key="visiblity", options=["Sentences", "Bullet Points"])
10
+ summarization_length = st.number_input("Number of Sentences", key="num_sentences", min_value=1, max_value=10, value=5, step=1)
11
+
12
+ if st.button('Process'):
13
+ with st.spinner("Processing ... "):
14
+ if docs_file.type == 'application/pdf':
15
+ summarized_text = PDFSummarize(docs_file, summarization_length=summarization_length).text
16
+ if summarization_type == "Sentences":
17
+ st.session_state.extracted_text = summarized_text
18
+ else:
19
+ summarized_text = summarized_text.replace(". ", ".\n\n")
20
+ st.session_state.extracted_text = summarized_text
21
+ elif docs_file.type == 'text/plain':
22
+ summarized_text = TxtSummarize(docs_file).text
23
+ if summarization_type == "Sentences":
24
+ st.session_state.extracted_text = summarized_text
25
+ else:
26
+ summarized_text = summarized_text.replace(". ", ".\n • \n")
27
+ st.session_state.extracted_text = summarized_text
28
+ else:
29
+ st.warning("Please upload a valid file type (txt or pdf)")
summarize_doc/src/pdf_summarize/__init__.py ADDED
File without changes
summarize_doc/src/pdf_summarize/pdf_summarize.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..summarize.helper_function import summarize_text
2
+ from PyPDF2 import PdfReader
3
+
4
+ class PDFSummarize:
5
+ def __init__(self, file_path, summarization_length=5):
6
+ self.file_path = file_path
7
+ raw_text = self.get_pdf_text()
8
+ self.text = summarize_text(raw_text, num_sentences=summarization_length)
9
+
10
+ def get_pdf_text(self):
11
+ text = ""
12
+ reader = PdfReader(self.file_path)
13
+ for page in reader.pages:
14
+ text += page.extract_text()
15
+ return text
16
+
summarize_doc/src/summarize/__init__.py ADDED
File without changes
summarize_doc/src/summarize/helper_function.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from collections import Counter
3
+ import nltk
4
+ from nltk.tokenize import sent_tokenize, word_tokenize
5
+ from nltk.corpus import stopwords
6
+
7
+ nltk.download('stopwords')
8
+ nltk.download('punkt')
9
+ nltk.download('punkt_tab')
10
+
11
+ def preporcess_text(text):
12
+ stop_words = set(stopwords.words('english'))
13
+ words = word_tokenize(text)
14
+ words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
15
+ return words
16
+
17
+ def sentence_score(text):
18
+ sentences = sent_tokenize(text)
19
+ words = preporcess_text(text)
20
+ word_frequencies = Counter(words)
21
+ scores = {}
22
+
23
+ for sentence in sentences:
24
+ sentence_words = preporcess_text(sentence)
25
+ score = sum([word_frequencies[word] for word in sentence_words])
26
+ scores[sentence] = score
27
+ return scores
28
+
29
+ def summarize_text(text, num_sentences=10):
30
+ scores = sentence_score(text)
31
+ ranked_sentences = sorted(scores, key=scores.get, reverse=True)
32
+ summary = " ".join(ranked_sentences[:num_sentences])
33
+ return summary
summarize_doc/src/txt_summarize/__init__.py ADDED
File without changes
summarize_doc/src/txt_summarize/txt_summarize.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..summarize.helper_function import summarize_text
2
+
3
+ class TxtSummarize:
4
+ def __init__(self, file_path):
5
+ self.file_path = file_path
6
+ raw_text = self.get_txt_text()
7
+ self.text = summarize_text(raw_text)
8
+
9
+ def get_txt_text(self):
10
+ try:
11
+ text = ""
12
+ text = self.file_path.read().decode("utf-8")
13
+ return text
14
+ except Exception as e:
15
+ raise RuntimeError(f"Error reading the file at {self.file_path}: {e}")
16
+