umerfarooq29 commited on
Commit
ee54519
Β·
verified Β·
1 Parent(s): d66476b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -0
app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import streamlit as st
3
+ from transformers import BartTokenizer, BartForConditionalGeneration
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.cluster import KMeans
6
+ from nltk.tokenize import sent_tokenize
7
+ import nltk
8
+ import PyPDF2
9
+ from io import BytesIO
10
+
11
+ # --------------------------- DOWNLOAD NLTK DATA ---------------------------
12
+ @st.cache_resource
13
+ def download_nltk_data():
14
+ try:
15
+ nltk.data.find('tokenizers/punkt')
16
+ except LookupError:
17
+ nltk.download('punkt', quiet=True)
18
+ nltk.download('punkt_tab', quiet=True)
19
+
20
+ download_nltk_data()
21
+
22
+ # --------------------------- LOAD MODEL ---------------------------
23
+ @st.cache_resource
24
+ def load_model():
25
+ tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
26
+ model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
27
+ return tokenizer, model
28
+
29
+ tokenizer, model = load_model()
30
+
31
+ # --------------------------- HELPER FUNCTIONS ---------------------------
32
+
33
+ def extract_text_from_pdf(file) -> str:
34
+ """Extract text from uploaded PDF file."""
35
+ pdf_reader = PyPDF2.PdfReader(BytesIO(file.read()))
36
+ text = ""
37
+ for page in pdf_reader.pages:
38
+ text += page.extract_text() or ""
39
+ return text.strip()
40
+
41
+ def summarize(text, max_length=150, min_length=50):
42
+ """Summarize a given text using BART."""
43
+ inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors='pt')
44
+ summary_ids = model.generate(
45
+ inputs['input_ids'],
46
+ num_beams=4,
47
+ max_length=max_length,
48
+ min_length=min_length,
49
+ early_stopping=True
50
+ )
51
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
52
+
53
+ def cluster_documents(documents, n_clusters=3):
54
+ """Cluster similar documents using TF-IDF + KMeans."""
55
+ vectorizer = TfidfVectorizer(stop_words='english')
56
+ X = vectorizer.fit_transform(documents)
57
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
58
+ return kmeans.labels_
59
+
60
+ def chunk_text(text, max_words=1000):
61
+ """Split long text into smaller chunks for summarization."""
62
+ if len(text.split()) <= max_words:
63
+ return [text]
64
+
65
+ sentences = sent_tokenize(text)
66
+ chunks, current_chunk, current_word_count = [], [], 0
67
+
68
+ for sentence in sentences:
69
+ sentence_words = len(sentence.split())
70
+ if current_word_count + sentence_words <= max_words:
71
+ current_chunk.append(sentence)
72
+ current_word_count += sentence_words
73
+ else:
74
+ chunks.append(" ".join(current_chunk))
75
+ current_chunk = [sentence]
76
+ current_word_count = sentence_words
77
+
78
+ if current_chunk:
79
+ chunks.append(" ".join(current_chunk))
80
+
81
+ return chunks
82
+
83
+ def multi_document_summarize(documents):
84
+ """Summarize multiple related documents using clustering + BART."""
85
+ results = {
86
+ 'individual_summaries': [],
87
+ 'cluster_summaries': [],
88
+ 'final_summary': None
89
+ }
90
+
91
+ # 1️⃣ Individual summaries
92
+ for doc in documents:
93
+ chunks = chunk_text(doc)
94
+ doc_summary = " ".join([summarize(chunk) for chunk in chunks])
95
+ results['individual_summaries'].append(doc_summary)
96
+
97
+ # 2️⃣ Clustering (if >1 doc)
98
+ if len(documents) > 1:
99
+ n_clusters = min(3, len(documents))
100
+ clusters = cluster_documents(documents, n_clusters=n_clusters)
101
+
102
+ for cluster_id in np.unique(clusters):
103
+ cluster_docs = [doc for doc, c in zip(documents, clusters) if c == cluster_id]
104
+ combined_text = " ".join(cluster_docs)
105
+ chunks = chunk_text(combined_text)
106
+ cluster_summary = " ".join([summarize(chunk) for chunk in chunks])
107
+ results['cluster_summaries'].append({
108
+ 'doc_indices': [i for i, c in enumerate(clusters) if c == cluster_id],
109
+ 'summary': cluster_summary
110
+ })
111
+
112
+ # 3️⃣ Final overall summary
113
+ all_summaries = results['individual_summaries'] + [cs['summary'] for cs in results['cluster_summaries']]
114
+ results['final_summary'] = summarize(" ".join(all_summaries), max_length=200, min_length=100)
115
+ else:
116
+ results['final_summary'] = results['individual_summaries'][0]
117
+
118
+ return results
119
+
120
+ # --------------------------- STREAMLIT UI ---------------------------
121
+
122
+ st.set_page_config(page_title="Multi-Document Summarizer", layout="centered")
123
+ st.title("πŸ“š Multi-Document + PDF Summarization App")
124
+ st.write("Upload multiple text or PDF files to get summaries, clusters, and a final combined summary.")
125
+
126
+ uploaded_files = st.file_uploader(
127
+ "πŸ“€ Upload text or PDF files",
128
+ type=['txt', 'pdf'],
129
+ accept_multiple_files=True
130
+ )
131
+
132
+ if uploaded_files:
133
+ documents = []
134
+ for file in uploaded_files:
135
+ file_name = file.name
136
+ file_type = file.type
137
+ st.markdown(f"**πŸ“„ File:** `{file_name}`")
138
+
139
+ if file_name.lower().endswith('.pdf'):
140
+ text = extract_text_from_pdf(file)
141
+ else:
142
+ text = file.read().decode("utf-8")
143
+
144
+ if not text.strip():
145
+ st.warning(f"⚠️ No text found in `{file_name}` β€” skipping.")
146
+ continue
147
+
148
+ st.text_area(f"πŸ“ Preview of {file_name}", text[:700] + "..." if len(text) > 700 else text, height=150)
149
+ documents.append(text)
150
+
151
+ if len(documents) == 0:
152
+ st.error("No readable text found in uploaded files.")
153
+ elif st.button("πŸš€ Generate Summary"):
154
+ with st.spinner("⏳ Summarizing documents... please wait (this may take 1–2 minutes)..."):
155
+ try:
156
+ results = multi_document_summarize(documents)
157
+
158
+ # Individual summaries
159
+ st.subheader("πŸ“„ Individual Document Summaries")
160
+ for i, summary in enumerate(results['individual_summaries']):
161
+ with st.expander(f"Document {i+1} Summary"):
162
+ st.write(summary)
163
+
164
+ # Cluster summaries
165
+ if results['cluster_summaries']:
166
+ st.subheader("🧩 Cluster Summaries")
167
+ for i, cluster in enumerate(results['cluster_summaries']):
168
+ with st.expander(f"Cluster {i+1} (Documents: {', '.join(str(x+1) for x in cluster['doc_indices'])})"):
169
+ st.write(cluster['summary'])
170
+
171
+ # Final combined summary
172
+ st.subheader("🌍 Final Comprehensive Summary")
173
+ st.success(results['final_summary'])
174
+
175
+ except Exception as e:
176
+ st.error(f"❌ An error occurred: {str(e)}")
177
+ else:
178
+ st.info("πŸ‘† Please upload one or more `.txt` or `.pdf` files to start summarizing.")