Azidan commited on
Commit
b898d31
·
verified ·
1 Parent(s): 8a7f59e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -232
app.py CHANGED
@@ -1,253 +1,105 @@
1
- import io
2
- import math
3
- from typing import List, Tuple, Optional
4
-
5
  import gradio as gr
6
- from transformers import AutoTokenizer, pipeline
7
- import PyPDF2
8
- import docx
9
-
10
- # -----------------------------
11
- # Configuration
12
- # -----------------------------
13
- MODEL_NAME = "sshleifer/distilbart-cnn-12-6" # lightweight, works on free tier
14
- DEVICE = -1 # force CPU (Spaces free tier)
15
- CHUNK_STRIDE = 128 # overlap tokens between chunks (keeps context)
16
- SECOND_PASS = True # run final summarization on joined chunk summaries
17
-
18
- # Summary length presets (max tokens in generated summary)
19
- SUMMARY_PRESETS = {
20
- "short": {"max_length": 60, "min_length": 20},
21
- "medium": {"max_length": 120, "min_length": 40},
22
- "long": {"max_length": 200, "min_length": 80},
23
- }
24
-
25
- # -----------------------------
26
- # Load tokenizer & pipeline
27
- # -----------------------------
28
- # Wrap load in try/except so we can see clear startup errors in logs
29
- try:
30
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
31
- summarizer = pipeline("summarization", model=MODEL_NAME, tokenizer=tokenizer, device=DEVICE)
32
- print(f"Loaded model {MODEL_NAME} on {'CPU' if DEVICE == -1 else 'GPU'}")
33
- except Exception as e:
34
- # Let the Space fail loudly but with a helpful message in logs
35
- raise RuntimeError(f"Model load failed: {e}")
36
-
37
- # -----------------------------
38
- # Helpers: file reading
39
- # -----------------------------
40
- def read_pdf_bytes(file_bytes: bytes) -> str:
41
- try:
42
- reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
43
- pages = []
44
- for p in reader.pages:
45
- text = p.extract_text()
46
- if text:
47
- pages.append(text)
48
- return "\n".join(pages)
49
- except Exception as e:
50
- return f"[Error reading PDF: {e}]"
51
-
52
-
53
- def read_docx_bytes(file_bytes: bytes) -> str:
54
- try:
55
- doc = docx.Document(io.BytesIO(file_bytes))
56
- paragraphs = [p.text for p in doc.paragraphs if p.text and p.text.strip()]
57
- return "\n".join(paragraphs)
58
- except Exception as e:
59
- return f"[Error reading DOCX: {e}]"
60
-
61
-
62
- # -----------------------------
63
- # Helpers: token-aware chunking
64
- # -----------------------------
65
- def chunk_text_by_tokens(text: str, max_tokens: Optional[int] = None, stride: int = CHUNK_STRIDE) -> List[str]:
66
- """
67
- Split text into chunks no longer than `max_tokens` tokens each.
68
- Use overlap `stride` to preserve context between chunks.
69
- Returns list of chunk strings (decoded).
70
- """
71
- if not text or not text.strip():
72
- return []
73
-
74
- if max_tokens is None:
75
- max_tokens = tokenizer.model_max_length # typically 1024 for this model
76
-
77
- # encode without special tokens to control slicing precisely
78
- token_ids = tokenizer.encode(text, add_special_tokens=False)
79
- n = len(token_ids)
80
- if n <= max_tokens:
81
- return [text.strip()]
82
 
 
 
83
  chunks = []
84
- start = 0
85
- while start < n:
86
- end = min(start + max_tokens, n)
87
- chunk_ids = token_ids[start:end]
88
- chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
89
- chunks.append(chunk_text.strip())
90
- if end == n:
91
- break
92
- start = end - stride # overlap
93
  return chunks
94
 
95
 
96
- # -----------------------------
97
- # Summarization logic
98
- # -----------------------------
99
- def summarize_chunks(chunks: List[str], preset: str, progress: Optional[gr.Progress] = None) -> Tuple[List[str], str]:
100
- """
101
- Summarize each chunk and return (list_of_chunk_summaries, final_summary).
102
- If SECOND_PASS is True and >1 chunk, perform a second summarization of the concatenated chunk summaries.
103
- """
104
-
105
- if preset not in SUMMARY_PRESETS:
106
- preset = "medium"
107
- max_len = SUMMARY_PRESETS[preset]["max_length"]
108
- min_len = SUMMARY_PRESETS[preset]["min_length"]
109
-
110
- chunk_summaries = []
111
- total = len(chunks)
112
- for idx, chunk in enumerate(chunks, start=1):
113
- # call summarizer safely (each chunk within token limit)
114
- try:
115
- out = summarizer(
116
- chunk,
117
- max_length=max_len,
118
- min_length=min_len,
119
- do_sample=False,
120
- truncation=True
121
- )
122
- summary_text = out[0]["summary_text"].strip()
123
- except Exception as e:
124
- summary_text = f"[Chunk summarization error: {e}]"
125
- chunk_summaries.append(summary_text)
126
-
127
- if progress:
128
- progress((idx / total) * 0.7, desc=f"Summarizing chunk {idx}/{total}...")
129
-
130
- # Second pass: summarize combined chunk summaries to produce final summary
131
- final_summary = ""
132
- if SECOND_PASS and len(chunk_summaries) > 1:
133
- joined = "\n\n".join(chunk_summaries)
134
- # ensure joined fits token limit for model input by chunking again if needed
135
- joined_chunks = chunk_text_by_tokens(joined, max_tokens=tokenizer.model_max_length, stride=CHUNK_STRIDE)
136
- try:
137
- # if single joined chunk, summarize directly; otherwise summarize the joined chunks sequentially then join and summarize once more
138
- if len(joined_chunks) == 1:
139
- out = summarizer(
140
- joined_chunks[0],
141
- max_length=max_len,
142
- min_length=min_len,
143
- do_sample=False,
144
- truncation=True
145
- )
146
- final_summary = out[0]["summary_text"].strip()
147
- else:
148
- # reduce: summarize each joined_chunk into short pieces, then join and summarize final
149
- intermediate = []
150
- for jc in joined_chunks:
151
- out = summarizer(jc, max_length=max_len, min_length=min_len, do_sample=False, truncation=True)
152
- intermediate.append(out[0]["summary_text"].strip())
153
- # final compression
154
- final_text = "\n\n".join(intermediate)
155
- out = summarizer(final_text, max_length=max_len, min_length=min_len, do_sample=False, truncation=True)
156
- final_summary = out[0]["summary_text"].strip()
157
- except Exception as e:
158
- final_summary = f"[Final summarization error: {e}]"
159
- else:
160
- # if only one chunk or second pass disabled, final = join of chunk_summaries or the first chunk summary
161
- final_summary = "\n\n".join(chunk_summaries) if len(chunk_summaries) > 1 else (chunk_summaries[0] if chunk_summaries else "")
162
-
163
- if progress:
164
- progress(1.0, desc="Done")
165
-
166
- return chunk_summaries, final_summary
167
-
168
-
169
- # -----------------------------
170
- # Gradio processing function
171
- # -----------------------------
172
- def process(text_input: str, uploaded_file, preset: str, show_intermediate: bool, progress=gr.Progress()):
173
- progress(0.0, desc="Extracting text...")
174
-
175
- # Extract text
176
- extracted = ""
177
- if uploaded_file is not None:
178
- try:
179
- file_bytes = uploaded_file.read()
180
- fname = uploaded_file.name.lower()
181
- if fname.endswith(".pdf"):
182
- extracted = read_pdf_bytes(file_bytes)
183
- elif fname.endswith(".docx"):
184
- extracted = read_docx_bytes(file_bytes)
185
- else:
186
- # fallback: try to decode as text
187
- try:
188
- extracted = file_bytes.decode("utf-8", errors="replace")
189
- except Exception:
190
- extracted = "[Unsupported file type]"
191
- except Exception as e:
192
- return f"[File read error: {e}]", "", ""
193
- # combine pasted text with file text (file first)
194
- if text_input and text_input.strip():
195
- combined = (extracted + "\n\n" + text_input.strip()).strip()
196
  else:
197
- combined = extracted.strip()
198
 
199
- if not combined:
200
- return "No text found. Paste text or upload a PDF/DOCX file.", "", ""
201
 
202
- # chunk text by tokens
203
- progress(0.05, desc="Splitting into chunks...")
204
- max_tokens = tokenizer.model_max_length # model input limit
205
- chunks = chunk_text_by_tokens(combined, max_tokens=max_tokens, stride=CHUNK_STRIDE)
206
 
207
- # safety: if still empty
208
- if not chunks:
209
- return "No text extracted from the file or input.", "", ""
210
 
211
- # Summarize chunks (progress updates included)
212
- chunk_summaries, final_summary = summarize_chunks(chunks, preset, progress=progress)
213
 
214
- # Prepare intermediate summary output
215
- intermediate_md_lines = []
216
- for i, s in enumerate(chunk_summaries, start=1):
217
- intermediate_md_lines.append(f"### Chunk {i} Summary\n\n{s}\n")
218
- intermediate_md = "\n".join(intermediate_md_lines)
219
 
220
- stats = f"Input tokens (approx): {sum(len(tokenizer.encode(c, add_special_tokens=False)) for c in chunks)} | Chunks: {len(chunks)}"
221
 
222
- if show_intermediate:
223
- return final_summary, intermediate_md, stats
224
- else:
225
- return final_summary, "", stats
226
 
 
227
 
228
- # -----------------------------
229
- # Gradio UI
230
- # -----------------------------
231
  demo = gr.Interface(
232
- fn=process,
233
  inputs=[
234
- gr.Textbox(lines=12, placeholder="Paste text here (optional)...", label="Paste text (optional)"),
235
- gr.File(label="Upload PDF or DOCX (optional)"),
236
- gr.Radio(choices=["short", "medium", "long"], value="medium", label="Summary length (preset)"),
237
- gr.Checkbox(value=False, label="Show intermediate chunk summaries")
238
- ],
239
- outputs=[
240
- gr.Textbox(label="Final Summary"),
241
- gr.Markdown(label="Intermediate Chunk Summaries (if enabled)"),
242
- gr.Textbox(label="Stats")
243
  ],
244
- title="Hierarchical Long-Text Summarizer (token-aware, free-tier)",
245
- description=(
246
- "Paste text or upload a PDF/DOCX. The system splits long input by tokens, summarizes each chunk,"
247
- " then optionally performs a 2nd-pass summarization to produce a concise final summary."
248
- ),
249
- examples=[],
250
  )
251
 
252
- if __name__ == "__main__":
253
- demo.launch()
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
+ import pdfplumber
5
+
6
+ MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
9
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
10
+ device = "cpu"
11
+ model.to(device)
12
+
13
+
14
+ # ---------- Utilities ----------
15
+
16
+ def extract_text_from_file(file_path: str) -> str:
17
+ if file_path.endswith(".pdf"):
18
+ text = ""
19
+ with pdfplumber.open(file_path) as pdf:
20
+ for page in pdf.pages:
21
+ page_text = page.extract_text()
22
+ if page_text:
23
+ text += page_text + "\n"
24
+ return text
25
+
26
+ elif file_path.endswith(".txt"):
27
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
28
+ return f.read()
29
+
30
+ else:
31
+ return ""
32
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ def chunk_text(text, max_tokens=900):
35
+ tokens = tokenizer.encode(text)
36
  chunks = []
37
+
38
+ for i in range(0, len(tokens), max_tokens):
39
+ chunk_tokens = tokens[i:i + max_tokens]
40
+ chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
41
+ chunks.append(chunk_text)
42
+
 
 
 
43
  return chunks
44
 
45
 
46
+ def summarize_chunk(text):
47
+ inputs = tokenizer(
48
+ text,
49
+ return_tensors="pt",
50
+ truncation=True,
51
+ max_length=1024
52
+ ).to(device)
53
+
54
+ summary_ids = model.generate(
55
+ **inputs,
56
+ max_length=180,
57
+ min_length=60,
58
+ num_beams=4,
59
+ length_penalty=2.0,
60
+ early_stopping=True
61
+ )
62
+
63
+ return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
64
+
65
+
66
+ # ---------- Main Logic ----------
67
+
68
+ def summarize(text_input, file_input):
69
+ if file_input:
70
+ text = extract_text_from_file(file_input)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  else:
72
+ text = text_input
73
 
74
+ if not text or len(text.strip()) < 50:
75
+ return "Text is too short or empty."
76
 
77
+ chunks = chunk_text(text)
 
 
 
78
 
79
+ summaries = []
80
+ for chunk in chunks:
81
+ summaries.append(summarize_chunk(chunk))
82
 
83
+ # Optional second-pass summarization
84
+ combined_summary = " ".join(summaries)
85
 
86
+ if len(tokenizer.encode(combined_summary)) > 900:
87
+ combined_summary = summarize_chunk(combined_summary)
 
 
 
88
 
89
+ return combined_summary
90
 
 
 
 
 
91
 
92
+ # ---------- UI ----------
93
 
 
 
 
94
  demo = gr.Interface(
95
+ fn=summarize,
96
  inputs=[
97
+ gr.Textbox(lines=12, label="Paste Text (optional)"),
98
+ gr.File(label="Upload TXT or PDF (optional)")
 
 
 
 
 
 
 
99
  ],
100
+ outputs=gr.Textbox(lines=10, label="Summary"),
101
+ title="Long Text Summarizer (Free Tier Optimized)",
102
+ description="Supports large documents using chunked summarization. Runs on CPU."
 
 
 
103
  )
104
 
105
+ demo.launch(server_name="0.0.0.0", server_port=7860)