Spaces:
Sleeping
Sleeping
Update utils/summarizer.py
Browse files- utils/summarizer.py +50 -22
utils/summarizer.py
CHANGED
|
@@ -6,7 +6,13 @@ from typing import List
|
|
| 6 |
# ========== Load Summarization Pipeline ==========
|
| 7 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 8 |
|
| 9 |
-
# ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
def split_text(text: str, max_chunk_len: int = 800) -> List[str]:
|
| 12 |
"""
|
|
@@ -28,33 +34,55 @@ def split_text(text: str, max_chunk_len: int = 800) -> List[str]:
|
|
| 28 |
|
| 29 |
return chunks
|
| 30 |
|
| 31 |
-
|
| 32 |
-
"""
|
| 33 |
-
π§Ή Remove excessive whitespace and line breaks.
|
| 34 |
-
"""
|
| 35 |
-
return text.replace("\n", " ").replace(" ", " ").strip()
|
| 36 |
-
|
| 37 |
-
# ========== Summarization Function ==========
|
| 38 |
|
| 39 |
-
def summarize_text(text: str) -> str:
|
| 40 |
"""
|
| 41 |
-
π Generate
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
"""
|
| 43 |
if not text.strip():
|
| 44 |
return "No input provided."
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
|
| 52 |
-
result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
|
| 53 |
-
summary = result[0]["summary_text"].strip()
|
| 54 |
-
lines = summary.split('. ')
|
| 55 |
-
for line in lines:
|
| 56 |
-
cleaned_line = line.strip().rstrip('.')
|
| 57 |
-
if cleaned_line:
|
| 58 |
-
bullet_points.append(f"β’ {cleaned_line}.")
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
# ========== Load Summarization Pipeline ==========
|
| 7 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 8 |
|
| 9 |
+
# ========== Text Helpers ==========
|
| 10 |
+
|
| 11 |
+
def clean_text(text: str) -> str:
|
| 12 |
+
"""
|
| 13 |
+
π§Ή Remove excessive whitespace and line breaks.
|
| 14 |
+
"""
|
| 15 |
+
return text.replace("\n", " ").replace(" ", " ").strip()
|
| 16 |
|
| 17 |
def split_text(text: str, max_chunk_len: int = 800) -> List[str]:
|
| 18 |
"""
|
|
|
|
| 34 |
|
| 35 |
return chunks
|
| 36 |
|
| 37 |
+
# ========== Summarization Functions ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
def summarize_text(text: str, as_paragraph: bool = False, fallback: bool = True) -> str:
|
| 40 |
"""
|
| 41 |
+
π Generate an executive summary.
|
| 42 |
+
|
| 43 |
+
Params:
|
| 44 |
+
- as_paragraph: True β returns as 2β3 paragraph summary; False β bullet points
|
| 45 |
+
- fallback: True β if model fails, returns manual fallback
|
| 46 |
"""
|
| 47 |
if not text.strip():
|
| 48 |
return "No input provided."
|
| 49 |
|
| 50 |
+
try:
|
| 51 |
+
cleaned = clean_text(text)
|
| 52 |
+
chunks = split_text(cleaned)
|
| 53 |
+
summaries = []
|
| 54 |
+
|
| 55 |
+
for chunk in chunks:
|
| 56 |
+
result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
|
| 57 |
+
summary = result[0]["summary_text"].strip()
|
| 58 |
+
summaries.append(summary)
|
| 59 |
+
|
| 60 |
+
if as_paragraph:
|
| 61 |
+
return "π Executive Summary:\n\n" + "\n\n".join(summaries)
|
| 62 |
|
| 63 |
+
# Otherwise β return as bullet points
|
| 64 |
+
bullet_points = []
|
| 65 |
+
for summary in summaries:
|
| 66 |
+
lines = summary.split('. ')
|
| 67 |
+
for line in lines:
|
| 68 |
+
cleaned_line = line.strip().rstrip('.')
|
| 69 |
+
if cleaned_line:
|
| 70 |
+
bullet_points.append(f"β’ {cleaned_line}.")
|
| 71 |
|
| 72 |
+
return "π Executive Summary:\n" + "\n".join(bullet_points)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
except Exception as e:
|
| 75 |
+
if fallback:
|
| 76 |
+
return fallback_summary(text)
|
| 77 |
+
return f"An error occurred: {str(e)}"
|
| 78 |
+
|
| 79 |
+
# ========== Fallback Summary (manual) ==========
|
| 80 |
+
|
| 81 |
+
def fallback_summary(text: str, max_lines: int = 5) -> str:
|
| 82 |
+
"""
|
| 83 |
+
π§ Fallback: Return first few sentences as pseudo-summary.
|
| 84 |
+
"""
|
| 85 |
+
lines = text.split(". ")
|
| 86 |
+
selected = lines[:max_lines]
|
| 87 |
+
points = [f"β’ {line.strip().rstrip('.')}" for line in selected if line.strip()]
|
| 88 |
+
return "π (Fallback Summary)\n" + "\n".join(points)
|