SCBconsulting commited on
Commit
f42b944
·
verified ·
1 Parent(s): bd5c699

Update utils/summarizer.py

Browse files
Files changed (1) hide show
  1. utils/summarizer.py +23 -12
utils/summarizer.py CHANGED
@@ -1,13 +1,16 @@
1
  # utils/summarizer.py
2
 
3
  from transformers import pipeline
 
4
 
5
- # Load summarization pipeline
6
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
7
 
8
- def split_text(text, max_chunk_len=800):
 
 
9
  """
10
- Breaks long documents into smaller chunks for summarization.
11
  """
12
  sentences = text.split('. ')
13
  chunks = []
@@ -25,25 +28,33 @@ def split_text(text, max_chunk_len=800):
25
 
26
  return chunks
27
 
28
- def summarize_text(text):
 
 
 
 
 
 
 
 
29
  """
30
- Generate a readable executive summary using bullet points.
31
  """
32
  if not text.strip():
33
  return "No input provided."
34
 
35
- text = text.replace("\n", " ").replace(" ", " ").strip()
36
- chunks = split_text(text)
37
 
38
  bullet_points = []
39
 
40
  for chunk in chunks:
41
  result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
42
  summary = result[0]["summary_text"].strip()
43
- summary_lines = summary.split('. ')
44
- for line in summary_lines:
45
- clean = line.strip().rstrip('.')
46
- if clean:
47
- bullet_points.append(f"• {clean}.")
48
 
49
  return "📄 Executive Summary:\n" + "\n".join(bullet_points)
 
1
  # utils/summarizer.py
2
 
3
  from transformers import pipeline
4
+ from typing import List
5
 
6
+ # ========== Load Summarization Pipeline ==========
7
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
8
 
9
+ # ========== Helper Functions ==========
10
+
11
+ def split_text(text: str, max_chunk_len: int = 800) -> List[str]:
12
  """
13
+ ✂️ Breaks long text into smaller chunks for safe summarization.
14
  """
15
  sentences = text.split('. ')
16
  chunks = []
 
28
 
29
  return chunks
30
 
31
+ def clean_text(text: str) -> str:
32
+ """
33
+ 🧹 Remove excessive whitespace and line breaks.
34
+ """
35
+ return text.replace("\n", " ").replace(" ", " ").strip()
36
+
37
+ # ========== Summarization Function ==========
38
+
39
+ def summarize_text(text: str) -> str:
40
  """
41
+ 📄 Generate a readable executive summary using bullet points.
42
  """
43
  if not text.strip():
44
  return "No input provided."
45
 
46
+ cleaned = clean_text(text)
47
+ chunks = split_text(cleaned)
48
 
49
  bullet_points = []
50
 
51
  for chunk in chunks:
52
  result = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
53
  summary = result[0]["summary_text"].strip()
54
+ lines = summary.split('. ')
55
+ for line in lines:
56
+ cleaned_line = line.strip().rstrip('.')
57
+ if cleaned_line:
58
+ bullet_points.append(f"• {cleaned_line}.")
59
 
60
  return "📄 Executive Summary:\n" + "\n".join(bullet_points)