Azidan commited on
Commit
8b8dafc
Β·
verified Β·
1 Parent(s): b91ee99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -30
app.py CHANGED
@@ -7,7 +7,6 @@ from PyPDF2 import PdfReader
7
  # Model setup (CPU-safe)
8
  # =========================
9
  MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
10
-
11
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
  summarizer = pipeline(
13
  "summarization",
@@ -17,8 +16,7 @@ summarizer = pipeline(
17
  )
18
 
19
  MAX_MODEL_TOKENS = 1024
20
- CHUNK_SIZE = 900 # safe margin
21
-
22
 
23
  # =========================
24
  # Utilities
@@ -27,21 +25,17 @@ def clean_text(text: str) -> str:
27
  """Fix quotes, spacing, repetition, and broken punctuation."""
28
  text = text.replace("β€˜", "'").replace("’", "'")
29
  text = text.replace("β€œ", '"').replace("”", '"')
30
-
31
  text = re.sub(r"[.]{2,}", ".", text)
32
  text = re.sub(r"[']{2,}", "'", text)
33
  text = re.sub(r"\s+", " ", text)
34
-
35
  sentences = re.split(r'(?<=[.!?])\s+', text)
36
  seen = set()
37
  result = []
38
-
39
  for s in sentences:
40
  key = s.strip().lower()
41
  if key and key not in seen:
42
  seen.add(key)
43
  result.append(s.strip())
44
-
45
  return " ".join(result)
46
 
47
 
@@ -49,12 +43,10 @@ def chunk_text(text: str):
49
  """Token-aware chunking to avoid model overflow."""
50
  tokens = tokenizer.encode(text, add_special_tokens=False)
51
  chunks = []
52
-
53
  for i in range(0, len(tokens), CHUNK_SIZE):
54
  chunk_tokens = tokens[i:i + CHUNK_SIZE]
55
  chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
56
  chunks.append(chunk_text)
57
-
58
  return chunks
59
 
60
 
@@ -62,10 +54,10 @@ def summarize_long_text(text: str) -> str:
62
  """Summarize arbitrarily long text safely."""
63
  if not text or len(text.strip()) == 0:
64
  return "No text provided."
65
-
66
  chunks = chunk_text(text)
67
  summaries = []
68
-
69
  for chunk in chunks:
70
  summary = summarizer(
71
  chunk,
@@ -73,9 +65,8 @@ def summarize_long_text(text: str) -> str:
73
  min_length=40,
74
  do_sample=False
75
  )[0]["summary_text"]
76
-
77
  summaries.append(summary)
78
-
79
  merged = " ".join(summaries)
80
  return clean_text(merged)
81
 
@@ -96,42 +87,93 @@ def read_pdf(file) -> str:
96
  def process_input(text, file):
97
  if file is not None:
98
  text = read_pdf(file)
99
-
100
  return summarize_long_text(text)
101
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  # =========================
104
  # Gradio UI
105
  # =========================
106
- with gr.Blocks() as demo:
107
- gr.Markdown("# πŸ“„ Long Text Summarizer (Free-Tier Safe)")
108
  gr.Markdown(
109
- "β€’ Handles **thousands of words**\n"
110
- "β€’ Supports **PDF upload**\n"
111
- "β€’ Optimized for **CPU / free tier**"
112
  )
113
-
114
- text_input = gr.Textbox(
115
- lines=15,
116
- label="Paste text (optional)"
 
117
  )
118
 
119
- file_input = gr.File(
120
- label="Upload PDF (optional)",
121
- file_types=[".pdf"]
122
- )
 
 
 
 
 
 
 
 
 
 
123
 
124
  output = gr.Textbox(
125
  lines=10,
126
- label="Summary"
 
127
  )
128
 
129
- summarize_btn = gr.Button("Summarize")
130
-
131
  summarize_btn.click(
132
  fn=process_input,
133
  inputs=[text_input, file_input],
134
  outputs=output
135
  )
136
 
 
137
  demo.launch()
 
 
7
  # Model setup (CPU-safe)
8
  # =========================
9
  MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
 
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
11
  summarizer = pipeline(
12
  "summarization",
 
16
  )
17
 
18
  MAX_MODEL_TOKENS = 1024
19
+ CHUNK_SIZE = 900 # safe margin
 
20
 
21
  # =========================
22
  # Utilities
 
25
  """Fix quotes, spacing, repetition, and broken punctuation."""
26
  text = text.replace("β€˜", "'").replace("’", "'")
27
  text = text.replace("β€œ", '"').replace("”", '"')
 
28
  text = re.sub(r"[.]{2,}", ".", text)
29
  text = re.sub(r"[']{2,}", "'", text)
30
  text = re.sub(r"\s+", " ", text)
 
31
  sentences = re.split(r'(?<=[.!?])\s+', text)
32
  seen = set()
33
  result = []
 
34
  for s in sentences:
35
  key = s.strip().lower()
36
  if key and key not in seen:
37
  seen.add(key)
38
  result.append(s.strip())
 
39
  return " ".join(result)
40
 
41
 
 
43
  """Token-aware chunking to avoid model overflow."""
44
  tokens = tokenizer.encode(text, add_special_tokens=False)
45
  chunks = []
 
46
  for i in range(0, len(tokens), CHUNK_SIZE):
47
  chunk_tokens = tokens[i:i + CHUNK_SIZE]
48
  chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
49
  chunks.append(chunk_text)
 
50
  return chunks
51
 
52
 
 
54
  """Summarize arbitrarily long text safely."""
55
  if not text or len(text.strip()) == 0:
56
  return "No text provided."
57
+
58
  chunks = chunk_text(text)
59
  summaries = []
60
+
61
  for chunk in chunks:
62
  summary = summarizer(
63
  chunk,
 
65
  min_length=40,
66
  do_sample=False
67
  )[0]["summary_text"]
 
68
  summaries.append(summary)
69
+
70
  merged = " ".join(summaries)
71
  return clean_text(merged)
72
 
 
87
  def process_input(text, file):
88
  if file is not None:
89
  text = read_pdf(file)
 
90
  return summarize_long_text(text)
91
 
92
 
93
+ # =========================
94
+ # Custom theme + CSS
95
+ # =========================
96
+ custom_theme = gr.themes.Default(
97
+ primary_hue="blue",
98
+ secondary_hue="gray",
99
+ neutral_hue="gray",
100
+ font=[gr.themes.GoogleFont('Inter'), 'ui-sans-serif', 'sans-serif'],
101
+ ).set(
102
+ body_background_fill="#ffffff",
103
+ body_background_fill_dark="#ffffff",
104
+ block_background_fill="#ffffff",
105
+ block_background_fill_dark="#ffffff",
106
+ button_primary_background_fill="#2563eb", # nice blue
107
+ button_primary_background_fill_hover="#1d4ed8", # darker on hover
108
+ button_primary_text_color="#ffffff",
109
+ button_primary_border_color="#2563eb",
110
+ )
111
+
112
+
113
+ custom_css = """
114
+ .gradio-container {
115
+ background-color: #ffffff !important;
116
+ }
117
+
118
+ header, .gr-top, .gr-header {
119
+ background-color: #f8f9fa !important; /* light gray navbar-like bar */
120
+ border-bottom: 1px solid #e5e7eb !important;
121
+ padding: 12px 24px !important;
122
+ }
123
+
124
+ .gr-button-primary {
125
+ border-radius: 8px !important;
126
+ font-weight: 600 !important;
127
+ }
128
+
129
+ h1 {
130
+ margin: 0 !important;
131
+ color: #1f2937 !important;
132
+ }
133
+ """
134
+
135
  # =========================
136
  # Gradio UI
137
  # =========================
138
+ with gr.Blocks(theme=custom_theme, css=custom_css) as demo:
 
139
  gr.Markdown(
140
+ "# πŸ“„ Long Text Summarizer (Free-Tier Safe)",
141
+ elem_classes=["pb-2"]
 
142
  )
143
+ gr.Markdown(
144
+ "β€’ Handles **thousands of words** \n"
145
+ "β€’ Supports **PDF upload** \n"
146
+ "β€’ Optimized for **CPU / free tier**",
147
+ elem_classes=["text-gray-600", "text-sm", "mb-6"]
148
  )
149
 
150
+ with gr.Row():
151
+ with gr.Column(scale=5):
152
+ text_input = gr.Textbox(
153
+ lines=15,
154
+ label="Paste text (optional)",
155
+ placeholder="Paste your long article / text here...",
156
+ )
157
+ with gr.Column(scale=1, min_width=240):
158
+ file_input = gr.File(
159
+ label="Upload PDF (optional)",
160
+ file_types=[".pdf"],
161
+ )
162
+
163
+ summarize_btn = gr.Button("Summarize", variant="primary", scale=0)
164
 
165
  output = gr.Textbox(
166
  lines=10,
167
+ label="Summary",
168
+ placeholder="Summary will appear here...",
169
  )
170
 
 
 
171
  summarize_btn.click(
172
  fn=process_input,
173
  inputs=[text_input, file_input],
174
  outputs=output
175
  )
176
 
177
+ # Change share=True β†’ debug=True during development
178
  demo.launch()
179
+ # demo.launch(server_name="0.0.0.0") # ← use this on HF Spaces if needed