Azidan commited on
Commit
b91ee99
·
verified ·
1 Parent(s): ec272a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -75
app.py CHANGED
@@ -1,105 +1,137 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
- import pdfplumber
5
 
 
 
 
6
  MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
9
- model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
10
- device = "cpu"
11
- model.to(device)
 
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
13
 
14
- # ---------- Utilities ----------
 
 
15
 
16
- def extract_text_from_file(file_path: str) -> str:
17
- if file_path.endswith(".pdf"):
18
- text = ""
19
- with pdfplumber.open(file_path) as pdf:
20
- for page in pdf.pages:
21
- page_text = page.extract_text()
22
- if page_text:
23
- text += page_text + "\n"
24
- return text
25
 
26
- elif file_path.endswith(".txt"):
27
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
28
- return f.read()
 
 
29
 
30
- else:
31
- return ""
32
 
33
 
34
- def chunk_text(text, max_tokens=900):
35
- tokens = tokenizer.encode(text)
 
36
  chunks = []
37
 
38
- for i in range(0, len(tokens), max_tokens):
39
- chunk_tokens = tokens[i:i + max_tokens]
40
  chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
41
  chunks.append(chunk_text)
42
 
43
  return chunks
44
 
45
 
46
- def summarize_chunk(text):
47
- inputs = tokenizer(
48
- text,
49
- return_tensors="pt",
50
- truncation=True,
51
- max_length=1024
52
- ).to(device)
53
-
54
- summary_ids = model.generate(
55
- **inputs,
56
- max_length=180,
57
- min_length=60,
58
- num_beams=4,
59
- length_penalty=2.0,
60
- early_stopping=True
61
- )
62
-
63
- return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
64
-
65
-
66
- # ---------- Main Logic ----------
67
-
68
- def summarize(text_input, file_input):
69
- if file_input:
70
- text = extract_text_from_file(file_input)
71
- else:
72
- text = text_input
73
-
74
- if not text or len(text.strip()) < 50:
75
- return "Text is too short or empty."
76
 
77
  chunks = chunk_text(text)
78
-
79
  summaries = []
80
- for chunk in chunks:
81
- summaries.append(summarize_chunk(chunk))
82
 
83
- # Optional second-pass summarization
84
- combined_summary = " ".join(summaries)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- if len(tokenizer.encode(combined_summary)) > 900:
87
- combined_summary = summarize_chunk(combined_summary)
 
 
88
 
89
- return combined_summary
 
 
 
90
 
 
 
 
 
91
 
92
- # ---------- UI ----------
93
 
94
- demo = gr.Interface(
95
- fn=summarize,
96
- inputs=[
97
- gr.Textbox(lines=12, label="Paste Text (optional)"),
98
- gr.File(label="Upload TXT or PDF (optional)")
99
- ],
100
- outputs=gr.Textbox(lines=10, label="Summary"),
101
- title="Long Text Summarizer (Free Tier Optimized)",
102
- description="Supports large documents using chunked summarization. Runs on CPU."
103
- )
104
 
105
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ import re
3
+ from transformers import pipeline, AutoTokenizer
4
+ from PyPDF2 import PdfReader
5
 
6
+ # =========================
7
+ # Model setup (CPU-safe)
8
+ # =========================
9
  MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
10
 
11
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
12
+ summarizer = pipeline(
13
+ "summarization",
14
+ model=MODEL_NAME,
15
+ tokenizer=tokenizer,
16
+ device=-1 # CPU only
17
+ )
18
+
19
+ MAX_MODEL_TOKENS = 1024
20
+ CHUNK_SIZE = 900 # safe margin
21
+
22
 
23
+ # =========================
24
+ # Utilities
25
+ # =========================
26
+ def clean_text(text: str) -> str:
27
+ """Fix quotes, spacing, repetition, and broken punctuation."""
28
+ text = text.replace("‘", "'").replace("’", "'")
29
+ text = text.replace("“", '"').replace("”", '"')
30
 
31
+ text = re.sub(r"[.]{2,}", ".", text)
32
+ text = re.sub(r"[']{2,}", "'", text)
33
+ text = re.sub(r"\s+", " ", text)
34
 
35
+ sentences = re.split(r'(?<=[.!?])\s+', text)
36
+ seen = set()
37
+ result = []
 
 
 
 
 
 
38
 
39
+ for s in sentences:
40
+ key = s.strip().lower()
41
+ if key and key not in seen:
42
+ seen.add(key)
43
+ result.append(s.strip())
44
 
45
+ return " ".join(result)
 
46
 
47
 
48
+ def chunk_text(text: str):
49
+ """Token-aware chunking to avoid model overflow."""
50
+ tokens = tokenizer.encode(text, add_special_tokens=False)
51
  chunks = []
52
 
53
+ for i in range(0, len(tokens), CHUNK_SIZE):
54
+ chunk_tokens = tokens[i:i + CHUNK_SIZE]
55
  chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
56
  chunks.append(chunk_text)
57
 
58
  return chunks
59
 
60
 
61
+ def summarize_long_text(text: str) -> str:
62
+ """Summarize arbitrarily long text safely."""
63
+ if not text or len(text.strip()) == 0:
64
+ return "No text provided."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  chunks = chunk_text(text)
 
67
  summaries = []
 
 
68
 
69
+ for chunk in chunks:
70
+ summary = summarizer(
71
+ chunk,
72
+ max_length=150,
73
+ min_length=40,
74
+ do_sample=False
75
+ )[0]["summary_text"]
76
+
77
+ summaries.append(summary)
78
+
79
+ merged = " ".join(summaries)
80
+ return clean_text(merged)
81
+
82
+
83
+ def read_pdf(file) -> str:
84
+ """Safely extract text from PDF."""
85
+ try:
86
+ reader = PdfReader(file)
87
+ pages = [page.extract_text() or "" for page in reader.pages]
88
+ return " ".join(pages)
89
+ except Exception as e:
90
+ return f"PDF read error: {e}"
91
+
92
+
93
+ # =========================
94
+ # Main handler
95
+ # =========================
96
+ def process_input(text, file):
97
+ if file is not None:
98
+ text = read_pdf(file)
99
+
100
+ return summarize_long_text(text)
101
+
102
+
103
+ # =========================
104
+ # Gradio UI
105
+ # =========================
106
+ with gr.Blocks() as demo:
107
+ gr.Markdown("# 📄 Long Text Summarizer (Free-Tier Safe)")
108
+ gr.Markdown(
109
+ "• Handles **thousands of words**\n"
110
+ "• Supports **PDF upload**\n"
111
+ "• Optimized for **CPU / free tier**"
112
+ )
113
 
114
+ text_input = gr.Textbox(
115
+ lines=15,
116
+ label="Paste text (optional)"
117
+ )
118
 
119
+ file_input = gr.File(
120
+ label="Upload PDF (optional)",
121
+ file_types=[".pdf"]
122
+ )
123
 
124
+ output = gr.Textbox(
125
+ lines=10,
126
+ label="Summary"
127
+ )
128
 
129
+ summarize_btn = gr.Button("Summarize")
130
 
131
+ summarize_btn.click(
132
+ fn=process_input,
133
+ inputs=[text_input, file_input],
134
+ outputs=output
135
+ )
 
 
 
 
 
136
 
137
+ demo.launch()