Spaces:

Azidan
/

textSum

Running

App Files Files Community

Azidan commited on 20 days ago

Commit

b91ee99

verified ·

1 Parent(s): ec272a1

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -75

app.py CHANGED Viewed

@@ -1,105 +1,137 @@
 import gradio as gr
-import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-import pdfplumber
 MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
-device = "cpu"
-model.to(device)
-# ---------- Utilities ----------
-def extract_text_from_file(file_path: str) -> str:
-    if file_path.endswith(".pdf"):
-        text = ""
-        with pdfplumber.open(file_path) as pdf:
-            for page in pdf.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-        return text
-    elif file_path.endswith(".txt"):
-        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-            return f.read()
-    else:
-        return ""
-def chunk_text(text, max_tokens=900):
-    tokens = tokenizer.encode(text)
     chunks = []
-    for i in range(0, len(tokens), max_tokens):
-        chunk_tokens = tokens[i:i + max_tokens]
         chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
         chunks.append(chunk_text)
     return chunks
-def summarize_chunk(text):
-    inputs = tokenizer(
-        text,
-        return_tensors="pt",
-        truncation=True,
-        max_length=1024
-    ).to(device)
-    summary_ids = model.generate(
-        **inputs,
-        max_length=180,
-        min_length=60,
-        num_beams=4,
-        length_penalty=2.0,
-        early_stopping=True
-    )
-    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-# ---------- Main Logic ----------
-def summarize(text_input, file_input):
-    if file_input:
-        text = extract_text_from_file(file_input)
-    else:
-        text = text_input
-    if not text or len(text.strip()) < 50:
-        return "Text is too short or empty."
     chunks = chunk_text(text)
     summaries = []
-    for chunk in chunks:
-        summaries.append(summarize_chunk(chunk))
-    # Optional second-pass summarization
-    combined_summary = " ".join(summaries)
-    if len(tokenizer.encode(combined_summary)) > 900:
-        combined_summary = summarize_chunk(combined_summary)
-    return combined_summary
-# ---------- UI ----------
-demo = gr.Interface(
-    fn=summarize,
-    inputs=[
-        gr.Textbox(lines=12, label="Paste Text (optional)"),
-        gr.File(label="Upload TXT or PDF (optional)")
-    ],
-    outputs=gr.Textbox(lines=10, label="Summary"),
-    title="Long Text Summarizer (Free Tier Optimized)",
-    description="Supports large documents using chunked summarization. Runs on CPU."
-)
-demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
+import re
+from transformers import pipeline, AutoTokenizer
+from PyPDF2 import PdfReader
+# =========================
+# Model setup (CPU-safe)
+# =========================
 MODEL_NAME = "sshleifer/distilbart-cnn-12-6"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+summarizer = pipeline(
+    "summarization",
+    model=MODEL_NAME,
+    tokenizer=tokenizer,
+    device=-1  # CPU only
+)
+MAX_MODEL_TOKENS = 1024
+CHUNK_SIZE = 900  # safe margin
+# =========================
+# Utilities
+# =========================
+def clean_text(text: str) -> str:
+    """Fix quotes, spacing, repetition, and broken punctuation."""
+    text = text.replace("‘", "'").replace("’", "'")
+    text = text.replace("“", '"').replace("”", '"')
+    text = re.sub(r"[.]{2,}", ".", text)
+    text = re.sub(r"[']{2,}", "'", text)
+    text = re.sub(r"\s+", " ", text)
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    seen = set()
+    result = []
+    for s in sentences:
+        key = s.strip().lower()
+        if key and key not in seen:
+            seen.add(key)
+            result.append(s.strip())
+    return " ".join(result)
+def chunk_text(text: str):
+    """Token-aware chunking to avoid model overflow."""
+    tokens = tokenizer.encode(text, add_special_tokens=False)
     chunks = []
+    for i in range(0, len(tokens), CHUNK_SIZE):
+        chunk_tokens = tokens[i:i + CHUNK_SIZE]
         chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
         chunks.append(chunk_text)
     return chunks
+def summarize_long_text(text: str) -> str:
+    """Summarize arbitrarily long text safely."""
+    if not text or len(text.strip()) == 0:
+        return "No text provided."
     chunks = chunk_text(text)
     summaries = []
+    for chunk in chunks:
+        summary = summarizer(
+            chunk,
+            max_length=150,
+            min_length=40,
+            do_sample=False
+        )[0]["summary_text"]
+        summaries.append(summary)
+    merged = " ".join(summaries)
+    return clean_text(merged)
+def read_pdf(file) -> str:
+    """Safely extract text from PDF."""
+    try:
+        reader = PdfReader(file)
+        pages = [page.extract_text() or "" for page in reader.pages]
+        return " ".join(pages)
+    except Exception as e:
+        return f"PDF read error: {e}"
+# =========================
+# Main handler
+# =========================
+def process_input(text, file):
+    if file is not None:
+        text = read_pdf(file)
+    return summarize_long_text(text)
+# =========================
+# Gradio UI
+# =========================
+with gr.Blocks() as demo:
+    gr.Markdown("# 📄 Long Text Summarizer (Free-Tier Safe)")
+    gr.Markdown(
+        "• Handles **thousands of words**\n"
+        "• Supports **PDF upload**\n"
+        "• Optimized for **CPU / free tier**"
+    )
+    text_input = gr.Textbox(
+        lines=15,
+        label="Paste text (optional)"
+    )
+    file_input = gr.File(
+        label="Upload PDF (optional)",
+        file_types=[".pdf"]
+    )
+    output = gr.Textbox(
+        lines=10,
+        label="Summary"
+    )
+    summarize_btn = gr.Button("Summarize")
+    summarize_btn.click(
+        fn=process_input,
+        inputs=[text_input, file_input],
+        outputs=output
+    )
+demo.launch()