Spaces:

mbalvi
/

Text-Summarizer

Sleeping

App Files Files Community

mbalvi commited on Sep 25, 2025

Commit

4012cea

verified ·

1 Parent(s): 20bd76a

Create app.py

Browse files

Files changed (1) hide show

app.py +174 -0

app.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Simple Hugging Face Text Summarizer (Flask)
+Endpoints:
+- GET  /           -> basic info
+- POST /summarize  -> JSON input: {"text": "...", "model": "...", "max_length": 130, "min_length": 30}
+How to run:
+1. pip install -r requirements.txt
+2. python app.py
+3. POST JSON to http://127.0.0.1:8000/summarize
+Notes:
+- Default model: "facebook/bart-large-cnn". You may change to any summarization-capable HF model.
+- For very long texts, the app chunks the text and summarizes each chunk, then summarizes the concatenated chunk-summaries.
+"""
+from flask import Flask, request, jsonify
+from transformers import pipeline, Pipeline
+from typing import List, Optional
+import threading
+import math
+import textwrap
+import os
+app = Flask(__name__)
+# Default model (good general-purpose summarizer)
+DEFAULT_MODEL = os.getenv("SUMMARIZER_MODEL", "facebook/bart-large-cnn")
+# Global pipeline cache to avoid reloading between requests
+_PIPELINES = {}
+_PIPELINES_LOCK = threading.Lock()
+def get_summarizer(model_name: str = DEFAULT_MODEL) -> Pipeline:
+    """
+    Return a cached summarization pipeline for model_name or create one.
+    """
+    with _PIPELINES_LOCK:
+        if model_name not in _PIPELINES:
+            # Create pipeline (use default device; if you have GPU and torch detects it, it'll use it)
+            _PIPELINES[model_name] = pipeline("summarization", model=model_name)
+        return _PIPELINES[model_name]
+def chunk_text(text: str, max_chars: int = 1000, overlap: int = 200) -> List[str]:
+    """
+    Chunk text into pieces of at most max_chars (approx) with specified overlap.
+    This is a simple, robust chunker using whitespace boundaries.
+    """
+    if len(text) <= max_chars:
+        return [text]
+    words = text.split()
+    chunks = []
+    current = []
+    current_len = 0
+    i = 0
+    while i < len(words):
+        w = words[i]
+        # +1 for a space when joined
+        if current_len + len(w) + (1 if current_len > 0 else 0) <= max_chars:
+            current.append(w)
+            current_len += len(w) + (1 if current_len > 0 else 0)
+            i += 1
+        else:
+            chunks.append(" ".join(current))
+            # move pointer back by `overlap` words for overlapping context
+            # calculate how many words correspond to overlap characters approx
+            # (simple heuristic: take last K words)
+            if overlap <= 0:
+                current = []
+                current_len = 0
+            else:
+                # take some words from the end as overlap
+                overlap_chars = overlap
+                ov = []
+                ov_len = 0
+                while current and ov_len + len(current[-1]) + (1 if ov_len > 0 else 0) <= overlap_chars:
+                    ov.insert(0, current.pop())
+                    ov_len += len(ov[0]) + (1 if ov_len > 0 else 0)
+                current = ov
+                current_len = ov_len
+    if current:
+        chunks.append(" ".join(current))
+    return chunks
+def summarize_text(text: str, model_name: str = DEFAULT_MODEL,
+                   max_length: int = 130, min_length: int = 30,
+                   chunk_max_chars: int = 1000, chunk_overlap: int = 200) -> str:
+    """
+    Summarize a (possibly long) text by chunking -> summarizing chunks -> summarizing combined.
+    Returns a final summary string.
+    """
+    summarizer = get_summarizer(model_name)
+    # Chunk input
+    chunks = chunk_text(text, max_chars=chunk_max_chars, overlap=chunk_overlap)
+    # Summarize each chunk
+    chunk_summaries = []
+    for idx, chunk in enumerate(chunks):
+        # The pipeline returns a list of dicts with 'summary_text'
+        try:
+            out = summarizer(chunk, max_length=max_length, min_length=min_length, truncation=True)
+            s = out[0]["summary_text"].strip()
+        except Exception as e:
+            # Fallback simpler call without length constraints
+            out = summarizer(chunk, truncation=True)
+            s = out[0]["summary_text"].strip()
+        chunk_summaries.append(s)
+    # If multiple chunk summaries, summarize them again to produce final short summary
+    if len(chunk_summaries) == 1:
+        final = chunk_summaries[0]
+    else:
+        combined = "\n".join(chunk_summaries)
+        # adjust lengths for final summary (shorter)
+        final_out = summarizer(combined, max_length=min(max_length, 180), min_length=25, truncation=True)
+        final = final_out[0]["summary_text"].strip()
+    return final
+@app.route("/", methods=["GET"])
+def index():
+    return jsonify({
+        "service": "hf-text-summarizer",
+        "endpoints": {
+            "POST /summarize": {
+                "json": {
+                    "text": "string (required)",
+                    "model": "optional HF model id (default facebook/bart-large-cnn)",
+                    "max_length": "optional int (summary max tokens, default 130)",
+                    "min_length": "optional int (summary min tokens, default 30)"
+                }
+            }
+        }
+    })
+@app.route("/summarize", methods=["POST"])
+def summarize_route():
+    data = request.get_json(force=True, silent=True)
+    if not data or "text" not in data:
+        return jsonify({"error": "JSON body required with 'text' field"}), 400
+    text = data["text"]
+    model = data.get("model", DEFAULT_MODEL)
+    max_length = int(data.get("max_length", 130))
+    min_length = int(data.get("min_length", 30))
+    # Basic input validation
+    if not isinstance(text, str) or len(text.strip()) == 0:
+        return jsonify({"error": "text must be a non-empty string"}), 400
+    if max_length <= 0 or min_length < 0:
+        return jsonify({"error": "invalid min_length/max_length"}), 400
+    # Safety: prevent extremely huge single requests from crashing the server
+    if len(text) > 500_000:  # ~500k chars
+        return jsonify({"error": "input text too large (limit 500k chars)"}), 413
+    try:
+        summary = summarize_text(text, model_name=model,
+                                 max_length=max_length, min_length=min_length)
+    except Exception as e:
+        # Try to present a helpful message (avoid leaking internals)
+        return jsonify({"error": "failed to summarize text", "detail": str(e)}), 500
+    return jsonify({
+        "model": model,
+        "summary": summary,
+        "input_length": len(text),
+    })
+if __name__ == "__main__":
+    # Run Flask on port 8000
+    app.run(host="0.0.0.0", port=8000, debug=False)