Spaces:

maahi2412
/

text-summarization-app

Runtime error

App Files Files Community

Praful Nayak commited on Feb 28, 2025

Commit

4a1a141

1 Parent(s): 5c9924d

Deploy Flask Summarization App

Browse files

Files changed (3) hide show

Dockerfile +23 -0
app.py +65 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Use a lightweight Python image
+FROM python:3.9
+# Create a user and set environment
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set working directory
+WORKDIR /app
+# Copy requirements file and install dependencies
+COPY --chown=user requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy application files
+COPY --chown=user . /app
+# Expose the necessary port
+EXPOSE 7860
+# Run the Flask app using Gunicorn
+CMD ["gunicorn", "-w", "2", "-b", "0.0.0.0:7860", "app:app"]

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from flask import Flask, request, jsonify
+import os
+import pdfplumber
+import pytesseract
+from PIL import Image
+from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+import torch
+app = Flask(__name__)
+# Load Pegasus Model
+tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
+model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
+# Extract text from PDF
+def extract_text_from_pdf(file_path):
+    text = ""
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text() or ""
+    return text
+# Extract text from image (OCR)
+def extract_text_from_image(file_path):
+    image = Image.open(file_path)
+    text = pytesseract.image_to_string(image)
+    return text
+# Summarize text using Pegasus
+def summarize_text(text):
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, num_beams=4)
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return summary
+@app.route('/summarize', methods=['POST'])
+def summarize_document():
+    if 'file' not in request.files:
+        return jsonify({"error": "No file uploaded"}), 400
+    file = request.files['file']
+    filename = file.filename
+    file_path = os.path.join("/tmp", filename)
+    file.save(file_path)
+    try:
+        if filename.endswith('.pdf'):
+            text = extract_text_from_pdf(file_path)
+        elif filename.endswith(('.png', '.jpeg', '.jpg')):
+            text = extract_text_from_image(file_path)
+        else:
+            return jsonify({"error": "Unsupported file format"}), 400
+    except Exception as e:
+        return jsonify({"error": str(e)}), 500
+    finally:
+        os.remove(file_path)
+    if not text.strip():
+        return jsonify({"error": "No text extracted"}), 400
+    summary = summarize_text(text)
+    return jsonify({"summary": summary})
+if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+flask
+pdfplumber
+pytesseract
+Pillow
+torch
+transformers
+gunicorn