Praful Nayak commited on
Commit
4a1a141
·
1 Parent(s): 5c9924d

Deploy Flask Summarization App

Browse files
Files changed (3) hide show
  1. Dockerfile +23 -0
  2. app.py +65 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a lightweight Python image
2
+ FROM python:3.9
3
+
4
+ # Create a user and set environment
5
+ RUN useradd -m -u 1000 user
6
+ USER user
7
+ ENV PATH="/home/user/.local/bin:$PATH"
8
+
9
+ # Set working directory
10
+ WORKDIR /app
11
+
12
+ # Copy requirements file and install dependencies
13
+ COPY --chown=user requirements.txt requirements.txt
14
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
15
+
16
+ # Copy application files
17
+ COPY --chown=user . /app
18
+
19
+ # Expose the necessary port
20
+ EXPOSE 7860
21
+
22
+ # Run the Flask app using Gunicorn
23
+ CMD ["gunicorn", "-w", "2", "-b", "0.0.0.0:7860", "app:app"]
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import os
3
+ import pdfplumber
4
+ import pytesseract
5
+ from PIL import Image
6
+ from transformers import PegasusForConditionalGeneration, PegasusTokenizer
7
+ import torch
8
+
9
+ app = Flask(__name__)
10
+
11
+ # Load Pegasus Model
12
+ tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
13
+ model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
14
+
15
+ # Extract text from PDF
16
+ def extract_text_from_pdf(file_path):
17
+ text = ""
18
+ with pdfplumber.open(file_path) as pdf:
19
+ for page in pdf.pages:
20
+ text += page.extract_text() or ""
21
+ return text
22
+
23
+ # Extract text from image (OCR)
24
+ def extract_text_from_image(file_path):
25
+ image = Image.open(file_path)
26
+ text = pytesseract.image_to_string(image)
27
+ return text
28
+
29
+ # Summarize text using Pegasus
30
+ def summarize_text(text):
31
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
32
+ summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=30, num_beams=4)
33
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
34
+ return summary
35
+
36
+ @app.route('/summarize', methods=['POST'])
37
+ def summarize_document():
38
+ if 'file' not in request.files:
39
+ return jsonify({"error": "No file uploaded"}), 400
40
+
41
+ file = request.files['file']
42
+ filename = file.filename
43
+ file_path = os.path.join("/tmp", filename)
44
+ file.save(file_path)
45
+
46
+ try:
47
+ if filename.endswith('.pdf'):
48
+ text = extract_text_from_pdf(file_path)
49
+ elif filename.endswith(('.png', '.jpeg', '.jpg')):
50
+ text = extract_text_from_image(file_path)
51
+ else:
52
+ return jsonify({"error": "Unsupported file format"}), 400
53
+ except Exception as e:
54
+ return jsonify({"error": str(e)}), 500
55
+ finally:
56
+ os.remove(file_path)
57
+
58
+ if not text.strip():
59
+ return jsonify({"error": "No text extracted"}), 400
60
+
61
+ summary = summarize_text(text)
62
+ return jsonify({"summary": summary})
63
+
64
+ if __name__ == '__main__':
65
+ app.run(host='0.0.0.0', port=7860)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ flask
2
+ pdfplumber
3
+ pytesseract
4
+ Pillow
5
+ torch
6
+ transformers
7
+ gunicorn