init
Browse files- .gitignore +1 -0
- Dockerfile +27 -0
- app.py +54 -0
- requirements.txt +4 -0
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*.bat
|
Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use a base image that supports Python and includes Tesseract
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONUNBUFFERED 1
|
| 6 |
+
ENV FLASK_APP app.py
|
| 7 |
+
ENV APP_HOME /app
|
| 8 |
+
|
| 9 |
+
# Install Tesseract and its dependencies
|
| 10 |
+
RUN apt-get update && apt-get install --no-install-recommends -y \
|
| 11 |
+
tesseract-ocr \
|
| 12 |
+
tesseract-ocr-rus poppler-utils && \
|
| 13 |
+
rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Create and set the working directory
|
| 16 |
+
RUN mkdir /var/www
|
| 17 |
+
RUN mkdir /var/www/tmp
|
| 18 |
+
ENV HOME /var/www
|
| 19 |
+
WORKDIR /var/www
|
| 20 |
+
COPY . /var/www
|
| 21 |
+
|
| 22 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 23 |
+
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
# Run the Flask application
|
| 27 |
+
CMD flask run --host=0.0.0.0 --port=7860
|
app.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
from flask import Flask, request, jsonify
|
| 4 |
+
import pytesseract
|
| 5 |
+
from pdf2image import convert_from_bytes
|
| 6 |
+
from flask_cors import CORS
|
| 7 |
+
|
| 8 |
+
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
|
| 9 |
+
|
| 10 |
+
app = Flask(__name__)
|
| 11 |
+
CORS(app)
|
| 12 |
+
UPLOAD_FOLDER = './tmp'
|
| 13 |
+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
| 14 |
+
|
| 15 |
+
# Endpoint for uploading PDF and extracting text
|
| 16 |
+
@app.route('/upload', methods=['POST'])
|
| 17 |
+
def upload_file():
|
| 18 |
+
# Check if the post request has the file part
|
| 19 |
+
if 'file' not in request.files:
|
| 20 |
+
return jsonify({'error': 'No file part'})
|
| 21 |
+
|
| 22 |
+
file = request.files['file']
|
| 23 |
+
|
| 24 |
+
# Check if the file is a PDF
|
| 25 |
+
if file.filename == '':
|
| 26 |
+
return jsonify({'error': 'No selected file'})
|
| 27 |
+
if file and file.filename.endswith('.pdf'):
|
| 28 |
+
# Convert PDF to images
|
| 29 |
+
# images = convert_from_bytes(file.read())
|
| 30 |
+
filename = str(uuid.uuid4()) + '.pdf'
|
| 31 |
+
|
| 32 |
+
# Save the file to the temporary upload directory
|
| 33 |
+
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
|
| 34 |
+
|
| 35 |
+
# Construct and return the path where the file is saved
|
| 36 |
+
temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
| 37 |
+
|
| 38 |
+
text = ''
|
| 39 |
+
# for img in images:
|
| 40 |
+
# # Perform OCR on each page
|
| 41 |
+
# text += pytesseract.image_to_string(img, lang='rus')
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# присрать сюда вызов библиотеки Андрея с temp_path
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
os.remove(temp_path)
|
| 48 |
+
|
| 49 |
+
return jsonify({'text': text})
|
| 50 |
+
else:
|
| 51 |
+
return jsonify({'error': 'File must be a PDF'})
|
| 52 |
+
|
| 53 |
+
if __name__ == '__main__':
|
| 54 |
+
app.run(debug=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask
|
| 2 |
+
flask-cors
|
| 3 |
+
pytesseract
|
| 4 |
+
pdf2image
|