Spaces:

muryshev
/

cb-api

Sleeping

App Files Files Community

muryshev commited on Mar 12, 2024

Commit

eeebb29

1 Parent(s): 1554eeb

update

Browse files

Files changed (4) hide show

Dockerfile +8 -5
app.py +24 -3
lib/ocr_1.py +236 -0
requirements.txt +5 -1

Dockerfile CHANGED Viewed

@@ -8,21 +8,24 @@ ENV APP_HOME /app
 # Install Tesseract and its dependencies
 RUN apt-get update && apt-get install --no-install-recommends -y \
-    tesseract-ocr \
-    tesseract-ocr-rus poppler-utils && \
     rm -rf /var/lib/apt/lists/*
 # Create and set the working directory
 RUN mkdir /var/www
 RUN mkdir /var/www/tmp
-RUN chmod +w /var/www/tmp
 ENV HOME /var/www
 WORKDIR /var/www
 COPY . /var/www
 RUN pip install --no-cache-dir -r requirements.txt
 EXPOSE 7860
 # Run the Flask application
-CMD flask run --host=0.0.0.0 --port=7860

 # Install Tesseract and its dependencies
 RUN apt-get update && apt-get install --no-install-recommends -y \
+    tesseract-ocr tesseract-ocr-rus poppler-utils python3-opencv && \
     rm -rf /var/lib/apt/lists/*
 # Create and set the working directory
 RUN mkdir /var/www
 RUN mkdir /var/www/tmp
+RUN chmod a+w /var/www/tmp
+RUN groupadd -r flaskuser && useradd -r -g flaskuser flaskuser
 ENV HOME /var/www
 WORKDIR /var/www
 COPY . /var/www
 RUN pip install --no-cache-dir -r requirements.txt
+USER flaskuser
 EXPOSE 7860
 # Run the Flask application
+CMD flask run --host=0.0.0.0 --port=7860

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from flask import Flask, request, jsonify
 import pytesseract
 from pdf2image import convert_from_bytes
 from flask_cors import CORS
 os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
@@ -13,7 +14,7 @@ UPLOAD_FOLDER = './tmp'
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 # Endpoint for uploading PDF and extracting text
-@app.route('/upload', methods=['POST'])
 def upload_file():
     # Check if the post request has the file part
     if 'file' not in request.files:
@@ -41,14 +42,34 @@ def upload_file():
         #     text += pytesseract.image_to_string(img, lang='rus')
-        # присрать сюда вызов библиотеки Андрея с temp_path
         os.remove(temp_path)
-        return jsonify({'text': text})
     else:
         return jsonify({'error': 'File must be a PDF'})
 if __name__ == '__main__':
     app.run(debug=True)

 import pytesseract
 from pdf2image import convert_from_bytes
 from flask_cors import CORS
+from lib import ocr_1
 os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
 app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 # Endpoint for uploading PDF and extracting text
+@app.route('/recognize', methods=['POST'])
 def upload_file():
     # Check if the post request has the file part
     if 'file' not in request.files:
         #     text += pytesseract.image_to_string(img, lang='rus')
+        docs_info = ocr_1.processSingleFile(temp_path)
         os.remove(temp_path)
+        return jsonify(docs_info)
     else:
         return jsonify({'error': 'File must be a PDF'})
+# Endpoint for uploading PDF and extracting text
+@app.route('/analize', methods=['POST'])
+def analize():
+    # Get the text data from the request
+    text_data = request.json.get('text')
+    # Process the text data and generate the JSON response
+    result = []
+    # Example processing: Split the text into two groups
+    group1 = [{"название параметра группы 1": word} for word in text_data.split()[:len(text_data)//2]]
+    group2 = [{"название параметра группы 2": word} for word in text_data.split()[len(text_data)//2:]]
+    # Append the groups to the result list
+    result.append(group1)
+    result.append(group2)
+    # Return the JSON response
+    return jsonify(result)
 if __name__ == '__main__':
     app.run(debug=True)

lib/ocr_1.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from PIL import Image, ImageFilter
+import cv2
+import pytesseract
+from pytesseract import Output
+from os import listdir
+from os.path import isfile, join
+import numpy as np
+import json
+import matplotlib.pyplot as plt
+from pdf2image import convert_from_path
+from matplotlib import pyplot as plt
+import re
+def processFiles(pdfs, verbose = False) :
+    images_per_pdf_2d = [convert_from_path(file) for file in pdfs]
+    images_per_pdf = []
+    docfilenames = []
+    pagenames = []
+    fileindices = []
+    for i in range(len(images_per_pdf_2d)) :
+        docfilenames.append(pdfs[i][:-4])
+        pageindices = []
+        for j in range(len(images_per_pdf_2d[i])) :
+            images_per_pdf.append(images_per_pdf_2d[i][j])
+            pagenames.append(pdfs[i][:-4] + '_page_' + str(j))
+            pageindices.append(len(pagenames) - 1)
+            # print(i, j, len(pagenames) - 1, pagenames[-1])
+        fileindices.append(pageindices)
+    gray_images_per_pdf_cropped = []
+    for i in range(len(images_per_pdf)) :
+        image = images_per_pdf[i]
+        crop = image.convert("L").crop((
+                                    750, 150,      # left top point
+                                    1654, 850       # right bottom point
+                                    ))
+        gray_images_per_pdf_cropped.append(crop)
+    texts = [pytesseract.image_to_string(image, lang='rus') for image in gray_images_per_pdf_cropped]
+    fulltexts = [pytesseract.image_to_string(image, lang='rus') for image in images_per_pdf]
+    cropped_images = gray_images_per_pdf_cropped
+    init_size = cropped_images[0].size
+    thresh_imgs = [
+                image.resize(
+                    (init_size[0] //4, init_size[1] // 4)
+                    ).point(
+                        lambda x: 0 if x < 220 else 255
+                        ).filter(
+                            ImageFilter.MedianFilter(5)
+                            ).filter(
+                                ImageFilter.MinFilter(15) #15
+                                )  for i,(name,image) in enumerate(zip(pagenames, cropped_images))
+    ]
+    masks = thresh_imgs
+    masks_arr = [np.array(img) for img in masks]
+    mask_shape = masks_arr[0].shape
+    str_size = 7
+    masks = []
+    masks_bw = []
+    for name, mask in zip(pagenames, masks_arr):
+        cleaned_mask = mask.copy()
+        for iter in range(mask_shape[0] // str_size):
+            temp_mean = int(cleaned_mask[iter*str_size : iter*str_size + str_size, :].mean())
+            if (temp_mean < 49) or (temp_mean > 160):
+                cleaned_mask[iter*str_size : iter*str_size + str_size, :] = 255
+        vertical_threshold = 200
+        for i in range(mask_shape[1] // str_size + 1):
+            if (i*str_size + str_size) > mask_shape[1]:
+                temp_mean_vertical = int(cleaned_mask[:, i*str_size : mask_shape[1]].mean())
+                if temp_mean_vertical > vertical_threshold:
+                    cleaned_mask[:, i*str_size : mask_shape[1]] = 255
+            else:
+                temp_mean_vertical = int(cleaned_mask[:, i*str_size : i*str_size + str_size].mean())
+                if temp_mean_vertical > vertical_threshold:
+                    cleaned_mask[:, i*str_size : i*str_size + str_size] = 255
+        image = Image.fromarray(cleaned_mask).filter(
+                                        ImageFilter.MedianFilter(13)
+                                    ).filter(
+                                        ImageFilter.MinFilter(25) #15
+                                    )
+        masks.append(image)
+        masks_bw.append(image.convert('1'))
+    masks_bw_arr = [np.array(img) for img in masks_bw]
+    # check which pages have address box: if there is no address box the mask is empty
+    addressexists = [bool((~mask_bw).sum()) for mask_bw in masks_bw_arr]
+    # this is a list of CB names that may be used in address
+    CBnames = [
+        'цб рф',
+        'центральный банк',
+        'центрального банка',
+        'банк россии',
+        'банка россии',
+    ]
+    # check which pages have address box addressed to CB
+    toCB = []
+    for i in range(len(addressexists)) :
+        iftoCB = False
+        for j in range(len(CBnames)) :
+            if addressexists[i] and CBnames[j] in texts[i].lower() :
+                iftoCB = True
+                break
+        toCB.append(iftoCB)
+    # build 3-level list: file -> doc -> page
+    docindices = []
+    doctypes = []
+    for i in range(len(fileindices)) :
+        docs = []
+        types = []
+        pages = []
+        doctype = False
+        for j in range(len(fileindices[i])) :
+            index = fileindices[i][j]
+            ifaddress = addressexists[index]
+            iftoCB = toCB[index]
+            if ifaddress :
+                if len(pages) > 0 :
+                    docs.append(pages)
+                    types.append(doctype)
+                pages = []
+                doctype = iftoCB
+            pages.append(index)
+        docs.append(pages)
+        types.append(doctype)
+        docindices.append(docs)
+        doctypes.append(types)
+    cropped = cropped_images
+    orig_size = cropped[0].size
+    masks = [mask.convert('L').resize((orig_size)) for mask in masks]
+    if verbose :
+        for i in range(len(masks)) :
+            img = np.array(masks[i])
+            out = np.array(cropped[i])
+            bw = cv2.inRange(img, 0, 12)
+            contours, hierarchy = cv2.findContours(bw, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+            aaa = cv2.drawContours(out, contours, -1, (0, 255, 0), 5,  cv2.LINE_AA, hierarchy, 1)
+            print()
+            print(pagenames[i])
+            print('Address exists :', addressexists[i])
+            print('To CB :', toCB[i])
+            # if addressflags[i] :
+            #     if toCB[i] :
+            #         print('text :', texts[i])
+            plt.imshow(Image.fromarray(aaa))
+            plt.show()
+    # print recognized text with marks: file - > doc # and doc type -> page number and text
+    docs_info = []
+    for i in range(len(docindices)) :
+        docs = []
+        if verbose :
+            print('File =', docfilenames[i])
+        for j in range(len(docindices[i])) :
+            doc = {}
+            doctype = 'Сопроводительное письмо'
+            if doctypes[i][j] :
+                doctype = 'Обращение'
+            doc['Тип документа'] = doctype
+            text = ''
+            if verbose :
+                print('Doc =', j, 'Type =', doctype)
+            for k in range(len(docindices[i][j])) :
+                index = docindices[i][j][k]
+                text += fulltexts[index]
+                if verbose :
+                    print('Page =', pagenames[index])
+                    print(fulltexts[index])
+                    print('--- end of page ---')
+                    print()
+            text = re.sub(r'\n +', r'\n', text)
+            text = re.sub(r'\n+', r'\n', text)
+            doc['Текст документа'] = text
+            docs.append(doc)
+        docs_info.append(docs)
+    for i in range(len(docindices)) :
+        for j in range(len(docindices[i])) :
+            for k in range(len(docindices[i][j])) :
+                index = docindices[i][j][k]
+                if toCB[index] :
+                    if verbose :
+                        print('Page =', pagenames[index])
+                        print(texts[index].strip())
+                        print('------------------------')
+                        print()
+    return docs_info
+def processSingleFile(file) :
+    return processFiles([file])
+# docs_info =
+#  [
+#    {
+#      'Имя поля' : 'Текст поля',
+#      ...
+#    },
+#    ...
+#  ]
+# то есть это массив документов, содержащихся в файле, для каждого документа задан словарь 'Имя поля' : 'Текст поля' (сейчас там 2 поля для каждого документа)

requirements.txt CHANGED Viewed

@@ -1,4 +1,8 @@
 flask
 flask-cors
 pytesseract
-pdf2image

 flask
 flask-cors
 pytesseract
+pdf2image
+opencv-python
+matplotlib
+numpy
+pillow