Spaces:
Sleeping
Sleeping
Pavan+2-at-244075126032
commited on
Commit
·
70c5320
1
Parent(s):
d25f036
REver ocr changes
Browse files- app/__init__.py +7 -7
- app/api.py +8 -9
- app/routes/adhaarApi.py +60 -60
- app/routes/panApi.py +64 -64
- app/services/adhaarServices/adhaarDataExtractor.py +47 -47
- app/services/adhaarServices/ocr.py +37 -37
- app/services/panServices/panDataExtractor.py +80 -80
- app/services/panServices/panOcr.py +53 -53
- gunicorn.conf.py +1 -1
app/__init__.py
CHANGED
|
@@ -1,16 +1,16 @@
|
|
| 1 |
from flask import Flask
|
| 2 |
-
|
| 3 |
|
| 4 |
def create_app():
|
| 5 |
app = Flask(__name__)
|
| 6 |
from .api import ocr_bp
|
| 7 |
app.register_blueprint(ocr_bp)
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
return app
|
|
|
|
| 1 |
from flask import Flask
|
| 2 |
+
from ultralytics import YOLO
|
| 3 |
|
| 4 |
def create_app():
|
| 5 |
app = Flask(__name__)
|
| 6 |
from .api import ocr_bp
|
| 7 |
app.register_blueprint(ocr_bp)
|
| 8 |
|
| 9 |
+
with app.app_context():
|
| 10 |
+
# Load model once
|
| 11 |
+
app.models = {
|
| 12 |
+
'adhaarModel': YOLO('models/aadhaarYolov8.pt'),
|
| 13 |
+
'panModel': YOLO('models/PanModel_v6.pt') # Load additional models as needed
|
| 14 |
+
}
|
| 15 |
|
| 16 |
return app
|
app/api.py
CHANGED
|
@@ -1,22 +1,21 @@
|
|
| 1 |
-
# from app.services.UidaiServices.uidaiServices import generate_captcha, validate_aadhaar
|
| 2 |
from app.routes.uidaiServices import generate_captcha, validate_aadhaar
|
| 3 |
from flask import Blueprint, request, jsonify
|
| 4 |
import requests
|
| 5 |
import os
|
| 6 |
-
|
| 7 |
-
|
| 8 |
import uuid
|
| 9 |
ocr_bp = Blueprint('ocr', __name__)
|
| 10 |
session = requests.Session()
|
| 11 |
mode = os.getenv("PROJECT_MODE")
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
# Route to generate captcha
|
| 22 |
@ocr_bp.route('/generateCaptcha', methods=['POST'])
|
|
|
|
|
|
|
| 1 |
from app.routes.uidaiServices import generate_captcha, validate_aadhaar
|
| 2 |
from flask import Blueprint, request, jsonify
|
| 3 |
import requests
|
| 4 |
import os
|
| 5 |
+
from .routes.adhaarApi import ocrAdhaar
|
| 6 |
+
from .routes.panApi import ocrPan
|
| 7 |
import uuid
|
| 8 |
ocr_bp = Blueprint('ocr', __name__)
|
| 9 |
session = requests.Session()
|
| 10 |
mode = os.getenv("PROJECT_MODE")
|
| 11 |
|
| 12 |
+
@ocr_bp.route('/ocrPan', methods=['POST'])
|
| 13 |
+
def getResponse_Pan():
|
| 14 |
+
return ocrPan(mode, session)
|
| 15 |
|
| 16 |
+
@ocr_bp.route('/ocrAdhaar', methods=['POST'])
|
| 17 |
+
def getResponse_Adhaar():
|
| 18 |
+
return ocrAdhaar(mode, session)
|
| 19 |
|
| 20 |
# Route to generate captcha
|
| 21 |
@ocr_bp.route('/generateCaptcha', methods=['POST'])
|
app/routes/adhaarApi.py
CHANGED
|
@@ -1,70 +1,70 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
|
| 54 |
|
| 55 |
-
#
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from flask import request, jsonify, current_app
|
| 3 |
+
from PIL import Image, UnidentifiedImageError
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
import base64
|
| 6 |
+
import requests
|
| 7 |
+
from ..services.adhaarServices.ocr import process_results
|
| 8 |
+
import io
|
| 9 |
+
def ocrAdhaar(mode, session):
|
| 10 |
+
try:
|
| 11 |
+
print("API HIT ************* AADHAAROCR")
|
| 12 |
+
data = request.get_json()
|
| 13 |
+
if not data:
|
| 14 |
+
return jsonify({"error": "Invalid request payload"}), 400
|
| 15 |
|
| 16 |
+
if mode == "prod":
|
| 17 |
+
if not data.get('image'):
|
| 18 |
+
return jsonify({"error": "Image data/buffer is required"}), 400
|
| 19 |
|
| 20 |
+
#removing 'data:image/png,base64 ' from buffer.
|
| 21 |
+
imgBuffer = data.get('image')
|
| 22 |
+
imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
|
| 23 |
+
# Adjust base64 string padding
|
| 24 |
+
if len(imgBuffer) % 4:
|
| 25 |
+
imgBuffer += '=' * (4 - len(imgBuffer) % 4)
|
| 26 |
|
| 27 |
+
try:
|
| 28 |
+
img_data = base64.b64decode(imgBuffer)
|
| 29 |
+
img = Image.open(BytesIO(img_data))
|
| 30 |
+
img.verify() # Verify image format
|
| 31 |
+
img = Image.open(io.BytesIO(img_data)) # Re-open image after verification
|
| 32 |
+
except (base64.binascii.Error, ValueError) as decode_err:
|
| 33 |
+
return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
|
| 34 |
+
except UnidentifiedImageError:
|
| 35 |
+
return jsonify({"error": "Unable to identify image format."}), 400
|
| 36 |
|
| 37 |
+
elif mode == "dev":
|
| 38 |
+
if not data.get('imgUrl'):
|
| 39 |
+
return jsonify({"error": "Image URL is required"}), 400
|
| 40 |
|
| 41 |
+
img_url = data.get('imgUrl')
|
| 42 |
+
response = session.get(img_url)
|
| 43 |
+
response.raise_for_status()
|
| 44 |
+
img = Image.open(BytesIO(response.content))
|
| 45 |
+
img.verify() # Verify image format
|
| 46 |
+
img = Image.open(BytesIO(response.content)) # Re-open image after verification
|
| 47 |
|
| 48 |
+
else:
|
| 49 |
+
return jsonify({"error": "Invalid mode configuration"}), 500
|
| 50 |
|
| 51 |
+
# Check image format
|
| 52 |
+
if img.format not in ['JPEG', 'JPG', 'PNG']:
|
| 53 |
+
return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
|
| 54 |
|
| 55 |
+
# Run detection
|
| 56 |
+
model = current_app.models.get('adhaarModel')
|
| 57 |
+
results = model.predict(source=img, save=False)
|
| 58 |
+
extracted_data = process_results(results, img)
|
| 59 |
|
| 60 |
+
if extracted_data.get('statusCode') == 400:
|
| 61 |
+
return jsonify(extracted_data), 400
|
| 62 |
|
| 63 |
+
return jsonify(extracted_data), 200
|
| 64 |
+
except requests.RequestException as req_err:
|
| 65 |
+
return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
|
| 66 |
+
except UnidentifiedImageError:
|
| 67 |
+
return jsonify({"error": "Unable to identify image format."}), 400
|
| 68 |
+
except Exception as e:
|
| 69 |
+
current_app.logger.error(f"Unexpected error: {str(e)}")
|
| 70 |
+
return jsonify({"error": "An unexpected error occurred."}), 500
|
app/routes/panApi.py
CHANGED
|
@@ -1,74 +1,74 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
|
| 53 |
-
#
|
| 54 |
-
|
| 55 |
-
|
| 56 |
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
#
|
| 61 |
-
|
| 62 |
-
#
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from flask import request, jsonify, current_app
|
| 3 |
+
from PIL import Image, UnidentifiedImageError
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
import base64
|
| 6 |
+
import requests
|
| 7 |
+
from ..services.panServices.panOcr import process_results
|
| 8 |
+
import io
|
| 9 |
+
def ocrPan(mode, session):
|
| 10 |
+
try:
|
| 11 |
+
print("API HIT ************* PANOCR")
|
| 12 |
+
data = request.get_json()
|
| 13 |
+
if not data:
|
| 14 |
+
return jsonify({"error": "Invalid request payload"}), 400
|
| 15 |
|
| 16 |
+
if mode == "prod":
|
| 17 |
+
if not data.get('image'):
|
| 18 |
+
return jsonify({"error": "Image data/buffer is required"}), 400
|
| 19 |
|
| 20 |
+
#removing 'data:image/png,base64 ' from buffer.
|
| 21 |
+
imgBuffer = data.get('image')
|
| 22 |
+
imgBuffer = re.sub("^data:image/.+;base64,", "", imgBuffer)
|
| 23 |
+
# Adjust base64 string padding
|
| 24 |
+
if len(imgBuffer) % 4:
|
| 25 |
+
imgBuffer += '=' * (4 - len(imgBuffer) % 4)
|
| 26 |
|
| 27 |
+
try:
|
| 28 |
+
img_data = base64.b64decode(imgBuffer)
|
| 29 |
+
img = Image.open(BytesIO(img_data))
|
| 30 |
+
img.verify() # Verify image format
|
| 31 |
+
print(img, "img")
|
| 32 |
+
img = Image.open(io.BytesIO(img_data)) # Re-open image after verification
|
| 33 |
+
except (base64.binascii.Error, ValueError) as decode_err:
|
| 34 |
+
return jsonify({"error": f"Image decoding failed: {str(decode_err)}"}), 400
|
| 35 |
+
except UnidentifiedImageError:
|
| 36 |
+
return jsonify({"error": "Unable to identify image format."}), 400
|
| 37 |
|
| 38 |
+
elif mode == "dev":
|
| 39 |
+
if not data.get('imgUrl'):
|
| 40 |
+
return jsonify({"error": "Image URL is required"}), 400
|
| 41 |
|
| 42 |
+
img_url = data.get('imgUrl')
|
| 43 |
+
response = session.get(img_url)
|
| 44 |
+
response.raise_for_status()
|
| 45 |
+
img = Image.open(BytesIO(response.content))
|
| 46 |
+
print(img, "img")
|
| 47 |
+
img.verify() # Verify image format
|
| 48 |
+
img = Image.open(BytesIO(response.content)) # Re-open image after verification
|
| 49 |
|
| 50 |
+
else:
|
| 51 |
+
return jsonify({"error": "Invalid mode configuration"}), 500
|
| 52 |
|
| 53 |
+
# Check image format
|
| 54 |
+
if img.format not in ['JPEG', 'JPG', 'PNG']:
|
| 55 |
+
return jsonify({"error": "Invalid image format. Only JPG and PNG are supported."}), 400
|
| 56 |
|
| 57 |
+
# Run detection
|
| 58 |
+
model = current_app.models.get('panModel')
|
| 59 |
+
results = model.predict(source=img, imgsz=680, iou=0.7, augment=True)
|
| 60 |
+
# print(results,"model result")
|
| 61 |
+
extracted_data = process_results(results, img)
|
| 62 |
+
# print(extracted_data, "extracted data")
|
| 63 |
|
| 64 |
+
if extracted_data.get('statusCode') == 400:
|
| 65 |
+
return jsonify(extracted_data), 400
|
| 66 |
|
| 67 |
+
return jsonify(extracted_data), 200
|
| 68 |
+
except requests.RequestException as req_err:
|
| 69 |
+
return jsonify({"error": f"Image download failed: {str(req_err)}"}), 500
|
| 70 |
+
except UnidentifiedImageError:
|
| 71 |
+
return jsonify({"error": "Unable to identify image format."}), 400
|
| 72 |
+
except Exception as e:
|
| 73 |
+
current_app.logger.error(f"Unexpected error: {str(e)}")
|
| 74 |
+
return jsonify({"error": "An unexpected error occurred on api call."}), 500
|
app/services/adhaarServices/adhaarDataExtractor.py
CHANGED
|
@@ -1,59 +1,59 @@
|
|
| 1 |
-
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
|
| 6 |
-
|
| 7 |
|
| 8 |
-
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
|
| 12 |
|
| 13 |
-
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
|
| 23 |
-
#
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
|
| 38 |
-
|
| 39 |
|
| 40 |
-
#
|
| 41 |
-
|
| 42 |
-
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
|
| 3 |
+
def extract_details(texts):
|
| 4 |
+
details = {'name': '', 'gender': '', 'dob': '', 'aadhaarNo': ''}
|
| 5 |
|
| 6 |
+
current_year = datetime.datetime.now().year
|
| 7 |
|
| 8 |
+
for text in texts:
|
| 9 |
+
# Check if colon exists in text and split accordingly
|
| 10 |
+
if ':' in text:
|
| 11 |
+
text = text.split(':')[1].strip()
|
| 12 |
|
| 13 |
+
cleaned_text = text.replace(':', '').strip()
|
| 14 |
|
| 15 |
+
# Remove leading non-alphabetic characters for gender detection and strip spaces
|
| 16 |
+
cleaned_gender = cleaned_text.lstrip('.-/').strip()
|
| 17 |
|
| 18 |
+
# Check if the text is the name (only alphabets, spaces, and possibly dots)
|
| 19 |
+
if (all(char.isalpha() or char.isspace() or char == '.' for char in cleaned_text)
|
| 20 |
+
and cleaned_gender.lower() not in ['male', 'female']):
|
| 21 |
+
details['name'] = cleaned_text
|
| 22 |
|
| 23 |
+
# Check if the text is the DOB (format: dd/mm/yyyy or yyyy)
|
| 24 |
+
elif (len(cleaned_text) == 4 and
|
| 25 |
+
cleaned_text.isdigit() and
|
| 26 |
+
1900 < int(cleaned_text) < current_year):
|
| 27 |
+
details['dob'] = cleaned_text
|
| 28 |
|
| 29 |
+
# Check if the text is the DOB (format: dd/mm/yyyy or dd-mm-yyyy)
|
| 30 |
+
elif (len(cleaned_text) == 10 and
|
| 31 |
+
(cleaned_text[2] in ['/', '-']) and
|
| 32 |
+
(cleaned_text[5] in ['/', '-']) and
|
| 33 |
+
cleaned_text.replace('/', '').replace('-', '').isdigit()):
|
| 34 |
+
details['dob'] = cleaned_text
|
| 35 |
|
| 36 |
+
# Check if the text is the gender (either 'Male' or 'Female')
|
| 37 |
+
elif cleaned_gender.lower() in ['male', 'female']:
|
| 38 |
+
details['gender'] = cleaned_gender.capitalize()
|
| 39 |
|
| 40 |
+
# Check if the text is the Aadhaar number (12 digits after removing spaces)
|
| 41 |
+
elif cleaned_text.replace(' ', '').isdigit() and len(cleaned_text.replace(' ', '')) == 12:
|
| 42 |
+
details['aadhaarNo'] = cleaned_text
|
| 43 |
|
| 44 |
+
# Check if any key's value is empty
|
| 45 |
+
if any(value == '' for value in details.values()):
|
| 46 |
+
error_key = next(key for key, value in details.items() if value == '')
|
| 47 |
+
result = {
|
| 48 |
+
'statusCode': 400,
|
| 49 |
+
'result': details,
|
| 50 |
+
'error': f'{error_key} value is not found due to bad image.'
|
| 51 |
+
}
|
| 52 |
+
else:
|
| 53 |
+
result = {
|
| 54 |
+
'statusCode': 200,
|
| 55 |
+
'result': details,
|
| 56 |
+
'error': ''
|
| 57 |
+
}
|
| 58 |
|
| 59 |
+
return result
|
app/services/adhaarServices/ocr.py
CHANGED
|
@@ -1,44 +1,44 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
|
| 38 |
-
|
| 39 |
-
#
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
| 1 |
+
from io import BytesIO
|
| 2 |
+
from ...utils.azureOCR import analyze_image
|
| 3 |
+
from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
|
| 4 |
+
from app.services.adhaarServices.adhaarDataExtractor import extract_details
|
| 5 |
|
| 6 |
+
def process_results(results, img):
|
| 7 |
+
precision_data = {label: {"correct": 0, "total": 0} for label in ["aadharNo", "name", "dob", "gender", "address"]}
|
| 8 |
+
confidence_threshold = 0.3
|
| 9 |
+
input_image_format = img.format if img.format else "PNG"
|
| 10 |
+
valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
|
| 11 |
+
input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
|
| 12 |
|
| 13 |
+
label_to_image = {}
|
| 14 |
+
extracted_data = {"adhaarNo": "", "dob": "", "gender": "", "name": "", "address": ""}
|
| 15 |
+
for result in results:
|
| 16 |
+
for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
|
| 17 |
+
label = ["aadharNo", "dob", "gender", "name", "address"][int(cls)]
|
| 18 |
+
print(label, conf)
|
| 19 |
+
if conf < confidence_threshold or label == "address":
|
| 20 |
+
continue
|
| 21 |
|
| 22 |
+
x1, y1, x2, y2 = map(int, bbox.tolist())
|
| 23 |
+
crop_img = img.crop((x1, y1, x2, y2))
|
| 24 |
+
crop_img = resize_if_needed(crop_img)
|
| 25 |
|
| 26 |
+
if label not in label_to_image or label_to_image[label][1] < conf:
|
| 27 |
+
label_to_image[label] = (crop_img, conf)
|
| 28 |
+
precision_data[label]["total"] += 1
|
| 29 |
+
precision_data[label]["correct"] += 1 # Replace with actual OCR validation check
|
| 30 |
|
| 31 |
+
cropped_images = [img for label, (img, conf) in sorted(label_to_image.items()) if label != "address"]
|
| 32 |
+
final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
|
| 33 |
|
| 34 |
+
buffer = BytesIO()
|
| 35 |
+
final_image.save(buffer, format=input_image_format)
|
| 36 |
+
buffer.seek(0)
|
| 37 |
|
| 38 |
+
response = analyze_image(buffer.getvalue(), input_image_format)
|
| 39 |
+
# print(response)
|
| 40 |
+
lines = response['readResult']['blocks'][0]['lines']
|
| 41 |
+
texts = [line['text'] for line in lines]
|
| 42 |
+
print(texts)
|
| 43 |
+
extracted_data = extract_details(texts)
|
| 44 |
+
return extracted_data
|
app/services/panServices/panDataExtractor.py
CHANGED
|
@@ -1,95 +1,95 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
#
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
|
| 58 |
-
#
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
#
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
|
| 82 |
-
#
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
|
| 95 |
-
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
def filter_array(arr):
|
| 3 |
+
# Define the regex patterns
|
| 4 |
+
pattern_alphanumeric_special = re.compile(r'[\w]+[^.\s\w]+|[^.\s\w]+[\w]+')
|
| 5 |
+
pattern_numeric = re.compile(r'^[0-9]+$')
|
| 6 |
+
pattern_special_chars = re.compile(r'[^a-zA-Z.\s]+')
|
| 7 |
|
| 8 |
+
# Filter the array
|
| 9 |
+
filtered_array = [
|
| 10 |
+
item for item in arr
|
| 11 |
+
if not (pattern_alphanumeric_special.search(item) or
|
| 12 |
+
pattern_numeric.match(item) or
|
| 13 |
+
pattern_special_chars.search(item))
|
| 14 |
+
]
|
| 15 |
+
return filtered_array
|
| 16 |
|
| 17 |
+
def extract_panData(data):
|
| 18 |
+
unwanted_words = ["Name", "/Name", 'Permanent', 'Account', 'Number', 'Card', 'नाम', '/Name',
|
| 19 |
+
"पिता का नाम", 'नाम / Name', "पिता का नाम/ Father's Name", '414 / Name', 'पिता का नाम / Fath',
|
| 20 |
+
"VIT VE Hra / Father's Nama", 'पिता का नाम/ Fal', 'पिता का नाम / Fathe', "पिता का नाम / Father's Na",
|
| 21 |
+
'जन्म की तारीख /।', 'जन्म का ताराख', "पिता का नाम/ Father's Nam", 'नाम /Name', "पिता का नाम / Father's Name",
|
| 22 |
+
'जन्म का वाराज़', 'Date of Birth', 'Permanent Account Number Card', "Date of Birth", "/Date of Birth",
|
| 23 |
+
"Permanent Account Number", "Father's Name", "14 /Name", "/Father's Name", 'HTH / Name',"inent Account Number", "anent Account Number C","Permanent Account Number Car",
|
| 24 |
+
'ugr Name']
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
|
| 29 |
+
# Initialize result object
|
| 30 |
+
result = {
|
| 31 |
+
"statusCode": 200,
|
| 32 |
+
"error": '',
|
| 33 |
+
"data": {
|
| 34 |
+
"panNo": '',
|
| 35 |
+
"name": '',
|
| 36 |
+
"fatherName": '',
|
| 37 |
+
"dob": ''
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
|
| 41 |
+
# Clean the array by removing unwanted words and invalid entries
|
| 42 |
+
cleaned_data = []
|
| 43 |
+
combination_pattern = re.compile(r'(?=.*[0-9])(?=.*[!@#$%^&*(),?":{}|<>])')
|
| 44 |
|
| 45 |
+
for item in data:
|
| 46 |
+
if item not in unwanted_words and not combination_pattern.search(item):
|
| 47 |
+
cleaned_data.append(item)
|
| 48 |
|
| 49 |
|
| 50 |
+
# Check and extract PAN number
|
| 51 |
+
pan_pattern = re.compile(r'^[A-Z]{5}\s*[0-9]{4}\s*[A-Z]$')
|
| 52 |
+
for item in cleaned_data:
|
| 53 |
+
if pan_pattern.match(item):
|
| 54 |
+
result["data"]["panNo"] = item
|
| 55 |
+
cleaned_data.remove(item)
|
| 56 |
+
break
|
| 57 |
|
| 58 |
+
# Check and extract date of birth
|
| 59 |
+
dob_pattern = re.compile(r'^\d{2}[-/]\d{2}[-/]\d{4}$')
|
| 60 |
+
for item in cleaned_data:
|
| 61 |
+
if dob_pattern.match(item):
|
| 62 |
+
result["data"]["dob"] = item
|
| 63 |
+
cleaned_data.remove(item)
|
| 64 |
+
break
|
| 65 |
|
| 66 |
+
# If only two values are left, assume they are name and father's name
|
| 67 |
+
cleaned_data = filter_array(cleaned_data)
|
| 68 |
+
if len(cleaned_data) == 2:
|
| 69 |
+
result["data"]["name"] = cleaned_data[0]
|
| 70 |
+
result["data"]["fatherName"] = cleaned_data[1]
|
| 71 |
+
else:
|
| 72 |
+
# Further cleaning of the data array to extract name and father's name
|
| 73 |
+
cleaned_data = [item for item in cleaned_data if not combination_pattern.search(item) and item not in unwanted_words]
|
| 74 |
+
print(cleaned_data, "after cleaning")
|
| 75 |
+
# Check and extract name
|
| 76 |
+
name_pattern = re.compile(r'^[A-Za-z .]+$')
|
| 77 |
+
if len(cleaned_data) > 0 and name_pattern.match(cleaned_data[0]):
|
| 78 |
+
result["data"]["name"] = cleaned_data[0]
|
| 79 |
+
else:
|
| 80 |
+
result["data"]["name"] = ''
|
| 81 |
|
| 82 |
+
# Check and extract father's name
|
| 83 |
+
if len(cleaned_data) > 1 and name_pattern.match(cleaned_data[1]):
|
| 84 |
+
result["data"]["fatherName"] = cleaned_data[1]
|
| 85 |
+
else:
|
| 86 |
+
result["data"]["fatherName"] = ''
|
| 87 |
|
| 88 |
+
# Check if any value is empty and set error message
|
| 89 |
+
for key, value in result["data"].items():
|
| 90 |
+
if value == '':
|
| 91 |
+
result["statusCode"] = 400
|
| 92 |
+
result["error"] = f"{key} value is not found due to bad image."
|
| 93 |
+
break
|
| 94 |
|
| 95 |
+
return result
|
app/services/panServices/panOcr.py
CHANGED
|
@@ -1,66 +1,66 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
#
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
|
| 52 |
-
#
|
| 53 |
-
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 1 |
+
from io import BytesIO
|
| 2 |
+
from ...utils.azureOCR import analyze_image
|
| 3 |
+
from ...utils.imageUtils import resize_if_needed, all_cropped_images_to_one_image
|
| 4 |
+
from .panDataExtractor import extract_panData
|
| 5 |
+
# from collections import defaultdict
|
| 6 |
|
| 7 |
+
def process_results(results, img):
|
| 8 |
+
label_indices = {"pan_num": 0, "name": 1, "father": 2, "dob": 3}
|
| 9 |
+
confidence_threshold = 0.3
|
| 10 |
+
input_image_format = img.format if img.format else "PNG"
|
| 11 |
+
valid_formats = ["JPEG", "PNG", "BMP", "GIF", "TIFF"]
|
| 12 |
+
input_image_format = input_image_format if input_image_format in valid_formats else "PNG"
|
| 13 |
|
| 14 |
+
best_crops = {label: (None, -1) for label in label_indices.keys()} # Store best (image, confidence) pairs
|
| 15 |
|
| 16 |
+
precision_data = {label: {"correct": 0, "total": 0} for label in label_indices.keys()}
|
| 17 |
|
| 18 |
+
for result in results:
|
| 19 |
+
for bbox, cls, conf in zip(result.boxes.xyxy, result.boxes.cls, result.boxes.conf):
|
| 20 |
+
# Ensure the class index is within the bounds of the label list
|
| 21 |
+
if int(cls) >= len(label_indices):
|
| 22 |
+
print(f"Warning: Class index {cls} is out of range. Skipping this bbox.")
|
| 23 |
+
continue
|
| 24 |
|
| 25 |
+
label = list(label_indices.keys())[int(cls)]
|
| 26 |
+
print(label, conf)
|
| 27 |
+
if conf < confidence_threshold:
|
| 28 |
+
continue
|
| 29 |
|
| 30 |
+
x1, y1, x2, y2 = map(int, bbox.tolist())
|
| 31 |
+
crop_img = img.crop((x1, y1, x2, y2))
|
| 32 |
+
crop_img = resize_if_needed(crop_img)
|
| 33 |
+
crop_img.save(f"temp_{label}.png")
|
| 34 |
|
| 35 |
+
# Replace old crop if new one has higher confidence
|
| 36 |
+
_, best_conf = best_crops[label]
|
| 37 |
+
if conf > best_conf:
|
| 38 |
+
best_crops[label] = (crop_img, conf)
|
| 39 |
+
precision_data[label]["total"] += 1
|
| 40 |
+
precision_data[label]["correct"] += 1 # Replace with actual OCR validation check
|
| 41 |
|
| 42 |
+
# Extract the images for final processing
|
| 43 |
+
cropped_images_with_labels = [(img, label_indices[label], conf) for label, (img, conf) in best_crops.items() if img is not None]
|
| 44 |
|
| 45 |
+
# Sort the images by their label indices in ascending order
|
| 46 |
+
cropped_images_with_labels.sort(key=lambda x: x[1])
|
| 47 |
+
print(cropped_images_with_labels, "cropped images with labels")
|
| 48 |
|
| 49 |
+
if not cropped_images_with_labels:
|
| 50 |
+
raise ValueError("No images were cropped.")
|
| 51 |
|
| 52 |
+
# Extract only the images for concatenation
|
| 53 |
+
cropped_images = [img for img, _, _ in cropped_images_with_labels]
|
| 54 |
|
| 55 |
+
final_image = all_cropped_images_to_one_image(cropped_images, separator_image_path='app/utils/seprator3.png')
|
| 56 |
+
buffer = BytesIO()
|
| 57 |
+
final_image.save(buffer, format=input_image_format)
|
| 58 |
+
buffer.seek(0)
|
| 59 |
|
| 60 |
+
response = analyze_image(buffer.getvalue(), input_image_format)
|
| 61 |
+
print(response, "response")
|
| 62 |
+
lines = response['readResult']['blocks'][0]['lines']
|
| 63 |
+
texts = [line['text'] for line in lines]
|
| 64 |
+
print(texts, "text after microsoft ocr")
|
| 65 |
+
extracted_data = extract_panData(texts)
|
| 66 |
+
return extracted_data
|
gunicorn.conf.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# gunicorn.conf.py
|
| 2 |
|
| 3 |
bind = '0.0.0.0:7860'
|
| 4 |
-
workers =
|
| 5 |
timeout = 120
|
| 6 |
loglevel = 'info'
|
| 7 |
accesslog = '-'
|
|
|
|
| 1 |
# gunicorn.conf.py
|
| 2 |
|
| 3 |
bind = '0.0.0.0:7860'
|
| 4 |
+
workers = 1
|
| 5 |
timeout = 120
|
| 6 |
loglevel = 'info'
|
| 7 |
accesslog = '-'
|