seta / app.py
sairamtelagamsetti's picture
Update app.py
223b677 verified
import pytesseract
import cv2
import re
import io
from flask import Flask, request, jsonify
from flask_cors import CORS
from PIL import Image
import numpy as np
app = Flask(__name__)
CORS(app) # Enable CORS for all routes
# Update pytesseract to use the system-installed Tesseract
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Path for Hugging Face Spaces
def extract_text_from_image(image_data):
"""
Extract text from an image using OCR.
"""
# Convert base64 image data to numpy array
img_data = np.frombuffer(image_data, np.uint8)
img = cv2.imdecode(img_data, cv2.IMREAD_COLOR)
if img is None:
return None, "Error: Image could not be decoded."
# Convert image to grayscale for better OCR results
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Perform OCR
extracted_text = pytesseract.image_to_string(gray_img)
# Clean the OCR text to remove unwanted spaces or artifacts
cleaned_text = "\n".join([line.strip() for line in extracted_text.splitlines() if line.strip()])
return cleaned_text, None
def process_unstructured_data(input_text):
"""
Process unstructured text data and convert it into structured format.
"""
structured_data = []
# Define patterns for key attributes
patterns = {
"Product Name": r"Product\s*[:;-]\s*(.*?)(?=\||Total|\n)",
"Model": r"Model\s*[:;-]\s*(.*?)\s*kW",
"kW / HP": r"kW\s*/\s*HP\s*:\s*([\d./]+)",
"Phase": r"Phase\s*:\s*(\w+)",
"Speed": r"Speed\s*:\s*(\d+\s*RPM)",
"Net Quantity": r"Net\s*Quantity\s*:\s*(\S+)",
"Gross Weight": r"Gross\s*Weight\s*:\s*([\d.]+\s*\w+)",
"Month & Year of MFG": r"Month\s*&\s*Year\s*of\s*MFG\s*:\s*(\w+\s*\d+)",
"MRP": r"MRP.*?([\d.,]+\s*\(Inclusive\s*of\s*.*?\))",
"Serial No.": r"Serial\s*No\s*[:;-]\s*(.*?)\|",
"Manufacturer": r"Sold\s*By\s*[:;-]\s*(.*?)(?=,|\n)",
"Address": r"DELIVERY\s*ADDRESS[:;-]\s*(.*?)(?=\s*Courler|\n)",
"Customer Care": r"Customer\s*Care\s*[:;-]\s*(\+?\d+)",
"Email": r"Email\s*[:;-]\s*(\S+)",
"Name": r"Name\s*[:;-]\s*(.*?)(?=Model|Date|$)",
"Date": r"Date\s*[:;-]\s*([0-9-/]+)",
"Tracking ID": r"Courler\s*AWB\s*No\s*[:;-]\s*(\S+)",
"GSTIN": r"GSTIN\s*No\s*[:;-]\s*([A-Z0-9]+)"
}
# Extract attributes using patterns
for attribute, pattern in patterns.items():
match = re.search(pattern, input_text, re.IGNORECASE)
if match and match.groups(): # Ensure match and group(1) exists
structured_data.append(f"{attribute} : {match.group(1).strip()}")
else:
structured_data.append(f"{attribute} : Not Found")
# Combine structured data into a paragraph
structured_paragraph = " ".join(structured_data)
return structured_paragraph
@app.route('/run/predict', methods=['POST'])
def process_image():
try:
# Get the image data from the request
data = request.json
image_data = data.get('image')
# Decode the base64 image
image_data = image_data.split(',')[1] # Remove data URL prefix
image_bytes = base64.b64decode(image_data)
# Extract text from the image
extracted_text, error = extract_text_from_image(image_bytes)
if error:
return jsonify({"error": error}), 400
# Process the extracted text into structured data
structured_output = process_unstructured_data(extracted_text)
# Return the processed structured data
return jsonify({"structured_output": structured_output}), 200
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860, debug=True)