Spaces:

sairamtelagamsetti
/

seta

Sleeping

App Files Files Community

seta / app.py

sairamtelagamsetti

Update app.py

223b677 verified about 1 year ago

raw

history blame contribute delete

3.78 kB

	import pytesseract
	import cv2
	import re
	import io
	from flask import Flask, request, jsonify
	from flask_cors import CORS
	from PIL import Image
	import numpy as np

	app = Flask(__name__)
	CORS(app) # Enable CORS for all routes

	# Update pytesseract to use the system-installed Tesseract
	pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Path for Hugging Face Spaces

	def extract_text_from_image(image_data):
	"""
	Extract text from an image using OCR.
	"""
	# Convert base64 image data to numpy array
	img_data = np.frombuffer(image_data, np.uint8)
	img = cv2.imdecode(img_data, cv2.IMREAD_COLOR)
	if img is None:
	return None, "Error: Image could not be decoded."

	# Convert image to grayscale for better OCR results
	gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Perform OCR
	extracted_text = pytesseract.image_to_string(gray_img)

	# Clean the OCR text to remove unwanted spaces or artifacts
	cleaned_text = "\n".join([line.strip() for line in extracted_text.splitlines() if line.strip()])

	return cleaned_text, None

	def process_unstructured_data(input_text):
	"""
	Process unstructured text data and convert it into structured format.
	"""
	structured_data = []

	# Define patterns for key attributes
	patterns = {
	"Product Name": r"Product\s[:;-]\s(.*?)(?=\\|\|Total\|\n)",
	"Model": r"Model\s[:;-]\s(.?)\skW",
	"kW / HP": r"kW\s/\sHP\s:\s([\d./]+)",
	"Phase": r"Phase\s:\s(\w+)",
	"Speed": r"Speed\s:\s(\d+\s*RPM)",
	"Net Quantity": r"Net\sQuantity\s:\s*(\S+)",
	"Gross Weight": r"Gross\sWeight\s:\s([\d.]+\s\w+)",
	"Month & Year of MFG": r"Month\s&\sYear\sof\sMFG\s:\s(\w+\s*\d+)",
	"MRP": r"MRP.?([\d.,]+\s$Inclusive\sof\s.*?$)",
	"Serial No.": r"Serial\sNo\s[:;-]\s(.?)\\|",
	"Manufacturer": r"Sold\sBy\s[:;-]\s(.?)(?=,\|\n)",
	"Address": r"DELIVERY\sADDRESS[:;-]\s(.?)(?=\sCourler\|\n)",
	"Customer Care": r"Customer\sCare\s[:;-]\s*(\+?\d+)",
	"Email": r"Email\s[:;-]\s(\S+)",
	"Name": r"Name\s[:;-]\s(.*?)(?=Model\|Date\|$)",
	"Date": r"Date\s[:;-]\s([0-9-/]+)",
	"Tracking ID": r"Courler\sAWB\sNo\s[:;-]\s(\S+)",
	"GSTIN": r"GSTIN\sNo\s[:;-]\s*([A-Z0-9]+)"
	}

	# Extract attributes using patterns
	for attribute, pattern in patterns.items():
	match = re.search(pattern, input_text, re.IGNORECASE)
	if match and match.groups(): # Ensure match and group(1) exists
	structured_data.append(f"{attribute} : {match.group(1).strip()}")
	else:
	structured_data.append(f"{attribute} : Not Found")

	# Combine structured data into a paragraph
	structured_paragraph = " ".join(structured_data)
	return structured_paragraph

	@app.route('/run/predict', methods=['POST'])
	def process_image():
	try:
	# Get the image data from the request
	data = request.json
	image_data = data.get('image')

	# Decode the base64 image
	image_data = image_data.split(',')[1] # Remove data URL prefix
	image_bytes = base64.b64decode(image_data)

	# Extract text from the image
	extracted_text, error = extract_text_from_image(image_bytes)
	if error:
	return jsonify({"error": error}), 400

	# Process the extracted text into structured data
	structured_output = process_unstructured_data(extracted_text)

	# Return the processed structured data
	return jsonify({"structured_output": structured_output}), 200
	except Exception as e:
	return jsonify({"error": str(e)}), 500

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=7860, debug=True)