Spaces:
Sleeping
Sleeping
| import pytesseract | |
| import cv2 | |
| import re | |
| import io | |
| from flask import Flask, request, jsonify | |
| from flask_cors import CORS | |
| from PIL import Image | |
| import numpy as np | |
| app = Flask(__name__) | |
| CORS(app) # Enable CORS for all routes | |
| # Update pytesseract to use the system-installed Tesseract | |
| pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Path for Hugging Face Spaces | |
| def extract_text_from_image(image_data): | |
| """ | |
| Extract text from an image using OCR. | |
| """ | |
| # Convert base64 image data to numpy array | |
| img_data = np.frombuffer(image_data, np.uint8) | |
| img = cv2.imdecode(img_data, cv2.IMREAD_COLOR) | |
| if img is None: | |
| return None, "Error: Image could not be decoded." | |
| # Convert image to grayscale for better OCR results | |
| gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| # Perform OCR | |
| extracted_text = pytesseract.image_to_string(gray_img) | |
| # Clean the OCR text to remove unwanted spaces or artifacts | |
| cleaned_text = "\n".join([line.strip() for line in extracted_text.splitlines() if line.strip()]) | |
| return cleaned_text, None | |
| def process_unstructured_data(input_text): | |
| """ | |
| Process unstructured text data and convert it into structured format. | |
| """ | |
| structured_data = [] | |
| # Define patterns for key attributes | |
| patterns = { | |
| "Product Name": r"Product\s*[:;-]\s*(.*?)(?=\||Total|\n)", | |
| "Model": r"Model\s*[:;-]\s*(.*?)\s*kW", | |
| "kW / HP": r"kW\s*/\s*HP\s*:\s*([\d./]+)", | |
| "Phase": r"Phase\s*:\s*(\w+)", | |
| "Speed": r"Speed\s*:\s*(\d+\s*RPM)", | |
| "Net Quantity": r"Net\s*Quantity\s*:\s*(\S+)", | |
| "Gross Weight": r"Gross\s*Weight\s*:\s*([\d.]+\s*\w+)", | |
| "Month & Year of MFG": r"Month\s*&\s*Year\s*of\s*MFG\s*:\s*(\w+\s*\d+)", | |
| "MRP": r"MRP.*?([\d.,]+\s*\(Inclusive\s*of\s*.*?\))", | |
| "Serial No.": r"Serial\s*No\s*[:;-]\s*(.*?)\|", | |
| "Manufacturer": r"Sold\s*By\s*[:;-]\s*(.*?)(?=,|\n)", | |
| "Address": r"DELIVERY\s*ADDRESS[:;-]\s*(.*?)(?=\s*Courler|\n)", | |
| "Customer Care": r"Customer\s*Care\s*[:;-]\s*(\+?\d+)", | |
| "Email": r"Email\s*[:;-]\s*(\S+)", | |
| "Name": r"Name\s*[:;-]\s*(.*?)(?=Model|Date|$)", | |
| "Date": r"Date\s*[:;-]\s*([0-9-/]+)", | |
| "Tracking ID": r"Courler\s*AWB\s*No\s*[:;-]\s*(\S+)", | |
| "GSTIN": r"GSTIN\s*No\s*[:;-]\s*([A-Z0-9]+)" | |
| } | |
| # Extract attributes using patterns | |
| for attribute, pattern in patterns.items(): | |
| match = re.search(pattern, input_text, re.IGNORECASE) | |
| if match and match.groups(): # Ensure match and group(1) exists | |
| structured_data.append(f"{attribute} : {match.group(1).strip()}") | |
| else: | |
| structured_data.append(f"{attribute} : Not Found") | |
| # Combine structured data into a paragraph | |
| structured_paragraph = " ".join(structured_data) | |
| return structured_paragraph | |
| def process_image(): | |
| try: | |
| # Get the image data from the request | |
| data = request.json | |
| image_data = data.get('image') | |
| # Decode the base64 image | |
| image_data = image_data.split(',')[1] # Remove data URL prefix | |
| image_bytes = base64.b64decode(image_data) | |
| # Extract text from the image | |
| extracted_text, error = extract_text_from_image(image_bytes) | |
| if error: | |
| return jsonify({"error": error}), 400 | |
| # Process the extracted text into structured data | |
| structured_output = process_unstructured_data(extracted_text) | |
| # Return the processed structured data | |
| return jsonify({"structured_output": structured_output}), 200 | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| if __name__ == '__main__': | |
| app.run(host='0.0.0.0', port=7860, debug=True) | |