import pytesseract
import cv2
import re
import io
from flask import Flask, request, jsonify
from flask_cors import CORS
from PIL import Image
import numpy as np

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Update pytesseract to use the system-installed Tesseract
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # Path for Hugging Face Spaces

def extract_text_from_image(image_data):
    """
    Extract text from an image using OCR.
    """
    # Convert base64 image data to numpy array
    img_data = np.frombuffer(image_data, np.uint8)
    img = cv2.imdecode(img_data, cv2.IMREAD_COLOR)
    if img is None:
        return None, "Error: Image could not be decoded."

    # Convert image to grayscale for better OCR results
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Perform OCR
    extracted_text = pytesseract.image_to_string(gray_img)

    # Clean the OCR text to remove unwanted spaces or artifacts
    cleaned_text = "\n".join([line.strip() for line in extracted_text.splitlines() if line.strip()])

    return cleaned_text, None

def process_unstructured_data(input_text):
    """
    Process unstructured text data and convert it into structured format.
    """
    structured_data = []

    # Define patterns for key attributes
    patterns = {
        "Product Name": r"Product\s*[:;-]\s*(.*?)(?=\||Total|\n)",
        "Model": r"Model\s*[:;-]\s*(.*?)\s*kW",
        "kW / HP": r"kW\s*/\s*HP\s*:\s*([\d./]+)",
        "Phase": r"Phase\s*:\s*(\w+)",
        "Speed": r"Speed\s*:\s*(\d+\s*RPM)",
        "Net Quantity": r"Net\s*Quantity\s*:\s*(\S+)",
        "Gross Weight": r"Gross\s*Weight\s*:\s*([\d.]+\s*\w+)",
        "Month & Year of MFG": r"Month\s*&\s*Year\s*of\s*MFG\s*:\s*(\w+\s*\d+)",
        "MRP": r"MRP.*?([\d.,]+\s*\(Inclusive\s*of\s*.*?\))",
        "Serial No.": r"Serial\s*No\s*[:;-]\s*(.*?)\|",
        "Manufacturer": r"Sold\s*By\s*[:;-]\s*(.*?)(?=,|\n)",
        "Address": r"DELIVERY\s*ADDRESS[:;-]\s*(.*?)(?=\s*Courler|\n)",
        "Customer Care": r"Customer\s*Care\s*[:;-]\s*(\+?\d+)",
        "Email": r"Email\s*[:;-]\s*(\S+)",
        "Name": r"Name\s*[:;-]\s*(.*?)(?=Model|Date|$)",
        "Date": r"Date\s*[:;-]\s*([0-9-/]+)",
        "Tracking ID": r"Courler\s*AWB\s*No\s*[:;-]\s*(\S+)",
        "GSTIN": r"GSTIN\s*No\s*[:;-]\s*([A-Z0-9]+)"
    }

    # Extract attributes using patterns
    for attribute, pattern in patterns.items():
        match = re.search(pattern, input_text, re.IGNORECASE)
        if match and match.groups():  # Ensure match and group(1) exists
            structured_data.append(f"{attribute} : {match.group(1).strip()}")
        else:
            structured_data.append(f"{attribute} : Not Found")

    # Combine structured data into a paragraph
    structured_paragraph = " ".join(structured_data)
    return structured_paragraph

@app.route('/run/predict', methods=['POST'])
def process_image():
    try:
        # Get the image data from the request
        data = request.json
        image_data = data.get('image')

        # Decode the base64 image
        image_data = image_data.split(',')[1]  # Remove data URL prefix
        image_bytes = base64.b64decode(image_data)

        # Extract text from the image
        extracted_text, error = extract_text_from_image(image_bytes)
        if error:
            return jsonify({"error": error}), 400

        # Process the extracted text into structured data
        structured_output = process_unstructured_data(extracted_text)

        # Return the processed structured data
        return jsonify({"structured_output": structured_output}), 200
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860, debug=True)