Spaces:

sairamtelagamsetti
/

seta

Sleeping

App Files Files Community

sairamtelagamsetti commited on Dec 16, 2024

Commit

223b677

verified ·

1 Parent(s): 738d678

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -16

app.py CHANGED Viewed

@@ -1,12 +1,81 @@
-import base64
 from flask import Flask, request, jsonify
 from PIL import Image
-import io
 app = Flask(__name__)
-#@app.route('/run/predict', methods=['POST'])
-#def process_image():
     try:
         # Get the image data from the request
         data = request.json
@@ -16,20 +85,16 @@ app = Flask(__name__)
         image_data = image_data.split(',')[1]  # Remove data URL prefix
         image_bytes = base64.b64decode(image_data)
-        # Open the image with PIL
-        image = Image.open(io.BytesIO(image_bytes))
-        # Example: Resize the image (Optional processing)
-        image = image.resize((300, 300))
-        # Convert the processed image back to base64
-        buffered = io.BytesIO()
-        image.save(buffered, format="PNG")
-        processed_image_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
-        processed_image_data = f"data:image/png;base64,{processed_image_data}"
-        # Return the processed image as a response
-        return jsonify({"processed_image": processed_image_data}), 200
     except Exception as e:
         return jsonify({"error": str(e)}), 500

+import pytesseract
+import cv2
+import re
+import io
 from flask import Flask, request, jsonify
+from flask_cors import CORS
 from PIL import Image
+import numpy as np
 app = Flask(__name__)
+CORS(app)  # Enable CORS for all routes
+# Update pytesseract to use the system-installed Tesseract
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"  # Path for Hugging Face Spaces
+def extract_text_from_image(image_data):
+    """
+    Extract text from an image using OCR.
+    """
+    # Convert base64 image data to numpy array
+    img_data = np.frombuffer(image_data, np.uint8)
+    img = cv2.imdecode(img_data, cv2.IMREAD_COLOR)
+    if img is None:
+        return None, "Error: Image could not be decoded."
+    # Convert image to grayscale for better OCR results
+    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    # Perform OCR
+    extracted_text = pytesseract.image_to_string(gray_img)
+    # Clean the OCR text to remove unwanted spaces or artifacts
+    cleaned_text = "\n".join([line.strip() for line in extracted_text.splitlines() if line.strip()])
+    return cleaned_text, None
+def process_unstructured_data(input_text):
+    """
+    Process unstructured text data and convert it into structured format.
+    """
+    structured_data = []
+    # Define patterns for key attributes
+    patterns = {
+        "Product Name": r"Product\s*[:;-]\s*(.*?)(?=\||Total|\n)",
+        "Model": r"Model\s*[:;-]\s*(.*?)\s*kW",
+        "kW / HP": r"kW\s*/\s*HP\s*:\s*([\d./]+)",
+        "Phase": r"Phase\s*:\s*(\w+)",
+        "Speed": r"Speed\s*:\s*(\d+\s*RPM)",
+        "Net Quantity": r"Net\s*Quantity\s*:\s*(\S+)",
+        "Gross Weight": r"Gross\s*Weight\s*:\s*([\d.]+\s*\w+)",
+        "Month & Year of MFG": r"Month\s*&\s*Year\s*of\s*MFG\s*:\s*(\w+\s*\d+)",
+        "MRP": r"MRP.*?([\d.,]+\s*\(Inclusive\s*of\s*.*?\))",
+        "Serial No.": r"Serial\s*No\s*[:;-]\s*(.*?)\|",
+        "Manufacturer": r"Sold\s*By\s*[:;-]\s*(.*?)(?=,|\n)",
+        "Address": r"DELIVERY\s*ADDRESS[:;-]\s*(.*?)(?=\s*Courler|\n)",
+        "Customer Care": r"Customer\s*Care\s*[:;-]\s*(\+?\d+)",
+        "Email": r"Email\s*[:;-]\s*(\S+)",
+        "Name": r"Name\s*[:;-]\s*(.*?)(?=Model|Date|$)",
+        "Date": r"Date\s*[:;-]\s*([0-9-/]+)",
+        "Tracking ID": r"Courler\s*AWB\s*No\s*[:;-]\s*(\S+)",
+        "GSTIN": r"GSTIN\s*No\s*[:;-]\s*([A-Z0-9]+)"
+    }
+    # Extract attributes using patterns
+    for attribute, pattern in patterns.items():
+        match = re.search(pattern, input_text, re.IGNORECASE)
+        if match and match.groups():  # Ensure match and group(1) exists
+            structured_data.append(f"{attribute} : {match.group(1).strip()}")
+        else:
+            structured_data.append(f"{attribute} : Not Found")
+    # Combine structured data into a paragraph
+    structured_paragraph = " ".join(structured_data)
+    return structured_paragraph
+@app.route('/run/predict', methods=['POST'])
+def process_image():
     try:
         # Get the image data from the request
         data = request.json
         image_data = image_data.split(',')[1]  # Remove data URL prefix
         image_bytes = base64.b64decode(image_data)
+        # Extract text from the image
+        extracted_text, error = extract_text_from_image(image_bytes)
+        if error:
+            return jsonify({"error": error}), 400
+        # Process the extracted text into structured data
+        structured_output = process_unstructured_data(extracted_text)
+        # Return the processed structured data
+        return jsonify({"structured_output": structured_output}), 200
     except Exception as e:
         return jsonify({"error": str(e)}), 500