sairamtelagamsetti commited on
Commit
223b677
·
verified ·
1 Parent(s): 738d678

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -16
app.py CHANGED
@@ -1,12 +1,81 @@
1
- import base64
 
 
 
2
  from flask import Flask, request, jsonify
 
3
  from PIL import Image
4
- import io
5
 
6
  app = Flask(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- #@app.route('/run/predict', methods=['POST'])
9
- #def process_image():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  try:
11
  # Get the image data from the request
12
  data = request.json
@@ -16,20 +85,16 @@ app = Flask(__name__)
16
  image_data = image_data.split(',')[1] # Remove data URL prefix
17
  image_bytes = base64.b64decode(image_data)
18
 
19
- # Open the image with PIL
20
- image = Image.open(io.BytesIO(image_bytes))
21
-
22
- # Example: Resize the image (Optional processing)
23
- image = image.resize((300, 300))
24
 
25
- # Convert the processed image back to base64
26
- buffered = io.BytesIO()
27
- image.save(buffered, format="PNG")
28
- processed_image_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
29
- processed_image_data = f"data:image/png;base64,{processed_image_data}"
30
 
31
- # Return the processed image as a response
32
- return jsonify({"processed_image": processed_image_data}), 200
33
  except Exception as e:
34
  return jsonify({"error": str(e)}), 500
35
 
 
1
+ import pytesseract
2
+ import cv2
3
+ import re
4
+ import io
5
  from flask import Flask, request, jsonify
6
+ from flask_cors import CORS
7
  from PIL import Image
8
+ import numpy as np
9
 
10
  app = Flask(__name__)
11
+ CORS(app) # Enable CORS for all routes
12
+
13
+ # Update pytesseract to use the system-installed Tesseract
14
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" # Path for Hugging Face Spaces
15
+
16
+ def extract_text_from_image(image_data):
17
+ """
18
+ Extract text from an image using OCR.
19
+ """
20
+ # Convert base64 image data to numpy array
21
+ img_data = np.frombuffer(image_data, np.uint8)
22
+ img = cv2.imdecode(img_data, cv2.IMREAD_COLOR)
23
+ if img is None:
24
+ return None, "Error: Image could not be decoded."
25
+
26
+ # Convert image to grayscale for better OCR results
27
+ gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
28
+
29
+ # Perform OCR
30
+ extracted_text = pytesseract.image_to_string(gray_img)
31
+
32
+ # Clean the OCR text to remove unwanted spaces or artifacts
33
+ cleaned_text = "\n".join([line.strip() for line in extracted_text.splitlines() if line.strip()])
34
 
35
+ return cleaned_text, None
36
+
37
+ def process_unstructured_data(input_text):
38
+ """
39
+ Process unstructured text data and convert it into structured format.
40
+ """
41
+ structured_data = []
42
+
43
+ # Define patterns for key attributes
44
+ patterns = {
45
+ "Product Name": r"Product\s*[:;-]\s*(.*?)(?=\||Total|\n)",
46
+ "Model": r"Model\s*[:;-]\s*(.*?)\s*kW",
47
+ "kW / HP": r"kW\s*/\s*HP\s*:\s*([\d./]+)",
48
+ "Phase": r"Phase\s*:\s*(\w+)",
49
+ "Speed": r"Speed\s*:\s*(\d+\s*RPM)",
50
+ "Net Quantity": r"Net\s*Quantity\s*:\s*(\S+)",
51
+ "Gross Weight": r"Gross\s*Weight\s*:\s*([\d.]+\s*\w+)",
52
+ "Month & Year of MFG": r"Month\s*&\s*Year\s*of\s*MFG\s*:\s*(\w+\s*\d+)",
53
+ "MRP": r"MRP.*?([\d.,]+\s*\(Inclusive\s*of\s*.*?\))",
54
+ "Serial No.": r"Serial\s*No\s*[:;-]\s*(.*?)\|",
55
+ "Manufacturer": r"Sold\s*By\s*[:;-]\s*(.*?)(?=,|\n)",
56
+ "Address": r"DELIVERY\s*ADDRESS[:;-]\s*(.*?)(?=\s*Courler|\n)",
57
+ "Customer Care": r"Customer\s*Care\s*[:;-]\s*(\+?\d+)",
58
+ "Email": r"Email\s*[:;-]\s*(\S+)",
59
+ "Name": r"Name\s*[:;-]\s*(.*?)(?=Model|Date|$)",
60
+ "Date": r"Date\s*[:;-]\s*([0-9-/]+)",
61
+ "Tracking ID": r"Courler\s*AWB\s*No\s*[:;-]\s*(\S+)",
62
+ "GSTIN": r"GSTIN\s*No\s*[:;-]\s*([A-Z0-9]+)"
63
+ }
64
+
65
+ # Extract attributes using patterns
66
+ for attribute, pattern in patterns.items():
67
+ match = re.search(pattern, input_text, re.IGNORECASE)
68
+ if match and match.groups(): # Ensure match and group(1) exists
69
+ structured_data.append(f"{attribute} : {match.group(1).strip()}")
70
+ else:
71
+ structured_data.append(f"{attribute} : Not Found")
72
+
73
+ # Combine structured data into a paragraph
74
+ structured_paragraph = " ".join(structured_data)
75
+ return structured_paragraph
76
+
77
+ @app.route('/run/predict', methods=['POST'])
78
+ def process_image():
79
  try:
80
  # Get the image data from the request
81
  data = request.json
 
85
  image_data = image_data.split(',')[1] # Remove data URL prefix
86
  image_bytes = base64.b64decode(image_data)
87
 
88
+ # Extract text from the image
89
+ extracted_text, error = extract_text_from_image(image_bytes)
90
+ if error:
91
+ return jsonify({"error": error}), 400
 
92
 
93
+ # Process the extracted text into structured data
94
+ structured_output = process_unstructured_data(extracted_text)
 
 
 
95
 
96
+ # Return the processed structured data
97
+ return jsonify({"structured_output": structured_output}), 200
98
  except Exception as e:
99
  return jsonify({"error": str(e)}), 500
100