Spaces:

kavg
/

sri-doc

Sleeping

App Files Files Community

kavg commited on Apr 21, 2024

Commit

1be0846

1 Parent(s): 613ad82

implemented two ocr methods

Browse files

Files changed (4) hide show

config.py +4 -0
handwritting_detection.py +41 -0
main.py +23 -2
ocr.py +60 -1

config.py CHANGED Viewed

@@ -7,3 +7,7 @@ class Settings(BaseSettings):
     SER_MODEL: str
     TOKENIZER: str
     RE_MODEL: str

     SER_MODEL: str
     TOKENIZER: str
     RE_MODEL: str
+    ROBOFLOW_API_KEY: str
+    ROBOFLOW_URL: str
+    YOLO_MODEL_ID: str
+    TROCR_API_URL: str

handwritting_detection.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from inference_sdk import InferenceHTTPClient
+from config import Settings
+from PIL import Image, ImageDraw
+def draw_rectangle(image, x, y, width, height, **kwargs):
+    # Create a draw object
+    draw = ImageDraw.Draw(image)
+    # Calculate the top-left and bottom-right corners of the rectangle
+    x1 = x - width // 2
+    y1 = y - height // 2
+    x2 = x1 + width
+    y2 = y1 + height
+    # Draw the rectangle
+    draw.rectangle(((x1, y1), (x2, y2)), fill=(255, 255, 255))
+    return image
+def crop_image(image, x, y, width, height, **kwargs):
+    # Calculate the top-left and bottom-right corners of the cropping area
+    left = x - width // 2
+    top = y - height // 2
+    right = left + width
+    bottom = top + height
+    # Crop the image
+    cropped_image = image.crop((left, top, right, bottom))
+    return cropped_image, left, top, (right-left), (bottom-top)
+def DetectHandwritting(image):
+    settings = Settings()
+    CLIENT = InferenceHTTPClient(
+        api_url=settings.ROBOFLOW_URL,
+        api_key=settings.ROBOFLOW_API_KEY
+    )
+    result = CLIENT.infer(image, model_id=settings.YOLO_MODEL_ID)
+    cpy = image.copy()
+    handwritten_parts = []
+    for prediction in result['predictions']:
+        cpy = draw_rectangle(cpy, **prediction)
+        handwritten_parts.append(crop_image(cpy, **prediction))
+    return cpy, handwritten_parts

main.py CHANGED Viewed

@@ -11,6 +11,8 @@ import json
 import io
 from models import LiLTRobertaLikeForRelationExtraction
 from base64 import b64decode
 config = {}
 @asynccontextmanager
@@ -23,6 +25,7 @@ async def lifespan(app: FastAPI):
     config['tokenizer'] = AutoTokenizer.from_pretrained(settings.TOKENIZER)
     config['ser_model'] = LiltForTokenClassification.from_pretrained(settings.SER_MODEL)
     config['re_model'] = LiLTRobertaLikeForRelationExtraction.from_pretrained(settings.RE_MODEL)
     yield
     # Clean up and release the resources
     config.clear()
@@ -69,13 +72,31 @@ def ApplyOCR(content):
     image = Image.open(io.BytesIO(content))
   except:
     raise HTTPException(status_code=400, detail="Invalid image")
   try:
     vision_client = ocr.VisionClient(config['settings'].GCV_AUTH)
-    ocr_df = vision_client.ocr(content, image)
   except:
-    raise HTTPException(status_code=400, detail="OCR process failed")
   return ocr_df, image
 def LabelTokens(ocr_df, image):
   input_ids, attention_mask, token_type_ids, bbox, token_actual_boxes, offset_mapping = config['processor'].process(ocr_df, image = image)
   token_labels = token_classification.classifyTokens(config['ser_model'], input_ids, attention_mask, bbox, offset_mapping)

 import io
 from models import LiLTRobertaLikeForRelationExtraction
 from base64 import b64decode
+from handwritting_detection import DetectHandwritting
+import pandas as pd
 config = {}
 @asynccontextmanager
     config['tokenizer'] = AutoTokenizer.from_pretrained(settings.TOKENIZER)
     config['ser_model'] = LiltForTokenClassification.from_pretrained(settings.SER_MODEL)
     config['re_model'] = LiLTRobertaLikeForRelationExtraction.from_pretrained(settings.RE_MODEL)
+    config['TROCR_API'] = settings.TROCR_API_URL
     yield
     # Clean up and release the resources
     config.clear()
     image = Image.open(io.BytesIO(content))
   except:
     raise HTTPException(status_code=400, detail="Invalid image")
   try:
+    printed_img, handwritten_imgs = DetectHandwritting(image)
+  except:
+    raise HTTPException(status_code=400, detail="Handwritten OCR failed")
+  try:
+    trocr_client = ocr.TrOCRClientClient(config['settings'].TROCR_API_URL)
+    handwritten_ocr_df = trocr_client.ocr(handwritten_imgs, image)
+  except:
+    raise HTTPException(status_code=400, detail="handwritten OCR process failed")
+  try:
+    jpeg_bytes = io.BytesIO()
+    printed_img.save(jpeg_bytes, format='JPEG')
+    jpeg_content = jpeg_bytes.getvalue()
     vision_client = ocr.VisionClient(config['settings'].GCV_AUTH)
+    printed_ocr_df = vision_client.ocr(jpeg_content, printed_img)
   except:
+    raise HTTPException(status_code=400, detail="Printed OCR process failed")
+  ocr_df = pd.concat([handwritten_ocr_df, printed_ocr_df])
   return ocr_df, image
 def LabelTokens(ocr_df, image):
   input_ids, attention_mask, token_type_ids, bbox, token_actual_boxes, offset_mapping = config['processor'].process(ocr_df, image = image)
   token_labels = token_classification.classifyTokens(config['ser_model'], input_ids, attention_mask, bbox, offset_mapping)

ocr.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 import numpy as np
 from PIL import Image
 import io
 image_ext = ("*.jpg", "*.jpeg", "*.png")
@@ -86,4 +87,62 @@ class VisionClient:
         resp_js = self.get_response(content)
         boxObjects = self.post_process(resp_js)
         ocr_df = self.convert_to_df(boxObjects, image)
-        return ocr_df

 import numpy as np
 from PIL import Image
 import io
+import requests
 image_ext = ("*.jpg", "*.jpeg", "*.png")
         resp_js = self.get_response(content)
         boxObjects = self.post_process(resp_js)
         ocr_df = self.convert_to_df(boxObjects, image)
+        return ocr_df
+class TrOCRClient():
+    def __init__(self, api_url):
+        self.api_url = api_url
+    def convert_to_df(self, boxObjects, image):
+        ocr_df = pd.DataFrame(boxObjects)
+        # ocr_df = ocr_df.sort_values(by=['top', 'left'], ascending=True).reset_index(drop=True)
+        width, height = image.size
+        w_scale = 1000/width
+        h_scale = 1000/height
+        ocr_df = ocr_df.dropna() \
+                    .assign(left_scaled = ocr_df.left*w_scale,
+                            width_scaled = ocr_df.width*w_scale,
+                            top_scaled = ocr_df.top*h_scale,
+                            height_scaled = ocr_df.height*h_scale,
+                            right_scaled = lambda x: x.left_scaled + x.width_scaled,
+                            bottom_scaled = lambda x: x.top_scaled + x.height_scaled)
+        float_cols = ocr_df.select_dtypes('float').columns
+        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
+        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
+        ocr_df = ocr_df.dropna().reset_index(drop=True)
+        return ocr_df
+    def send_request(self, handwritten_img):
+        jpeg_bytes = io.BytesIO()
+        handwritten_img.save(jpeg_bytes, format='JPEG')
+        jpeg_content = jpeg_bytes.getvalue()
+        # Send a POST request with the image file
+        response = requests.post(self.api_url, files={"file": jpeg_content})
+        # Check the response status code
+        if response.status_code == 200:
+            # Get the extracted text from the response
+            extracted_text = response.json()["text"]
+            print(extracted_text)
+        else:
+            print(f"Error: {response.text}")
+    def ocr(self, handwritten_imgs, image):
+        boxObjects = []
+        for i in len(handwritten_imgs):
+            handwritten_img = handwritten_imgs[i]
+            ocr_result = self.send_request(handwritten_img[0])
+            boxObjects.append({
+                "id": i-1,
+                "text": ocr_result,
+                "left": handwritten_img[1],
+                "width": handwritten_img[3],
+                "top": handwritten_img[2],
+                "height":handwritten_img[4]
+            })
+        ocr_df = self.convert_to_df(boxObjects, image)
+        return ocr_df