Spaces:

capitaletech
/

cv_quality

Sleeping

App Files Files Community

Nassiraaa commited on Aug 6, 2024

Commit

52a1b2f

verified ·

1 Parent(s): cbc2d14

Delete ocr_extractor.py

Browse files

Files changed (1) hide show

ocr_extractor.py +0 -139

ocr_extractor.py DELETED Viewed

@@ -1,139 +0,0 @@
-import sys
-import importlib
-from PIL import Image
-import boto3
-import os
-from doctr.io import DocumentFile
-from doctr.models import ocr_predictor
-import easyocr
-from shapely.geometry import Polygon
-from paddleocr import PaddleOCR
-import langid
-import json
-import PyPDF2
-# Check if python-bidi is installed
-if importlib.util.find_spec("bidi") is None:
-    print("Error: python-bidi is not installed. Please install it using pip install python-bidi")
-    sys.exit(1)
-# Initialize OCR models
-def load_models(language):
-    doctr_model = ocr_predictor(pretrained=True)
-    easyocr_reader = easyocr.Reader([language])
-    paddleocr_reader = PaddleOCR(use_angle_cls=True, lang=language)
-    return doctr_model, easyocr_reader, paddleocr_reader
-# AWS Textract client
-textract_client = boto3.client('textract', region_name='us-west-2')
-def extract_text_aws(image_bytes):
-    try:
-        response = textract_client.detect_document_text(Document={'Bytes': image_bytes})
-        return [(item['Text'], item['Geometry']['BoundingBox'], item['Confidence'])
-                for item in response['Blocks'] if item['BlockType'] == 'WORD']
-    except Exception as e:
-        print(f"Error in AWS Textract: {str(e)}")
-        return []
-def extract_text_doctr(image_path, doctr_model):
-    try:
-        doc = DocumentFile.from_images(image_path)
-        result = doctr_model(doc)
-        return [(word.value, word.geometry, word.confidence)
-                for block in result.pages[0].blocks for line in block.lines for word in line.words]
-    except Exception as e:
-        print(f"Error in Doctr OCR: {str(e)}")
-        return []
-def extract_text_easyocr(image_path, easyocr_reader):
-    try:
-        result = easyocr_reader.readtext(image_path)
-        return [(detection[1], detection[0], detection[2]) for detection in result]
-    except Exception as e:
-        print(f"Error in EasyOCR: {str(e)}")
-        return []
-def extract_text_paddleocr(image_path, paddleocr_reader):
-    try:
-        result = paddleocr_reader.ocr(image_path, cls=True)
-        return [(line[1][0], line[0], line[1][1]) for line in result[0]]
-    except Exception as e:
-        print(f"Error in PaddleOCR: {str(e)}")
-        return []
-def bbox_to_polygon(bbox):
-    if isinstance(bbox, dict):  # AWS format
-        return Polygon([(bbox['Left'], bbox['Top']),
-                        (bbox['Left']+bbox['Width'], bbox['Top']),
-                        (bbox['Left']+bbox['Width'], bbox['Top']+bbox['Height']),
-                        (bbox['Left'], bbox['Top']+bbox['Height'])])
-    elif len(bbox) == 4 and all(isinstance(p, (list, tuple)) for p in bbox):  # EasyOCR format
-        return Polygon(bbox)
-    elif len(bbox) == 2:  # Doctr format
-        x, y, w, h = bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]
-        return Polygon([(x, y), (x+w, y), (x+w, y+h), (x, y+h)])
-    else:
-        raise ValueError(f"Unsupported bbox format: {bbox}")
-def combine_ocr_results(results, weights):
-    combined_words = []
-    for method, words in results.items():
-        for word, bbox, confidence in words:
-            try:
-                polygon = bbox_to_polygon(bbox)
-                combined_words.append((word, polygon, float(confidence) * weights[method]))
-            except Exception as e:
-                print(f"Error processing word '{word}' from {method}: {str(e)}")
-    final_words = []
-    while combined_words:
-        current_word = combined_words.pop(0)
-        overlapping = [w for w in combined_words if current_word[1].intersects(w[1])]
-        if overlapping:
-            best_word = max([current_word] + overlapping, key=lambda x: x[2])
-            final_words.append(best_word[0])
-            for word in overlapping:
-                combined_words.remove(word)
-        else:
-            final_words.append(current_word[0])
-    return ' '.join(final_words)
-def detect_language(text):
-    language, _ = langid.classify(text)
-    return language
-def process_file(file_path, weights_file):
-    _, file_extension = os.path.splitext(file_path)
-    if file_extension.lower() == '.pdf':
-        with open(file_path, 'rb') as file:
-            pdf_reader = PyPDF2.PdfReader(file)
-            text = ""
-            for page in pdf_reader.pages:
-                text += page.extract_text() + "\n"
-        return text
-    else:  # Assume it's an image file
-        with open(weights_file, 'r') as f:
-            weights = json.load(f)
-        with open(file_path, 'rb') as image_file:
-            image_bytes = image_file.read()
-        # Detect language using a sample of text from AWS Textract
-        aws_results = extract_text_aws(image_bytes)
-        sample_text = ' '.join([item[0] for item in aws_results[:10]])
-        detected_language = detect_language(sample_text)
-        doctr_model, easyocr_reader, paddleocr_reader = load_models(detected_language)
-        results = {
-            "aws": aws_results,
-            "doctr": extract_text_doctr(file_path, doctr_model),
-            "easyocr": extract_text_easyocr(file_path, easyocr_reader),
-            "paddleocr": extract_text_paddleocr(file_path, paddleocr_reader),
-        }
-        return combine_ocr_results(results, weights)