pavansuresh commited on
Commit
c842ecc
·
verified ·
1 Parent(s): 0407128

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +30 -7
ocr_utils.py CHANGED
@@ -5,11 +5,12 @@ import tempfile
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
7
  """
8
- Extract text, words, and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
- list: List of dictionaries, each containing 'text' (str), 'words' (list of str), and 'bbox' (list of [x0, y0, x1, y1]) for each page.
 
13
  Returns empty list if failed.
14
  """
15
  try:
@@ -30,26 +31,48 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
30
  img_path = f"{temp_path}_page_{page_num}.png"
31
  pix.save(img_path)
32
 
 
 
 
33
  # Perform OCR using EasyOCR
34
  results = reader.readtext(img_path)
35
  text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
36
  words = []
37
  bboxes = []
38
 
39
- # Split text segments into words and assign bounding boxes
40
  for res in results:
41
  segment_text = res[1]
42
  segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
 
 
 
 
 
 
 
 
 
43
  segment_words = segment_text.split()
44
- # Assign the same bounding box to each word in the segment
45
  for word in segment_words:
46
  words.append(word)
47
- bboxes.append(segment_bbox)
48
 
49
  if text.strip():
50
- all_pages.append({"text": text, "words": words, "bbox": bboxes})
 
 
 
 
 
51
  else:
52
- all_pages.append({"text": f"Page {page_num + 1}: No text detected", "words": [], "bbox": []})
 
 
 
 
 
53
 
54
  # Clean up temporary image
55
  if os.path.exists(img_path):
 
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
7
  """
8
+ Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
+ list: List of dictionaries, each containing 'text' (str), 'words' (list of str),
13
+ 'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page.
14
  Returns empty list if failed.
15
  """
16
  try:
 
31
  img_path = f"{temp_path}_page_{page_num}.png"
32
  pix.save(img_path)
33
 
34
+ # Get image dimensions
35
+ image_width, image_height = pix.width, pix.height
36
+
37
  # Perform OCR using EasyOCR
38
  results = reader.readtext(img_path)
39
  text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
40
  words = []
41
  bboxes = []
42
 
43
+ # Split text segments into words and assign normalized bounding boxes
44
  for res in results:
45
  segment_text = res[1]
46
  segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
47
+ # Normalize bounding box to 0-1000 range
48
+ normalized_bbox = [
49
+ int((segment_bbox[0] / image_width) * 1000),
50
+ int((segment_bbox[1] / image_height) * 1000),
51
+ int((segment_bbox[2] / image_width) * 1000),
52
+ int((segment_bbox[3] / image_height) * 1000)
53
+ ]
54
+ # Ensure coordinates are within 0-1000
55
+ normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox]
56
  segment_words = segment_text.split()
57
+ # Assign the same normalized bounding box to each word in the segment
58
  for word in segment_words:
59
  words.append(word)
60
+ bboxes.append(normalized_bbox)
61
 
62
  if text.strip():
63
+ all_pages.append({
64
+ "text": text,
65
+ "words": words,
66
+ "bbox": bboxes,
67
+ "image_dims": [image_width, image_height]
68
+ })
69
  else:
70
+ all_pages.append({
71
+ "text": f"Page {page_num + 1}: No text detected",
72
+ "words": [],
73
+ "bbox": [],
74
+ "image_dims": [image_width, image_height]
75
+ })
76
 
77
  # Clean up temporary image
78
  if os.path.exists(img_path):