kanha-upadhyay commited on
Commit
2e2af5e
·
1 Parent(s): 15ad0e8

Enhance PDFProcessorService and TextExtractor with improved logging and error handling

Browse files
src/services/_pdf_processor_service.py CHANGED
@@ -13,27 +13,22 @@ from src.utils import TextExtractor, model_manager
13
 
14
 
15
  class PDFProcessorService:
16
- """Async PDF processor for handling both digital and scanned PDFs."""
17
-
18
  def __init__(self):
19
- # Use the centralized model manager
20
  self._ensure_models_loaded()
21
 
22
  def _ensure_models_loaded(self):
23
- """Ensure models are loaded via the model manager."""
24
  if not model_manager.models_loaded:
25
- logger.info("🔄 Models not loaded, initializing model manager...")
26
- # This will trigger model loading if not already done
27
  _ = model_manager.doctr_model
 
28
 
29
  @property
30
  def doctr_model(self):
31
- """Get the loaded doctr model from model manager."""
32
  return model_manager.doctr_model
33
 
34
  @property
35
  def device(self):
36
- """Get the device being used from model manager."""
37
  return model_manager.device
38
 
39
  async def __aenter__(self):
@@ -43,60 +38,76 @@ class PDFProcessorService:
43
  pass
44
 
45
  async def is_pdf_scanned(self, pdf_path: str) -> bool:
46
- """Check if PDF is scanned (no extractable text)."""
47
 
48
  def _check_scanned():
49
- doc = fitz.open(pdf_path)
50
- for page in doc:
51
- text = page.get_text()
52
- if text.strip():
53
- return False
54
- return True
 
 
 
 
55
 
56
  return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
57
 
58
  async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
59
- file_name = uploaded_file.filename
60
- suffix = Path(file_name).suffix
61
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
62
- temp_path = tmp.name
63
- async with aiofiles.open(temp_path, "wb") as f:
64
- await f.write(await uploaded_file.read())
65
- return temp_path
 
 
 
 
 
 
66
 
67
  async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
68
- """Extract text from digital PDF using PyPDF2."""
69
 
70
  async def _extract_text():
71
- doc = fitz.open(pdf_path)
72
- extracted_data = []
73
-
74
- for page in doc:
75
- ptext = page.get_text()
76
- if ptext:
77
- data = []
78
- for line in ptext.splitlines():
79
- cleaned_line = await self._split_on_repeated_pattern(
80
- line.strip()
81
- )
82
- if cleaned_line:
83
- data.append(cleaned_line[0])
84
- extracted_data.append(data)
85
-
86
- return extracted_data
 
 
 
 
 
 
 
87
 
88
  return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
89
 
90
  async def _split_on_repeated_pattern(
91
  self, line: str, min_space: int = 10
92
  ) -> List[str]:
93
- """Split line on repeated pattern."""
94
  import re
95
  from difflib import SequenceMatcher
96
 
97
  original_line = line.strip()
98
 
99
- # Find all spans of spaces >= min_space
100
  space_spans = [
101
  (m.start(), len(m.group()))
102
  for m in re.finditer(r" {%d,}" % min_space, original_line)
@@ -105,27 +116,22 @@ class PDFProcessorService:
105
  if not space_spans:
106
  return [original_line]
107
 
108
- # Count how often each gap size occurs
109
  gaps = [span[1] for span in space_spans]
110
  gap_counts = {}
111
  for g in gaps:
112
  gap_counts[g] = gap_counts.get(g, 0) + 1
113
 
114
- # Sort gaps by size × count (more dominant gaps first)
115
  sorted_gaps = sorted(
116
  gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
117
  )
118
 
119
- # No significant gaps, return original
120
  if not sorted_gaps:
121
  return [original_line]
122
 
123
  dominant_gap = sorted_gaps[0][0]
124
 
125
- # Use the dominant large gap to split
126
  chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
127
 
128
- # Check if it's actually repeated using fuzzy match
129
  base = chunks[0].strip()
130
  repeated = False
131
  for chunk in chunks[1:]:
@@ -137,38 +143,64 @@ class PDFProcessorService:
137
  return [base] if repeated else [original_line]
138
 
139
  async def process_pdf(self, file):
140
- pdf_path = await self.save_uploaded_file(file)
141
- is_scanned = await self.is_pdf_scanned(pdf_path)
142
- text_extractor = TextExtractor(self.doctr_model)
143
- if is_scanned:
144
- logger.info(f"{pdf_path} is likely a scanned PDF.")
145
- extracted_text_list = (
146
- await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
147
- )
148
- else:
149
- logger.info(f"{pdf_path} is not a scanned PDF. Extracting text...")
150
- extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
151
- pdf_text = ""
152
- for block in extracted_text_list:
153
- for line in block:
154
- pdf_text += " " + line["line"]
155
- text_noisy = text_extractor.is_text_noisy(pdf_text)
156
- if text_noisy:
157
- logger.info("Text is noisy. Extracting text again...")
158
  extracted_text_list = (
159
  await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
160
  pdf_path
161
  )
162
  )
163
- return extracted_text_list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  async def extract_entity(self, text: str):
166
- text = re.sub(r"[^\w\s]", " ", text)
167
- doc = model_manager.spacy_model(text)
168
- entities = {ent.text: ent.label_ for ent in doc.ents}
169
- for key, value in entities.items():
170
- if value == "ORG":
171
- return key
172
- if entities:
173
- return list(entities.keys())[0]
174
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
  class PDFProcessorService:
 
 
16
  def __init__(self):
17
+ logger.info("Initializing PDFProcessorService")
18
  self._ensure_models_loaded()
19
 
20
  def _ensure_models_loaded(self):
 
21
  if not model_manager.models_loaded:
22
+ logger.info("Models not loaded, initializing model manager...")
 
23
  _ = model_manager.doctr_model
24
+ logger.debug("Model manager initialization completed")
25
 
26
  @property
27
  def doctr_model(self):
 
28
  return model_manager.doctr_model
29
 
30
  @property
31
  def device(self):
 
32
  return model_manager.device
33
 
34
  async def __aenter__(self):
 
38
  pass
39
 
40
  async def is_pdf_scanned(self, pdf_path: str) -> bool:
41
+ logger.debug(f"Checking if PDF is scanned: {pdf_path}")
42
 
43
  def _check_scanned():
44
+ try:
45
+ doc = fitz.open(pdf_path)
46
+ for page in doc:
47
+ text = page.get_text()
48
+ if text.strip():
49
+ return False
50
+ return True
51
+ except Exception as e:
52
+ logger.error(f"Error checking if PDF is scanned: {e}")
53
+ raise
54
 
55
  return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
56
 
57
  async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
58
+ logger.info(f"Saving uploaded file: {uploaded_file.filename}")
59
+ try:
60
+ file_name = uploaded_file.filename
61
+ suffix = Path(file_name).suffix
62
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
63
+ temp_path = tmp.name
64
+ async with aiofiles.open(temp_path, "wb") as f:
65
+ await f.write(await uploaded_file.read())
66
+ logger.debug(f"File saved to temporary path: {temp_path}")
67
+ return temp_path
68
+ except Exception as e:
69
+ logger.error(f"Error saving uploaded file: {e}")
70
+ raise
71
 
72
  async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
73
+ logger.debug(f"Extracting text from digital PDF: {pdf_path}")
74
 
75
  async def _extract_text():
76
+ try:
77
+ doc = fitz.open(pdf_path)
78
+ extracted_data = []
79
+
80
+ for page in doc:
81
+ ptext = page.get_text()
82
+ if ptext:
83
+ data = []
84
+ for line in ptext.splitlines():
85
+ cleaned_line = await self._split_on_repeated_pattern(
86
+ line.strip()
87
+ )
88
+ if cleaned_line:
89
+ data.append(cleaned_line[0])
90
+ extracted_data.append(data)
91
+
92
+ logger.info(
93
+ f"Successfully extracted text from {len(extracted_data)} pages"
94
+ )
95
+ return extracted_data
96
+ except Exception as e:
97
+ logger.error(f"Error extracting text from digital PDF: {e}")
98
+ raise
99
 
100
  return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
101
 
102
  async def _split_on_repeated_pattern(
103
  self, line: str, min_space: int = 10
104
  ) -> List[str]:
105
+ logger.debug(f"Processing line for repeated patterns: {line[:50]}...")
106
  import re
107
  from difflib import SequenceMatcher
108
 
109
  original_line = line.strip()
110
 
 
111
  space_spans = [
112
  (m.start(), len(m.group()))
113
  for m in re.finditer(r" {%d,}" % min_space, original_line)
 
116
  if not space_spans:
117
  return [original_line]
118
 
 
119
  gaps = [span[1] for span in space_spans]
120
  gap_counts = {}
121
  for g in gaps:
122
  gap_counts[g] = gap_counts.get(g, 0) + 1
123
 
 
124
  sorted_gaps = sorted(
125
  gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
126
  )
127
 
 
128
  if not sorted_gaps:
129
  return [original_line]
130
 
131
  dominant_gap = sorted_gaps[0][0]
132
 
 
133
  chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
134
 
 
135
  base = chunks[0].strip()
136
  repeated = False
137
  for chunk in chunks[1:]:
 
143
  return [base] if repeated else [original_line]
144
 
145
  async def process_pdf(self, file):
146
+ logger.info(f"Processing PDF file: {file.filename}")
147
+ try:
148
+ pdf_path = await self.save_uploaded_file(file)
149
+ is_scanned = await self.is_pdf_scanned(pdf_path)
150
+ text_extractor = TextExtractor(self.doctr_model)
151
+
152
+ if is_scanned:
153
+ logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
 
 
 
 
 
 
 
 
 
 
154
  extracted_text_list = (
155
  await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
156
  pdf_path
157
  )
158
  )
159
+ else:
160
+ logger.info(f"PDF {pdf_path} is digital, extracting text directly")
161
+ extracted_text_list = await text_extractor.extract_lines_with_bbox(
162
+ pdf_path
163
+ )
164
+ pdf_text = ""
165
+ for block in extracted_text_list:
166
+ for line in block:
167
+ pdf_text += " " + line["line"]
168
+ text_noisy = text_extractor.is_text_noisy(pdf_text)
169
+ if text_noisy:
170
+ logger.warning("Text is noisy, falling back to OCR extraction")
171
+ extracted_text_list = (
172
+ await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
173
+ pdf_path
174
+ )
175
+ )
176
+
177
+ logger.info(
178
+ f"Successfully processed PDF with {len(extracted_text_list)} text blocks"
179
+ )
180
+ return extracted_text_list
181
+ except Exception as e:
182
+ logger.error(f"Error processing PDF: {e}")
183
+ raise
184
 
185
  async def extract_entity(self, text: str):
186
+ logger.debug(f"Extracting entities from text: {text[:100]}...")
187
+ try:
188
+ text = re.sub(r"[^\w\s]", " ", text)
189
+ doc = model_manager.spacy_model(text)
190
+ entities = {ent.text: ent.label_ for ent in doc.ents}
191
+
192
+ for key, value in entities.items():
193
+ if value == "ORG":
194
+ logger.info(f"Found organization entity: {key}")
195
+ return key
196
+
197
+ if entities:
198
+ entity = list(entities.keys())[0]
199
+ logger.info(f"Found entity: {entity}")
200
+ return entity
201
+
202
+ logger.debug("No entities found, returning original text")
203
+ return text
204
+ except Exception as e:
205
+ logger.error(f"Error extracting entities: {e}")
206
+ return text
src/utils/_text_extractor.py CHANGED
@@ -8,13 +8,13 @@ from typing import Dict, List
8
 
9
  import fitz
10
  import numpy as np
 
11
  from pdf2image import convert_from_path
12
 
13
 
14
  class TextExtractor:
15
- """Async text extractor for extracting text with bounding boxes."""
16
-
17
  def __init__(self, doctr_model):
 
18
  self.doctr_model = doctr_model
19
  self.noise_pattern = [
20
  r"\b[A-Z]{6,}\b",
@@ -22,6 +22,7 @@ class TextExtractor:
22
  r"(\d)\1{5,}",
23
  r"\b(?=[A-Za-z]*\d)(?=\d*[A-Za-z])[A-Za-z\d]{8,}\b",
24
  ]
 
25
 
26
  async def __aenter__(self):
27
  return self
@@ -30,33 +31,36 @@ class TextExtractor:
30
  pass
31
 
32
  def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
33
- """Normalize bounding box (x0, y0, x1, y1) to range [0, 1]."""
34
  x0, y0, x1, y1 = bbox
35
- return [
36
  round(x0 / width, 6),
37
  round(y0 / height, 6),
38
  round(x1 / width, 6),
39
  round(y1 / height, 6),
40
  ]
 
 
41
 
42
  def remove_consecutive_items(self, line: List[str]) -> List[str]:
43
- """Remove consecutive duplicate items from a list."""
44
  if not line:
45
  return line
46
  result = [line[0]]
47
  for item in line[1:]:
48
  if item != result[-1]:
49
  result.append(item)
 
50
  return result
51
 
52
  def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
53
- """Remove consecutive duplicate words from word data."""
54
  if not word_data:
55
  return word_data
56
  result = [word_data[0]]
57
  for i in range(1, len(word_data)):
58
  if word_data[i]["word"] != result[-1]["word"]:
59
  result.append(word_data[i])
 
 
 
60
  return result
61
 
62
  def shannon_entropy(self, text: str) -> float:
@@ -69,17 +73,9 @@ class TextExtractor:
69
  )
70
 
71
  def reconstruct_line_from_bboxes(self, words, space_unit=5):
72
- """
73
- Reconstructs a line with appropriate spacing based on word bounding boxes.
74
-
75
- Parameters:
76
- - words: list of dicts with 'word' and 'bbox' (bbox = [x0, y0, x1, y1])
77
- - space_unit: how many pixels roughly correspond to one space
78
-
79
- Returns:
80
- - str: reconstructed line with spaces
81
- """
82
- # Sort words by x-coordinate (left to right)
83
  words = sorted(words, key=lambda w: w["bbox"][0])
84
 
85
  line = ""
@@ -89,88 +85,113 @@ class TextExtractor:
89
  start_x = word_info["bbox"][0]
90
 
91
  if prev_end_x is not None:
92
- # Calculate gap between previous word and current word
93
  gap = max(0, start_x - prev_end_x)
94
  num_spaces = int(round(gap / space_unit))
95
  line += " " * num_spaces
96
 
97
  line += word
98
- prev_end_x = word_info["bbox"][2] # x1 of current word
99
 
 
100
  return line
101
 
102
  def is_text_noisy(self, text: str) -> bool:
103
- """Check if text is noisy (contains special characters)."""
104
  total_chars = len(text)
105
- if total_chars < 50: # skip empty or small pages
 
106
  return True
107
 
108
  tokens = re.findall(r"\b\w+\b", text)
109
  total_words = len(tokens)
110
 
111
- # Symbol & digit density
112
  digit_count = len(re.findall(r"\d", text))
113
- symbol_count = len(
114
- re.findall(r"[^\w\s]", text)
115
- ) # anything not a word char or whitespace
116
  symbol_density = symbol_count / total_chars
117
  digit_density = digit_count / total_chars
118
 
119
- # Repeating char patterns like "22222222222" or "!!!!!!"
120
- long_repeats = len(re.findall(r"(.)\1{5,}", text)) # any char repeated 6+ times
121
-
122
- # Entropy: randomness of characters
123
  entropy = self.shannon_entropy(text)
124
 
125
- # Heuristics tuned for your sample
126
- if (
127
  entropy > 4.0
128
  and symbol_density > 0.15
129
  and digit_density > 0.15
130
  and long_repeats > 1
131
  and total_words > 30
132
- ):
133
- return True
134
- return False
 
 
 
 
 
135
 
136
  async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0):
137
- """Extract lines with bounding boxes from digital PDF."""
138
 
139
  def _extract_lines():
140
- doc = fitz.open(pdf_path)
141
- page_lines_with_bbox = []
142
-
143
- for page in doc:
144
- words = page.get_text(
145
- "words"
146
- ) # (x0, y0, x1, y1, word, block_no, line_no, word_no)
147
- words.sort(key=lambda w: (round(w[1], 1), w[0])) # sort by y then x
148
-
149
- lines = []
150
- current_line = []
151
- current_y = None
152
- current_word_data = []
153
-
154
- for w in words:
155
- x0, y0, x1, y1, word = w[:5]
156
- if (
157
- word == "|"
158
- or not word
159
- or word == "."
160
- or word == "#"
161
- or re.sub(r"[^\w\s-]", "", word) == ""
162
- or re.sub(r"\d{19,}", "", word) == ""
163
- ):
164
- continue
165
- word = word.lower()
166
- word = word.replace("$", "")
167
- word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
168
-
169
- if current_y is None or abs(y0 - current_y) < y_threshold:
170
- current_line.append((x0, y0, word))
171
- current_y = y0
172
- current_word_data.append(word_data)
173
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  current_line.sort()
175
  line_words = [w[2] for w in current_line]
176
  clean_line = self.remove_consecutive_items(line_words)
@@ -192,43 +213,29 @@ class TextExtractor:
192
  "words": clean_word_data,
193
  }
194
  )
195
- current_line = [(x0, y0, word)]
196
- current_y = y0
197
- current_word_data = [word_data]
198
-
199
- # Process remaining line
200
- if current_line:
201
- current_line.sort()
202
- line_words = [w[2] for w in current_line]
203
- clean_line = self.remove_consecutive_items(line_words)
204
- current_word_data = sorted(
205
- current_word_data, key=lambda w: w["bbox"][0]
206
- )
207
- clean_word_data = self.remove_consecutive_words(current_word_data)
208
-
209
- if clean_line:
210
- x_start = min([w[0] for w in current_line])
211
- y_start = min([w[1] for w in current_line])
212
- if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
213
- lines.append(
214
- {
215
- "line": " ".join(clean_line),
216
- "bbox": [x_start, y_start],
217
- "words": clean_word_data,
218
- }
219
- )
220
 
221
- page_lines_with_bbox.append(lines)
 
222
 
223
- return page_lines_with_bbox
 
 
 
 
 
 
224
 
225
  return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
226
 
227
  def create_page_chunks(self, num_pages: int, cpu_core: int):
 
 
 
228
  final_ranges = []
229
  page_per_cpu = 2
230
  for i in range(1, num_pages + 1, page_per_cpu + 1):
231
  final_ranges.append([i, min(i + page_per_cpu, num_pages)])
 
232
  return final_ranges
233
 
234
  def process_page_parallel_async(
@@ -246,6 +253,7 @@ class TextExtractor:
246
  async def process_pages_concurrently(self, pdf_path: str, page_range: List[int]):
247
  start_page = page_range[0]
248
  end_page = page_range[1]
 
249
 
250
  tasks = []
251
  for page in range(start_page, end_page + 1):
@@ -255,117 +263,165 @@ class TextExtractor:
255
  page_results.sort(key=lambda x: x[0])
256
 
257
  chunk_outputs = [output for page_num, output in page_results]
 
258
 
259
  return page_range, chunk_outputs
260
 
261
  async def process_page_parallel(self, pdf_path: str, i: int):
262
- print(f"Processing page {i}")
263
- pages = convert_from_path(pdf_path, dpi=300, first_page=i, last_page=i)
264
- page_imgs = [page.convert("RGB") for page in pages]
265
- output = self.doctr_model([np.array(img) for img in page_imgs])
266
- return i, output
 
 
 
 
 
267
 
268
  async def extract_lines_with_bbox_from_scanned_pdf(
269
  self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False
270
  ):
271
- """Extract lines with bounding boxes from scanned PDF using OCR."""
 
 
272
 
273
  def _extract_from_scanned():
274
- result = None
275
- doc = None
276
-
277
- if first_page:
278
- number_of_pages = fitz.open(pdf_path).page_count
279
- if number_of_pages < 3:
280
- pages = convert_from_path(
281
- pdf_path, dpi=300, first_page=1, last_page=number_of_pages
282
  )
283
- else:
284
- pages = convert_from_path(
285
- pdf_path, dpi=300, first_page=1, last_page=3
286
- )
287
- first_page_img = [page.convert("RGB") for page in pages]
288
- result = self.doctr_model([np.array(img) for img in first_page_img])
289
- doc = [np.array(img) for img in first_page_img]
290
- else:
291
- pdf = fitz.open(pdf_path)
292
- num_pages = pdf.page_count
293
- page_witdh_f = pdf[0].rect.width
294
- page_height_f = pdf[0].rect.height
295
- page_chunks = self.create_page_chunks(
296
- num_pages, multiprocessing.cpu_count()
297
- )
298
- with ThreadPoolExecutor(
299
- max_workers=multiprocessing.cpu_count()
300
- ) as executor:
301
- futures = []
302
- for chunk in page_chunks:
303
- futures.append(
304
- executor.submit(
305
- self.process_page_parallel_async, pdf_path, chunk, self
306
- )
307
  )
308
- results = [f.result() for f in futures]
309
- results.sort(key=lambda x: x[0][0])
310
- result = []
311
- for r in results:
312
- result.extend(r[1])
313
- results = result
314
- page_lines_with_bbox = []
315
-
316
- for result in results:
317
- for page in result.pages:
318
- if first_page:
319
- img_width, img_height = doc[0].shape[1], doc[0].shape[0]
320
  else:
321
- img_width, img_height = page_witdh_f, page_height_f
322
- words = []
323
-
324
- for block in page.blocks:
325
- for line in block.lines:
326
- for word in line.words:
327
- x0, y0 = word.geometry[0]
328
- x1, y1 = word.geometry[1]
329
- abs_x0 = x0 * img_width
330
- abs_y0 = y0 * img_height
331
- abs_x1 = x1 * img_width
332
- abs_y1 = y1 * img_height
333
- text = word.value.strip().lower()
334
- text = re.sub(r"[#*]", " ", text)
335
- text = re.sub(f"[$]", "", text)
336
- text = text.strip()
337
-
338
- if (
339
- text == "|"
340
- or not text
341
- or text == "."
342
- or text == "#"
343
- or re.sub(r"[^\w\s-]", "", text) == ""
344
- or re.sub(r"\d{19,}", "", text) == ""
345
- ):
346
- continue
347
- words.append(
348
- {
349
- "word": text,
350
- "bbox": [abs_x0, abs_y0, abs_x1, abs_y1],
351
- }
352
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
- # Sort words by y then x
355
- words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
356
-
357
- lines = []
358
- current_line = []
359
- current_word_data = []
360
- current_y = None
361
-
362
- for w in words:
363
- y0 = w["bbox"][1]
364
- if current_y is None or abs(y0 - current_y) < y_threshold:
365
- current_line.append((w["bbox"][0], y0, w["word"]))
366
- current_word_data.append(w)
367
- current_y = y0
368
- else:
 
369
  current_line.sort()
370
  line_words = [x[2] for x in current_line]
371
  clean_line = self.remove_consecutive_items(line_words)
@@ -387,35 +443,16 @@ class TextExtractor:
387
  "words": clean_word_data,
388
  }
389
  )
390
- current_line = [(w["bbox"][0], y0, w["word"])]
391
- current_word_data = [w]
392
- current_y = y0
393
-
394
- # Final remaining line
395
- if current_line:
396
- current_line.sort()
397
- line_words = [x[2] for x in current_line]
398
- clean_line = self.remove_consecutive_items(line_words)
399
- current_word_data = sorted(
400
- current_word_data, key=lambda w: w["bbox"][0]
401
- )
402
- clean_word_data = self.remove_consecutive_words(current_word_data)
403
-
404
- if clean_line:
405
- x_start = min(x[0] for x in current_line)
406
- y_start = min(x[1] for x in current_line)
407
- if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
408
- lines.append(
409
- {
410
- "line": " ".join(clean_line),
411
- "bbox": [x_start, y_start],
412
- "words": clean_word_data,
413
- }
414
- )
415
 
416
- page_lines_with_bbox.append(lines)
417
 
418
- return page_lines_with_bbox
 
 
 
 
 
 
419
 
420
  return await asyncio.get_event_loop().run_in_executor(
421
  None, _extract_from_scanned
 
8
 
9
  import fitz
10
  import numpy as np
11
+ from loguru import logger
12
  from pdf2image import convert_from_path
13
 
14
 
15
  class TextExtractor:
 
 
16
  def __init__(self, doctr_model):
17
+ logger.info("Initializing TextExtractor")
18
  self.doctr_model = doctr_model
19
  self.noise_pattern = [
20
  r"\b[A-Z]{6,}\b",
 
22
  r"(\d)\1{5,}",
23
  r"\b(?=[A-Za-z]*\d)(?=\d*[A-Za-z])[A-Za-z\d]{8,}\b",
24
  ]
25
+ logger.debug(f"Initialized with {len(self.noise_pattern)} noise patterns")
26
 
27
  async def __aenter__(self):
28
  return self
 
31
  pass
32
 
33
  def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
 
34
  x0, y0, x1, y1 = bbox
35
+ normalized = [
36
  round(x0 / width, 6),
37
  round(y0 / height, 6),
38
  round(x1 / width, 6),
39
  round(y1 / height, 6),
40
  ]
41
+ logger.debug(f"Normalized bbox from {bbox} to {normalized}")
42
+ return normalized
43
 
44
  def remove_consecutive_items(self, line: List[str]) -> List[str]:
 
45
  if not line:
46
  return line
47
  result = [line[0]]
48
  for item in line[1:]:
49
  if item != result[-1]:
50
  result.append(item)
51
+ logger.debug(f"Removed consecutive items: {len(line)} -> {len(result)} items")
52
  return result
53
 
54
  def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
 
55
  if not word_data:
56
  return word_data
57
  result = [word_data[0]]
58
  for i in range(1, len(word_data)):
59
  if word_data[i]["word"] != result[-1]["word"]:
60
  result.append(word_data[i])
61
+ logger.debug(
62
+ f"Removed consecutive words: {len(word_data)} -> {len(result)} words"
63
+ )
64
  return result
65
 
66
  def shannon_entropy(self, text: str) -> float:
 
73
  )
74
 
75
  def reconstruct_line_from_bboxes(self, words, space_unit=5):
76
+ logger.debug(
77
+ f"Reconstructing line from {len(words)} words with space_unit={space_unit}"
78
+ )
 
 
 
 
 
 
 
 
79
  words = sorted(words, key=lambda w: w["bbox"][0])
80
 
81
  line = ""
 
85
  start_x = word_info["bbox"][0]
86
 
87
  if prev_end_x is not None:
 
88
  gap = max(0, start_x - prev_end_x)
89
  num_spaces = int(round(gap / space_unit))
90
  line += " " * num_spaces
91
 
92
  line += word
93
+ prev_end_x = word_info["bbox"][2]
94
 
95
+ logger.debug(f"Reconstructed line: '{line[:100]}...'")
96
  return line
97
 
98
  def is_text_noisy(self, text: str) -> bool:
99
+ logger.debug(f"Checking if text is noisy: {len(text)} characters")
100
  total_chars = len(text)
101
+ if total_chars < 50:
102
+ logger.debug("Text too short, marking as noisy")
103
  return True
104
 
105
  tokens = re.findall(r"\b\w+\b", text)
106
  total_words = len(tokens)
107
 
 
108
  digit_count = len(re.findall(r"\d", text))
109
+ symbol_count = len(re.findall(r"[^\w\s]", text))
 
 
110
  symbol_density = symbol_count / total_chars
111
  digit_density = digit_count / total_chars
112
 
113
+ long_repeats = len(re.findall(r"(.)\1{5,}", text))
 
 
 
114
  entropy = self.shannon_entropy(text)
115
 
116
+ is_noisy = (
 
117
  entropy > 4.0
118
  and symbol_density > 0.15
119
  and digit_density > 0.15
120
  and long_repeats > 1
121
  and total_words > 30
122
+ )
123
+
124
+ logger.debug(
125
+ f"Noise analysis - entropy: {entropy:.2f}, symbol_density: {symbol_density:.2f}, "
126
+ f"digit_density: {digit_density:.2f}, long_repeats: {long_repeats}, "
127
+ f"total_words: {total_words}, is_noisy: {is_noisy}"
128
+ )
129
+ return is_noisy
130
 
131
  async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0):
132
+ logger.info(f"Extracting lines with bbox from digital PDF: {pdf_path}")
133
 
134
  def _extract_lines():
135
+ try:
136
+ doc = fitz.open(pdf_path)
137
+ page_lines_with_bbox = []
138
+
139
+ for page_num, page in enumerate(doc):
140
+ logger.debug(f"Processing page {page_num + 1}")
141
+ words = page.get_text("words")
142
+ words.sort(key=lambda w: (round(w[1], 1), w[0]))
143
+
144
+ lines = []
145
+ current_line = []
146
+ current_y = None
147
+ current_word_data = []
148
+
149
+ for w in words:
150
+ x0, y0, x1, y1, word = w[:5]
151
+ if (
152
+ word == "|"
153
+ or not word
154
+ or word == "."
155
+ or word == "#"
156
+ or re.sub(r"[^\w\s-]", "", word) == ""
157
+ or re.sub(r"\d{19,}", "", word) == ""
158
+ ):
159
+ continue
160
+ word = word.lower()
161
+ word = word.replace("$", "")
162
+ word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
163
+
164
+ if current_y is None or abs(y0 - current_y) < y_threshold:
165
+ current_line.append((x0, y0, word))
166
+ current_y = y0
167
+ current_word_data.append(word_data)
168
+ else:
169
+ current_line.sort()
170
+ line_words = [w[2] for w in current_line]
171
+ clean_line = self.remove_consecutive_items(line_words)
172
+ current_word_data = sorted(
173
+ current_word_data, key=lambda w: w["bbox"][0]
174
+ )
175
+ clean_word_data = self.remove_consecutive_words(
176
+ current_word_data
177
+ )
178
+
179
+ if clean_line:
180
+ x_start = min([w[0] for w in current_line])
181
+ y_start = min([w[1] for w in current_line])
182
+ if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
183
+ lines.append(
184
+ {
185
+ "line": " ".join(clean_line),
186
+ "bbox": [x_start, y_start],
187
+ "words": clean_word_data,
188
+ }
189
+ )
190
+ current_line = [(x0, y0, word)]
191
+ current_y = y0
192
+ current_word_data = [word_data]
193
+
194
+ if current_line:
195
  current_line.sort()
196
  line_words = [w[2] for w in current_line]
197
  clean_line = self.remove_consecutive_items(line_words)
 
213
  "words": clean_word_data,
214
  }
215
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ logger.debug(f"Page {page_num + 1}: extracted {len(lines)} lines")
218
+ page_lines_with_bbox.append(lines)
219
 
220
+ logger.info(
221
+ f"Successfully extracted lines from {len(page_lines_with_bbox)} pages"
222
+ )
223
+ return page_lines_with_bbox
224
+ except Exception as e:
225
+ logger.error(f"Error extracting lines from digital PDF: {e}")
226
+ raise
227
 
228
  return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
229
 
230
  def create_page_chunks(self, num_pages: int, cpu_core: int):
231
+ logger.debug(
232
+ f"Creating page chunks for {num_pages} pages using {cpu_core} CPU cores"
233
+ )
234
  final_ranges = []
235
  page_per_cpu = 2
236
  for i in range(1, num_pages + 1, page_per_cpu + 1):
237
  final_ranges.append([i, min(i + page_per_cpu, num_pages)])
238
+ logger.debug(f"Created {len(final_ranges)} page chunks: {final_ranges}")
239
  return final_ranges
240
 
241
  def process_page_parallel_async(
 
253
  async def process_pages_concurrently(self, pdf_path: str, page_range: List[int]):
254
  start_page = page_range[0]
255
  end_page = page_range[1]
256
+ logger.debug(f"Processing pages {start_page}-{end_page} concurrently")
257
 
258
  tasks = []
259
  for page in range(start_page, end_page + 1):
 
263
  page_results.sort(key=lambda x: x[0])
264
 
265
  chunk_outputs = [output for page_num, output in page_results]
266
+ logger.debug(f"Completed processing pages {start_page}-{end_page}")
267
 
268
  return page_range, chunk_outputs
269
 
270
  async def process_page_parallel(self, pdf_path: str, i: int):
271
+ logger.debug(f"Processing page {i}")
272
+ try:
273
+ pages = convert_from_path(pdf_path, dpi=300, first_page=i, last_page=i)
274
+ page_imgs = [page.convert("RGB") for page in pages]
275
+ output = self.doctr_model([np.array(img) for img in page_imgs])
276
+ logger.debug(f"Successfully processed page {i}")
277
+ return i, output
278
+ except Exception as e:
279
+ logger.error(f"Error processing page {i}: {e}")
280
+ raise
281
 
282
  async def extract_lines_with_bbox_from_scanned_pdf(
283
  self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False
284
  ):
285
+ logger.info(
286
+ f"Extracting lines from scanned PDF: {pdf_path} (first_page: {first_page})"
287
+ )
288
 
289
  def _extract_from_scanned():
290
+ try:
291
+ result = None
292
+ doc = None
293
+
294
+ if first_page:
295
+ number_of_pages = fitz.open(pdf_path).page_count
296
+ logger.debug(
297
+ f"Processing first page(s) only, total pages: {number_of_pages}"
298
  )
299
+ if number_of_pages < 3:
300
+ pages = convert_from_path(
301
+ pdf_path, dpi=300, first_page=1, last_page=number_of_pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  )
 
 
 
 
 
 
 
 
 
 
 
 
303
  else:
304
+ pages = convert_from_path(
305
+ pdf_path, dpi=300, first_page=1, last_page=3
306
+ )
307
+ first_page_img = [page.convert("RGB") for page in pages]
308
+ result = self.doctr_model([np.array(img) for img in first_page_img])
309
+ doc = [np.array(img) for img in first_page_img]
310
+ else:
311
+ logger.debug("Processing all pages using parallel processing")
312
+ pdf = fitz.open(pdf_path)
313
+ num_pages = pdf.page_count
314
+ page_witdh_f = pdf[0].rect.width
315
+ page_height_f = pdf[0].rect.height
316
+ page_chunks = self.create_page_chunks(
317
+ num_pages, multiprocessing.cpu_count()
318
+ )
319
+ logger.info(
320
+ f"Processing {num_pages} pages using {multiprocessing.cpu_count()} CPU cores"
321
+ )
322
+ with ThreadPoolExecutor(
323
+ max_workers=multiprocessing.cpu_count()
324
+ ) as executor:
325
+ futures = []
326
+ for chunk in page_chunks:
327
+ futures.append(
328
+ executor.submit(
329
+ self.process_page_parallel_async,
330
+ pdf_path,
331
+ chunk,
332
+ self,
 
 
333
  )
334
+ )
335
+ results = [f.result() for f in futures]
336
+ results.sort(key=lambda x: x[0][0])
337
+ result = []
338
+ for r in results:
339
+ result.extend(r[1])
340
+ results = result
341
+
342
+ page_lines_with_bbox = []
343
+
344
+ for result_idx, result in enumerate(results):
345
+ logger.debug(
346
+ f"Processing OCR result {result_idx + 1}/{len(results)}"
347
+ )
348
+ for page in result.pages:
349
+ if first_page:
350
+ img_width, img_height = doc[0].shape[1], doc[0].shape[0]
351
+ else:
352
+ img_width, img_height = page_witdh_f, page_height_f
353
+ words = []
354
+
355
+ for block in page.blocks:
356
+ for line in block.lines:
357
+ for word in line.words:
358
+ x0, y0 = word.geometry[0]
359
+ x1, y1 = word.geometry[1]
360
+ abs_x0 = x0 * img_width
361
+ abs_y0 = y0 * img_height
362
+ abs_x1 = x1 * img_width
363
+ abs_y1 = y1 * img_height
364
+ text = word.value.strip().lower()
365
+ text = re.sub(r"[#*]", " ", text)
366
+ text = re.sub(f"[$]", "", text)
367
+ text = text.strip()
368
+
369
+ if (
370
+ text == "|"
371
+ or not text
372
+ or text == "."
373
+ or text == "#"
374
+ or re.sub(r"[^\w\s-]", "", text) == ""
375
+ or re.sub(r"\d{19,}", "", text) == ""
376
+ ):
377
+ continue
378
+ words.append(
379
+ {
380
+ "word": text,
381
+ "bbox": [abs_x0, abs_y0, abs_x1, abs_y1],
382
+ }
383
+ )
384
+
385
+ words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
386
+
387
+ lines = []
388
+ current_line = []
389
+ current_word_data = []
390
+ current_y = None
391
+
392
+ for w in words:
393
+ y0 = w["bbox"][1]
394
+ if current_y is None or abs(y0 - current_y) < y_threshold:
395
+ current_line.append((w["bbox"][0], y0, w["word"]))
396
+ current_word_data.append(w)
397
+ current_y = y0
398
+ else:
399
+ current_line.sort()
400
+ line_words = [x[2] for x in current_line]
401
+ clean_line = self.remove_consecutive_items(line_words)
402
+ current_word_data = sorted(
403
+ current_word_data, key=lambda w: w["bbox"][0]
404
+ )
405
+ clean_word_data = self.remove_consecutive_words(
406
+ current_word_data
407
+ )
408
 
409
+ if clean_line:
410
+ x_start = min(x[0] for x in current_line)
411
+ y_start = min(x[1] for x in current_line)
412
+ if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
413
+ lines.append(
414
+ {
415
+ "line": " ".join(clean_line),
416
+ "bbox": [x_start, y_start],
417
+ "words": clean_word_data,
418
+ }
419
+ )
420
+ current_line = [(w["bbox"][0], y0, w["word"])]
421
+ current_word_data = [w]
422
+ current_y = y0
423
+
424
+ if current_line:
425
  current_line.sort()
426
  line_words = [x[2] for x in current_line]
427
  clean_line = self.remove_consecutive_items(line_words)
 
443
  "words": clean_word_data,
444
  }
445
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
+ page_lines_with_bbox.append(lines)
448
 
449
+ logger.info(
450
+ f"Successfully extracted lines from {len(page_lines_with_bbox)} scanned pages"
451
+ )
452
+ return page_lines_with_bbox
453
+ except Exception as e:
454
+ logger.error(f"Error extracting lines from scanned PDF: {e}")
455
+ raise
456
 
457
  return await asyncio.get_event_loop().run_in_executor(
458
  None, _extract_from_scanned