ocr-engine-3

Sleeping

App Files Files Community

kanha-upadhyay commited on Aug 23, 2025

Commit

2e2af5e

1 Parent(s): 15ad0e8

Enhance PDFProcessorService and TextExtractor with improved logging and error handling

Browse files

Files changed (2) hide show

src/services/_pdf_processor_service.py +106 -74
src/utils/_text_extractor.py +256 -219

src/services/_pdf_processor_service.py CHANGED Viewed

@@ -13,27 +13,22 @@ from src.utils import TextExtractor, model_manager
 class PDFProcessorService:
-    """Async PDF processor for handling both digital and scanned PDFs."""
     def __init__(self):
-        # Use the centralized model manager
         self._ensure_models_loaded()
     def _ensure_models_loaded(self):
-        """Ensure models are loaded via the model manager."""
         if not model_manager.models_loaded:
-            logger.info("🔄 Models not loaded, initializing model manager...")
-            # This will trigger model loading if not already done
             _ = model_manager.doctr_model
     @property
     def doctr_model(self):
-        """Get the loaded doctr model from model manager."""
         return model_manager.doctr_model
     @property
     def device(self):
-        """Get the device being used from model manager."""
         return model_manager.device
     async def __aenter__(self):
@@ -43,60 +38,76 @@ class PDFProcessorService:
         pass
     async def is_pdf_scanned(self, pdf_path: str) -> bool:
-        """Check if PDF is scanned (no extractable text)."""
         def _check_scanned():
-            doc = fitz.open(pdf_path)
-            for page in doc:
-                text = page.get_text()
-                if text.strip():
-                    return False
-            return True
         return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
     async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
-        file_name = uploaded_file.filename
-        suffix = Path(file_name).suffix
-        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
-            temp_path = tmp.name
-        async with aiofiles.open(temp_path, "wb") as f:
-            await f.write(await uploaded_file.read())
-        return temp_path
     async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
-        """Extract text from digital PDF using PyPDF2."""
         async def _extract_text():
-            doc = fitz.open(pdf_path)
-            extracted_data = []
-            for page in doc:
-                ptext = page.get_text()
-                if ptext:
-                    data = []
-                    for line in ptext.splitlines():
-                        cleaned_line = await self._split_on_repeated_pattern(
-                            line.strip()
-                        )
-                        if cleaned_line:
-                            data.append(cleaned_line[0])
-                        extracted_data.append(data)
-            return extracted_data
         return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
     async def _split_on_repeated_pattern(
         self, line: str, min_space: int = 10
     ) -> List[str]:
-        """Split line on repeated pattern."""
         import re
         from difflib import SequenceMatcher
         original_line = line.strip()
-        # Find all spans of spaces >= min_space
         space_spans = [
             (m.start(), len(m.group()))
             for m in re.finditer(r" {%d,}" % min_space, original_line)
@@ -105,27 +116,22 @@ class PDFProcessorService:
         if not space_spans:
             return [original_line]
-        # Count how often each gap size occurs
         gaps = [span[1] for span in space_spans]
         gap_counts = {}
         for g in gaps:
             gap_counts[g] = gap_counts.get(g, 0) + 1
-        # Sort gaps by size × count (more dominant gaps first)
         sorted_gaps = sorted(
             gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
         )
-        # No significant gaps, return original
         if not sorted_gaps:
             return [original_line]
         dominant_gap = sorted_gaps[0][0]
-        # Use the dominant large gap to split
         chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
-        # Check if it's actually repeated using fuzzy match
         base = chunks[0].strip()
         repeated = False
         for chunk in chunks[1:]:
@@ -137,38 +143,64 @@ class PDFProcessorService:
         return [base] if repeated else [original_line]
     async def process_pdf(self, file):
-        pdf_path = await self.save_uploaded_file(file)
-        is_scanned = await self.is_pdf_scanned(pdf_path)
-        text_extractor = TextExtractor(self.doctr_model)
-        if is_scanned:
-            logger.info(f"{pdf_path} is likely a scanned PDF.")
-            extracted_text_list = (
-                await text_extractor.extract_lines_with_bbox_from_scanned_pdf(pdf_path)
-            )
-        else:
-            logger.info(f"{pdf_path} is not a scanned PDF. Extracting text...")
-            extracted_text_list = await text_extractor.extract_lines_with_bbox(pdf_path)
-            pdf_text = ""
-            for block in extracted_text_list:
-                for line in block:
-                    pdf_text += " " + line["line"]
-            text_noisy = text_extractor.is_text_noisy(pdf_text)
-            if text_noisy:
-                logger.info("Text is noisy. Extracting text again...")
                 extracted_text_list = (
                     await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
                         pdf_path
                     )
                 )
-        return extracted_text_list
     async def extract_entity(self, text: str):
-        text = re.sub(r"[^\w\s]", " ", text)
-        doc = model_manager.spacy_model(text)
-        entities = {ent.text: ent.label_ for ent in doc.ents}
-        for key, value in entities.items():
-            if value == "ORG":
-                return key
-        if entities:
-            return list(entities.keys())[0]
-        return text

 class PDFProcessorService:
     def __init__(self):
+        logger.info("Initializing PDFProcessorService")
         self._ensure_models_loaded()
     def _ensure_models_loaded(self):
         if not model_manager.models_loaded:
+            logger.info("Models not loaded, initializing model manager...")
             _ = model_manager.doctr_model
+            logger.debug("Model manager initialization completed")
     @property
     def doctr_model(self):
         return model_manager.doctr_model
     @property
     def device(self):
         return model_manager.device
     async def __aenter__(self):
         pass
     async def is_pdf_scanned(self, pdf_path: str) -> bool:
+        logger.debug(f"Checking if PDF is scanned: {pdf_path}")
         def _check_scanned():
+            try:
+                doc = fitz.open(pdf_path)
+                for page in doc:
+                    text = page.get_text()
+                    if text.strip():
+                        return False
+                return True
+            except Exception as e:
+                logger.error(f"Error checking if PDF is scanned: {e}")
+                raise
         return await asyncio.get_event_loop().run_in_executor(None, _check_scanned)
     async def save_uploaded_file(self, uploaded_file: UploadFile) -> str:
+        logger.info(f"Saving uploaded file: {uploaded_file.filename}")
+        try:
+            file_name = uploaded_file.filename
+            suffix = Path(file_name).suffix
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+                temp_path = tmp.name
+            async with aiofiles.open(temp_path, "wb") as f:
+                await f.write(await uploaded_file.read())
+            logger.debug(f"File saved to temporary path: {temp_path}")
+            return temp_path
+        except Exception as e:
+            logger.error(f"Error saving uploaded file: {e}")
+            raise
     async def extract_text_from_digital_pdf(self, pdf_path: str) -> List[List[str]]:
+        logger.debug(f"Extracting text from digital PDF: {pdf_path}")
         async def _extract_text():
+            try:
+                doc = fitz.open(pdf_path)
+                extracted_data = []
+                for page in doc:
+                    ptext = page.get_text()
+                    if ptext:
+                        data = []
+                        for line in ptext.splitlines():
+                            cleaned_line = await self._split_on_repeated_pattern(
+                                line.strip()
+                            )
+                            if cleaned_line:
+                                data.append(cleaned_line[0])
+                            extracted_data.append(data)
+                logger.info(
+                    f"Successfully extracted text from {len(extracted_data)} pages"
+                )
+                return extracted_data
+            except Exception as e:
+                logger.error(f"Error extracting text from digital PDF: {e}")
+                raise
         return await asyncio.get_event_loop().run_in_executor(None, _extract_text)
     async def _split_on_repeated_pattern(
         self, line: str, min_space: int = 10
     ) -> List[str]:
+        logger.debug(f"Processing line for repeated patterns: {line[:50]}...")
         import re
         from difflib import SequenceMatcher
         original_line = line.strip()
         space_spans = [
             (m.start(), len(m.group()))
             for m in re.finditer(r" {%d,}" % min_space, original_line)
         if not space_spans:
             return [original_line]
         gaps = [span[1] for span in space_spans]
         gap_counts = {}
         for g in gaps:
             gap_counts[g] = gap_counts.get(g, 0) + 1
         sorted_gaps = sorted(
             gap_counts.items(), key=lambda x: x[1] * x[0], reverse=True
         )
         if not sorted_gaps:
             return [original_line]
         dominant_gap = sorted_gaps[0][0]
         chunks = re.split(rf" {{%d,}}" % dominant_gap, original_line)
         base = chunks[0].strip()
         repeated = False
         for chunk in chunks[1:]:
         return [base] if repeated else [original_line]
     async def process_pdf(self, file):
+        logger.info(f"Processing PDF file: {file.filename}")
+        try:
+            pdf_path = await self.save_uploaded_file(file)
+            is_scanned = await self.is_pdf_scanned(pdf_path)
+            text_extractor = TextExtractor(self.doctr_model)
+            if is_scanned:
+                logger.info(f"PDF {pdf_path} is scanned, using OCR extraction")
                 extracted_text_list = (
                     await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
                         pdf_path
                     )
                 )
+            else:
+                logger.info(f"PDF {pdf_path} is digital, extracting text directly")
+                extracted_text_list = await text_extractor.extract_lines_with_bbox(
+                    pdf_path
+                )
+                pdf_text = ""
+                for block in extracted_text_list:
+                    for line in block:
+                        pdf_text += " " + line["line"]
+                text_noisy = text_extractor.is_text_noisy(pdf_text)
+                if text_noisy:
+                    logger.warning("Text is noisy, falling back to OCR extraction")
+                    extracted_text_list = (
+                        await text_extractor.extract_lines_with_bbox_from_scanned_pdf(
+                            pdf_path
+                        )
+                    )
+            logger.info(
+                f"Successfully processed PDF with {len(extracted_text_list)} text blocks"
+            )
+            return extracted_text_list
+        except Exception as e:
+            logger.error(f"Error processing PDF: {e}")
+            raise
     async def extract_entity(self, text: str):
+        logger.debug(f"Extracting entities from text: {text[:100]}...")
+        try:
+            text = re.sub(r"[^\w\s]", " ", text)
+            doc = model_manager.spacy_model(text)
+            entities = {ent.text: ent.label_ for ent in doc.ents}
+            for key, value in entities.items():
+                if value == "ORG":
+                    logger.info(f"Found organization entity: {key}")
+                    return key
+            if entities:
+                entity = list(entities.keys())[0]
+                logger.info(f"Found entity: {entity}")
+                return entity
+            logger.debug("No entities found, returning original text")
+            return text
+        except Exception as e:
+            logger.error(f"Error extracting entities: {e}")
+            return text

src/utils/_text_extractor.py CHANGED Viewed

@@ -8,13 +8,13 @@ from typing import Dict, List
 import fitz
 import numpy as np
 from pdf2image import convert_from_path
 class TextExtractor:
-    """Async text extractor for extracting text with bounding boxes."""
     def __init__(self, doctr_model):
         self.doctr_model = doctr_model
         self.noise_pattern = [
             r"\b[A-Z]{6,}\b",
@@ -22,6 +22,7 @@ class TextExtractor:
             r"(\d)\1{5,}",
             r"\b(?=[A-Za-z]*\d)(?=\d*[A-Za-z])[A-Za-z\d]{8,}\b",
         ]
     async def __aenter__(self):
         return self
@@ -30,33 +31,36 @@ class TextExtractor:
         pass
     def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
-        """Normalize bounding box (x0, y0, x1, y1) to range [0, 1]."""
         x0, y0, x1, y1 = bbox
-        return [
             round(x0 / width, 6),
             round(y0 / height, 6),
             round(x1 / width, 6),
             round(y1 / height, 6),
         ]
     def remove_consecutive_items(self, line: List[str]) -> List[str]:
-        """Remove consecutive duplicate items from a list."""
         if not line:
             return line
         result = [line[0]]
         for item in line[1:]:
             if item != result[-1]:
                 result.append(item)
         return result
     def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
-        """Remove consecutive duplicate words from word data."""
         if not word_data:
             return word_data
         result = [word_data[0]]
         for i in range(1, len(word_data)):
             if word_data[i]["word"] != result[-1]["word"]:
                 result.append(word_data[i])
         return result
     def shannon_entropy(self, text: str) -> float:
@@ -69,17 +73,9 @@ class TextExtractor:
         )
     def reconstruct_line_from_bboxes(self, words, space_unit=5):
-        """
-        Reconstructs a line with appropriate spacing based on word bounding boxes.
-        Parameters:
-        - words: list of dicts with 'word' and 'bbox' (bbox = [x0, y0, x1, y1])
-        - space_unit: how many pixels roughly correspond to one space
-        Returns:
-        - str: reconstructed line with spaces
-        """
-        # Sort words by x-coordinate (left to right)
         words = sorted(words, key=lambda w: w["bbox"][0])
         line = ""
@@ -89,88 +85,113 @@ class TextExtractor:
             start_x = word_info["bbox"][0]
             if prev_end_x is not None:
-                # Calculate gap between previous word and current word
                 gap = max(0, start_x - prev_end_x)
                 num_spaces = int(round(gap / space_unit))
                 line += " " * num_spaces
             line += word
-            prev_end_x = word_info["bbox"][2]  # x1 of current word
         return line
     def is_text_noisy(self, text: str) -> bool:
-        """Check if text is noisy (contains special characters)."""
         total_chars = len(text)
-        if total_chars < 50:  # skip empty or small pages
             return True
         tokens = re.findall(r"\b\w+\b", text)
         total_words = len(tokens)
-        # Symbol & digit density
         digit_count = len(re.findall(r"\d", text))
-        symbol_count = len(
-            re.findall(r"[^\w\s]", text)
-        )  # anything not a word char or whitespace
         symbol_density = symbol_count / total_chars
         digit_density = digit_count / total_chars
-        # Repeating char patterns like "22222222222" or "!!!!!!"
-        long_repeats = len(re.findall(r"(.)\1{5,}", text))  # any char repeated 6+ times
-        # Entropy: randomness of characters
         entropy = self.shannon_entropy(text)
-        # Heuristics tuned for your sample
-        if (
             entropy > 4.0
             and symbol_density > 0.15
             and digit_density > 0.15
             and long_repeats > 1
             and total_words > 30
-        ):
-            return True
-        return False
     async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0):
-        """Extract lines with bounding boxes from digital PDF."""
         def _extract_lines():
-            doc = fitz.open(pdf_path)
-            page_lines_with_bbox = []
-            for page in doc:
-                words = page.get_text(
-                    "words"
-                )  # (x0, y0, x1, y1, word, block_no, line_no, word_no)
-                words.sort(key=lambda w: (round(w[1], 1), w[0]))  # sort by y then x
-                lines = []
-                current_line = []
-                current_y = None
-                current_word_data = []
-                for w in words:
-                    x0, y0, x1, y1, word = w[:5]
-                    if (
-                        word == "|"
-                        or not word
-                        or word == "."
-                        or word == "#"
-                        or re.sub(r"[^\w\s-]", "", word) == ""
-                        or re.sub(r"\d{19,}", "", word) == ""
-                    ):
-                        continue
-                    word = word.lower()
-                    word = word.replace("$", "")
-                    word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
-                    if current_y is None or abs(y0 - current_y) < y_threshold:
-                        current_line.append((x0, y0, word))
-                        current_y = y0
-                        current_word_data.append(word_data)
-                    else:
                         current_line.sort()
                         line_words = [w[2] for w in current_line]
                         clean_line = self.remove_consecutive_items(line_words)
@@ -192,43 +213,29 @@ class TextExtractor:
                                         "words": clean_word_data,
                                     }
                                 )
-                        current_line = [(x0, y0, word)]
-                        current_y = y0
-                        current_word_data = [word_data]
-                # Process remaining line
-                if current_line:
-                    current_line.sort()
-                    line_words = [w[2] for w in current_line]
-                    clean_line = self.remove_consecutive_items(line_words)
-                    current_word_data = sorted(
-                        current_word_data, key=lambda w: w["bbox"][0]
-                    )
-                    clean_word_data = self.remove_consecutive_words(current_word_data)
-                    if clean_line:
-                        x_start = min([w[0] for w in current_line])
-                        y_start = min([w[1] for w in current_line])
-                        if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
-                            lines.append(
-                                {
-                                    "line": " ".join(clean_line),
-                                    "bbox": [x_start, y_start],
-                                    "words": clean_word_data,
-                                }
-                            )
-                page_lines_with_bbox.append(lines)
-            return page_lines_with_bbox
         return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
     def create_page_chunks(self, num_pages: int, cpu_core: int):
         final_ranges = []
         page_per_cpu = 2
         for i in range(1, num_pages + 1, page_per_cpu + 1):
             final_ranges.append([i, min(i + page_per_cpu, num_pages)])
         return final_ranges
     def process_page_parallel_async(
@@ -246,6 +253,7 @@ class TextExtractor:
     async def process_pages_concurrently(self, pdf_path: str, page_range: List[int]):
         start_page = page_range[0]
         end_page = page_range[1]
         tasks = []
         for page in range(start_page, end_page + 1):
@@ -255,117 +263,165 @@ class TextExtractor:
         page_results.sort(key=lambda x: x[0])
         chunk_outputs = [output for page_num, output in page_results]
         return page_range, chunk_outputs
     async def process_page_parallel(self, pdf_path: str, i: int):
-        print(f"Processing page {i}")
-        pages = convert_from_path(pdf_path, dpi=300, first_page=i, last_page=i)
-        page_imgs = [page.convert("RGB") for page in pages]
-        output = self.doctr_model([np.array(img) for img in page_imgs])
-        return i, output
     async def extract_lines_with_bbox_from_scanned_pdf(
         self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False
     ):
-        """Extract lines with bounding boxes from scanned PDF using OCR."""
         def _extract_from_scanned():
-            result = None
-            doc = None
-            if first_page:
-                number_of_pages = fitz.open(pdf_path).page_count
-                if number_of_pages < 3:
-                    pages = convert_from_path(
-                        pdf_path, dpi=300, first_page=1, last_page=number_of_pages
                     )
-                else:
-                    pages = convert_from_path(
-                        pdf_path, dpi=300, first_page=1, last_page=3
-                    )
-                first_page_img = [page.convert("RGB") for page in pages]
-                result = self.doctr_model([np.array(img) for img in first_page_img])
-                doc = [np.array(img) for img in first_page_img]
-            else:
-                pdf = fitz.open(pdf_path)
-                num_pages = pdf.page_count
-                page_witdh_f = pdf[0].rect.width
-                page_height_f = pdf[0].rect.height
-                page_chunks = self.create_page_chunks(
-                    num_pages, multiprocessing.cpu_count()
-                )
-                with ThreadPoolExecutor(
-                    max_workers=multiprocessing.cpu_count()
-                ) as executor:
-                    futures = []
-                    for chunk in page_chunks:
-                        futures.append(
-                            executor.submit(
-                                self.process_page_parallel_async, pdf_path, chunk, self
-                            )
                         )
-                    results = [f.result() for f in futures]
-                results.sort(key=lambda x: x[0][0])
-                result = []
-                for r in results:
-                    result.extend(r[1])
-                results = result
-            page_lines_with_bbox = []
-            for result in results:
-                for page in result.pages:
-                    if first_page:
-                        img_width, img_height = doc[0].shape[1], doc[0].shape[0]
                     else:
-                        img_width, img_height = page_witdh_f, page_height_f
-                    words = []
-                    for block in page.blocks:
-                        for line in block.lines:
-                            for word in line.words:
-                                x0, y0 = word.geometry[0]
-                                x1, y1 = word.geometry[1]
-                                abs_x0 = x0 * img_width
-                                abs_y0 = y0 * img_height
-                                abs_x1 = x1 * img_width
-                                abs_y1 = y1 * img_height
-                                text = word.value.strip().lower()
-                                text = re.sub(r"[#*]", " ", text)
-                                text = re.sub(f"[$]", "", text)
-                                text = text.strip()
-                                if (
-                                    text == "|"
-                                    or not text
-                                    or text == "."
-                                    or text == "#"
-                                    or re.sub(r"[^\w\s-]", "", text) == ""
-                                    or re.sub(r"\d{19,}", "", text) == ""
-                                ):
-                                    continue
-                                words.append(
-                                    {
-                                        "word": text,
-                                        "bbox": [abs_x0, abs_y0, abs_x1, abs_y1],
-                                    }
                                 )
-                # Sort words by y then x
-                words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
-                lines = []
-                current_line = []
-                current_word_data = []
-                current_y = None
-                for w in words:
-                    y0 = w["bbox"][1]
-                    if current_y is None or abs(y0 - current_y) < y_threshold:
-                        current_line.append((w["bbox"][0], y0, w["word"]))
-                        current_word_data.append(w)
-                        current_y = y0
-                    else:
                         current_line.sort()
                         line_words = [x[2] for x in current_line]
                         clean_line = self.remove_consecutive_items(line_words)
@@ -387,35 +443,16 @@ class TextExtractor:
                                         "words": clean_word_data,
                                     }
                                 )
-                        current_line = [(w["bbox"][0], y0, w["word"])]
-                        current_word_data = [w]
-                        current_y = y0
-                # Final remaining line
-                if current_line:
-                    current_line.sort()
-                    line_words = [x[2] for x in current_line]
-                    clean_line = self.remove_consecutive_items(line_words)
-                    current_word_data = sorted(
-                        current_word_data, key=lambda w: w["bbox"][0]
-                    )
-                    clean_word_data = self.remove_consecutive_words(current_word_data)
-                    if clean_line:
-                        x_start = min(x[0] for x in current_line)
-                        y_start = min(x[1] for x in current_line)
-                        if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
-                            lines.append(
-                                {
-                                    "line": " ".join(clean_line),
-                                    "bbox": [x_start, y_start],
-                                    "words": clean_word_data,
-                                }
-                            )
-                page_lines_with_bbox.append(lines)
-            return page_lines_with_bbox
         return await asyncio.get_event_loop().run_in_executor(
             None, _extract_from_scanned

 import fitz
 import numpy as np
+from loguru import logger
 from pdf2image import convert_from_path
 class TextExtractor:
     def __init__(self, doctr_model):
+        logger.info("Initializing TextExtractor")
         self.doctr_model = doctr_model
         self.noise_pattern = [
             r"\b[A-Z]{6,}\b",
             r"(\d)\1{5,}",
             r"\b(?=[A-Za-z]*\d)(?=\d*[A-Za-z])[A-Za-z\d]{8,}\b",
         ]
+        logger.debug(f"Initialized with {len(self.noise_pattern)} noise patterns")
     async def __aenter__(self):
         return self
         pass
     def normalize_bbox(self, bbox, width: float, height: float) -> List[float]:
         x0, y0, x1, y1 = bbox
+        normalized = [
             round(x0 / width, 6),
             round(y0 / height, 6),
             round(x1 / width, 6),
             round(y1 / height, 6),
         ]
+        logger.debug(f"Normalized bbox from {bbox} to {normalized}")
+        return normalized
     def remove_consecutive_items(self, line: List[str]) -> List[str]:
         if not line:
             return line
         result = [line[0]]
         for item in line[1:]:
             if item != result[-1]:
                 result.append(item)
+        logger.debug(f"Removed consecutive items: {len(line)} -> {len(result)} items")
         return result
     def remove_consecutive_words(self, word_data: List[Dict]) -> List[Dict]:
         if not word_data:
             return word_data
         result = [word_data[0]]
         for i in range(1, len(word_data)):
             if word_data[i]["word"] != result[-1]["word"]:
                 result.append(word_data[i])
+        logger.debug(
+            f"Removed consecutive words: {len(word_data)} -> {len(result)} words"
+        )
         return result
     def shannon_entropy(self, text: str) -> float:
         )
     def reconstruct_line_from_bboxes(self, words, space_unit=5):
+        logger.debug(
+            f"Reconstructing line from {len(words)} words with space_unit={space_unit}"
+        )
         words = sorted(words, key=lambda w: w["bbox"][0])
         line = ""
             start_x = word_info["bbox"][0]
             if prev_end_x is not None:
                 gap = max(0, start_x - prev_end_x)
                 num_spaces = int(round(gap / space_unit))
                 line += " " * num_spaces
             line += word
+            prev_end_x = word_info["bbox"][2]
+        logger.debug(f"Reconstructed line: '{line[:100]}...'")
         return line
     def is_text_noisy(self, text: str) -> bool:
+        logger.debug(f"Checking if text is noisy: {len(text)} characters")
         total_chars = len(text)
+        if total_chars < 50:
+            logger.debug("Text too short, marking as noisy")
             return True
         tokens = re.findall(r"\b\w+\b", text)
         total_words = len(tokens)
         digit_count = len(re.findall(r"\d", text))
+        symbol_count = len(re.findall(r"[^\w\s]", text))
         symbol_density = symbol_count / total_chars
         digit_density = digit_count / total_chars
+        long_repeats = len(re.findall(r"(.)\1{5,}", text))
         entropy = self.shannon_entropy(text)
+        is_noisy = (
             entropy > 4.0
             and symbol_density > 0.15
             and digit_density > 0.15
             and long_repeats > 1
             and total_words > 30
+        )
+        logger.debug(
+            f"Noise analysis - entropy: {entropy:.2f}, symbol_density: {symbol_density:.2f}, "
+            f"digit_density: {digit_density:.2f}, long_repeats: {long_repeats}, "
+            f"total_words: {total_words}, is_noisy: {is_noisy}"
+        )
+        return is_noisy
     async def extract_lines_with_bbox(self, pdf_path: str, y_threshold: float = 3.0):
+        logger.info(f"Extracting lines with bbox from digital PDF: {pdf_path}")
         def _extract_lines():
+            try:
+                doc = fitz.open(pdf_path)
+                page_lines_with_bbox = []
+                for page_num, page in enumerate(doc):
+                    logger.debug(f"Processing page {page_num + 1}")
+                    words = page.get_text("words")
+                    words.sort(key=lambda w: (round(w[1], 1), w[0]))
+                    lines = []
+                    current_line = []
+                    current_y = None
+                    current_word_data = []
+                    for w in words:
+                        x0, y0, x1, y1, word = w[:5]
+                        if (
+                            word == "|"
+                            or not word
+                            or word == "."
+                            or word == "#"
+                            or re.sub(r"[^\w\s-]", "", word) == ""
+                            or re.sub(r"\d{19,}", "", word) == ""
+                        ):
+                            continue
+                        word = word.lower()
+                        word = word.replace("$", "")
+                        word_data = {"word": word.strip(), "bbox": (x0, y0, x1, y1)}
+                        if current_y is None or abs(y0 - current_y) < y_threshold:
+                            current_line.append((x0, y0, word))
+                            current_y = y0
+                            current_word_data.append(word_data)
+                        else:
+                            current_line.sort()
+                            line_words = [w[2] for w in current_line]
+                            clean_line = self.remove_consecutive_items(line_words)
+                            current_word_data = sorted(
+                                current_word_data, key=lambda w: w["bbox"][0]
+                            )
+                            clean_word_data = self.remove_consecutive_words(
+                                current_word_data
+                            )
+                            if clean_line:
+                                x_start = min([w[0] for w in current_line])
+                                y_start = min([w[1] for w in current_line])
+                                if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
+                                    lines.append(
+                                        {
+                                            "line": " ".join(clean_line),
+                                            "bbox": [x_start, y_start],
+                                            "words": clean_word_data,
+                                        }
+                                    )
+                            current_line = [(x0, y0, word)]
+                            current_y = y0
+                            current_word_data = [word_data]
+                    if current_line:
                         current_line.sort()
                         line_words = [w[2] for w in current_line]
                         clean_line = self.remove_consecutive_items(line_words)
                                         "words": clean_word_data,
                                     }
                                 )
+                    logger.debug(f"Page {page_num + 1}: extracted {len(lines)} lines")
+                    page_lines_with_bbox.append(lines)
+                logger.info(
+                    f"Successfully extracted lines from {len(page_lines_with_bbox)} pages"
+                )
+                return page_lines_with_bbox
+            except Exception as e:
+                logger.error(f"Error extracting lines from digital PDF: {e}")
+                raise
         return await asyncio.get_event_loop().run_in_executor(None, _extract_lines)
     def create_page_chunks(self, num_pages: int, cpu_core: int):
+        logger.debug(
+            f"Creating page chunks for {num_pages} pages using {cpu_core} CPU cores"
+        )
         final_ranges = []
         page_per_cpu = 2
         for i in range(1, num_pages + 1, page_per_cpu + 1):
             final_ranges.append([i, min(i + page_per_cpu, num_pages)])
+        logger.debug(f"Created {len(final_ranges)} page chunks: {final_ranges}")
         return final_ranges
     def process_page_parallel_async(
     async def process_pages_concurrently(self, pdf_path: str, page_range: List[int]):
         start_page = page_range[0]
         end_page = page_range[1]
+        logger.debug(f"Processing pages {start_page}-{end_page} concurrently")
         tasks = []
         for page in range(start_page, end_page + 1):
         page_results.sort(key=lambda x: x[0])
         chunk_outputs = [output for page_num, output in page_results]
+        logger.debug(f"Completed processing pages {start_page}-{end_page}")
         return page_range, chunk_outputs
     async def process_page_parallel(self, pdf_path: str, i: int):
+        logger.debug(f"Processing page {i}")
+        try:
+            pages = convert_from_path(pdf_path, dpi=300, first_page=i, last_page=i)
+            page_imgs = [page.convert("RGB") for page in pages]
+            output = self.doctr_model([np.array(img) for img in page_imgs])
+            logger.debug(f"Successfully processed page {i}")
+            return i, output
+        except Exception as e:
+            logger.error(f"Error processing page {i}: {e}")
+            raise
     async def extract_lines_with_bbox_from_scanned_pdf(
         self, pdf_path: str, y_threshold: float = 5.0, first_page: bool = False
     ):
+        logger.info(
+            f"Extracting lines from scanned PDF: {pdf_path} (first_page: {first_page})"
+        )
         def _extract_from_scanned():
+            try:
+                result = None
+                doc = None
+                if first_page:
+                    number_of_pages = fitz.open(pdf_path).page_count
+                    logger.debug(
+                        f"Processing first page(s) only, total pages: {number_of_pages}"
                     )
+                    if number_of_pages < 3:
+                        pages = convert_from_path(
+                            pdf_path, dpi=300, first_page=1, last_page=number_of_pages
                         )
                     else:
+                        pages = convert_from_path(
+                            pdf_path, dpi=300, first_page=1, last_page=3
+                        )
+                    first_page_img = [page.convert("RGB") for page in pages]
+                    result = self.doctr_model([np.array(img) for img in first_page_img])
+                    doc = [np.array(img) for img in first_page_img]
+                else:
+                    logger.debug("Processing all pages using parallel processing")
+                    pdf = fitz.open(pdf_path)
+                    num_pages = pdf.page_count
+                    page_witdh_f = pdf[0].rect.width
+                    page_height_f = pdf[0].rect.height
+                    page_chunks = self.create_page_chunks(
+                        num_pages, multiprocessing.cpu_count()
+                    )
+                    logger.info(
+                        f"Processing {num_pages} pages using {multiprocessing.cpu_count()} CPU cores"
+                    )
+                    with ThreadPoolExecutor(
+                        max_workers=multiprocessing.cpu_count()
+                    ) as executor:
+                        futures = []
+                        for chunk in page_chunks:
+                            futures.append(
+                                executor.submit(
+                                    self.process_page_parallel_async,
+                                    pdf_path,
+                                    chunk,
+                                    self,
                                 )
+                            )
+                        results = [f.result() for f in futures]
+                    results.sort(key=lambda x: x[0][0])
+                    result = []
+                    for r in results:
+                        result.extend(r[1])
+                    results = result
+                page_lines_with_bbox = []
+                for result_idx, result in enumerate(results):
+                    logger.debug(
+                        f"Processing OCR result {result_idx + 1}/{len(results)}"
+                    )
+                    for page in result.pages:
+                        if first_page:
+                            img_width, img_height = doc[0].shape[1], doc[0].shape[0]
+                        else:
+                            img_width, img_height = page_witdh_f, page_height_f
+                        words = []
+                        for block in page.blocks:
+                            for line in block.lines:
+                                for word in line.words:
+                                    x0, y0 = word.geometry[0]
+                                    x1, y1 = word.geometry[1]
+                                    abs_x0 = x0 * img_width
+                                    abs_y0 = y0 * img_height
+                                    abs_x1 = x1 * img_width
+                                    abs_y1 = y1 * img_height
+                                    text = word.value.strip().lower()
+                                    text = re.sub(r"[#*]", " ", text)
+                                    text = re.sub(f"[$]", "", text)
+                                    text = text.strip()
+                                    if (
+                                        text == "|"
+                                        or not text
+                                        or text == "."
+                                        or text == "#"
+                                        or re.sub(r"[^\w\s-]", "", text) == ""
+                                        or re.sub(r"\d{19,}", "", text) == ""
+                                    ):
+                                        continue
+                                    words.append(
+                                        {
+                                            "word": text,
+                                            "bbox": [abs_x0, abs_y0, abs_x1, abs_y1],
+                                        }
+                                    )
+                    words.sort(key=lambda w: (round(w["bbox"][1], 3), w["bbox"][0]))
+                    lines = []
+                    current_line = []
+                    current_word_data = []
+                    current_y = None
+                    for w in words:
+                        y0 = w["bbox"][1]
+                        if current_y is None or abs(y0 - current_y) < y_threshold:
+                            current_line.append((w["bbox"][0], y0, w["word"]))
+                            current_word_data.append(w)
+                            current_y = y0
+                        else:
+                            current_line.sort()
+                            line_words = [x[2] for x in current_line]
+                            clean_line = self.remove_consecutive_items(line_words)
+                            current_word_data = sorted(
+                                current_word_data, key=lambda w: w["bbox"][0]
+                            )
+                            clean_word_data = self.remove_consecutive_words(
+                                current_word_data
+                            )
+                            if clean_line:
+                                x_start = min(x[0] for x in current_line)
+                                y_start = min(x[1] for x in current_line)
+                                if re.sub(r"\d{13,}", "", " ".join(clean_line)) != "":
+                                    lines.append(
+                                        {
+                                            "line": " ".join(clean_line),
+                                            "bbox": [x_start, y_start],
+                                            "words": clean_word_data,
+                                        }
+                                    )
+                            current_line = [(w["bbox"][0], y0, w["word"])]
+                            current_word_data = [w]
+                            current_y = y0
+                    if current_line:
                         current_line.sort()
                         line_words = [x[2] for x in current_line]
                         clean_line = self.remove_consecutive_items(line_words)
                                         "words": clean_word_data,
                                     }
                                 )
+                    page_lines_with_bbox.append(lines)
+                logger.info(
+                    f"Successfully extracted lines from {len(page_lines_with_bbox)} scanned pages"
+                )
+                return page_lines_with_bbox
+            except Exception as e:
+                logger.error(f"Error extracting lines from scanned PDF: {e}")
+                raise
         return await asyncio.get_event_loop().run_in_executor(
             None, _extract_from_scanned