Spaces:

hkai20000
/

ocrAPP

Sleeping

App Files Files Community

hkai20000 commited on Mar 6

Commit

175eb27

verified ·

1 Parent(s): ee0bd33

Update main.py

Browse files

Files changed (1) hide show

main.py +132 -117

main.py CHANGED Viewed

@@ -116,11 +116,6 @@ NER_MODELS = {
 # --- GLOBAL MODEL CACHES ---
 ner_model_cache: Dict[str, Any] = {}
 ocr_model_cache: Dict[str, Any] = {}
-mlm_corrector_cache: Dict[str, Any] = {}
-# --- OCR CORRECTION MODEL ---
-OCR_CORRECTION_MODEL = "hkai20000/bio-clinicalbert-ocr-correction"
 # --- DOCLING CONVERTER CACHE ---
 docling_converter_cache: Dict[str, Any] = {}
@@ -312,122 +307,142 @@ def get_ner_pipeline(model_id: str):
         return None
-# --- OCR CORRECTION MODEL LOADING ---
-def get_mlm_corrector():
-    """Lazy-load the fill-mask pipeline for OCR error correction."""
-    if OCR_CORRECTION_MODEL in mlm_corrector_cache:
-        return mlm_corrector_cache[OCR_CORRECTION_MODEL]
-    try:
-        print(f"Loading OCR correction model: {OCR_CORRECTION_MODEL}...")
-        corrector = hf_pipeline("fill-mask", model=OCR_CORRECTION_MODEL)
-        mlm_corrector_cache[OCR_CORRECTION_MODEL] = corrector
-        print(f"OCR correction model loaded successfully!")
-        return corrector
-    except Exception as e:
-        print(f"ERROR: Failed to load OCR correction model: {e}")
-        return None
-def correct_ocr_text(words_with_boxes: list, cleaned_text: str, confidence_threshold: float = 0.75) -> dict:
-    """
-    Correct OCR errors using fill-mask MLM model.
-    For each word with docTR confidence below the threshold:
-    1. Mask the word in the full text context
-    2. Run fill-mask to get predictions
-    3. Accept correction if MLM confidence > 0.5 and edit distance <= 3
-    Returns dict with 'corrected_text' and 'corrections' list.
-    """
-    corrector = get_mlm_corrector()
-    if corrector is None:
-        return {'corrected_text': cleaned_text, 'corrections': []}
-    corrections = []
-    corrected_text = cleaned_text
-    low_confidence_words = [
-        w for w in words_with_boxes
-        if w.get('confidence', 1.0) < confidence_threshold
-        and len(w['word']) >= 4
-        and w['word'].isalpha()  # Only correct purely alphabetic words — skip numbers, units, punctuation
-    ]
-    if not low_confidence_words:
-        return {'corrected_text': cleaned_text, 'corrections': []}
-    for word_info in low_confidence_words:
-        original_word = word_info['word']
-        word_confidence = word_info.get('confidence', 0.0)
-        pattern = re.escape(original_word)
-        match = re.search(r'\b' + pattern + r'\b', corrected_text)
-        if not match:
-            match = re.search(pattern, corrected_text)
-            if not match:
-                continue
-        start, end = match.start(), match.end()
-        masked_text = corrected_text[:start] + "[MASK]" + corrected_text[end:]
-        mask_pos = masked_text.find("[MASK]")
-        context_chars = 200
-        ctx_start = max(0, mask_pos - context_chars)
-        ctx_end = min(len(masked_text), mask_pos + context_chars)
-        context = masked_text[ctx_start:ctx_end]
-        if "[MASK]" not in context:
-            continue
-        try:
-            predictions = corrector(context, top_k=5)
-        except Exception as e:
-            print(f"MLM correction error for '{original_word}': {e}")
             continue
-        if not predictions:
             continue
-        top = predictions[0]
-        predicted_word = top['token_str'].strip()
-        mlm_score = top['score']
-        edit_dist = _edit_distance(original_word.lower(), predicted_word.lower())
-        if mlm_score > 0.5 and edit_dist <= 3 and predicted_word.lower() != original_word.lower():
-            corrected_text = corrected_text[:start] + predicted_word + corrected_text[end:]
-            corrections.append({
-                'original': original_word,
-                'corrected': predicted_word,
-                'confidence': round(mlm_score, 4),
-                'ocr_confidence': round(word_confidence, 4),
-                'edit_distance': edit_dist,
-            })
-    return {
-        'corrected_text': corrected_text,
-        'corrections': corrections,
-    }
-def _edit_distance(s1: str, s2: str) -> int:
-    """Compute Levenshtein edit distance between two strings."""
-    if len(s1) < len(s2):
-        return _edit_distance(s2, s1)
-    if len(s2) == 0:
-        return len(s1)
-    prev_row = range(len(s2) + 1)
-    for i, c1 in enumerate(s1):
-        curr_row = [i + 1]
-        for j, c2 in enumerate(s2):
-            insertions = prev_row[j + 1] + 1
-            deletions = curr_row[j] + 1
-            substitutions = prev_row[j] + (c1 != c2)
-            curr_row.append(min(insertions, deletions, substitutions))
-        prev_row = curr_row
-    return prev_row[-1]
 # --- IMAGE PREPROCESSING ---
 def deskew_image(image: np.ndarray) -> np.ndarray:
@@ -2136,9 +2151,9 @@ async def get_available_models():
             for model_id, model_data in NER_MODELS.items()
         },
         "ocr_correction_model": {
-            "id": OCR_CORRECTION_MODEL,
-            "name": "Bio-ClinicalBERT OCR Correction",
-            "description": "Fine-tuned Bio_ClinicalBERT for medical OCR error correction using fill-mask MLM",
         }
     }
@@ -2264,22 +2279,12 @@ async def process_image(
             primary_table_data = {'is_table': False}
             print("No table detected by any method, using regular OCR text")
-        # OCR Text Correction (if enabled)
         correction_enabled = enable_correction.lower() == "true"
         correction_result = {'corrected_text': cleaned_text, 'corrections': []}
-        if correction_enabled:
-            print(f"Running OCR text correction with threshold={correction_threshold}...")
-            correction_result = correct_ocr_text(words_with_boxes, cleaned_text, confidence_threshold=float(correction_threshold))
-            if correction_result['corrections']:
-                print(f"Applied {len(correction_result['corrections'])} corrections")
-                for c in correction_result['corrections']:
-                    print(f"  '{c['original']}' -> '{c['corrected']}' (MLM={c['confidence']:.2f})")
-            else:
-                print("No corrections needed")
-        # Use corrected text for NER if correction was applied
-        ner_input_text = correction_result['corrected_text'] if correction_enabled else cleaned_text
         # Perform NER on text
         print("Running NER...")
@@ -2298,6 +2303,16 @@ async def process_image(
         # Map entities to bounding boxes
         entities_with_boxes = map_entities_to_boxes(structured_entities, words_with_boxes, ner_input_text)
         # Check for drug interactions
         detected_drugs = []
         for entity in structured_entities:

 # --- GLOBAL MODEL CACHES ---
 ner_model_cache: Dict[str, Any] = {}
 ocr_model_cache: Dict[str, Any] = {}
 # --- DOCLING CONVERTER CACHE ---
 docling_converter_cache: Dict[str, Any] = {}
         return None
+def _edit_distance(s1: str, s2: str) -> int:
+    """Compute Levenshtein edit distance between two strings."""
+    if len(s1) < len(s2):
+        return _edit_distance(s2, s1)
+    if len(s2) == 0:
+        return len(s1)
+    prev_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        curr_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = prev_row[j + 1] + 1
+            deletions = curr_row[j] + 1
+            substitutions = prev_row[j] + (c1 != c2)
+            curr_row.append(min(insertions, deletions, substitutions))
+        prev_row = curr_row
+    return prev_row[-1]
+# --- NER-INFORMED CORRECTION ---
+_entity_dicts: dict[str, set] = {}
+def _build_entity_dicts():
+    """Build per-entity-type dictionaries from already-loaded DRUG_INTERACTIONS and MEDLINEPLUS_MAP."""
+    global _entity_dicts
+    med_dict: set[str] = set()
+    for drug_name in DRUG_INTERACTIONS.keys():
+        for part in str(drug_name).split(','):
+            part = part.strip().lower()
+            if len(part) >= 4:
+                med_dict.add(part)
+    lab_dict: set[str] = set()
+    for test_name, data in MEDLINEPLUS_MAP.items():
+        if len(test_name) >= 4:
+            lab_dict.add(test_name.lower())
+        for alias in data.get('aliases', []):
+            if len(alias) >= 4:
+                lab_dict.add(alias.lower())
+    _entity_dicts = {
+        'MEDICATION':            med_dict,
+        'LAB_VALUE':             lab_dict,
+        'DIAGNOSTIC_PROCEDURE':  lab_dict,
+        'TREATMENT':             med_dict,
+        'CHEM':                  med_dict,
+        'CHEMICAL':              med_dict,
+    }
+    print(f"Entity dicts built: {len(med_dict)} medication terms, {len(lab_dict)} lab terms")
+def _find_closest(word: str, dictionary: set) -> tuple:
+    best_match, best_dist = None, 999
+    word_lower = word.lower()
+    for term in dictionary:
+        if abs(len(term) - len(word_lower)) > 3:
             continue
+        dist = _edit_distance(word_lower, term)
+        if dist < best_dist:
+            best_dist = dist
+            best_match = term
+    return best_match, best_dist
+def _match_case(original: str, replacement: str) -> str:
+    if original.isupper():
+        return replacement.upper()
+    if original[0].isupper():
+        return replacement.capitalize()
+    return replacement.lower()
+def correct_with_ner_entities(
+    words_with_boxes: list,
+    ner_entities: list,
+    text: str,
+    confidence_threshold: float = 0.75,
+) -> dict:
+    """Second-pass correction using NER entity labels as context."""
+    if not _entity_dicts:
+        _build_entity_dicts()
+    word_conf: dict[str, float] = {}
+    for w in words_with_boxes:
+        key = w['word'].lower()
+        word_conf[key] = min(word_conf.get(key, 1.0), w.get('confidence', 1.0))
+    corrections = []
+    corrected_text = text
+    for entity in ner_entities:
+        entity_type = entity.get('entity_group', '')
+        entity_word = entity.get('word', '').strip()
+        lookup_dict = _entity_dicts.get(entity_type)
+        if not lookup_dict or not entity_word:
             continue
+        for token in entity_word.split():
+            clean_token = re.sub(r'[^a-zA-Z]', '', token)
+            if not clean_token.isalpha() or len(clean_token) < 4:
+                continue
+            ocr_conf = word_conf.get(clean_token.lower(), 1.0)
+            if ocr_conf >= confidence_threshold:
+                continue
+            best_match, best_dist = _find_closest(clean_token, lookup_dict)
+            if best_match is None or best_dist > 2:
+                continue
+            if best_match.lower() == clean_token.lower():
+                continue
+            replacement = _match_case(clean_token, best_match)
+            match = re.search(r'\b' + re.escape(clean_token) + r'\b',
+                              corrected_text, re.IGNORECASE)
+            if not match:
+                continue
+            start, end = match.start(), match.end()
+            corrected_text = corrected_text[:start] + replacement + corrected_text[end:]
+            corrections.append({
+                'original':       clean_token,
+                'corrected':      replacement,
+                'confidence':     round(1.0 - best_dist / max(len(clean_token), len(best_match)), 4),
+                'ocr_confidence': round(ocr_conf, 4),
+                'edit_distance':  best_dist,
+                'source':         'ner',
+                'entity_type':    entity_type,
+            })
+            word_conf[replacement.lower()] = 1.0
+    return {'corrected_text': corrected_text, 'corrections': corrections}
 # --- IMAGE PREPROCESSING ---
 def deskew_image(image: np.ndarray) -> np.ndarray:
             for model_id, model_data in NER_MODELS.items()
         },
         "ocr_correction_model": {
+            "id": "ner-dictionary",
+            "name": "NER-Informed Dictionary Correction",
+            "description": "Edit-distance correction against medical entity dictionaries, guided by NER entity labels",
         }
     }
             primary_table_data = {'is_table': False}
             print("No table detected by any method, using regular OCR text")
+        # OCR Text Correction (NER-informed dictionary pass)
         correction_enabled = enable_correction.lower() == "true"
         correction_result = {'corrected_text': cleaned_text, 'corrections': []}
+        # Use cleaned text for NER input (NER correction runs after NER, see below)
+        ner_input_text = cleaned_text
         # Perform NER on text
         print("Running NER...")
         # Map entities to bounding boxes
         entities_with_boxes = map_entities_to_boxes(structured_entities, words_with_boxes, ner_input_text)
+        # NER-informed correction (second pass: fix low-confidence tokens matching entity dicts)
+        if correction_enabled:
+            ner_corr = correct_with_ner_entities(
+                words_with_boxes, structured_entities,
+                correction_result['corrected_text'], confidence_threshold=float(correction_threshold))
+            if ner_corr['corrections']:
+                correction_result['corrections'].extend(ner_corr['corrections'])
+                correction_result['corrected_text'] = ner_corr['corrected_text']
+                print(f"NER-informed correction: {len(ner_corr['corrections'])} additional fix(es)")
         # Check for drug interactions
         detected_drugs = []
         for entity in structured_entities: