Fix spell checking to ignore numbers and add pharmaceutical terms

- Add comprehensive pharmaceutical terms allowlist (100+ terms)
- Include glycerol, tocophersolan, tocopherol, and other pharmaceutical compounds
- Add vitamins, minerals, amino acids, and chemical compounds
- Add pharmaceutical excipients and formulation ingredients
- Add _is_mostly_numbers() function to detect and ignore numeric tokens
- Ignore pure numbers, decimal numbers, percentages, and ordinal numbers
- Ignore tokens with >70% digits to avoid flagging mixed alphanumeric codes
- Numbers like '123', '45.6', '78%', '1st', '2nd' are now ignored
- Pharmaceutical terms are only flagged when actually misspelled
- Comprehensive coverage of pharmaceutical and chemical terminology

Files changed (1) hide show

pdf_comparator.py +68 -1

pdf_comparator.py CHANGED Viewed

@@ -123,7 +123,36 @@ _DOMAIN_ALLOWLIST = {
     # Common misspellings that are actually correct in context
     "colour", "colour", "favour", "favour", "honour", "honour",
     "behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
-    "theatre", "theatre", "metre", "metre", "litre", "litre"
 }
 _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
@@ -172,6 +201,40 @@ def _has_digits(tok: str) -> bool:
     """Check if token contains digits"""
     return any(ch.isdigit() for ch in tok)
 def _is_likely_word(tok: str) -> bool:
     """Check if token looks like a real word (not random characters)"""
     if len(tok) < 2:
@@ -222,6 +285,10 @@ def _is_known_word(tok: str) -> bool:
     if not _is_likely_word(tok):
         return True  # Don't flag non-words as misspellings
     # Check domain allowlist, acronyms, and words with digits
     if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
         return True

     # Common misspellings that are actually correct in context
     "colour", "colour", "favour", "favour", "honour", "honour",
     "behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
+    "theatre", "theatre", "metre", "metre", "litre", "litre",
+    # Pharmaceutical terms
+    "glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate",
+    "ascorbic", "ascorbate", "retinol", "retinyl", "palmitate",
+    "stearate", "oleate", "linoleate", "arachidonate", "docosahexaenoate",
+    "eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega",
+    "hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl",
+    "phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide",
+    "sodium", "potassium", "calcium", "magnesium", "zinc", "iron",
+    "copper", "manganese", "selenium", "chromium", "molybdenum",
+    "thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine",
+    "biotin", "folate", "cobalamin", "cholecalciferol", "ergocalciferol",
+    "phylloquinone", "menaquinone", "ubiquinone", "coenzyme", "carnitine",
+    "creatine", "taurine", "glutamine", "arginine", "lysine", "leucine",
+    "isoleucine", "valine", "phenylalanine", "tryptophan", "methionine",
+    "cysteine", "tyrosine", "histidine", "proline", "serine", "threonine",
+    "asparagine", "glutamic", "aspartic", "alanine", "glycine",
+    "polysorbate", "monostearate", "distearate", "tristearate",
+    "polyethylene", "polypropylene", "polyvinyl", "carbomer", "carboxymethyl",
+    "cellulose", "hydroxypropyl", "methylcellulose", "ethylcellulose",
+    "microcrystalline", "lactose", "sucrose", "dextrose", "fructose",
+    "maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol",
+    "stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic",
+    "eicosapentaenoic", "arachidonic", "linolenic", "gamma", "linolenic",
+    "conjugated", "linoleic", "acid", "ester", "amide", "anhydride",
+    "hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate",
+    "phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate",
+    "malate", "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate",
+    "palmitate", "stearate", "oleate", "linoleate", "arachidonate"
 }
 _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
     """Check if token contains digits"""
     return any(ch.isdigit() for ch in tok)
+def _is_mostly_numbers(tok: str) -> bool:
+    """Check if token is mostly numbers (should be ignored)"""
+    if not tok:
+        return False
+    # Count digits and letters
+    digit_count = sum(1 for ch in tok if ch.isdigit())
+    letter_count = sum(1 for ch in tok if ch.isalpha())
+    total_chars = len(tok)
+    # If more than 70% digits, consider it mostly numbers
+    if digit_count / total_chars > 0.7:
+        return True
+    # If it's a pure number (all digits), ignore it
+    if digit_count == total_chars:
+        return True
+    # If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th")
+    if total_chars >= 2 and digit_count >= 1:
+        suffix = tok[-2:].lower()
+        if suffix in ['st', 'nd', 'rd', 'th']:
+            return True
+    # If it's a decimal number (contains digits and decimal point)
+    if '.' in tok and digit_count > 0:
+        return True
+    # If it's a percentage (ends with %)
+    if tok.endswith('%') and digit_count > 0:
+        return True
+    return False
 def _is_likely_word(tok: str) -> bool:
     """Check if token looks like a real word (not random characters)"""
     if len(tok) < 2:
     if not _is_likely_word(tok):
         return True  # Don't flag non-words as misspellings
+    # Ignore numbers and mostly numeric tokens
+    if _is_mostly_numbers(tok):
+        return True  # Don't flag numbers as misspellings
     # Check domain allowlist, acronyms, and words with digits
     if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
         return True