Yaz Hobooti
commited on
Commit
·
a127220
1
Parent(s):
c0f0c6d
Fix spell checking to ignore numbers and add pharmaceutical terms
Browse files- Add comprehensive pharmaceutical terms allowlist (100+ terms)
- Include glycerol, tocophersolan, tocopherol, and other pharmaceutical compounds
- Add vitamins, minerals, amino acids, and chemical compounds
- Add pharmaceutical excipients and formulation ingredients
- Add _is_mostly_numbers() function to detect and ignore numeric tokens
- Ignore pure numbers, decimal numbers, percentages, and ordinal numbers
- Ignore tokens with >70% digits to avoid flagging mixed alphanumeric codes
- Numbers like '123', '45.6', '78%', '1st', '2nd' are now ignored
- Pharmaceutical terms are only flagged when actually misspelled
- Comprehensive coverage of pharmaceutical and chemical terminology
- pdf_comparator.py +68 -1
pdf_comparator.py
CHANGED
|
@@ -123,7 +123,36 @@ _DOMAIN_ALLOWLIST = {
|
|
| 123 |
# Common misspellings that are actually correct in context
|
| 124 |
"colour", "colour", "favour", "favour", "honour", "honour",
|
| 125 |
"behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
|
| 126 |
-
"theatre", "theatre", "metre", "metre", "litre", "litre"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
}
|
| 128 |
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
|
| 129 |
|
|
@@ -172,6 +201,40 @@ def _has_digits(tok: str) -> bool:
|
|
| 172 |
"""Check if token contains digits"""
|
| 173 |
return any(ch.isdigit() for ch in tok)
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
def _is_likely_word(tok: str) -> bool:
|
| 176 |
"""Check if token looks like a real word (not random characters)"""
|
| 177 |
if len(tok) < 2:
|
|
@@ -222,6 +285,10 @@ def _is_known_word(tok: str) -> bool:
|
|
| 222 |
if not _is_likely_word(tok):
|
| 223 |
return True # Don't flag non-words as misspellings
|
| 224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
# Check domain allowlist, acronyms, and words with digits
|
| 226 |
if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
|
| 227 |
return True
|
|
|
|
| 123 |
# Common misspellings that are actually correct in context
|
| 124 |
"colour", "colour", "favour", "favour", "honour", "honour",
|
| 125 |
"behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
|
| 126 |
+
"theatre", "theatre", "metre", "metre", "litre", "litre",
|
| 127 |
+
|
| 128 |
+
# Pharmaceutical terms
|
| 129 |
+
"glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate",
|
| 130 |
+
"ascorbic", "ascorbate", "retinol", "retinyl", "palmitate",
|
| 131 |
+
"stearate", "oleate", "linoleate", "arachidonate", "docosahexaenoate",
|
| 132 |
+
"eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega",
|
| 133 |
+
"hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl",
|
| 134 |
+
"phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide",
|
| 135 |
+
"sodium", "potassium", "calcium", "magnesium", "zinc", "iron",
|
| 136 |
+
"copper", "manganese", "selenium", "chromium", "molybdenum",
|
| 137 |
+
"thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine",
|
| 138 |
+
"biotin", "folate", "cobalamin", "cholecalciferol", "ergocalciferol",
|
| 139 |
+
"phylloquinone", "menaquinone", "ubiquinone", "coenzyme", "carnitine",
|
| 140 |
+
"creatine", "taurine", "glutamine", "arginine", "lysine", "leucine",
|
| 141 |
+
"isoleucine", "valine", "phenylalanine", "tryptophan", "methionine",
|
| 142 |
+
"cysteine", "tyrosine", "histidine", "proline", "serine", "threonine",
|
| 143 |
+
"asparagine", "glutamic", "aspartic", "alanine", "glycine",
|
| 144 |
+
"polysorbate", "monostearate", "distearate", "tristearate",
|
| 145 |
+
"polyethylene", "polypropylene", "polyvinyl", "carbomer", "carboxymethyl",
|
| 146 |
+
"cellulose", "hydroxypropyl", "methylcellulose", "ethylcellulose",
|
| 147 |
+
"microcrystalline", "lactose", "sucrose", "dextrose", "fructose",
|
| 148 |
+
"maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol",
|
| 149 |
+
"stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic",
|
| 150 |
+
"eicosapentaenoic", "arachidonic", "linolenic", "gamma", "linolenic",
|
| 151 |
+
"conjugated", "linoleic", "acid", "ester", "amide", "anhydride",
|
| 152 |
+
"hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate",
|
| 153 |
+
"phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate",
|
| 154 |
+
"malate", "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate",
|
| 155 |
+
"palmitate", "stearate", "oleate", "linoleate", "arachidonate"
|
| 156 |
}
|
| 157 |
_DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
|
| 158 |
|
|
|
|
| 201 |
"""Check if token contains digits"""
|
| 202 |
return any(ch.isdigit() for ch in tok)
|
| 203 |
|
| 204 |
+
def _is_mostly_numbers(tok: str) -> bool:
|
| 205 |
+
"""Check if token is mostly numbers (should be ignored)"""
|
| 206 |
+
if not tok:
|
| 207 |
+
return False
|
| 208 |
+
|
| 209 |
+
# Count digits and letters
|
| 210 |
+
digit_count = sum(1 for ch in tok if ch.isdigit())
|
| 211 |
+
letter_count = sum(1 for ch in tok if ch.isalpha())
|
| 212 |
+
total_chars = len(tok)
|
| 213 |
+
|
| 214 |
+
# If more than 70% digits, consider it mostly numbers
|
| 215 |
+
if digit_count / total_chars > 0.7:
|
| 216 |
+
return True
|
| 217 |
+
|
| 218 |
+
# If it's a pure number (all digits), ignore it
|
| 219 |
+
if digit_count == total_chars:
|
| 220 |
+
return True
|
| 221 |
+
|
| 222 |
+
# If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th")
|
| 223 |
+
if total_chars >= 2 and digit_count >= 1:
|
| 224 |
+
suffix = tok[-2:].lower()
|
| 225 |
+
if suffix in ['st', 'nd', 'rd', 'th']:
|
| 226 |
+
return True
|
| 227 |
+
|
| 228 |
+
# If it's a decimal number (contains digits and decimal point)
|
| 229 |
+
if '.' in tok and digit_count > 0:
|
| 230 |
+
return True
|
| 231 |
+
|
| 232 |
+
# If it's a percentage (ends with %)
|
| 233 |
+
if tok.endswith('%') and digit_count > 0:
|
| 234 |
+
return True
|
| 235 |
+
|
| 236 |
+
return False
|
| 237 |
+
|
| 238 |
def _is_likely_word(tok: str) -> bool:
|
| 239 |
"""Check if token looks like a real word (not random characters)"""
|
| 240 |
if len(tok) < 2:
|
|
|
|
| 285 |
if not _is_likely_word(tok):
|
| 286 |
return True # Don't flag non-words as misspellings
|
| 287 |
|
| 288 |
+
# Ignore numbers and mostly numeric tokens
|
| 289 |
+
if _is_mostly_numbers(tok):
|
| 290 |
+
return True # Don't flag numbers as misspellings
|
| 291 |
+
|
| 292 |
# Check domain allowlist, acronyms, and words with digits
|
| 293 |
if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
|
| 294 |
return True
|