Yaz Hobooti commited on
Commit
a127220
·
1 Parent(s): c0f0c6d

Fix spell checking to ignore numbers and add pharmaceutical terms

Browse files

- Add comprehensive pharmaceutical terms allowlist (100+ terms)
- Include glycerol, tocophersolan, tocopherol, and other pharmaceutical compounds
- Add vitamins, minerals, amino acids, and chemical compounds
- Add pharmaceutical excipients and formulation ingredients
- Add _is_mostly_numbers() function to detect and ignore numeric tokens
- Ignore pure numbers, decimal numbers, percentages, and ordinal numbers
- Ignore tokens with >70% digits to avoid flagging mixed alphanumeric codes
- Numbers like '123', '45.6', '78%', '1st', '2nd' are now ignored
- Pharmaceutical terms are only flagged when actually misspelled
- Comprehensive coverage of pharmaceutical and chemical terminology

Files changed (1) hide show
  1. pdf_comparator.py +68 -1
pdf_comparator.py CHANGED
@@ -123,7 +123,36 @@ _DOMAIN_ALLOWLIST = {
123
  # Common misspellings that are actually correct in context
124
  "colour", "colour", "favour", "favour", "honour", "honour",
125
  "behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
126
- "theatre", "theatre", "metre", "metre", "litre", "litre"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  }
128
  _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
129
 
@@ -172,6 +201,40 @@ def _has_digits(tok: str) -> bool:
172
  """Check if token contains digits"""
173
  return any(ch.isdigit() for ch in tok)
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def _is_likely_word(tok: str) -> bool:
176
  """Check if token looks like a real word (not random characters)"""
177
  if len(tok) < 2:
@@ -222,6 +285,10 @@ def _is_known_word(tok: str) -> bool:
222
  if not _is_likely_word(tok):
223
  return True # Don't flag non-words as misspellings
224
 
 
 
 
 
225
  # Check domain allowlist, acronyms, and words with digits
226
  if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
227
  return True
 
123
  # Common misspellings that are actually correct in context
124
  "colour", "colour", "favour", "favour", "honour", "honour",
125
  "behaviour", "behaviour", "neighbour", "neighbour", "centre", "centre",
126
+ "theatre", "theatre", "metre", "metre", "litre", "litre",
127
+
128
+ # Pharmaceutical terms
129
+ "glycerol", "tocophersolan", "tocopherol", "tocopheryl", "acetate",
130
+ "ascorbic", "ascorbate", "retinol", "retinyl", "palmitate",
131
+ "stearate", "oleate", "linoleate", "arachidonate", "docosahexaenoate",
132
+ "eicosapentaenoate", "alpha", "beta", "gamma", "delta", "omega",
133
+ "hydroxy", "methyl", "ethyl", "propyl", "butyl", "pentyl", "hexyl",
134
+ "phosphate", "sulfate", "nitrate", "chloride", "bromide", "iodide",
135
+ "sodium", "potassium", "calcium", "magnesium", "zinc", "iron",
136
+ "copper", "manganese", "selenium", "chromium", "molybdenum",
137
+ "thiamine", "riboflavin", "niacin", "pantothenic", "pyridoxine",
138
+ "biotin", "folate", "cobalamin", "cholecalciferol", "ergocalciferol",
139
+ "phylloquinone", "menaquinone", "ubiquinone", "coenzyme", "carnitine",
140
+ "creatine", "taurine", "glutamine", "arginine", "lysine", "leucine",
141
+ "isoleucine", "valine", "phenylalanine", "tryptophan", "methionine",
142
+ "cysteine", "tyrosine", "histidine", "proline", "serine", "threonine",
143
+ "asparagine", "glutamic", "aspartic", "alanine", "glycine",
144
+ "polysorbate", "monostearate", "distearate", "tristearate",
145
+ "polyethylene", "polypropylene", "polyvinyl", "carbomer", "carboxymethyl",
146
+ "cellulose", "hydroxypropyl", "methylcellulose", "ethylcellulose",
147
+ "microcrystalline", "lactose", "sucrose", "dextrose", "fructose",
148
+ "maltose", "galactose", "mannitol", "sorbitol", "xylitol", "erythritol",
149
+ "stearic", "palmitic", "oleic", "linoleic", "arachidonic", "docosahexaenoic",
150
+ "eicosapentaenoic", "arachidonic", "linolenic", "gamma", "linolenic",
151
+ "conjugated", "linoleic", "acid", "ester", "amide", "anhydride",
152
+ "hydrochloride", "hydrobromide", "hydroiodide", "nitrate", "sulfate",
153
+ "phosphate", "acetate", "citrate", "tartrate", "succinate", "fumarate",
154
+ "malate", "lactate", "gluconate", "ascorbate", "tocopheryl", "acetate",
155
+ "palmitate", "stearate", "oleate", "linoleate", "arachidonate"
156
  }
157
  _DOMAIN_ALLOWLIST_LOWER = {w.lower() for w in _DOMAIN_ALLOWLIST}
158
 
 
201
  """Check if token contains digits"""
202
  return any(ch.isdigit() for ch in tok)
203
 
204
+ def _is_mostly_numbers(tok: str) -> bool:
205
+ """Check if token is mostly numbers (should be ignored)"""
206
+ if not tok:
207
+ return False
208
+
209
+ # Count digits and letters
210
+ digit_count = sum(1 for ch in tok if ch.isdigit())
211
+ letter_count = sum(1 for ch in tok if ch.isalpha())
212
+ total_chars = len(tok)
213
+
214
+ # If more than 70% digits, consider it mostly numbers
215
+ if digit_count / total_chars > 0.7:
216
+ return True
217
+
218
+ # If it's a pure number (all digits), ignore it
219
+ if digit_count == total_chars:
220
+ return True
221
+
222
+ # If it's a number with common suffixes (like "1st", "2nd", "3rd", "4th")
223
+ if total_chars >= 2 and digit_count >= 1:
224
+ suffix = tok[-2:].lower()
225
+ if suffix in ['st', 'nd', 'rd', 'th']:
226
+ return True
227
+
228
+ # If it's a decimal number (contains digits and decimal point)
229
+ if '.' in tok and digit_count > 0:
230
+ return True
231
+
232
+ # If it's a percentage (ends with %)
233
+ if tok.endswith('%') and digit_count > 0:
234
+ return True
235
+
236
+ return False
237
+
238
  def _is_likely_word(tok: str) -> bool:
239
  """Check if token looks like a real word (not random characters)"""
240
  if len(tok) < 2:
 
285
  if not _is_likely_word(tok):
286
  return True # Don't flag non-words as misspellings
287
 
288
+ # Ignore numbers and mostly numeric tokens
289
+ if _is_mostly_numbers(tok):
290
+ return True # Don't flag numbers as misspellings
291
+
292
  # Check domain allowlist, acronyms, and words with digits
293
  if t in _DOMAIN_ALLOWLIST_LOWER or _looks_like_acronym(tok) or _has_digits(tok):
294
  return True