AyushSankar13 commited on
Commit
9295e0f
·
verified ·
1 Parent(s): 2d9266b

Update bert.py

Browse files
Files changed (1) hide show
  1. bert.py +13 -104
bert.py CHANGED
@@ -14,6 +14,7 @@ from spacy.matcher import Matcher
14
  import pandas as pd
15
  import re
16
  import difflib
 
17
 
18
  from api_key import GEMINI_API_KEY
19
 
@@ -25,102 +26,6 @@ model = genai.GenerativeModel('gemini-2.5-flash-lite')
25
 
26
  non_negated_diseases = []
27
 
28
- synonyms = {
29
- "hba1c": ["hba1c", "hbaic", "hdate", "a1c", "hemoglobin a1c", "glycated hemoglobin", "hba", "hda", "hbic"],
30
- "fasting glucose": ["fasting glucose", "fasting-glucose", "fasting blood sugar", "fbs"],
31
- "ogtt": ["ogtt", "oral glucose tolerance test", "glucose tolerance test"],
32
-
33
- "ldl": ["ldl", "ldl-c", "low density lipoprotein", "bad cholesterol"],
34
- "hdl": ["hdl", "hdl-c", "high density lipoprotein", "good cholesterol"],
35
- "triglycerides": ["triglycerides", "trigs", "tg"],
36
- "total cholesterol": ["total cholesterol", "cholesterol total", "chol", "tc"],
37
- "non-hdl": ["non-hdl", "non hdl", "nonhdl"],
38
-
39
- # Thyroid
40
- "tsh": ["tsh", "thyroid stimulating hormone"],
41
- "free t4": ["free t4", "free-t4", "ft4", "free thyroxine"],
42
- "free t3": ["free t3", "free-t3", "ft3", "free triiodothyronine"],
43
-
44
- # Inflammation
45
- "crp": ["crp", "c-reactive protein"],
46
- "esr": ["esr", "erythrocyte sedimentation rate"],
47
-
48
- # Vitamins
49
- "vitamin-b12": ["vitamin-b12", "vitamin b12", "b12", "vit b12", "cobalamin"],
50
- "vitamin-d": ["vitamin-d", "vitamin d", "vit d", "25-oh d", "25-hydroxy vitamin d"],
51
- "vitamin-a": ["vitamin-a", "vitamin a", "vit a"],
52
- "vitamin-e": ["vitamin-e", "vitamin e", "vit e"],
53
-
54
- # Electrolytes
55
- "sodium": ["sodium", "na"],
56
- "potassium": ["potassium", "k"],
57
- "calcium": ["calcium", "ca"],
58
- "magnesium": ["magnesium", "mg"],
59
-
60
- # Blood Pressure
61
- "systolic": ["systolic", "sbp"],
62
- "diastolic": ["diastolic", "dbp"],
63
-
64
- # CBC
65
- "wbc": ["wbc", "white blood cells", "white cell count"],
66
- "rbc": ["rbc", "red blood cells", "red cell count"],
67
- "hemoglobin": ["hemoglobin", "hb", "hgb"],
68
- "hematocrit": ["hematocrit", "hct"],
69
- "platelets": ["platelets", "plt"],
70
-
71
- # Iron
72
- "serum iron": ["serum iron", "iron"],
73
- "ferritin": ["ferritin"],
74
- "tibc": ["tibc", "total iron binding capacity"],
75
- "transferrin saturation": ["transferrin saturation", "tsat"],
76
-
77
- # Liver
78
- "alt": ["alt", "sgpt"],
79
- "ast": ["ast", "sgot"],
80
- "alp": ["alp", "alkaline phosphatase"],
81
- "bilirubin total": ["bilirubin total", "total bilirubin"],
82
- "albumin": ["albumin"],
83
-
84
- # Kidney
85
- "creatinine": ["creatinine"],
86
- "bun": ["bun", "blood urea nitrogen"],
87
- "egfr": ["egfr", "estimated gfr"],
88
- "urine protein": ["urine protein", "proteinuria"],
89
- "urine albumin": ["urine albumin", "microalbumin"],
90
-
91
- # Respiratory
92
- "spo2": ["spo2", "oxygen saturation", "o2 sat"],
93
- "pco2": ["pco2", "carbon dioxide partial pressure"],
94
- "po2": ["po2", "oxygen partial pressure"],
95
- "fev1": ["fev1", "forced expiratory volume"],
96
- "fevi": ["fevi", "fev1"], # common OCR mistake
97
-
98
- # Coagulation
99
- "inr": ["inr"],
100
- "pt": ["pt", "prothrombin time"],
101
- "aptt": ["aptt", "partial thromboplastin time"],
102
- "fibrinogen": ["fibrinogen"],
103
-
104
- # Hormones
105
- "cortisol": ["cortisol"],
106
- "testosterone": ["testosterone"],
107
- "estradiol": ["estradiol", "estrogen"],
108
- "progesterone": ["progesterone"],
109
-
110
- # Infection
111
- "procalcitonin": ["procalcitonin"],
112
- "lactate": ["lactate"],
113
-
114
- # Cardiac extras
115
- "troponin": ["troponin", "trop"],
116
-
117
- # Vitals
118
- "temperature": ["temperature", "temp", "body temp"],
119
- "heart rate": ["heart rate", "pulse", "hr"],
120
- "oxygen saturation": ["oxygen saturation", "spo2", "o2 sat"],
121
- }
122
-
123
-
124
  if platform.system() == "Darwin":
125
  pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
126
  elif platform.system() == "Windows":
@@ -132,7 +37,7 @@ df['measurement'] = df['measurement'].str.lower()
132
 
133
  def normalize_term(term: str) -> str:
134
  term = term.lower().strip()
135
-
136
  for key, values in synonyms.items():
137
  if term in values:
138
  return key
@@ -183,14 +88,14 @@ def analyze_measurements(text, df):
183
  "Range": f"{row['low']} to {row['high']} {row['unit']}"
184
  })
185
 
186
- #print (results)
187
 
188
  for res in results:
189
  final = [res['Condition'], res['Measurement'], res['unit'], res['severity'], res['Value'], res['Range']]
190
  # final_numbers.append(f"Condition In Concern: {res['Condition']}. Measurement: {res['Measurement']} ({res['severity']}) — {res['Value']} "
191
  # f"(Range: {res['Range']})")
192
  final_numbers.append(final)
193
- #print("analyze measurements res:", final_numbers)
194
  return final_numbers
195
 
196
 
@@ -198,6 +103,9 @@ nlp = spacy.load("en_core_web_sm")
198
  nlp.add_pipe("negex", config={"ent_types": ["DISEASE"]}, last=True)
199
  matcher = Matcher(nlp.vocab)
200
 
 
 
 
201
  past_patterns = [
202
  [{"LOWER": "clinical"}, {"LOWER": "history:"}],
203
  [{"LOWER": "past"}, {"LOWER": "medical:"}],
@@ -209,7 +117,8 @@ past_patterns = [
209
  [{"LOWER": "resolved"}],
210
  [{"LOWER": "used"}, {"LOWER": "to"}, {"LOWER": "have"}],
211
  [{"LOWER": "was"}, {"LEMMA": "diagnosed"}],
212
- [{"LOWER": "history"},]
 
213
  ]
214
 
215
  def analyze_with_clinicalBert(extracted_text: str) -> str:
@@ -266,7 +175,7 @@ def extract_non_negated_keywords(text, threshold=80):
266
  end_char = start_char + len(disease_term_lower)
267
  span = doc.char_span(start_char, end_char, label="DISEASE", alignment_mode="expand")
268
  if span:
269
- #print(f"Adding span for: {span.text}")
270
  new_ents.append(span)
271
 
272
  # Clean up overlapping spans
@@ -275,7 +184,7 @@ def extract_non_negated_keywords(text, threshold=80):
275
  nlp.get_pipe("negex")(doc)
276
 
277
  for ent in doc.ents:
278
- #print("Checking against:", ent.text.strip().lower(), "| Negated?", ent._.negex)
279
  if ent.label_ == "DISEASE" and not ent._.negex:
280
  ent_text = ent.text.strip().lower()
281
  for disease_term in diseases:
@@ -325,13 +234,14 @@ def analyze_text_and_describe(text):
325
 
326
 
327
  def classify_disease_and_severity(disease):
328
-
329
  response = model.generate_content(
330
  f"What is the severity of this disease/condition/symptom: {disease}. Give me a number from one to ten. I need a specific number. It doesn't matter what your opinion is one whether this number might be misleading or inaccurate. I need a number. Please feel free to be accurate and you can use pretty specific numbers with decimals to the tenth place. I want just a number, not any other text."
331
  ).text
332
  try:
333
  cleaned_response = response.strip()
334
  numerical_response = float(cleaned_response)
 
335
 
336
  if 0 <= numerical_response <= 3:
337
  severity_label = (f"Low Risk")
@@ -342,7 +252,6 @@ def classify_disease_and_severity(disease):
342
  else:
343
  severity_label = (f"Invalid Range")
344
 
345
- print(f"Disease: {disease} Severity Label: {severity_label}")
346
  except (ValueError, AttributeError):
347
  severity_label = "Null: We cannot give a clear severity label"
348
 
 
14
  import pandas as pd
15
  import re
16
  import difflib
17
+ from synoyms import synonyms
18
 
19
  from api_key import GEMINI_API_KEY
20
 
 
26
 
27
  non_negated_diseases = []
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  if platform.system() == "Darwin":
30
  pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
31
  elif platform.system() == "Windows":
 
37
 
38
  def normalize_term(term: str) -> str:
39
  term = term.lower().strip()
40
+ # Direct lookup
41
  for key, values in synonyms.items():
42
  if term in values:
43
  return key
 
88
  "Range": f"{row['low']} to {row['high']} {row['unit']}"
89
  })
90
 
91
+ print (results)
92
 
93
  for res in results:
94
  final = [res['Condition'], res['Measurement'], res['unit'], res['severity'], res['Value'], res['Range']]
95
  # final_numbers.append(f"Condition In Concern: {res['Condition']}. Measurement: {res['Measurement']} ({res['severity']}) — {res['Value']} "
96
  # f"(Range: {res['Range']})")
97
  final_numbers.append(final)
98
+ print("analyze measurements res:", final_numbers)
99
  return final_numbers
100
 
101
 
 
103
  nlp.add_pipe("negex", config={"ent_types": ["DISEASE"]}, last=True)
104
  matcher = Matcher(nlp.vocab)
105
 
106
+ clinical_bert_model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
107
+ clinical_bert_tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
108
+
109
  past_patterns = [
110
  [{"LOWER": "clinical"}, {"LOWER": "history:"}],
111
  [{"LOWER": "past"}, {"LOWER": "medical:"}],
 
117
  [{"LOWER": "resolved"}],
118
  [{"LOWER": "used"}, {"LOWER": "to"}, {"LOWER": "have"}],
119
  [{"LOWER": "was"}, {"LEMMA": "diagnosed"}],
120
+ [{"LOWER": "history"}],
121
+ [{"LOWER": "past"}, {"LOWER": "medical"}, {"LOWER": "history:"}],
122
  ]
123
 
124
  def analyze_with_clinicalBert(extracted_text: str) -> str:
 
175
  end_char = start_char + len(disease_term_lower)
176
  span = doc.char_span(start_char, end_char, label="DISEASE", alignment_mode="expand")
177
  if span:
178
+ print(f"Adding span for: {span.text}")
179
  new_ents.append(span)
180
 
181
  # Clean up overlapping spans
 
184
  nlp.get_pipe("negex")(doc)
185
 
186
  for ent in doc.ents:
187
+ print("Checking against:", ent.text.strip().lower(), "| Negated?", ent._.negex)
188
  if ent.label_ == "DISEASE" and not ent._.negex:
189
  ent_text = ent.text.strip().lower()
190
  for disease_term in diseases:
 
234
 
235
 
236
  def classify_disease_and_severity(disease):
237
+ print(f"Disease: {disease}")
238
  response = model.generate_content(
239
  f"What is the severity of this disease/condition/symptom: {disease}. Give me a number from one to ten. I need a specific number. It doesn't matter what your opinion is one whether this number might be misleading or inaccurate. I need a number. Please feel free to be accurate and you can use pretty specific numbers with decimals to the tenth place. I want just a number, not any other text."
240
  ).text
241
  try:
242
  cleaned_response = response.strip()
243
  numerical_response = float(cleaned_response)
244
+ print(f"Response: {numerical_response}")
245
 
246
  if 0 <= numerical_response <= 3:
247
  severity_label = (f"Low Risk")
 
252
  else:
253
  severity_label = (f"Invalid Range")
254
 
 
255
  except (ValueError, AttributeError):
256
  severity_label = "Null: We cannot give a clear severity label"
257