Spaces:
Sleeping
Sleeping
Update bert.py
Browse files
bert.py
CHANGED
|
@@ -14,6 +14,7 @@ from spacy.matcher import Matcher
|
|
| 14 |
import pandas as pd
|
| 15 |
import re
|
| 16 |
import difflib
|
|
|
|
| 17 |
|
| 18 |
from api_key import GEMINI_API_KEY
|
| 19 |
|
|
@@ -25,102 +26,6 @@ model = genai.GenerativeModel('gemini-2.5-flash-lite')
|
|
| 25 |
|
| 26 |
non_negated_diseases = []
|
| 27 |
|
| 28 |
-
synonyms = {
|
| 29 |
-
"hba1c": ["hba1c", "hbaic", "hdate", "a1c", "hemoglobin a1c", "glycated hemoglobin", "hba", "hda", "hbic"],
|
| 30 |
-
"fasting glucose": ["fasting glucose", "fasting-glucose", "fasting blood sugar", "fbs"],
|
| 31 |
-
"ogtt": ["ogtt", "oral glucose tolerance test", "glucose tolerance test"],
|
| 32 |
-
|
| 33 |
-
"ldl": ["ldl", "ldl-c", "low density lipoprotein", "bad cholesterol"],
|
| 34 |
-
"hdl": ["hdl", "hdl-c", "high density lipoprotein", "good cholesterol"],
|
| 35 |
-
"triglycerides": ["triglycerides", "trigs", "tg"],
|
| 36 |
-
"total cholesterol": ["total cholesterol", "cholesterol total", "chol", "tc"],
|
| 37 |
-
"non-hdl": ["non-hdl", "non hdl", "nonhdl"],
|
| 38 |
-
|
| 39 |
-
# Thyroid
|
| 40 |
-
"tsh": ["tsh", "thyroid stimulating hormone"],
|
| 41 |
-
"free t4": ["free t4", "free-t4", "ft4", "free thyroxine"],
|
| 42 |
-
"free t3": ["free t3", "free-t3", "ft3", "free triiodothyronine"],
|
| 43 |
-
|
| 44 |
-
# Inflammation
|
| 45 |
-
"crp": ["crp", "c-reactive protein"],
|
| 46 |
-
"esr": ["esr", "erythrocyte sedimentation rate"],
|
| 47 |
-
|
| 48 |
-
# Vitamins
|
| 49 |
-
"vitamin-b12": ["vitamin-b12", "vitamin b12", "b12", "vit b12", "cobalamin"],
|
| 50 |
-
"vitamin-d": ["vitamin-d", "vitamin d", "vit d", "25-oh d", "25-hydroxy vitamin d"],
|
| 51 |
-
"vitamin-a": ["vitamin-a", "vitamin a", "vit a"],
|
| 52 |
-
"vitamin-e": ["vitamin-e", "vitamin e", "vit e"],
|
| 53 |
-
|
| 54 |
-
# Electrolytes
|
| 55 |
-
"sodium": ["sodium", "na"],
|
| 56 |
-
"potassium": ["potassium", "k"],
|
| 57 |
-
"calcium": ["calcium", "ca"],
|
| 58 |
-
"magnesium": ["magnesium", "mg"],
|
| 59 |
-
|
| 60 |
-
# Blood Pressure
|
| 61 |
-
"systolic": ["systolic", "sbp"],
|
| 62 |
-
"diastolic": ["diastolic", "dbp"],
|
| 63 |
-
|
| 64 |
-
# CBC
|
| 65 |
-
"wbc": ["wbc", "white blood cells", "white cell count"],
|
| 66 |
-
"rbc": ["rbc", "red blood cells", "red cell count"],
|
| 67 |
-
"hemoglobin": ["hemoglobin", "hb", "hgb"],
|
| 68 |
-
"hematocrit": ["hematocrit", "hct"],
|
| 69 |
-
"platelets": ["platelets", "plt"],
|
| 70 |
-
|
| 71 |
-
# Iron
|
| 72 |
-
"serum iron": ["serum iron", "iron"],
|
| 73 |
-
"ferritin": ["ferritin"],
|
| 74 |
-
"tibc": ["tibc", "total iron binding capacity"],
|
| 75 |
-
"transferrin saturation": ["transferrin saturation", "tsat"],
|
| 76 |
-
|
| 77 |
-
# Liver
|
| 78 |
-
"alt": ["alt", "sgpt"],
|
| 79 |
-
"ast": ["ast", "sgot"],
|
| 80 |
-
"alp": ["alp", "alkaline phosphatase"],
|
| 81 |
-
"bilirubin total": ["bilirubin total", "total bilirubin"],
|
| 82 |
-
"albumin": ["albumin"],
|
| 83 |
-
|
| 84 |
-
# Kidney
|
| 85 |
-
"creatinine": ["creatinine"],
|
| 86 |
-
"bun": ["bun", "blood urea nitrogen"],
|
| 87 |
-
"egfr": ["egfr", "estimated gfr"],
|
| 88 |
-
"urine protein": ["urine protein", "proteinuria"],
|
| 89 |
-
"urine albumin": ["urine albumin", "microalbumin"],
|
| 90 |
-
|
| 91 |
-
# Respiratory
|
| 92 |
-
"spo2": ["spo2", "oxygen saturation", "o2 sat"],
|
| 93 |
-
"pco2": ["pco2", "carbon dioxide partial pressure"],
|
| 94 |
-
"po2": ["po2", "oxygen partial pressure"],
|
| 95 |
-
"fev1": ["fev1", "forced expiratory volume"],
|
| 96 |
-
"fevi": ["fevi", "fev1"], # common OCR mistake
|
| 97 |
-
|
| 98 |
-
# Coagulation
|
| 99 |
-
"inr": ["inr"],
|
| 100 |
-
"pt": ["pt", "prothrombin time"],
|
| 101 |
-
"aptt": ["aptt", "partial thromboplastin time"],
|
| 102 |
-
"fibrinogen": ["fibrinogen"],
|
| 103 |
-
|
| 104 |
-
# Hormones
|
| 105 |
-
"cortisol": ["cortisol"],
|
| 106 |
-
"testosterone": ["testosterone"],
|
| 107 |
-
"estradiol": ["estradiol", "estrogen"],
|
| 108 |
-
"progesterone": ["progesterone"],
|
| 109 |
-
|
| 110 |
-
# Infection
|
| 111 |
-
"procalcitonin": ["procalcitonin"],
|
| 112 |
-
"lactate": ["lactate"],
|
| 113 |
-
|
| 114 |
-
# Cardiac extras
|
| 115 |
-
"troponin": ["troponin", "trop"],
|
| 116 |
-
|
| 117 |
-
# Vitals
|
| 118 |
-
"temperature": ["temperature", "temp", "body temp"],
|
| 119 |
-
"heart rate": ["heart rate", "pulse", "hr"],
|
| 120 |
-
"oxygen saturation": ["oxygen saturation", "spo2", "o2 sat"],
|
| 121 |
-
}
|
| 122 |
-
|
| 123 |
-
|
| 124 |
if platform.system() == "Darwin":
|
| 125 |
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
|
| 126 |
elif platform.system() == "Windows":
|
|
@@ -132,7 +37,7 @@ df['measurement'] = df['measurement'].str.lower()
|
|
| 132 |
|
| 133 |
def normalize_term(term: str) -> str:
|
| 134 |
term = term.lower().strip()
|
| 135 |
-
|
| 136 |
for key, values in synonyms.items():
|
| 137 |
if term in values:
|
| 138 |
return key
|
|
@@ -183,14 +88,14 @@ def analyze_measurements(text, df):
|
|
| 183 |
"Range": f"{row['low']} to {row['high']} {row['unit']}"
|
| 184 |
})
|
| 185 |
|
| 186 |
-
|
| 187 |
|
| 188 |
for res in results:
|
| 189 |
final = [res['Condition'], res['Measurement'], res['unit'], res['severity'], res['Value'], res['Range']]
|
| 190 |
# final_numbers.append(f"Condition In Concern: {res['Condition']}. Measurement: {res['Measurement']} ({res['severity']}) — {res['Value']} "
|
| 191 |
# f"(Range: {res['Range']})")
|
| 192 |
final_numbers.append(final)
|
| 193 |
-
|
| 194 |
return final_numbers
|
| 195 |
|
| 196 |
|
|
@@ -198,6 +103,9 @@ nlp = spacy.load("en_core_web_sm")
|
|
| 198 |
nlp.add_pipe("negex", config={"ent_types": ["DISEASE"]}, last=True)
|
| 199 |
matcher = Matcher(nlp.vocab)
|
| 200 |
|
|
|
|
|
|
|
|
|
|
| 201 |
past_patterns = [
|
| 202 |
[{"LOWER": "clinical"}, {"LOWER": "history:"}],
|
| 203 |
[{"LOWER": "past"}, {"LOWER": "medical:"}],
|
|
@@ -209,7 +117,8 @@ past_patterns = [
|
|
| 209 |
[{"LOWER": "resolved"}],
|
| 210 |
[{"LOWER": "used"}, {"LOWER": "to"}, {"LOWER": "have"}],
|
| 211 |
[{"LOWER": "was"}, {"LEMMA": "diagnosed"}],
|
| 212 |
-
[{"LOWER": "history"}
|
|
|
|
| 213 |
]
|
| 214 |
|
| 215 |
def analyze_with_clinicalBert(extracted_text: str) -> str:
|
|
@@ -266,7 +175,7 @@ def extract_non_negated_keywords(text, threshold=80):
|
|
| 266 |
end_char = start_char + len(disease_term_lower)
|
| 267 |
span = doc.char_span(start_char, end_char, label="DISEASE", alignment_mode="expand")
|
| 268 |
if span:
|
| 269 |
-
|
| 270 |
new_ents.append(span)
|
| 271 |
|
| 272 |
# Clean up overlapping spans
|
|
@@ -275,7 +184,7 @@ def extract_non_negated_keywords(text, threshold=80):
|
|
| 275 |
nlp.get_pipe("negex")(doc)
|
| 276 |
|
| 277 |
for ent in doc.ents:
|
| 278 |
-
|
| 279 |
if ent.label_ == "DISEASE" and not ent._.negex:
|
| 280 |
ent_text = ent.text.strip().lower()
|
| 281 |
for disease_term in diseases:
|
|
@@ -325,13 +234,14 @@ def analyze_text_and_describe(text):
|
|
| 325 |
|
| 326 |
|
| 327 |
def classify_disease_and_severity(disease):
|
| 328 |
-
|
| 329 |
response = model.generate_content(
|
| 330 |
f"What is the severity of this disease/condition/symptom: {disease}. Give me a number from one to ten. I need a specific number. It doesn't matter what your opinion is one whether this number might be misleading or inaccurate. I need a number. Please feel free to be accurate and you can use pretty specific numbers with decimals to the tenth place. I want just a number, not any other text."
|
| 331 |
).text
|
| 332 |
try:
|
| 333 |
cleaned_response = response.strip()
|
| 334 |
numerical_response = float(cleaned_response)
|
|
|
|
| 335 |
|
| 336 |
if 0 <= numerical_response <= 3:
|
| 337 |
severity_label = (f"Low Risk")
|
|
@@ -342,7 +252,6 @@ def classify_disease_and_severity(disease):
|
|
| 342 |
else:
|
| 343 |
severity_label = (f"Invalid Range")
|
| 344 |
|
| 345 |
-
print(f"Disease: {disease} Severity Label: {severity_label}")
|
| 346 |
except (ValueError, AttributeError):
|
| 347 |
severity_label = "Null: We cannot give a clear severity label"
|
| 348 |
|
|
|
|
| 14 |
import pandas as pd
|
| 15 |
import re
|
| 16 |
import difflib
|
| 17 |
+
from synoyms import synonyms
|
| 18 |
|
| 19 |
from api_key import GEMINI_API_KEY
|
| 20 |
|
|
|
|
| 26 |
|
| 27 |
non_negated_diseases = []
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if platform.system() == "Darwin":
|
| 30 |
pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
|
| 31 |
elif platform.system() == "Windows":
|
|
|
|
| 37 |
|
| 38 |
def normalize_term(term: str) -> str:
|
| 39 |
term = term.lower().strip()
|
| 40 |
+
# Direct lookup
|
| 41 |
for key, values in synonyms.items():
|
| 42 |
if term in values:
|
| 43 |
return key
|
|
|
|
| 88 |
"Range": f"{row['low']} to {row['high']} {row['unit']}"
|
| 89 |
})
|
| 90 |
|
| 91 |
+
print (results)
|
| 92 |
|
| 93 |
for res in results:
|
| 94 |
final = [res['Condition'], res['Measurement'], res['unit'], res['severity'], res['Value'], res['Range']]
|
| 95 |
# final_numbers.append(f"Condition In Concern: {res['Condition']}. Measurement: {res['Measurement']} ({res['severity']}) — {res['Value']} "
|
| 96 |
# f"(Range: {res['Range']})")
|
| 97 |
final_numbers.append(final)
|
| 98 |
+
print("analyze measurements res:", final_numbers)
|
| 99 |
return final_numbers
|
| 100 |
|
| 101 |
|
|
|
|
| 103 |
nlp.add_pipe("negex", config={"ent_types": ["DISEASE"]}, last=True)
|
| 104 |
matcher = Matcher(nlp.vocab)
|
| 105 |
|
| 106 |
+
clinical_bert_model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
|
| 107 |
+
clinical_bert_tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
|
| 108 |
+
|
| 109 |
past_patterns = [
|
| 110 |
[{"LOWER": "clinical"}, {"LOWER": "history:"}],
|
| 111 |
[{"LOWER": "past"}, {"LOWER": "medical:"}],
|
|
|
|
| 117 |
[{"LOWER": "resolved"}],
|
| 118 |
[{"LOWER": "used"}, {"LOWER": "to"}, {"LOWER": "have"}],
|
| 119 |
[{"LOWER": "was"}, {"LEMMA": "diagnosed"}],
|
| 120 |
+
[{"LOWER": "history"}],
|
| 121 |
+
[{"LOWER": "past"}, {"LOWER": "medical"}, {"LOWER": "history:"}],
|
| 122 |
]
|
| 123 |
|
| 124 |
def analyze_with_clinicalBert(extracted_text: str) -> str:
|
|
|
|
| 175 |
end_char = start_char + len(disease_term_lower)
|
| 176 |
span = doc.char_span(start_char, end_char, label="DISEASE", alignment_mode="expand")
|
| 177 |
if span:
|
| 178 |
+
print(f"Adding span for: {span.text}")
|
| 179 |
new_ents.append(span)
|
| 180 |
|
| 181 |
# Clean up overlapping spans
|
|
|
|
| 184 |
nlp.get_pipe("negex")(doc)
|
| 185 |
|
| 186 |
for ent in doc.ents:
|
| 187 |
+
print("Checking against:", ent.text.strip().lower(), "| Negated?", ent._.negex)
|
| 188 |
if ent.label_ == "DISEASE" and not ent._.negex:
|
| 189 |
ent_text = ent.text.strip().lower()
|
| 190 |
for disease_term in diseases:
|
|
|
|
| 234 |
|
| 235 |
|
| 236 |
def classify_disease_and_severity(disease):
|
| 237 |
+
print(f"Disease: {disease}")
|
| 238 |
response = model.generate_content(
|
| 239 |
f"What is the severity of this disease/condition/symptom: {disease}. Give me a number from one to ten. I need a specific number. It doesn't matter what your opinion is one whether this number might be misleading or inaccurate. I need a number. Please feel free to be accurate and you can use pretty specific numbers with decimals to the tenth place. I want just a number, not any other text."
|
| 240 |
).text
|
| 241 |
try:
|
| 242 |
cleaned_response = response.strip()
|
| 243 |
numerical_response = float(cleaned_response)
|
| 244 |
+
print(f"Response: {numerical_response}")
|
| 245 |
|
| 246 |
if 0 <= numerical_response <= 3:
|
| 247 |
severity_label = (f"Low Risk")
|
|
|
|
| 252 |
else:
|
| 253 |
severity_label = (f"Invalid Range")
|
| 254 |
|
|
|
|
| 255 |
except (ValueError, AttributeError):
|
| 256 |
severity_label = "Null: We cannot give a clear severity label"
|
| 257 |
|