Spaces:

vineelagampa
/

PRMSChallenge

Sleeping

App Files Files Community

AyushSankar13 commited on Sep 22, 2025

Commit

9295e0f

verified ·

1 Parent(s): 2d9266b

Update bert.py

Browse files

Files changed (1) hide show

bert.py +13 -104

bert.py CHANGED Viewed

@@ -14,6 +14,7 @@ from spacy.matcher import Matcher
 import pandas as pd
 import re
 import difflib
 from api_key import GEMINI_API_KEY
@@ -25,102 +26,6 @@ model = genai.GenerativeModel('gemini-2.5-flash-lite')
 non_negated_diseases = []
-synonyms = {
-    "hba1c": ["hba1c", "hbaic", "hdate", "a1c", "hemoglobin a1c", "glycated hemoglobin", "hba", "hda", "hbic"],
-    "fasting glucose": ["fasting glucose", "fasting-glucose", "fasting blood sugar", "fbs"],
-    "ogtt": ["ogtt", "oral glucose tolerance test", "glucose tolerance test"],
-    "ldl": ["ldl", "ldl-c", "low density lipoprotein", "bad cholesterol"],
-    "hdl": ["hdl", "hdl-c", "high density lipoprotein", "good cholesterol"],
-    "triglycerides": ["triglycerides", "trigs", "tg"],
-    "total cholesterol": ["total cholesterol", "cholesterol total", "chol", "tc"],
-    "non-hdl": ["non-hdl", "non hdl", "nonhdl"],
-    # Thyroid
-    "tsh": ["tsh", "thyroid stimulating hormone"],
-    "free t4": ["free t4", "free-t4", "ft4", "free thyroxine"],
-    "free t3": ["free t3", "free-t3", "ft3", "free triiodothyronine"],
-    # Inflammation
-    "crp": ["crp", "c-reactive protein"],
-    "esr": ["esr", "erythrocyte sedimentation rate"],
-    # Vitamins
-    "vitamin-b12": ["vitamin-b12", "vitamin b12", "b12", "vit b12", "cobalamin"],
-    "vitamin-d": ["vitamin-d", "vitamin d", "vit d", "25-oh d", "25-hydroxy vitamin d"],
-    "vitamin-a": ["vitamin-a", "vitamin a", "vit a"],
-    "vitamin-e": ["vitamin-e", "vitamin e", "vit e"],
-    # Electrolytes
-    "sodium": ["sodium", "na"],
-    "potassium": ["potassium", "k"],
-    "calcium": ["calcium", "ca"],
-    "magnesium": ["magnesium", "mg"],
-    # Blood Pressure
-    "systolic": ["systolic", "sbp"],
-    "diastolic": ["diastolic", "dbp"],
-    # CBC
-    "wbc": ["wbc", "white blood cells", "white cell count"],
-    "rbc": ["rbc", "red blood cells", "red cell count"],
-    "hemoglobin": ["hemoglobin", "hb", "hgb"],
-    "hematocrit": ["hematocrit", "hct"],
-    "platelets": ["platelets", "plt"],
-    # Iron
-    "serum iron": ["serum iron", "iron"],
-    "ferritin": ["ferritin"],
-    "tibc": ["tibc", "total iron binding capacity"],
-    "transferrin saturation": ["transferrin saturation", "tsat"],
-    # Liver
-    "alt": ["alt", "sgpt"],
-    "ast": ["ast", "sgot"],
-    "alp": ["alp", "alkaline phosphatase"],
-    "bilirubin total": ["bilirubin total", "total bilirubin"],
-    "albumin": ["albumin"],
-    # Kidney
-    "creatinine": ["creatinine"],
-    "bun": ["bun", "blood urea nitrogen"],
-    "egfr": ["egfr", "estimated gfr"],
-    "urine protein": ["urine protein", "proteinuria"],
-    "urine albumin": ["urine albumin", "microalbumin"],
-    # Respiratory
-    "spo2": ["spo2", "oxygen saturation", "o2 sat"],
-    "pco2": ["pco2", "carbon dioxide partial pressure"],
-    "po2": ["po2", "oxygen partial pressure"],
-    "fev1": ["fev1", "forced expiratory volume"],
-    "fevi": ["fevi", "fev1"],  # common OCR mistake
-    # Coagulation
-    "inr": ["inr"],
-    "pt": ["pt", "prothrombin time"],
-    "aptt": ["aptt", "partial thromboplastin time"],
-    "fibrinogen": ["fibrinogen"],
-    # Hormones
-    "cortisol": ["cortisol"],
-    "testosterone": ["testosterone"],
-    "estradiol": ["estradiol", "estrogen"],
-    "progesterone": ["progesterone"],
-    # Infection
-    "procalcitonin": ["procalcitonin"],
-    "lactate": ["lactate"],
-    # Cardiac extras
-    "troponin": ["troponin", "trop"],
-    # Vitals
-    "temperature": ["temperature", "temp", "body temp"],
-    "heart rate": ["heart rate", "pulse", "hr"],
-    "oxygen saturation": ["oxygen saturation", "spo2", "o2 sat"],
-}
 if platform.system() == "Darwin":
     pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
 elif platform.system() == "Windows":
@@ -132,7 +37,7 @@ df['measurement'] = df['measurement'].str.lower()
 def normalize_term(term: str) -> str:
     term = term.lower().strip()
     for key, values in synonyms.items():
         if term in values:
             return key
@@ -183,14 +88,14 @@ def analyze_measurements(text, df):
                             "Range": f"{row['low']} to {row['high']} {row['unit']}"
                     })
-    #print (results)
     for res in results:
         final = [res['Condition'], res['Measurement'], res['unit'], res['severity'], res['Value'], res['Range']]
         # final_numbers.append(f"Condition In Concern: {res['Condition']}. Measurement: {res['Measurement']} ({res['severity']}) — {res['Value']} "
         #     f"(Range: {res['Range']})")
         final_numbers.append(final)
-    #print("analyze measurements res:", final_numbers)
     return final_numbers
@@ -198,6 +103,9 @@ nlp = spacy.load("en_core_web_sm")
 nlp.add_pipe("negex", config={"ent_types": ["DISEASE"]}, last=True)
 matcher = Matcher(nlp.vocab)
 past_patterns = [
     [{"LOWER": "clinical"}, {"LOWER": "history:"}],
     [{"LOWER": "past"}, {"LOWER": "medical:"}],
@@ -209,7 +117,8 @@ past_patterns = [
     [{"LOWER": "resolved"}],
     [{"LOWER": "used"}, {"LOWER": "to"}, {"LOWER": "have"}],
     [{"LOWER": "was"}, {"LEMMA": "diagnosed"}],
-    [{"LOWER": "history"},]
 ]
 def analyze_with_clinicalBert(extracted_text: str) -> str:
@@ -266,7 +175,7 @@ def extract_non_negated_keywords(text, threshold=80):
                     end_char = start_char + len(disease_term_lower)
                     span = doc.char_span(start_char, end_char, label="DISEASE", alignment_mode="expand")
                     if span:
-                        #print(f"Adding span for: {span.text}")
                         new_ents.append(span)
     # Clean up overlapping spans
@@ -275,7 +184,7 @@ def extract_non_negated_keywords(text, threshold=80):
     nlp.get_pipe("negex")(doc)
     for ent in doc.ents:
-        #print("Checking against:", ent.text.strip().lower(), "| Negated?", ent._.negex)
         if ent.label_ == "DISEASE" and not ent._.negex:
             ent_text = ent.text.strip().lower()
             for disease_term in diseases:
@@ -325,13 +234,14 @@ def analyze_text_and_describe(text):
 def classify_disease_and_severity(disease):
     response = model.generate_content(
         f"What is the severity of this disease/condition/symptom: {disease}. Give me a number from one to ten. I need a specific number. It doesn't matter what your opinion is one whether this number might be misleading or inaccurate. I need a number. Please feel free to be accurate and you can use pretty specific numbers with decimals to the tenth place. I want just a number, not any other text."
     ).text
     try:
         cleaned_response = response.strip()
         numerical_response = float(cleaned_response)
         if 0 <= numerical_response <= 3:
             severity_label = (f"Low Risk")
@@ -342,7 +252,6 @@ def classify_disease_and_severity(disease):
         else:
             severity_label = (f"Invalid Range")
-        print(f"Disease: {disease} Severity Label: {severity_label}")
     except (ValueError, AttributeError):
         severity_label = "Null: We cannot give a clear severity label"

 import pandas as pd
 import re
 import difflib
+from synoyms import synonyms
 from api_key import GEMINI_API_KEY
 non_negated_diseases = []
 if platform.system() == "Darwin":
     pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
 elif platform.system() == "Windows":
 def normalize_term(term: str) -> str:
     term = term.lower().strip()
+    # Direct lookup
     for key, values in synonyms.items():
         if term in values:
             return key
                             "Range": f"{row['low']} to {row['high']} {row['unit']}"
                     })
+    print (results)
     for res in results:
         final = [res['Condition'], res['Measurement'], res['unit'], res['severity'], res['Value'], res['Range']]
         # final_numbers.append(f"Condition In Concern: {res['Condition']}. Measurement: {res['Measurement']} ({res['severity']}) — {res['Value']} "
         #     f"(Range: {res['Range']})")
         final_numbers.append(final)
+    print("analyze measurements res:", final_numbers)
     return final_numbers
 nlp.add_pipe("negex", config={"ent_types": ["DISEASE"]}, last=True)
 matcher = Matcher(nlp.vocab)
+clinical_bert_model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+clinical_bert_tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
 past_patterns = [
     [{"LOWER": "clinical"}, {"LOWER": "history:"}],
     [{"LOWER": "past"}, {"LOWER": "medical:"}],
     [{"LOWER": "resolved"}],
     [{"LOWER": "used"}, {"LOWER": "to"}, {"LOWER": "have"}],
     [{"LOWER": "was"}, {"LEMMA": "diagnosed"}],
+    [{"LOWER": "history"}],
+    [{"LOWER": "past"}, {"LOWER": "medical"}, {"LOWER": "history:"}],
 ]
 def analyze_with_clinicalBert(extracted_text: str) -> str:
                     end_char = start_char + len(disease_term_lower)
                     span = doc.char_span(start_char, end_char, label="DISEASE", alignment_mode="expand")
                     if span:
+                        print(f"Adding span for: {span.text}")
                         new_ents.append(span)
     # Clean up overlapping spans
     nlp.get_pipe("negex")(doc)
     for ent in doc.ents:
+        print("Checking against:", ent.text.strip().lower(), "| Negated?", ent._.negex)
         if ent.label_ == "DISEASE" and not ent._.negex:
             ent_text = ent.text.strip().lower()
             for disease_term in diseases:
 def classify_disease_and_severity(disease):
+    print(f"Disease: {disease}")
     response = model.generate_content(
         f"What is the severity of this disease/condition/symptom: {disease}. Give me a number from one to ten. I need a specific number. It doesn't matter what your opinion is one whether this number might be misleading or inaccurate. I need a number. Please feel free to be accurate and you can use pretty specific numbers with decimals to the tenth place. I want just a number, not any other text."
     ).text
     try:
         cleaned_response = response.strip()
         numerical_response = float(cleaned_response)
+        print(f"Response: {numerical_response}")
         if 0 <= numerical_response <= 3:
             severity_label = (f"Low Risk")
         else:
             severity_label = (f"Invalid Range")
     except (ValueError, AttributeError):
         severity_label = "Null: We cannot give a clear severity label"