File size: 1,192 Bytes
f27e986
14f41f0
f27e986
14f41f0
f27e986
 
 
14f41f0
 
 
 
f27e986
 
14f41f0
 
 
f27e986
 
 
 
14f41f0
f27e986
14f41f0
f27e986
 
 
 
 
 
 
 
 
14f41f0
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
import pdfplumber
import pandas as pd
import torch


def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join(
            page.extract_text() for page in pdf.pages if page.extract_text()
        )


def extract_lab_tests_dict(text):
    pattern = r"[-•]?\s*([\w\s/()%.-]+?):\s*([\d.]+)\s*(\w+/?.*)?"
    matches = re.findall(pattern, text)
    lab_dict = {}
    for test, value, unit in matches:
        test = test.strip()
        try:
            lab_dict[test] = float(value)
        except ValueError:
            continue
    return lab_dict


def prepare_lab_tensor(lab_data, feature_list):
    values = [lab_data.get(feature, -1) for feature in feature_list]
    return torch.tensor([values], dtype=torch.float32)


def load_icd_mapping(csv_path):
    df = pd.read_csv(csv_path)
    if not {"ICD_Label", "ICD Code", "Diagnosis"}.issubset(df.columns):
        raise ValueError("CSV must include ICD_Label, ICD Code, Diagnosis columns.")
    df = df.drop_duplicates(subset="ICD_Label")
    return {
        int(row["ICD_Label"]): (row["ICD Code"], row["ICD_Label"], row["Diagnosis"])
        for _, row in df.iterrows()
    }