Spaces:

AT-VN-Research-Group
/

PUDMED-Sentence-Classification

Sleeping

File size: 5,084 Bytes

import os
import string
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns
import streamlit as st
import base64


def get_lines(txt_url):
    """
    Read data txt file 
    """
    with open(txt_url, "r") as f:
      return f.readlines()



def get_information(file_dir):
    """
    Create features from data
    Each sentence in an abstract has:
    - target: Class label
    - text: raw text
    - line id: Numeral order of a sentence in an abstract
    - line_length: Length of text
    - total_lines: Number of sentences in an abstract
    """

    input_lines = get_lines(file_dir)

    abstract_samples = []
    lines_in_one_abstract = ""
    for line in input_lines:
        # Start new abstract
        if line.startswith("###"):
            abstract_id = line[4:-1]
            one_abstract = ""
        # End an abstract
        elif line.isspace():
            lines_in_one_abstract = one_abstract.splitlines()
            for line_id, line in enumerate(lines_in_one_abstract):
                # Each dict contains infor of a line
                line_infor = {}
                line_infor['abstract_id'] = abstract_id
                # Split target and text
                line_split = line.split("\t")
                line_infor['target'] = line_split[0]
                line_infor['text'] = line_split[1].lower()
                line_infor['line_id'] = line_id
                line_infor['length_lines'] = len(line_split[1].split(" ")) # Num of words in a line
                line_infor['total_lines'] = len(lines_in_one_abstract) - 1 # Num of lines in an abstract
                abstract_samples.append(line_infor)
        else:
            one_abstract += line
    return abstract_samples


def get_information_infer(samples):
    """
    Extract feature in inference phase
    """

    total_line = len(samples)

    sample_lines = []
    for id, line in enumerate(samples):
        one_line = {}
        one_line['text'] = str(line)
        one_line['line_id'] = id
        one_line['length_lines'] = len(line.split(" "))
        one_line['total_lines'] = total_line - 1
        sample_lines.append(one_line)
    return sample_lines



def split_into_char(line):
    """Split a line into char"""
    return " ".join(list(line))



def convert_to_one_hot(y_pred):
    """
    Convert probability vector y_pred into one-hot vector
    """
    for row in range(len(y_pred)):
        max_index = np.argmax(y_pred[row])
        y_pred[row] = np.zeros((1, len(y_pred[row])))
        y_pred[row][max_index] = 1
    return y_pred



def convert_to_one_hot(y_pred):
    """
    Convert probability vector into one-hot vector
    """
    for row in range(len(y_pred)):
        max_index = np.argmax(y_pred[row])
        y_pred[row] = np.zeros((1, len(y_pred[row])))
        y_pred[row][max_index] = 1
    return y_pred



def get_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    Get confusion matrix of model predictio
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt=".2f" if normalize else "d", cmap=cmap, square=True,
                xticklabels=classes, yticklabels=classes)
    plt.title(title)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')

    plt.show()



# ---------------------Streamlit App utils------------------------------

# Decode image
def get_base64(bin_file):
    with open(bin_file, 'rb') as f:
        data = f.read()
    return base64.b64encode(data).decode()


# Set background for local web
def set_background(png_file):
    bin_str = get_base64(png_file)
    page_bg_img = '''
    <style>
    .stApp {
    background-image: url("data:image/png;base64,%s");
    background-size: cover;
    }
    </style>
    ''' % bin_str
    st.markdown(page_bg_img, unsafe_allow_html=True)


def get_block(custom_text, back_ground):
    block = f"""
        <div style="
                    {back_ground}
                    border-radius: 6px;
                    min-height: 80px;
                    padding: 20px;
                    --shadow: 1px 1px 1px 1px rgb(0 0 0 / 0.25);
                    box-shadow: var(--shadow);
                    border-radius: 25px;
                    box-sizing: border-box;
                    text-align: justify;
                    ">
                    <h4 style="color:#ffffff; font-family: cursive; font-size: 16px; font-weight: 200">
                            {custom_text}</h4>
                    </div>
                    </br>"""
    st.markdown(block, unsafe_allow_html=True)