File size: 4,356 Bytes
88cf11f
 
 
 
 
434ab59
 
88cf11f
 
434ab59
88cf11f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434ab59
88cf11f
 
 
 
 
 
 
 
 
 
 
 
 
 
434ab59
88cf11f
 
 
 
 
434ab59
88cf11f
 
 
 
 
 
 
 
 
 
434ab59
 
88cf11f
434ab59
88cf11f
 
 
 
434ab59
 
 
 
 
 
 
 
88cf11f
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import streamlit as st
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend for Streamlit
import matplotlib.pyplot as plt
import torch
import io
model = AutoModel.from_pretrained("aarnow/distilbert-base-uncased-1212-test")
tokenizer = AutoTokenizer.from_pretrained("aarnow/distilbert-base-uncased-1212-test")


with st.sidebar:
    st.title('Technical Demonstration')
    st.header('powered by rascal')
    st.markdown('''
    ## About
    This is a tool that shows the classification and PII redaction capabilities of the auditory skills model.  PII redaction is powered by Microsoft's presidio tool and the text classification model is trained on a combination of synthetic and human annotated data from the HATCH (Helping Adults Talk to Children) Lab at Idaho State University.  Erber's Hierarchy is used to benchmark the text classification model.

    ''')



def main():
    st.subheader("Enter Text for Evaluation")

    sentence = st.text_input('Type text to classify below')
    if sentence != "":
        #with PII redacted
        analyzer = AnalyzerEngine()
        # Call analyzer to get results
        results = analyzer.analyze(text=sentence,
                                   language='en')

        # Analyzer results are passed to the AnonymizerEngine for anonymization
        anonymizer = AnonymizerEngine()
        anonymized_text = anonymizer.anonymize(text=sentence,analyzer_results=results)
        st.markdown("**Your text with PII redacted:**  "+anonymized_text.text)
        st.text(results)
        st.subheader("Classification Details")
        #use classification model below
        #sentence = 'My child is able to comprehend a voice when the TV is on'
        labels = ['DETECTION', 'DISCRIMINATION', 'IDENTIFICATION','COMPREHENSION']

        # run inputs through model and mean-pool over the sequence
        # dimension to get sequence-level representations
        inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                             return_tensors='pt',
                                             padding=True)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        output = model(input_ids, attention_mask=attention_mask)[0]
        sentence_rep = output[:1].mean(dim=1)
        label_reps = output[1:].mean(dim=1)

        # now find the labels with the highest cosine similarities to
        # the sentence
        similarities = F.cosine_similarity(sentence_rep, label_reps)
        closest = similarities.argsort(descending=True)
        st.markdown("The classification that best fits your entry is: "+labels[closest[0]])


        #map the labels
        plt.clf()  # Clear previous plot
        tensor_datalbl = label_reps.detach()
        x_values = tensor_datalbl[:, 0].numpy()
        y_values = tensor_datalbl[:, 1].numpy()

        # Create a scatter plot for labels
        plt.scatter(x_values, y_values, label='Labels')

        # Add labels to specific points (adjust indices as needed)
        for i in range(len(tensor_datalbl)):
            plt.text(x_values[i], y_values[i], str(labels[i]), fontsize=8, ha='right', va='bottom')


        #map the sentence
        tensor_datasen = sentence_rep.detach()

        # Extract the individual dimensions for the scatter plot
        x_values_sen = tensor_datasen[:, 0].numpy()
        y_values_sen = tensor_datasen[:, 1].numpy()

        plt.scatter(x_values_sen, y_values_sen, label='Input Sentence', color='red', marker='x', s=100)

        plt.title('2D Representation of Similarity Estimates (2D)')
        plt.xlabel('X-axis')
        plt.ylabel('Y-axis')
        plt.legend()
        
        # Save to BytesIO instead of file system
        buf = io.BytesIO()
        plt.savefig(buf, format='png', bbox_inches='tight')
        buf.seek(0)
        st.image(buf)
        buf.close()
        st.subheader("Classification Details")
        for ind in closest:
            #print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')
            st.write(f'label: {labels[ind]} \t similarity: {similarities[ind]}')


#run main
if __name__ == '__main__':
    main()