File size: 2,874 Bytes
8b7740b
e916274
 
 
 
 
 
 
 
8b7740b
e916274
 
 
8b7740b
e916274
 
 
 
 
 
 
 
 
 
 
a92fd71
83afbd5
 
 
a92fd71
 
51a55dc
e916274
 
 
83afbd5
 
e916274
83afbd5
 
 
 
 
 
e916274
 
 
 
 
 
a92fd71
 
 
 
 
e916274
 
 
 
 
 
 
 
 
 
 
 
a92fd71
e916274
a92fd71
 
 
 
 
 
 
 
e916274
a92fd71
e916274
 
a92fd71
83afbd5
a92fd71
e916274
51a55dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e916274
a92fd71
83afbd5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification
)
from scipy.special import softmax

# ==============================
# LOAD MODELS ONCE (GLOBAL)
# ==============================

bert_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)
bert_model.eval()

sentiment_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
sentiment_model.eval()


# ==============================
# CORE FUNCTION (NO PANDAS)
# Input: list of lists
# Output: list of lists
# ==============================

def nlp_encode_sentence(values):

    feature_rows = []

    for row in values:
        sentence = row[0]   # first column

        inputs = tokenizer(
            sentence,
            return_tensors="pt",
            truncation=True,
            padding=True
        )

        with torch.no_grad():
            outputs = bert_model(**inputs)

        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()

        embedding_mean = float(np.mean(cls_embedding))
        embedding_median = float(np.median(cls_embedding))
        embedding_std = float(np.std(cls_embedding))
        embedding_min = float(np.min(cls_embedding))
        embedding_max = float(np.max(cls_embedding))

        sentiment_inputs = sentiment_tokenizer(
            sentence,
            return_tensors="pt",
            truncation=True,
            padding=True
        )

        with torch.no_grad():
            sentiment_outputs = sentiment_model(**sentiment_inputs)

        probs = softmax(sentiment_outputs.logits.numpy()[0])
        sentiment_score = float(probs[2] - probs[0])

        feature_rows.append([
            embedding_mean,
            embedding_median,
            embedding_std,
            embedding_min,
            embedding_max,
            sentiment_score
        ])

    return feature_rows


# ==============================
# GRADIO APP
# ==============================

with gr.Blocks() as demo:
    gr.Markdown("### NLP Encoder")

    input_data = gr.Dataframe(
        headers=["value"],
        datatype=["str"],
        type="array"
    )

    output_data = gr.Dataframe(
        headers=[
            "embedding_mean",
            "embedding_median",
            "embedding_std",
            "embedding_min",
            "embedding_max",
            "sentiment_score"
        ],
        type="array"
    )

    btn = gr.Button("Run")

    btn.click(
        fn=nlp_encode_sentence,
        inputs=input_data,
        outputs=output_data
    )

if __name__ == "__main__":
    demo.launch()