File size: 2,874 Bytes
8b7740b e916274 8b7740b e916274 8b7740b e916274 a92fd71 83afbd5 a92fd71 51a55dc e916274 83afbd5 e916274 83afbd5 e916274 a92fd71 e916274 a92fd71 e916274 a92fd71 e916274 a92fd71 e916274 a92fd71 83afbd5 a92fd71 e916274 51a55dc e916274 a92fd71 83afbd5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | import gradio as gr
import numpy as np
import torch
from transformers import (
AutoTokenizer,
AutoModel,
AutoModelForSequenceClassification
)
from scipy.special import softmax
# ==============================
# LOAD MODELS ONCE (GLOBAL)
# ==============================
bert_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)
bert_model.eval()
sentiment_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
sentiment_model.eval()
# ==============================
# CORE FUNCTION (NO PANDAS)
# Input: list of lists
# Output: list of lists
# ==============================
def nlp_encode_sentence(values):
feature_rows = []
for row in values:
sentence = row[0] # first column
inputs = tokenizer(
sentence,
return_tensors="pt",
truncation=True,
padding=True
)
with torch.no_grad():
outputs = bert_model(**inputs)
cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
embedding_mean = float(np.mean(cls_embedding))
embedding_median = float(np.median(cls_embedding))
embedding_std = float(np.std(cls_embedding))
embedding_min = float(np.min(cls_embedding))
embedding_max = float(np.max(cls_embedding))
sentiment_inputs = sentiment_tokenizer(
sentence,
return_tensors="pt",
truncation=True,
padding=True
)
with torch.no_grad():
sentiment_outputs = sentiment_model(**sentiment_inputs)
probs = softmax(sentiment_outputs.logits.numpy()[0])
sentiment_score = float(probs[2] - probs[0])
feature_rows.append([
embedding_mean,
embedding_median,
embedding_std,
embedding_min,
embedding_max,
sentiment_score
])
return feature_rows
# ==============================
# GRADIO APP
# ==============================
with gr.Blocks() as demo:
gr.Markdown("### NLP Encoder")
input_data = gr.Dataframe(
headers=["value"],
datatype=["str"],
type="array"
)
output_data = gr.Dataframe(
headers=[
"embedding_mean",
"embedding_median",
"embedding_std",
"embedding_min",
"embedding_max",
"sentiment_score"
],
type="array"
)
btn = gr.Button("Run")
btn.click(
fn=nlp_encode_sentence,
inputs=input_data,
outputs=output_data
)
if __name__ == "__main__":
demo.launch()
|