File size: 6,561 Bytes
6a304fc
0e0b44f
 
 
 
 
 
 
 
 
 
 
 
2c4a02b
0e0b44f
 
 
 
 
 
29fac90
 
 
 
 
 
 
 
 
 
 
0e0b44f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd3439d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e0b44f
 
 
 
 
 
 
fd3439d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e0b44f
 
 
eb6289f
0e0b44f
 
 
eb6289f
0e0b44f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import streamlit as st
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import requests
import pandas as pd
import altair as alt
from collections import OrderedDict
from nltk.tokenize import sent_tokenize
import trafilatura

# Load the punkt tokenizer from nltk
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Load model and tokenizer
model_name = 'dejanseo/sentiment'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

st.set_page_config(
    page_title="Sentiment Analysis Tool by DEJAN AI",
    page_icon="🔎",
    layout="wide"
)

st.logo(
    image="https://dejan.ai/wp-content/uploads/2024/02/dejan-300x103.png",
    link="https://dejan.ai/",
)

# Sentiment labels as textual descriptions
sentiment_labels = {
    0: "very positive",
    1: "positive",
    2: "somewhat positive",
    3: "neutral",
    4: "somewhat negative",
    5: "negative",
    6: "very negative"
}

# Background colors for sentiments
background_colors = {
    "very positive": "rgba(0, 255, 0, 0.5)",
    "positive": "rgba(0, 255, 0, 0.3)",
    "somewhat positive": "rgba(0, 255, 0, 0.1)",
    "neutral": "rgba(128, 128, 128, 0.1)",
    "somewhat negative": "rgba(255, 0, 0, 0.1)",
    "negative": "rgba(255, 0, 0, 0.3)",
    "very negative": "rgba(255, 0, 0, 0.5)"
}

# Function to get text content from a URL
def get_text_from_url(url):
    downloaded = trafilatura.fetch_url(url)
    if downloaded:
        return trafilatura.extract(downloaded)
    return ""

# Function to classify text
def classify_text(text, max_length):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
    return scores

# Function to handle long texts
def classify_long_text(text):
    max_length = tokenizer.model_max_length
    # Split the text into chunks
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    aggregate_scores = [0] * len(sentiment_labels)
    chunk_scores_list = []
    for chunk in chunks:
        chunk_scores = classify_text(chunk, max_length)
        chunk_scores_list.append(chunk_scores)
        aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
    # Average the scores
    aggregate_scores = [x / len(chunks) for x in aggregate_scores]
    return aggregate_scores, chunk_scores_list, chunks

# Function to classify each sentence in the text
def classify_sentences(text):
    sentences = sent_tokenize(text)
    sentence_scores = []
    for sentence in sentences:
        scores = classify_text(sentence, tokenizer.model_max_length)
        sentiment_idx = scores.index(max(scores))
        sentiment = sentiment_labels[sentiment_idx]
        sentence_scores.append((sentence, sentiment))
    return sentence_scores

def render_analysis(text):
    scores, chunk_scores_list, chunks = classify_long_text(text)
    scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}

    # Ensure the exact order of labels in the graph
    sentiment_order = [
        "very positive", "positive", "somewhat positive",
        "neutral",
        "somewhat negative", "negative", "very negative"
    ]
    ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)

    # Prepare the DataFrame and reindex
    df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)

    # Use Altair to plot the bar chart
    chart = alt.Chart(df.reset_index()).mark_bar().encode(
        x=alt.X('index', sort=sentiment_order, title='Sentiment'),
        y='Likelihood'
    ).properties(
        width=600,
        height=400
    )
    st.altair_chart(chart, use_container_width=True)

    # Display each chunk and its own chart
    for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
        chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
        ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
        df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)

        chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
            x=alt.X('index', sort=sentiment_order, title='Sentiment'),
            y='Likelihood'
        ).properties(
            width=600,
            height=400
        )

        st.write(f"Chunk {i + 1}:")
        st.write(chunk)
        st.altair_chart(chunk_chart, use_container_width=True)

    # Sentence-level classification with background colors
    st.write("Extracted Text with Sentiment Highlights:")
    sentence_scores = classify_sentences(text)
    for sentence, sentiment in sentence_scores:
        bg_color = background_colors[sentiment]
        st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)

# Streamlit UI
st.title("Sentiment Classification Model")

tab_paste, tab_scrape = st.tabs(["Paste Text", "Scrape URL"])

with tab_paste:
    pasted_text = st.text_area("Paste text for analysis:", height=300, placeholder="Paste text here…")
    if pasted_text and pasted_text.strip():
        render_analysis(pasted_text.strip())

with tab_scrape:
    st.info("If the site returns 403 (forbidden), use the 'Paste Text' tab to analyze content manually.")
    url = st.text_input("Enter URL:")
    if url:
        text = get_text_from_url(url)
        if text:
            render_analysis(text)
        else:
            st.write("Could not extract text from the provided URL.")

# Additional information at the end
st.markdown("""
Multi-label sentiment classification model developed by [Dejan AI](https://dejan.ai/).
The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client.
### Engage Our Team
Interested in using this in an automated pipeline for bulk sentiment processing?
Please [book an appointment](https://dejan.ai/call/) to discuss your needs.
""")