| import streamlit as st
|
| import torch
|
| from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| import requests
|
| from bs4 import BeautifulSoup
|
| import pandas as pd
|
| import altair as alt
|
| from collections import OrderedDict
|
| from nltk.tokenize import sent_tokenize
|
|
|
|
|
| import nltk
|
| nltk.download('punkt')
|
|
|
|
|
| model_name = 'dejanseo/sentiment'
|
| model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
| sentiment_labels = {
|
| 0: "very positive",
|
| 1: "positive",
|
| 2: "somewhat positive",
|
| 3: "neutral",
|
| 4: "somewhat negative",
|
| 5: "negative",
|
| 6: "very negative"
|
| }
|
|
|
|
|
| background_colors = {
|
| "very positive": "rgba(0, 255, 0, 0.5)",
|
| "positive": "rgba(0, 255, 0, 0.3)",
|
| "somewhat positive": "rgba(0, 255, 0, 0.1)",
|
| "neutral": "rgba(128, 128, 128, 0.1)",
|
| "somewhat negative": "rgba(255, 0, 0, 0.1)",
|
| "negative": "rgba(255, 0, 0, 0.3)",
|
| "very negative": "rgba(255, 0, 0, 0.5)"
|
| }
|
|
|
|
|
| def get_text_from_url(url):
|
| response = requests.get(url)
|
| if response.status_code == 200:
|
| soup = BeautifulSoup(response.content, 'html.parser')
|
| paragraphs = soup.find_all('p')
|
| return ' '.join(p.get_text() for p in paragraphs)
|
| return ""
|
|
|
|
|
| def classify_text(text, max_length):
|
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
|
| with torch.no_grad():
|
| outputs = model(**inputs)
|
| scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
|
| return scores
|
|
|
|
|
| def classify_long_text(text):
|
| max_length = tokenizer.model_max_length
|
|
|
| chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
|
| aggregate_scores = [0] * len(sentiment_labels)
|
| chunk_scores_list = []
|
| for chunk in chunks:
|
| chunk_scores = classify_text(chunk, max_length)
|
| chunk_scores_list.append(chunk_scores)
|
| aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
|
|
|
| aggregate_scores = [x / len(chunks) for x in aggregate_scores]
|
| return aggregate_scores, chunk_scores_list, chunks
|
|
|
|
|
| def classify_sentences(text):
|
| sentences = sent_tokenize(text)
|
| sentence_scores = []
|
| for sentence in sentences:
|
| scores = classify_text(sentence, tokenizer.model_max_length)
|
| sentiment_idx = scores.index(max(scores))
|
| sentiment = sentiment_labels[sentiment_idx]
|
| sentence_scores.append((sentence, sentiment))
|
| return sentence_scores
|
|
|
|
|
| st.title("Sentiment Classification from URL")
|
|
|
| url = st.text_input("Enter URL:")
|
|
|
|
|
| st.markdown("""
|
| Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/).
|
|
|
| The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline.
|
|
|
| This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client.
|
|
|
| # Engage Our Team
|
| Interested in using this in an automated pipeline for bulk query processing?
|
|
|
| Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs.
|
| """)
|
|
|
| if url:
|
| text = get_text_from_url(url)
|
| if text:
|
| scores, chunk_scores_list, chunks = classify_long_text(text)
|
| scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
|
|
|
|
|
| sentiment_order = [
|
| "very positive", "positive", "somewhat positive",
|
| "neutral",
|
| "somewhat negative", "negative", "very negative"
|
| ]
|
| ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
|
|
|
|
|
| df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
|
|
|
|
|
| chart = alt.Chart(df.reset_index()).mark_bar().encode(
|
| x=alt.X('index', sort=sentiment_order, title='Sentiment'),
|
| y='Likelihood'
|
| ).properties(
|
| width=600,
|
| height=400
|
| )
|
|
|
| st.altair_chart(chart, use_container_width=True)
|
|
|
|
|
| for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
|
| chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
|
| ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
|
| df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
|
|
|
| chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
|
| x=alt.X('index', sort=sentiment_order, title='Sentiment'),
|
| y='Likelihood'
|
| ).properties(
|
| width=600,
|
| height=400
|
| )
|
|
|
| st.write(f"Chunk {i + 1}:")
|
| st.write(chunk)
|
| st.altair_chart(chunk_chart, use_container_width=True)
|
|
|
|
|
| st.write("Extracted Text with Sentiment Highlights:")
|
| sentence_scores = classify_sentences(text)
|
| for sentence, sentiment in sentence_scores:
|
| bg_color = background_colors[sentiment]
|
| st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
|
|
|
| else:
|
| st.write("Could not extract text from the provided URL.")
|
|
|
|
|
| st.markdown("""
|
| Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/).
|
|
|
| The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client.
|
|
|
| ### Engage Our Team
|
| Interested in using this in an automated pipeline for bulk query processing?
|
|
|
| Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs.
|
| """)
|
|
|