Spaces:

dhanikitkat
/

sentiment_emotion

Sleeping

App Files Files Community

sentiment_emotion / app.py

dhanikitkat

update self model for sentiment analysis

f45217d almost 2 years ago

raw

history blame contribute delete

16.6 kB

	import streamlit as st
	import pandas as pd
	from transformers import pipeline
	import base64
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import plotly.express as px
	import matplotlib.pyplot as plt
	from wordcloud import WordCloud
	import numpy as np
	from PIL import ImageFont
	import os


	nltk.download('punkt')
	nltk.download('stopwords')

	# Load pipelines
	sentiment_pipe = pipeline("text-classification", model="dhanikitkat/indo_smsa-1.5G_sentiment_analysis")
	emotion_pipe = pipeline("text-classification", model="azizp128/prediksi-emosi-indobert")

	def load_slank_formal(file):
	if file.name.endswith('.txt'):
	df = pd.read_csv(file, sep=';', header=None, names=['Slank', 'Formal'])
	else:
	st.error("Format file tidak didukung. Harap unggah file TXT.")
	return None
	df.columns = ['Slank', 'Formal']
	return df

	def replace_slank_to_formal(sentence, slank_formal_df):
	words = re.findall(r'[\w\',./:-]+\|[.,]+\|[^\x00-\x7F]+', sentence)
	for i, word in enumerate(words):
	replacement = slank_formal_df.loc[slank_formal_df['Slank'] == word.lower(), 'Formal'].values
	if replacement.size > 0:
	words[i] = str(replacement[0])
	return ' '.join(words)

	def preprocess_text(text, slank_formal_df):
	text = text.lower()
	text = re.sub(r'http\S+\|www\S+\|https\S+', '', text, flags=re.MULTILINE)
	text = re.sub(r'\@\w+\|\#', '', text)
	text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text)
	text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text)
	text = re.sub(r'([.,])', r' \1 ', text)
	text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text)
	text = re.sub(r'(\w)\1{1,}', r'\1\1', text)
	text = re.sub(r'\s+', ' ', text).strip()
	text = re.sub(r'\b(\w+)\b\s-\s\b\1\b', r'\1-\1', text)
	text = re.sub(r'(?<=\d)\s\.\s(?=\d)', '.', text)
	text = re.sub(r'(?<=\d)\s,\s(?=\d)', ',', text)
	text = re.sub(r'\s+', ' ', text).strip()
	text = replace_slank_to_formal(text, slank_formal_df)
	tokens = word_tokenize(text)
	preprocessed_text = ' '.join(tokens)
	return preprocessed_text

	def generate_wordcloud(text, font_path, colormap, title):
	# Create a circular mask for Full HD resolution
	x, y = np.ogrid[:1400, :1400] # Adjusted for 1400x1400 resolution
	mask = (x - 700) 2 + (y - 700) 2 > 630 ** 2 # Adjusted mask size for 1400x1400 resolution
	mask = 255 * mask.astype(int)

	# Remove Indonesian stopwords
	indo_stopwords = set(stopwords.words('indonesian'))
	words = text.split()
	words = [word for word in words if word.lower() not in indo_stopwords]
	text = ' '.join(words)

	wordcloud = WordCloud(
	width=1400,
	height=1400,
	background_color='white',
	font_path=font_path,
	prefer_horizontal=1.0,
	colormap=colormap,
	max_words=100,
	mask=mask
	).generate(text)

	# Configure plot settings for high-quality output
	plt.figure(figsize=(14, 14)) # Adjusted figure size for 1400x1400 resolution
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.axis('off')
	plt.title(title, fontsize=20, pad=20) # Title directly in matplotlib plot

	# Save word cloud to file with high DPI for better quality
	plt.savefig(f"{title}.png", dpi=300, bbox_inches='tight', pad_inches=0.1)

	# Display word cloud in Streamlit
	st.image(f"{title}.png", use_column_width=True)

	# Add download link for word cloud
	st.markdown(get_image_download_link(f"{title}.png"), unsafe_allow_html=True)

	def analyze_sentiment(text):
	result = sentiment_pipe(text)[0]
	return result['label'].lower(), result['score']

	def analyze_emotion(text):
	result = emotion_pipe(text)[0]
	return result['label'].lower(), result['score']

	def get_download_link(df, filename):
	csv = df.to_csv(index=False)
	b64 = base64.b64encode(csv.encode()).decode()
	href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download CSV</a>'
	return href

	def get_word_freq_download_link(word_freq_df):
	csv = word_freq_df.to_csv(index=True)
	b64 = base64.b64encode(csv.encode()).decode()
	href = f'<a href="data:file/csv;base64,{b64}" download="word_frequency.csv">Download Word Frequency CSV</a>'
	return href

	def get_example_download_link(file_path, link_text):
	with open(file_path, "rb") as file:
	b64 = base64.b64encode(file.read()).decode()
	return f'<a href="data:file/txt;base64,{b64}" download="{os.path.basename(file_path)}">{link_text}</a>'

	def get_image_download_link(image_path):
	with open(image_path, "rb") as image_file:
	b64 = base64.b64encode(image_file.read()).decode()
	href = f'<a href="data:file/png;base64,{b64}" download="{image_path}">Download {image_path}</a>'
	return href

	def combined_analysis(text, slank_formal_df):
	texts = text.split('\n')
	results = []
	for text in texts:
	if text.strip():
	cleaned_text = preprocess_text(text, slank_formal_df)
	sentiment_result = sentiment_pipe(cleaned_text)[0]
	emotion_result = emotion_pipe(cleaned_text)[0]
	results.append((text, cleaned_text, sentiment_result['label'].lower(), sentiment_result['score'], emotion_result['label'].lower(), emotion_result['score']))
	df = pd.DataFrame(results, columns=['Content', 'Cleaned Content', 'Sentiment', 'Score Sentiment', 'Emotion', 'Score Emotion'])

	# Define custom CSS to adjust the height
	st.markdown(
	"""
	<style>
	.chart-container {
	display: flex;
	justify-content: center;
	}
	.user-select-none.svg-container {
	height: 360px !important;
	}
	.average-score {
	text-align: center;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	# Sentiment pie chart
	sentiment_counts = df['Sentiment'].value_counts()
	sentiment_colors = {
	'positive': px.colors.qualitative.Set3[0],
	'negative': px.colors.qualitative.Set3[3],
	'neutral': px.colors.qualitative.Set3[1]
	}

	fig_sentiment = px.pie(
	sentiment_counts,
	values=sentiment_counts.values,
	names=sentiment_counts.index,
	title='Sentiment Distribution',
	width=400,
	height=400,
	color=sentiment_counts.index,
	color_discrete_map=sentiment_colors
	)

	# Calculate sentiment average
	sentiment_average = df['Score Sentiment'].mean()

	# Add average sentiment score as an annotation
	fig_sentiment.add_annotation(
	text=f"Average Sentiment Score: {sentiment_average:.4f}",
	xref="paper", yref="paper",
	x=0.5, y=-0.2,
	showarrow=False,
	font=dict(size=18)
	)

	st.markdown('<div class="chart-container">', unsafe_allow_html=True)
	st.plotly_chart(fig_sentiment, use_container_width=True)
	st.markdown('</div>', unsafe_allow_html=True)

	# Emotion pie chart
	# Sentiment pie chart
	emotion_counts = df['Emotion'].value_counts()
	emotion_colors = {
	'marah': px.colors.qualitative.Safe[9],
	'sedih': px.colors.qualitative.Safe[1],
	'senang': px.colors.qualitative.Safe[0],
	'cinta': px.colors.qualitative.Safe[2],
	'jijik': px.colors.qualitative.Safe[6],
	'takut': px.colors.qualitative.Safe[7],
	}
	fig_emotion = px.pie(
	emotion_counts,
	values=emotion_counts.values,
	names=emotion_counts.index,
	title='Emotion Distribution',
	width=400,
	height=400,
	color=emotion_counts.index,
	color_discrete_map=emotion_colors
	)

	# Calculate emotion average
	emotion_average = df['Score Emotion'].mean()

	# Add average emotion score as an annotation
	fig_emotion.add_annotation(
	text=f"Average Emotion Score: {emotion_average:.4f}",
	xref="paper", yref="paper",
	x=0.5, y=-0.2,
	showarrow=False,
	font=dict(size=18)
	)

	st.markdown('<div class="chart-container">', unsafe_allow_html=True)
	st.plotly_chart(fig_emotion, use_container_width=True)
	st.markdown('</div>', unsafe_allow_html=True)

	# Generate word clouds
	font_path = os.path.join('assets', 'Poppins-Regular.ttf')

	# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
	overall_text = ' '.join(df['Cleaned Content'].dropna())
	generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')

	positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
	generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')

	negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
	generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')

	# Word frequency
	word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
	st.write("Word Frequency:")
	st.write(word_freq)

	# Download link for word frequency
	word_freq_df = word_freq.reset_index()
	word_freq_df.columns = ['Word', 'Frequency']
	st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)

	return df

	def process_file(file, slank_formal_df):
	if file.name.endswith('.xlsx'):
	df = pd.read_excel(file)
	elif file.name.endswith('.csv'):
	df = pd.read_csv(file)
	else:
	st.error("Format file tidak didukung. Harap unggah file CSV atau XLSX.")
	return None

	results = []
	for index, row in df.iterrows():
	if pd.notna(row['content']) and isinstance(row['content'], str):
	cleaned_text = preprocess_text(row['content'], slank_formal_df)
	sentiment, score_sentiment = analyze_sentiment(cleaned_text)
	emotion, score_emotion = analyze_emotion(cleaned_text)
	results.append((row['content'], cleaned_text, sentiment, score_sentiment, emotion, score_emotion))
	else:
	results.append((row['content'], None, None, None, None, None))

	df['Cleaned Content'] = [r[1] for r in results]
	df['Sentiment'] = [r[2] for r in results]
	df['Score Sentiment'] = [r[3] for r in results]
	df['Emotion'] = [r[4] for r in results]
	df['Score Emotion'] = [r[5] for r in results]

	# Define custom CSS to adjust the height
	st.markdown(
	"""
	<style>
	.chart-container {
	display: flex;
	justify-content: center;
	}
	.user-select-none.svg-container {
	height: 360px !important;
	}
	.average-score {
	text-align: center;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	# Sentiment pie chart
	sentiment_counts = df['Sentiment'].value_counts()
	sentiment_colors = {
	'positive': px.colors.qualitative.Set3[0],
	'negative': px.colors.qualitative.Set3[3],
	'neutral': px.colors.qualitative.Set3[1]
	}

	fig_sentiment = px.pie(
	sentiment_counts,
	values=sentiment_counts.values,
	names=sentiment_counts.index,
	title='Sentiment Distribution',
	width=400,
	height=400,
	color=sentiment_counts.index,
	color_discrete_map=sentiment_colors
	)

	# Calculate sentiment average
	sentiment_average = df['Score Sentiment'].mean()

	# Add average sentiment score as an annotation
	fig_sentiment.add_annotation(
	text=f"Average Sentiment Score: {sentiment_average:.4f}",
	xref="paper", yref="paper",
	x=0.5, y=-0.2,
	showarrow=False,
	font=dict(size=18)
	)

	st.markdown('<div class="chart-container">', unsafe_allow_html=True)
	st.plotly_chart(fig_sentiment, use_container_width=True)
	st.markdown('</div>', unsafe_allow_html=True)

	# Emotion pie chart
	# Sentiment pie chart
	emotion_counts = df['Emotion'].value_counts()
	emotion_colors = {
	'marah': px.colors.qualitative.Safe[9],
	'sedih': px.colors.qualitative.Safe[1],
	'senang': px.colors.qualitative.Safe[0],
	'cinta': px.colors.qualitative.Safe[2],
	'jijik': px.colors.qualitative.Safe[6],
	'takut': px.colors.qualitative.Safe[7],
	}
	fig_emotion = px.pie(
	emotion_counts,
	values=emotion_counts.values,
	names=emotion_counts.index,
	title='Emotion Distribution',
	width=400,
	height=400,
	color=emotion_counts.index,
	color_discrete_map=emotion_colors
	)

	# Calculate emotion average
	emotion_average = df['Score Emotion'].mean()

	# Add average emotion score as an annotation
	fig_emotion.add_annotation(
	text=f"Average Emotion Score: {emotion_average:.4f}",
	xref="paper", yref="paper",
	x=0.5, y=-0.2,
	showarrow=False,
	font=dict(size=18)
	)

	st.markdown('<div class="chart-container">', unsafe_allow_html=True)
	st.plotly_chart(fig_emotion, use_container_width=True)
	st.markdown('</div>', unsafe_allow_html=True)

	# Generate word clouds
	font_path = os.path.join('assets', 'Poppins-Regular.ttf')

	# Ensure `df` is your DataFrame and 'Cleaned Content', 'Sentiment', and 'Emotion' columns exist
	overall_text = ' '.join(df['Cleaned Content'].dropna())
	generate_wordcloud(overall_text, font_path, 'hsv_r', 'Overall Word Cloud')

	positive_happy_text = ' '.join(df[(df['Sentiment'] == 'positive') & (df['Emotion'] == 'senang')]['Cleaned Content'].dropna())
	generate_wordcloud(positive_happy_text, font_path, 'gist_rainbow_r', 'Positive Sentiment & Happy Emotion Word Cloud')

	negative_angry_sad_text = ' '.join(df[(df['Sentiment'] == 'negative') & (df['Emotion'].isin(['marah', 'sedih']))]['Cleaned Content'].dropna())
	generate_wordcloud(negative_angry_sad_text, font_path, 'inferno', 'Negative Sentiment & Angry or Sad Emotion Word Cloud')

	# Word frequency
	word_freq = pd.Series(' '.join(df['Cleaned Content'].dropna()).split()).value_counts()
	st.write("Word Frequency:")
	st.write(word_freq)

	# Download link for word frequency
	word_freq_df = word_freq.reset_index()
	word_freq_df.columns = ['Word', 'Frequency']
	st.markdown(get_word_freq_download_link(word_freq_df), unsafe_allow_html=True)

	return df

	def main():
	st.title("Aplikasi Analisis Sentimen dan Prediksi Emosi")

	# Add download link for example slank template
	slank_template_path = "assets/contoh template data slank.txt"
	st.markdown(get_example_download_link(slank_template_path, "Download Contoh Template Data Slank (TXT)"), unsafe_allow_html=True)

	slank_file = st.file_uploader("Upload file slank dengan baris pertama Slank;Formal (TXT)", type=["txt"])
	if slank_file is not None:
	df_slank_formal = load_slank_formal(slank_file)
	if df_slank_formal is None:
	st.stop()
	else:
	st.warning("Harap upload file slank terlebih dahulu.")
	st.stop()

	menu = st.sidebar.selectbox("Pilih Metode", ["Analisis Langsung", "Import dari File"])

	if menu == "Analisis Langsung":
	user_input = st.text_area("Masukkan teks yang ingin dianalisis (pisahkan dengan enter):")
	if st.button("Analisis"):
	df = combined_analysis(user_input, df_slank_formal)
	st.write("Hasil Analisis:")
	st.write(df)
	st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)

	elif menu == "Import dari File":
	# Add download link for example content template
	content_template_path = "assets/contoh template data content.xlsx"
	st.markdown(get_example_download_link(content_template_path, "Download Contoh Template Data Content (XLSX)"), unsafe_allow_html=True)

	uploaded_file = st.file_uploader("Upload file CSV atau XLSX", type=["csv", "xlsx"])
	if uploaded_file is not None:
	df = process_file(uploaded_file, df_slank_formal)
	st.write("Hasil Analisis:")
	st.write(df)
	st.markdown(get_download_link(df, "analisis_sentimen_emosi"), unsafe_allow_html=True)

	if __name__ == '__main__':
	main()