Spaces:

shaheer
/

nlp-dataset

Runtime error

nlp-dataset / app.py

Zaman, Shaheer Shaheer

first commit

414b5fe over 3 years ago

2.87 kB

	import streamlit as st
	import pandas as pd
	import re
	import nltk
	from PIL import Image
	import os
	import numpy as np
	import seaborn as sns
	from wordcloud import WordCloud, STOPWORDS
	from nltk.corpus import stopwords
	import datasets
	from datasets import load_dataset
	import matplotlib.pyplot as plt
	import sklearn
	from sklearn.preprocessing import LabelEncoder
	sns.set_palette('RdBu')

	#load dataset
	dataset = load_dataset('merve/poetry', streaming=True)
	df = pd.DataFrame.from_dict(dataset['train'])

	d = os.path.dirname(__file__) if '__file__' in locals() else os.getcwd()
	nltk.download('stopwords')
	stop = stopwords.words('english')

	def standardize(text, remove_digits=True):
	text = re.sub('[^a-zA-Z\d\s]', '', text)
	text = text.lower()

	return text


	st.set_option('deprecation.showPyplotGlobalUse', False)
	st.write('Poetry dataset, content character cleaned from special characters and lower cased')
	df.content = df.content.apply(lambda x: ' '.join(word for word in x.split() if word not in stop))
	df.content = df.content.apply(standardize)
	st.dataframe(df)

	st.subheader('Visualization on dataset statistics')

	st.write('Number of peoms written in each type')
	sns.catplot(x='type', data=df, kind='count')
	plt.xticks(rotation=0)
	st.pyplot()

	st.write('Number of poems for each age')
	sns.catplot(x='age', data=df, kind='count')
	plt.xticks(rotation=0)
	st.pyplot()

	st.write("Number of poems for each author")
	sns.catplot(x="author", data=df, kind="count", aspect = 4)
	plt.xticks(rotation=90)
	st.pyplot()

	st.write('Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later')
	le = LabelEncoder()

	df.author = le.fit_transform(df.author)
	sns.catplot(x='age', y='author', hue='type', data=df)
	st.pyplot()

	words = df.content.str.split(expand=True).unstack().value_counts()
	renaissance = df.content.loc[df.age == 'Renaissance'].str.split(expand=True).unstack().value_counts()
	modern = df.content.loc[df.age == 'modern'].str.split(expand=True).unstack().value_counts()
	st.subheader('Visualizing content')
	mask = np.array(Image.open(os.path.join(d, 'poet.png')))

	import matplotlib.pyplot as plt
	def word_cloud(content, title):
	wc = WordCloud(background_color='white',
	max_words=200,
	contour_width=3,
	stopwords=STOPWORDS,
	max_font_size=50)
	wc.generate(' '.join(content.index.values))
	fig = plt.figure(figsize=(10, 10))
	plt.title(title, fontsize=20)
	plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
	plt.axis('off')
	st.pyplot()

	st.subheader('Most appearing words excluding stopwords n poems according to ages')
	word_cloud(modern, 'word cloud Renaissance poems')

	st.write('Most appearing words including stopwords')
	st.bar_chart(words[:50])