Spaces:

bestroi
/

PliniusNatHist

Sleeping

App Files Files Community

PliniusNatHist / app.py

bestroi

Update app.py

ca16794 verified about 1 year ago

raw

history blame contribute delete

4.99 kB

	import streamlit as st
	import pandas as pd
	import plotly.express as px
	import nltk
	from pathlib import Path

	# Download punkt tokenizer forcefully to avoid LookupError
	nltk.download('punkt', quiet=True, force=True)

	def count_tokens(text):
	tokens = nltk.word_tokenize(text)
	return len(tokens)

	def extract_number(entry):
	prefix = "plin. nat."
	start_index = entry.find(prefix)
	if start_index == -1:
	return 0.0
	start_index += len(prefix)
	num_str = ''.join([char for char in entry[start_index:] if char.isdigit() or char == '.'])
	try:
	return float(num_str)
	except ValueError:
	return 0.0

	@st.cache_data
	def load_data(csv_file):
	data = pd.read_csv(csv_file)
	data['token_count'] = data['Context'].apply(count_tokens)
	data['SortKey'] = data['Book/Chapter'].apply(extract_number)
	return data

	def visualize_data(data, sort_entries):
	if sort_entries:
	data = data.sort_values(by='SortKey')

	lemma_stats = data.groupby('Lemma').agg(
	Frequency=('Context', 'count'),
	Average_Tokens=('token_count', 'mean')
	).reset_index()

	# Tabs for better organization
	tab1, tab2, tab3 = st.tabs(["📊 Lemma Frequency", "🥧 Frequency Distribution", "📚 Chapter-wise Mentions"])

	with tab1:
	st.subheader("Lemma Frequency in the Dataset")
	fig_bar = px.bar(
	lemma_stats,
	x='Lemma',
	y='Frequency',
	color='Frequency',
	color_continuous_scale='Viridis',
	labels={'Frequency': 'Frequency'},
	title='Lemma Frequency',
	hover_data=['Average_Tokens']
	)
	fig_bar.update_layout(showlegend=False)
	st.plotly_chart(fig_bar, use_container_width=True)

	with tab2:
	st.subheader("Lemma Frequency Distribution")
	# Limit to top 20 for better readability
	top_lemmas = lemma_stats.nlargest(20, 'Frequency')
	fig_pie = px.pie(
	top_lemmas,
	values='Frequency',
	names='Lemma',
	title='Top 20 Lemmas Frequency Distribution',
	hole=0.4,
	color='Lemma',
	color_discrete_sequence=px.colors.qualitative.Set3
	)
	st.plotly_chart(fig_pie, use_container_width=True)

	with tab3:
	st.subheader("Chapter-wise Lemma Mentions")
	chapter_stats = data.groupby(['Book/Chapter', 'Lemma']).size().reset_index(name='Count')
	fig_stacked = px.bar(
	chapter_stats,
	x='Book/Chapter',
	y='Count',
	color='Lemma',
	title='Chapter-wise Lemma Mentions',
	labels={'Book/Chapter': 'Book/Chapter', 'Count': 'Mentions'},
	color_discrete_sequence=px.colors.qualitative.Pastel
	)
	st.plotly_chart(fig_stacked, use_container_width=True)

	st.markdown("---")
	most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]
	st.success(f"Most Common Lemma: {most_common_lemma['Lemma']} (Frequency: {most_common_lemma['Frequency']})")

	with st.expander("🔍 View Contexts"):
	st.markdown("### Contextual Information")
	for _, row in data.iterrows():
	st.markdown(f"Lemma: {row['Lemma']} \| Book/Chapter: {row['Book/Chapter']}")
	st.write(row['Context'])
	st.markdown("---")

	def main():
	st.set_page_config(
	page_title="Lemma Frequency Visualization",
	layout="wide",
	initial_sidebar_state="expanded",
	)
	st.title("📚 Lemma Frequency Visualization")

	# Sidebar section
	with st.sidebar:
	st.image("imgs/DiGi_Thrace logo-tall.jpg", use_column_width=True, caption="Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age")
	st.markdown("""
	### The Dataset:
	A curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in Naturalis Historia. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.

	The dataset is available on FigShare:
	https://doi.org/10.6084/m9.figshare.27044578.v1

	Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF
	""")

	csv_options = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"]
	csv_file = st.selectbox("📁 Select CSV file:", csv_options, index=0)

	sort_entries = st.checkbox("Sort Entries by Book/Chapter", value=True)

	st.markdown("---")
	st.markdown("### Customize Visualization")
	color_theme = st.selectbox("Select Color Theme for Charts:", ["Viridis", "Cividis", "Plasma", "Magma", "Inferno", "Turbo"])

	data = load_data(csv_file)
	visualize_data(data, sort_entries)

	if __name__ == "__main__":
	main()