PliniusNatHist / app.py
bestroi's picture
Update app.py
ca16794 verified
import streamlit as st
import pandas as pd
import plotly.express as px
import nltk
from pathlib import Path
# Download punkt tokenizer forcefully to avoid LookupError
nltk.download('punkt', quiet=True, force=True)
def count_tokens(text):
tokens = nltk.word_tokenize(text)
return len(tokens)
def extract_number(entry):
prefix = "plin. nat."
start_index = entry.find(prefix)
if start_index == -1:
return 0.0
start_index += len(prefix)
num_str = ''.join([char for char in entry[start_index:] if char.isdigit() or char == '.'])
try:
return float(num_str)
except ValueError:
return 0.0
@st.cache_data
def load_data(csv_file):
data = pd.read_csv(csv_file)
data['token_count'] = data['Context'].apply(count_tokens)
data['SortKey'] = data['Book/Chapter'].apply(extract_number)
return data
def visualize_data(data, sort_entries):
if sort_entries:
data = data.sort_values(by='SortKey')
lemma_stats = data.groupby('Lemma').agg(
Frequency=('Context', 'count'),
Average_Tokens=('token_count', 'mean')
).reset_index()
# Tabs for better organization
tab1, tab2, tab3 = st.tabs(["πŸ“Š Lemma Frequency", "πŸ₯§ Frequency Distribution", "πŸ“š Chapter-wise Mentions"])
with tab1:
st.subheader("Lemma Frequency in the Dataset")
fig_bar = px.bar(
lemma_stats,
x='Lemma',
y='Frequency',
color='Frequency',
color_continuous_scale='Viridis',
labels={'Frequency': 'Frequency'},
title='Lemma Frequency',
hover_data=['Average_Tokens']
)
fig_bar.update_layout(showlegend=False)
st.plotly_chart(fig_bar, use_container_width=True)
with tab2:
st.subheader("Lemma Frequency Distribution")
# Limit to top 20 for better readability
top_lemmas = lemma_stats.nlargest(20, 'Frequency')
fig_pie = px.pie(
top_lemmas,
values='Frequency',
names='Lemma',
title='Top 20 Lemmas Frequency Distribution',
hole=0.4,
color='Lemma',
color_discrete_sequence=px.colors.qualitative.Set3
)
st.plotly_chart(fig_pie, use_container_width=True)
with tab3:
st.subheader("Chapter-wise Lemma Mentions")
chapter_stats = data.groupby(['Book/Chapter', 'Lemma']).size().reset_index(name='Count')
fig_stacked = px.bar(
chapter_stats,
x='Book/Chapter',
y='Count',
color='Lemma',
title='Chapter-wise Lemma Mentions',
labels={'Book/Chapter': 'Book/Chapter', 'Count': 'Mentions'},
color_discrete_sequence=px.colors.qualitative.Pastel
)
st.plotly_chart(fig_stacked, use_container_width=True)
st.markdown("---")
most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]
st.success(f"**Most Common Lemma:** {most_common_lemma['Lemma']} (Frequency: {most_common_lemma['Frequency']})")
with st.expander("πŸ” View Contexts"):
st.markdown("### Contextual Information")
for _, row in data.iterrows():
st.markdown(f"**Lemma:** {row['Lemma']} | **Book/Chapter:** {row['Book/Chapter']}")
st.write(row['Context'])
st.markdown("---")
def main():
st.set_page_config(
page_title="Lemma Frequency Visualization",
layout="wide",
initial_sidebar_state="expanded",
)
st.title("πŸ“š Lemma Frequency Visualization")
# Sidebar section
with st.sidebar:
st.image("imgs/DiGi_Thrace logo-tall.jpg", use_column_width=True, caption="Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age")
st.markdown("""
### The Dataset:
A curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.
The dataset is available on **FigShare**:
https://doi.org/10.6084/m9.figshare.27044578.v1
**Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF**
""")
csv_options = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"]
csv_file = st.selectbox("πŸ“ Select CSV file:", csv_options, index=0)
sort_entries = st.checkbox("Sort Entries by Book/Chapter", value=True)
st.markdown("---")
st.markdown("### Customize Visualization")
color_theme = st.selectbox("Select Color Theme for Charts:", ["Viridis", "Cividis", "Plasma", "Magma", "Inferno", "Turbo"])
data = load_data(csv_file)
visualize_data(data, sort_entries)
if __name__ == "__main__":
main()