import streamlit as st import pandas as pd import plotly.express as px import nltk from pathlib import Path # Download punkt tokenizer forcefully to avoid LookupError nltk.download('punkt', quiet=True, force=True) def count_tokens(text): tokens = nltk.word_tokenize(text) return len(tokens) def extract_number(entry): prefix = "plin. nat." start_index = entry.find(prefix) if start_index == -1: return 0.0 start_index += len(prefix) num_str = ''.join([char for char in entry[start_index:] if char.isdigit() or char == '.']) try: return float(num_str) except ValueError: return 0.0 @st.cache_data def load_data(csv_file): data = pd.read_csv(csv_file) data['token_count'] = data['Context'].apply(count_tokens) data['SortKey'] = data['Book/Chapter'].apply(extract_number) return data def visualize_data(data, sort_entries): if sort_entries: data = data.sort_values(by='SortKey') lemma_stats = data.groupby('Lemma').agg( Frequency=('Context', 'count'), Average_Tokens=('token_count', 'mean') ).reset_index() # Tabs for better organization tab1, tab2, tab3 = st.tabs(["📊 Lemma Frequency", "🥧 Frequency Distribution", "📚 Chapter-wise Mentions"]) with tab1: st.subheader("Lemma Frequency in the Dataset") fig_bar = px.bar( lemma_stats, x='Lemma', y='Frequency', color='Frequency', color_continuous_scale='Viridis', labels={'Frequency': 'Frequency'}, title='Lemma Frequency', hover_data=['Average_Tokens'] ) fig_bar.update_layout(showlegend=False) st.plotly_chart(fig_bar, use_container_width=True) with tab2: st.subheader("Lemma Frequency Distribution") # Limit to top 20 for better readability top_lemmas = lemma_stats.nlargest(20, 'Frequency') fig_pie = px.pie( top_lemmas, values='Frequency', names='Lemma', title='Top 20 Lemmas Frequency Distribution', hole=0.4, color='Lemma', color_discrete_sequence=px.colors.qualitative.Set3 ) st.plotly_chart(fig_pie, use_container_width=True) with tab3: st.subheader("Chapter-wise Lemma Mentions") chapter_stats = data.groupby(['Book/Chapter', 'Lemma']).size().reset_index(name='Count') fig_stacked = px.bar( chapter_stats, x='Book/Chapter', y='Count', color='Lemma', title='Chapter-wise Lemma Mentions', labels={'Book/Chapter': 'Book/Chapter', 'Count': 'Mentions'}, color_discrete_sequence=px.colors.qualitative.Pastel ) st.plotly_chart(fig_stacked, use_container_width=True) st.markdown("---") most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()] st.success(f"**Most Common Lemma:** {most_common_lemma['Lemma']} (Frequency: {most_common_lemma['Frequency']})") with st.expander("🔍 View Contexts"): st.markdown("### Contextual Information") for _, row in data.iterrows(): st.markdown(f"**Lemma:** {row['Lemma']} | **Book/Chapter:** {row['Book/Chapter']}") st.write(row['Context']) st.markdown("---") def main(): st.set_page_config( page_title="Lemma Frequency Visualization", layout="wide", initial_sidebar_state="expanded", ) st.title("📚 Lemma Frequency Visualization") # Sidebar section with st.sidebar: st.image("imgs/DiGi_Thrace logo-tall.jpg", use_column_width=True, caption="Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age") st.markdown(""" ### The Dataset: A curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work. The dataset is available on **FigShare**: https://doi.org/10.6084/m9.figshare.27044578.v1 **Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF** """) csv_options = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"] csv_file = st.selectbox("📁 Select CSV file:", csv_options, index=0) sort_entries = st.checkbox("Sort Entries by Book/Chapter", value=True) st.markdown("---") st.markdown("### Customize Visualization") color_theme = st.selectbox("Select Color Theme for Charts:", ["Viridis", "Cividis", "Plasma", "Magma", "Inferno", "Turbo"]) data = load_data(csv_file) visualize_data(data, sort_entries) if __name__ == "__main__": main()