Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import nltk | |
| from pathlib import Path | |
| # Download punkt tokenizer forcefully to avoid LookupError | |
| nltk.download('punkt', quiet=True, force=True) | |
| def count_tokens(text): | |
| tokens = nltk.word_tokenize(text) | |
| return len(tokens) | |
| def extract_number(entry): | |
| prefix = "plin. nat." | |
| start_index = entry.find(prefix) | |
| if start_index == -1: | |
| return 0.0 | |
| start_index += len(prefix) | |
| num_str = ''.join([char for char in entry[start_index:] if char.isdigit() or char == '.']) | |
| try: | |
| return float(num_str) | |
| except ValueError: | |
| return 0.0 | |
| def load_data(csv_file): | |
| data = pd.read_csv(csv_file) | |
| data['token_count'] = data['Context'].apply(count_tokens) | |
| data['SortKey'] = data['Book/Chapter'].apply(extract_number) | |
| return data | |
| def visualize_data(data, sort_entries): | |
| if sort_entries: | |
| data = data.sort_values(by='SortKey') | |
| lemma_stats = data.groupby('Lemma').agg( | |
| Frequency=('Context', 'count'), | |
| Average_Tokens=('token_count', 'mean') | |
| ).reset_index() | |
| # Tabs for better organization | |
| tab1, tab2, tab3 = st.tabs(["π Lemma Frequency", "π₯§ Frequency Distribution", "π Chapter-wise Mentions"]) | |
| with tab1: | |
| st.subheader("Lemma Frequency in the Dataset") | |
| fig_bar = px.bar( | |
| lemma_stats, | |
| x='Lemma', | |
| y='Frequency', | |
| color='Frequency', | |
| color_continuous_scale='Viridis', | |
| labels={'Frequency': 'Frequency'}, | |
| title='Lemma Frequency', | |
| hover_data=['Average_Tokens'] | |
| ) | |
| fig_bar.update_layout(showlegend=False) | |
| st.plotly_chart(fig_bar, use_container_width=True) | |
| with tab2: | |
| st.subheader("Lemma Frequency Distribution") | |
| # Limit to top 20 for better readability | |
| top_lemmas = lemma_stats.nlargest(20, 'Frequency') | |
| fig_pie = px.pie( | |
| top_lemmas, | |
| values='Frequency', | |
| names='Lemma', | |
| title='Top 20 Lemmas Frequency Distribution', | |
| hole=0.4, | |
| color='Lemma', | |
| color_discrete_sequence=px.colors.qualitative.Set3 | |
| ) | |
| st.plotly_chart(fig_pie, use_container_width=True) | |
| with tab3: | |
| st.subheader("Chapter-wise Lemma Mentions") | |
| chapter_stats = data.groupby(['Book/Chapter', 'Lemma']).size().reset_index(name='Count') | |
| fig_stacked = px.bar( | |
| chapter_stats, | |
| x='Book/Chapter', | |
| y='Count', | |
| color='Lemma', | |
| title='Chapter-wise Lemma Mentions', | |
| labels={'Book/Chapter': 'Book/Chapter', 'Count': 'Mentions'}, | |
| color_discrete_sequence=px.colors.qualitative.Pastel | |
| ) | |
| st.plotly_chart(fig_stacked, use_container_width=True) | |
| st.markdown("---") | |
| most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()] | |
| st.success(f"**Most Common Lemma:** {most_common_lemma['Lemma']} (Frequency: {most_common_lemma['Frequency']})") | |
| with st.expander("π View Contexts"): | |
| st.markdown("### Contextual Information") | |
| for _, row in data.iterrows(): | |
| st.markdown(f"**Lemma:** {row['Lemma']} | **Book/Chapter:** {row['Book/Chapter']}") | |
| st.write(row['Context']) | |
| st.markdown("---") | |
| def main(): | |
| st.set_page_config( | |
| page_title="Lemma Frequency Visualization", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| st.title("π Lemma Frequency Visualization") | |
| # Sidebar section | |
| with st.sidebar: | |
| st.image("imgs/DiGi_Thrace logo-tall.jpg", use_column_width=True, caption="Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age") | |
| st.markdown(""" | |
| ### The Dataset: | |
| A curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work. | |
| The dataset is available on **FigShare**: | |
| https://doi.org/10.6084/m9.figshare.27044578.v1 | |
| **Project no. ΠΠ-06-Π50/3 from 30.11.2020, financed by BNSF** | |
| """) | |
| csv_options = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"] | |
| csv_file = st.selectbox("π Select CSV file:", csv_options, index=0) | |
| sort_entries = st.checkbox("Sort Entries by Book/Chapter", value=True) | |
| st.markdown("---") | |
| st.markdown("### Customize Visualization") | |
| color_theme = st.selectbox("Select Color Theme for Charts:", ["Viridis", "Cividis", "Plasma", "Magma", "Inferno", "Turbo"]) | |
| data = load_data(csv_file) | |
| visualize_data(data, sort_entries) | |
| if __name__ == "__main__": | |
| main() | |