Spaces:

bestroi
/

PliniusNatHist

Sleeping

App Files Files Community

bestroi commited on Dec 17, 2024

Commit

bcd03e9

verified ·

1 Parent(s): d262f97

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -54

app.py CHANGED Viewed

@@ -2,96 +2,160 @@ import streamlit as st
 import pandas as pd
 import plotly.express as px
 import nltk
-# Download punkt tokenizer forcefully to avoid LookupError
-nltk.download('punkt', force=True)
 def count_tokens(text):
-    tokens = nltk.word_tokenize(text)
-    return len(tokens)
 def extract_number(entry):
-    start_index = entry.find("plin. nat.") + len("plin. nat.")
     num_str = ''
     for char in entry[start_index:]:
         if char.isdigit() or char == '.':
             num_str += char
         else:
             break
-    return float(num_str) if num_str else 0.0
-def visualize_data(csv_file, sort_entries=False):
-    data = pd.read_csv(csv_file)
     if sort_entries:
         data['SortKey'] = data['Book/Chapter'].apply(extract_number)
         data = data.sort_values(by='SortKey')
     data['token_count'] = data['Context'].apply(count_tokens)
-    lemma_stats = data.groupby('Lemma').agg({'Context': 'count', 'token_count': 'mean'}).reset_index()
-    st.write("Basic Statistics:")
     st.table(lemma_stats)
     fig_bar = px.bar(
         lemma_stats,
         x='Lemma',
-        y='Context',
         color='Lemma',
-        labels={'Context': 'Frequency'},
         title='Lemma Frequency in the Dataset'
     )
-    st.plotly_chart(fig_bar)
-    lemma_stats_additional = data['Lemma'].value_counts().reset_index()
-    lemma_stats_additional.columns = ['Lemma', 'Frequency']
-    most_common_lemma_additional = lemma_stats_additional.iloc[0]['Lemma']
-    chapter_stats_additional = data.groupby(['Lemma', 'Book/Chapter']).size().unstack(fill_value=0)
     fig_pie = px.pie(
-        lemma_stats_additional,
         values='Frequency',
         names='Lemma',
-        title='Lemma Frequency Distribution'
     )
-    st.plotly_chart(fig_pie)
-    fig_additional = px.bar(
-        chapter_stats_additional,
         barmode='stack',
-        labels={'index': 'Book/Chapter'},
         title='Chapter-wise Lemma Mentions'
     )
-    st.plotly_chart(fig_additional)
-    st.write(f"Most Common Lemma: {most_common_lemma_additional}")
-    with st.expander("Click to view context"):
         for index, row in data.iterrows():
-            st.write(f"Lemma: {row['Lemma']}")
-            st.write(f"Book/Chapter: {row['Book/Chapter']}")
-            st.write(f"Context: {row['Context']}")
-            st.write('-' * 50)
 def main():
     st.title("Lemma Frequency Visualization")
-    # Sidebar section
-    st.sidebar.image("imgs/DiGi_Thrace logo-tall.jpg", use_column_width=True)
-    st.sidebar.markdown("""
-    ### The Dataset:
-    The dataset is a curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.
-    _Measuring Ancient Thrace: Re-evaluating Antiquity in Digital Age_
-    **Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF**
-    """, unsafe_allow_html=True)
-    csv_file = st.sidebar.selectbox("Select CSV file:", ["allData.csv","places.csv","ethnonyms.csv","rivers.csv","mountains.csv","toponyms.csv"])
-    visualize_data(csv_file)
 if __name__ == "__main__":
     main()

 import pandas as pd
 import plotly.express as px
 import nltk
+from nltk.tokenize import word_tokenize
+import os
+# Ensure NLTK 'punkt' tokenizer is downloaded
+nltk.download('punkt', quiet=True)
 def count_tokens(text):
+    """Count the number of tokens in a given text."""
+    if isinstance(text, str):
+        tokens = word_tokenize(text)
+        return len(tokens)
+    return 0
 def extract_number(entry):
+    """
+    Extracts a floating-point number following the substring "plin. nat." in the entry.
+    Returns 0.0 if the pattern is not found or conversion fails.
+    """
+    search_str = "plin. nat."
+    start_index = entry.find(search_str)
+    if start_index == -1:
+        return 0.0
+    start_index += len(search_str)
     num_str = ''
     for char in entry[start_index:]:
         if char.isdigit() or char == '.':
             num_str += char
         else:
             break
+    try:
+        return float(num_str) if num_str else 0.0
+    except ValueError:
+        return 0.0
+def visualize_data(csv_file, sort_entries=False):
+    """Reads the CSV file, processes data, and visualizes it using Streamlit."""
+    if not os.path.exists(csv_file):
+        st.error(f"The file '{csv_file}' does not exist. Please check the file path.")
+        return
+    try:
+        data = pd.read_csv(csv_file)
+    except Exception as e:
+        st.error(f"Error reading '{csv_file}': {e}")
+        return
+    # Check for necessary columns
+    required_columns = {'Book/Chapter', 'Context', 'Lemma'}
+    if not required_columns.issubset(data.columns):
+        st.error(f"The CSV file must contain the following columns: {required_columns}")
+        return
     if sort_entries:
         data['SortKey'] = data['Book/Chapter'].apply(extract_number)
         data = data.sort_values(by='SortKey')
+        data.drop('SortKey', axis=1, inplace=True)
     data['token_count'] = data['Context'].apply(count_tokens)
+    # Group by 'Lemma' to get frequency and average token count
+    lemma_stats = data.groupby('Lemma').agg({
+        'Context': 'count',
+        'token_count': 'mean'
+    }).reset_index()
+    lemma_stats.rename(columns={'Context': 'Frequency', 'token_count': 'Average Token Count'}, inplace=True)
+    st.subheader("Basic Statistics")
     st.table(lemma_stats)
+    # Bar Chart: Lemma Frequency
     fig_bar = px.bar(
         lemma_stats,
         x='Lemma',
+        y='Frequency',
         color='Lemma',
+        labels={'Frequency': 'Frequency'},
         title='Lemma Frequency in the Dataset'
     )
+    st.plotly_chart(fig_bar)
+    # Pie Chart: Lemma Frequency Distribution
+    # To avoid clutter, show top 10 lemmas and aggregate the rest
+    top_n = 10
+    top_lemmas = lemma_stats.nlargest(top_n, 'Frequency')
+    others = lemma_stats['Frequency'].sum() - top_lemmas['Frequency'].sum()
+    pie_data = top_lemmas.append(pd.DataFrame({
+        'Lemma': ['Others'],
+        'Frequency': [others]
+    }), ignore_index=True)
     fig_pie = px.pie(
+        pie_data,
         values='Frequency',
         names='Lemma',
+        title='Lemma Frequency Distribution (Top 10)'
     )
+    st.plotly_chart(fig_pie)
+    # Chapter-wise Lemma Mentions
+    chapter_stats = data.groupby(['Lemma', 'Book/Chapter']).size().reset_index(name='Count')
+    chapter_pivot = chapter_stats.pivot(index='Book/Chapter', columns='Lemma', values='Count').fillna(0)
+    fig_chapter = px.bar(
+        chapter_pivot,
         barmode='stack',
+        labels={'index': 'Book/Chapter', 'value': 'Count'},
         title='Chapter-wise Lemma Mentions'
     )
+    st.plotly_chart(fig_chapter)
+    # Most Common Lemma
+    most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]['Lemma']
+    st.write(f"**Most Common Lemma:** {most_common_lemma}")
+    # Expander to show detailed context
+    with st.expander("View Detailed Contexts"):
         for index, row in data.iterrows():
+            st.markdown(f"**Lemma:** {row['Lemma']}")
+            st.markdown(f"**Book/Chapter:** {row['Book/Chapter']}")
+            st.markdown(f"**Context:** {row['Context']}")
+            st.markdown("---")
 def main():
+    """Main function to set up the Streamlit app."""
+    st.set_page_config(page_title="Lemma Frequency Visualization", layout="wide")
     st.title("Lemma Frequency Visualization")
+    # Sidebar configuration
+    with st.sidebar:
+        # Display image if it exists
+        image_path = "imgs/DiGi_Thrace_logo-tall.jpg"
+        if os.path.exists(image_path):
+            st.image(image_path, use_column_width=True)
+        else:
+            st.warning(f"Image '{image_path}' not found.")
+        st.markdown("""
+        ### The Dataset:
+        The dataset is a curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.
+        _Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age_
+        **Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF**
+        """)
+        # File selection
+        csv_files = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"]
+        csv_file = st.selectbox("Select CSV file:", csv_files)
+        # Option to sort entries
+        sort_entries = st.checkbox("Sort entries based on 'Book/Chapter'")
+    # Visualize data based on user selection
+    visualize_data(csv_file, sort_entries=sort_entries)
 if __name__ == "__main__":
     main()