import streamlit as st
import pandas as pd
import plotly.express as px
import nltk
from pathlib import Path

# Download punkt tokenizer forcefully to avoid LookupError
nltk.download('punkt', quiet=True, force=True)

def count_tokens(text):
    tokens = nltk.word_tokenize(text)
    return len(tokens)

def extract_number(entry):
    prefix = "plin. nat."
    start_index = entry.find(prefix)
    if start_index == -1:
        return 0.0
    start_index += len(prefix)
    num_str = ''.join([char for char in entry[start_index:] if char.isdigit() or char == '.'])
    try:
        return float(num_str)
    except ValueError:
        return 0.0

@st.cache_data
def load_data(csv_file):
    data = pd.read_csv(csv_file)
    data['token_count'] = data['Context'].apply(count_tokens)
    data['SortKey'] = data['Book/Chapter'].apply(extract_number)
    return data

def visualize_data(data, sort_entries):
    if sort_entries:
        data = data.sort_values(by='SortKey')
    
    lemma_stats = data.groupby('Lemma').agg(
        Frequency=('Context', 'count'),
        Average_Tokens=('token_count', 'mean')
    ).reset_index()

    # Tabs for better organization
    tab1, tab2, tab3 = st.tabs(["📊 Lemma Frequency", "🥧 Frequency Distribution", "📚 Chapter-wise Mentions"])

    with tab1:
        st.subheader("Lemma Frequency in the Dataset")
        fig_bar = px.bar(
            lemma_stats,
            x='Lemma',
            y='Frequency',
            color='Frequency',
            color_continuous_scale='Viridis',
            labels={'Frequency': 'Frequency'},
            title='Lemma Frequency',
            hover_data=['Average_Tokens']
        )
        fig_bar.update_layout(showlegend=False)
        st.plotly_chart(fig_bar, use_container_width=True)

    with tab2:
        st.subheader("Lemma Frequency Distribution")
        # Limit to top 20 for better readability
        top_lemmas = lemma_stats.nlargest(20, 'Frequency')
        fig_pie = px.pie(
            top_lemmas,
            values='Frequency',
            names='Lemma',
            title='Top 20 Lemmas Frequency Distribution',
            hole=0.4,
            color='Lemma',
            color_discrete_sequence=px.colors.qualitative.Set3
        )
        st.plotly_chart(fig_pie, use_container_width=True)

    with tab3:
        st.subheader("Chapter-wise Lemma Mentions")
        chapter_stats = data.groupby(['Book/Chapter', 'Lemma']).size().reset_index(name='Count')
        fig_stacked = px.bar(
            chapter_stats,
            x='Book/Chapter',
            y='Count',
            color='Lemma',
            title='Chapter-wise Lemma Mentions',
            labels={'Book/Chapter': 'Book/Chapter', 'Count': 'Mentions'},
            color_discrete_sequence=px.colors.qualitative.Pastel
        )
        st.plotly_chart(fig_stacked, use_container_width=True)

    st.markdown("---")
    most_common_lemma = lemma_stats.loc[lemma_stats['Frequency'].idxmax()]
    st.success(f"**Most Common Lemma:** {most_common_lemma['Lemma']} (Frequency: {most_common_lemma['Frequency']})")

    with st.expander("🔍 View Contexts"):
        st.markdown("### Contextual Information")
        for _, row in data.iterrows():
            st.markdown(f"**Lemma:** {row['Lemma']} | **Book/Chapter:** {row['Book/Chapter']}")
            st.write(row['Context'])
            st.markdown("---")

def main():
    st.set_page_config(
        page_title="Lemma Frequency Visualization",
        layout="wide",
        initial_sidebar_state="expanded",
    )
    st.title("📚 Lemma Frequency Visualization")

    # Sidebar section
    with st.sidebar:
        st.image("imgs/DiGi_Thrace logo-tall.jpg", use_column_width=True, caption="Measuring Ancient Thrace: Re-evaluating Antiquity in the Digital Age")
        st.markdown("""
        ### The Dataset:
        A curated collection of information on ancient geographical locations, rivers, tribes, and cultural aspects as documented by Pliny the Elder in *Naturalis Historia*. It includes lemmas (base forms of words), contextual information, and references to specific books and chapters from Pliny's work.
                        
        The dataset is available on **FigShare**: 
        https://doi.org/10.6084/m9.figshare.27044578.v1
        
        **Project no. КП-06-Н50/3 from 30.11.2020, financed by BNSF**
        """)
        
        csv_options = ["allData.csv", "places.csv", "ethnonyms.csv", "rivers.csv", "mountains.csv", "toponyms.csv"]
        csv_file = st.selectbox("📁 Select CSV file:", csv_options, index=0)
        
        sort_entries = st.checkbox("Sort Entries by Book/Chapter", value=True)
        
        st.markdown("---")
        st.markdown("### Customize Visualization")
        color_theme = st.selectbox("Select Color Theme for Charts:", ["Viridis", "Cividis", "Plasma", "Magma", "Inferno", "Turbo"])
    
    data = load_data(csv_file)
    visualize_data(data, sort_entries)

if __name__ == "__main__":
    main()