File size: 3,445 Bytes
57b215d
 
 
 
 
 
60be2ab
57b215d
 
60be2ab
 
 
 
 
 
57b215d
 
 
60be2ab
 
57b215d
 
 
 
 
 
60be2ab
57b215d
 
 
 
 
 
 
 
60be2ab
57b215d
 
 
60be2ab
57b215d
 
 
60be2ab
57b215d
 
60be2ab
 
 
 
 
 
 
57b215d
 
 
 
 
 
60be2ab
57b215d
 
 
 
 
60be2ab
57b215d
 
60be2ab
57b215d
60be2ab
57b215d
60be2ab
 
57b215d
 
4a9a3c2
57b215d
4a9a3c2
57b215d
 
 
4a9a3c2
57b215d
4a9a3c2
57b215d
60be2ab
 
4a9a3c2
60be2ab
4a9a3c2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from PIL import Image
import os

# =============================================
# Base directory (works in container)
# =============================================
BASE_DIR = os.path.dirname(__file__)

# =============================================
# Cache dataset to avoid reload every time
# =============================================
@st.cache_data
def load_data():
    csv_path = os.path.join(BASE_DIR, 'singapore_airlines_reviews.csv')
    df = pd.read_csv(csv_path)
    return df

# Load dataset
df = load_data()

# =============================================
# Main EDA function
# =============================================
def run():
    st.title("ACRE - Automated Customer Review Analysis")
    st.subheader("Exploratory Data Analysis (EDA)")

    st.markdown(
        """

        This section provides an exploratory data analysis (EDA) of Singapore Airlines (SQ) customer reviews.  

        We aim to understand the distribution of ratings, textual review characteristics, and topic modeling results.

        """
    )

    # Dataset preview
    st.write("### Dataset Preview")
    st.dataframe(df.head())

    # Distribution of ratings
    st.write("### Distribution of Ratings")
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.countplot(
        x='rating',
        data=df,
        palette='viridis',
        ax=ax,
        order=sorted(df['rating'].unique())
    )
    for p in ax.patches:
        height = p.get_height()
        ax.annotate(f'{height:,}', (p.get_x() + p.get_width()/2, height),
                    ha='center', va='bottom', fontsize=10, fontweight='bold')
    st.pyplot(fig)

    # Distribution of review length
    st.write("### Distribution of Review Length")
    df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))
    fig = px.histogram(df, x='text_length', nbins=50, title='Review Length Distribution')
    st.plotly_chart(fig, use_container_width=True)

    # Wordclouds
    col1, col2 = st.columns(2)
    with col1:
        st.image(os.path.join(BASE_DIR, "Negative - Wordcloud.png"), caption="Negative - Wordcloud")
    with col2:
        st.image(os.path.join(BASE_DIR, "Positive - Wordcloud.png"), caption="Positive - Wordcloud")

    # Topic Modeling Results
    st.write("## Topic Modeling Results")
    col1, col2 = st.columns(2)
    with col1:
        st.image(os.path.join(BASE_DIR, "Negative - Top Words Distributions 10.png"), caption="Negative - Top Words Distributions")
    with col2:
        st.image(os.path.join(BASE_DIR, "Positive - Top Words Distributions 10.png"), caption="Positive - Top Words Distributions")

    col1, col2 = st.columns(2)
    with col1:
        st.image(os.path.join(BASE_DIR, "Negative - Topic Activities Over Time 10.png"), caption="Negative - Topic Activities Over Time")
    with col2:
        st.image(os.path.join(BASE_DIR, "Positive - Topic Activities Over Time 10.png"), caption="Positive - Topic Activities Over Time")

    col1, col2 = st.columns(2)
    with col1:
        st.image(os.path.join(BASE_DIR, "Negative - Topics Weights 10.png"), caption="Negative - Topics Weights")
    with col2:
        st.image(os.path.join(BASE_DIR, "Positive - Topics Weights 10.png"), caption="Positive - Topics Weights")