MukeshKapoor25's picture
Update app.py
d656893 verified
import streamlit as st
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
from app_store_scraper import AppStore
from google_play_scraper import Sort, reviews as google_reviews
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
# Download NLTK data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# Title for the app
st.title("AppInsights")
# Country selection
country_options = {
"All": "all",
"India": "in",
"China": "cn",
"Singapore": "sg",
"Hong Kong": "hk",
"Malaysia": "my",
"UAE": "ae",
"Thailand": "th"
}
platform_options = {
"iOS": "app_store",
"Android": "google_play"
}
selected_platform = st.selectbox("Select a Platform", list(platform_options.keys()))
selected_country = st.selectbox("Select a Country", list(country_options.keys()))
# Initialize reviews list
reviews = []
# Fetch reviews
if selected_platform == "iOS":
if selected_country == "All":
for country_code in country_options.values():
if country_code != "all":
app = AppStore(country=country_code, app_name="Straight2Bank", app_id=1169270682)
app.review(how_many=1000) # Fetch reviews
if app.reviews: # Check if reviews are fetched
reviews += app.reviews
else:
app = AppStore(country=country_options[selected_country], app_name="Straight2Bank", app_id=1169270682)
app.review(how_many=2000)
reviews = app.reviews if app.reviews else []
elif selected_platform == "Android":
if selected_country == "all":
reviews, _ = google_reviews("com.sc.s2b.ng.mobile", sort=Sort.NEWEST, count=1000)
else:
reviews, _ = google_reviews("com.sc.s2b.ng.mobile", lang=country_options[selected_country], sort=Sort.NEWEST, count=1000)
# Convert reviews into a DataFrame only if there are reviews
if reviews:
if selected_platform == "iOS":
df = pd.DataFrame(reviews)
#st.write(df)
# Check if 'date' is available, if not create a default one
#if 'date' not in df.columns:
# df['date'] = pd.Timestamp.now() # or you can set it to a specific date if needed
else: # Android
df = pd.DataFrame(reviews, columns=["reviewId", "userName", "userImage", "content", "score",
"thumbsUpCount", "reviewCreatedVersion", "at", "replyContent",
"repliedAt", "appVersion"])
# Convert 'at' to datetime
df['date'] = pd.to_datetime(df['at'])
# Function to clean text
def clean_text(text):
text = text.lower()
text = ''.join([char for char in text if char.isalnum() or char.isspace()])
text = ' '.join([word for word in text.split() if word not in stop_words])
return text
# Use the appropriate content column for cleaning and analysis
content_col = 'content' if selected_platform == "Android" else 'review'
rating_col = 'score' if selected_platform == "Android" else 'rating'
# Apply the clean function to the review content
df['cleaned_content'] = df[content_col].apply(clean_text)
# Define the sentiment based on ratings
df['sentiment'] = df[rating_col].apply(lambda x: 1 if x >= 4 else 0)
# Split data into features and labels
X = df['cleaned_content']
y = df['sentiment']
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000)
X_vectorized = vectorizer.fit_transform(X)
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42)
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# Top 5 positive and negative feedback
top_positive = df[df['sentiment'] == 1].nlargest(5, rating_col)[[content_col, rating_col]]
top_negative = df[df['sentiment'] == 0].nsmallest(5, rating_col)[[content_col, rating_col]]
st.subheader("Top 5 Positive Feedback")
st.write(top_positive)
st.subheader("Top 5 Negative Feedback")
st.write(top_negative)
# Trend Analysis for the last 6 months
if 'date' in df.columns:
df['date'] = pd.to_datetime(df['date'])
# Filter for the last 6 months
six_months_ago = pd.Timestamp.now() - pd.DateOffset(months=6)
df = df[df['date'] >= six_months_ago]
# Create a month-year column for grouping
df['month'] = df['date'].dt.to_period('M').astype(str)
# Count positive and negative reviews per month
trend_data = df.groupby(['month', 'sentiment']).size().unstack(fill_value=0).reset_index()
# Prepare data for Bokeh
source = ColumnDataSource(data={
'month': trend_data['month'],
'positive': trend_data.get(1, [0]*len(trend_data)),
'negative': trend_data.get(0, [0]*len(trend_data)),
})
# Create a Bokeh plot
p = figure(title="Review Trend Over Time", x_axis_label='Month', y_axis_label='Number of Reviews',
x_range=trend_data['month'].tolist(), plot_height=400)
# Plotting lines for positive and negative reviews
p.line(x='month', y='positive', source=source, line_width=2, color='green', legend_label='Positive Reviews', line_dash='solid')
p.line(x='month', y='negative', source=source, line_width=2, color='red', legend_label='Negative Reviews', line_dash='solid')
# Formatting x-axis to show months
p.xaxis.major_label_orientation = "vertical"
# Show Bokeh plot in Streamlit
st.bokeh_chart(p)
else:
st.write("No date column found in the reviews.")
else:
st.write("No reviews found.")