Spaces:
Sleeping
Sleeping
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| import pandas as pd | |
| import gradio as gr | |
| from google_play_scraper import Sort, reviews, app | |
| from datetime import datetime, timedelta | |
| import io | |
| import google.generativeai as genai | |
| import re | |
| from nltk.corpus import stopwords | |
| from Sastrawi.Stemmer.StemmerFactory import StemmerFactory | |
| from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory | |
| import pickle | |
| import nltk | |
| nltk.download('stopwords') | |
| # Tujuan file disimpan | |
| destination_file_1y_ex3 = 'data/app_reviews_1y_ex3.csv' | |
| model_file = 'model/best_model.pkl' | |
| vectorizer_file = 'model/vectorizer.pkl' | |
| # Global variables to store API key and model name | |
| api_key = None | |
| model_name = "gemini-2.0-flash" # Default model name | |
| with open(model_file, 'rb') as file: | |
| best_model = pickle.load(file) | |
| with open(vectorizer_file, 'rb') as file: | |
| vectorizer = pickle.load(file) | |
| # Cache stop words | |
| indonesian_stopwords = stopwords.words('indonesian') | |
| # Create stemmer | |
| factory = StemmerFactory() | |
| stemmer = factory.create_stemmer() | |
| # Create stop word remover | |
| stopword_factory = StopWordRemoverFactory() | |
| stopword_remover = stopword_factory.create_stop_word_remover() | |
| def preprocess_text(text): | |
| # 1. Handle None values | |
| if text is None: | |
| return "" # Or any other suitable replacement | |
| # Lowercase and remove punctuation & special characters in one step | |
| text = re.sub(r'[^\w\s\d]+', '', text.lower()) | |
| # Remove extra whitespaces | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # Stemming and stop word removal using NLTK and list comprehension | |
| text = stemmer.stem(text) # Indonesian stemming | |
| text = stopword_remover.remove(text) # Indonesian stopword removal | |
| words = text.split() | |
| words = [word for word in words if word not in indonesian_stopwords] # Remove Indonesian stopwords | |
| text = " ".join(words) | |
| return text | |
| def predict_sentiment(text): | |
| # Preprocess the input text | |
| processed_text = preprocess_text(text) | |
| # Transform the text using the loaded vectorizer | |
| text_vectorized = vectorizer.transform([processed_text]) | |
| # Predict the sentiment | |
| prediction = best_model.predict(text_vectorized)[0] | |
| return prediction | |
| # Fungsi untuk melakukan labeling dengan gemini api | |
| def label_sentiment_with_gemini(text, api_key, model_name): | |
| """Melakukan labeling sentimen menggunakan Gemini.""" | |
| prompt = f"""Klasifikasikan sentimen ulasan berikut menjadi: '1.puas', '2.tidak puas', '3.netral'. | |
| Perhatikan sarkasme dan sindiran, atau ekspresi negatif/positif halus, serta bahasa yang digunakan. | |
| **Ulasan:** {text} | |
| **Tampilkan hanya Sentimen** | |
| """ | |
| try: | |
| genai.configure(api_key=api_key) # Konfigurasi Gemini API di dalam fungsi | |
| model = genai.GenerativeModel(model_name) | |
| response = model.generate_content(prompt) | |
| generated_content = response.text.strip().lower() | |
| generated_content = re.sub(' ', '', generated_content) | |
| if "1.puas" in generated_content: | |
| return "puas" | |
| elif "2.tidakpuas" in generated_content: | |
| return "tidak puas" | |
| else: | |
| return "netral" | |
| except genai.errors.ResourceExhaustedError: | |
| print("Error: Rate limit exceeded. Please try again later.") | |
| return "netral" # or another appropriate default value | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| return "netral" # or another appropriate default value | |
| def predict_and_label(text): | |
| try: | |
| if not text: # Check if the text input is empty | |
| raise gr.Error("Please enter a review.") # Raise a Gradio error with a message | |
| if not api_key: # Check if the api_key input is empty | |
| raise gr.Error("Please enter your correct API_KEY on API Settings.") # Raise a Gradio error with a message | |
| if not model_file: # Check if the model_name input is empty | |
| raise gr.Error("Please enter your correct MODEL_NAME on API Settings.") # Raise a Gradio error with a message | |
| prediction = predict_sentiment(text) | |
| # Konversi np.str_ menjadi str | |
| prediction = prediction.item() # Atau prediction.astype(str) | |
| label_gemini = label_sentiment_with_gemini(text, api_key, model_name) | |
| return prediction, label_gemini | |
| except (ValueError, TypeError, AttributeError) as e: | |
| # Catch specific errors related to data types, empty inputs, and unexpected values | |
| raise gr.Error(f"Error processing input: {type(e).__name__}. Please check your input.") | |
| except genai.errors.ResourceExhaustedError: | |
| # Handle rate limit exceeded error | |
| raise gr.Error("Error: Rate limit exceeded for Gemini API/You forgot to update API_KEY. Please try again later.") | |
| except Exception as e: | |
| # Catch any other unexpected errors | |
| raise gr.Error(f"An unexpected error occurred: {type(e).__name__}. Please try again later.") | |
| def update_api_credentials(new_api_key, new_model_name): | |
| global api_key, model_name # Access the global variables | |
| api_key = str(new_api_key) | |
| model_name = str(new_model_name) | |
| #test api and show successfull if connected | |
| try: | |
| genai.configure(api_key=api_key) # Konfigurasi Gemini API di dalam fungsi | |
| model = genai.GenerativeModel(model_name) | |
| response = model.generate_content("Test API Connection.Just say Yes if successfull") | |
| generated_content = response.text.strip().lower() | |
| except genai.errors.ResourceExhaustedError: | |
| print("Error: Rate limit exceeded. Please try again later.") | |
| return "Error: Rate limit exceeded. Please try again later." | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| return "An unexpected error occurred. Please try again later." | |
| print(f"API Key: {api_key}") | |
| print(f"Model Name: {model_name}") | |
| return generated_content, " API credentials updated successfully!" | |
| def scrape_and_show_data(): | |
| try: | |
| # List App Packages | |
| app_packages = [ | |
| 'id.dana', #Dana | |
| 'com.shopeepay.id', #Shopeepay | |
| 'com.gojek.gopay', #Gopay | |
| 'ovo.id', #Ovo | |
| ] | |
| language = 'id' | |
| country = 'id' | |
| app_reviews = [] | |
| current_date = datetime.now() | |
| one_year_ago = current_date - timedelta(days=365) | |
| for ap in app_packages: | |
| for score in [1, 2, 3, 4, 5]: # Ambil semua rating (1-5) | |
| rvs, _ = reviews( | |
| ap, | |
| lang=language, | |
| country=country, | |
| sort=Sort.NEWEST, # Hanya ambil ulasan terbaru (newest) | |
| count=10, # Sesuaikan jumlah ulasan yang ingin di-scrape | |
| filter_score_with=score | |
| ) | |
| # Filter ulasan untuk satu tahun terakhir | |
| for r in rvs: | |
| review_date = datetime.strptime(r['at'].strftime("%Y-%m-%d"), "%Y-%m-%d") | |
| if review_date >= one_year_ago: | |
| r['sortOrder'] = 'newest' # Tetapkan sortOrder menjadi 'newest' | |
| r['appId'] = ap | |
| app_reviews.append(r) | |
| df = pd.DataFrame(app_reviews) | |
| # Buat label (misal: score 4-5 puas, 3 netral, 1-2 nggak puas) | |
| def label_sentiment(score): | |
| if score >= 4: | |
| return 'puas' | |
| elif score < 3: | |
| return 'tidak_puas' | |
| else: | |
| return 'netral' | |
| df['rating'] = df['score'].apply(label_sentiment) | |
| # Load Apps Info | |
| app_infos = [] | |
| for ap in app_packages: | |
| info = app(ap, lang=language, country=country) | |
| del info['comments'] | |
| app_infos.append(info) | |
| app_infos_df = pd.DataFrame(app_infos) | |
| df = pd.merge(df, app_infos_df[['appId', 'title']], on='appId', how='left') | |
| df = df.sort_values(by='at', ascending=False).head(10) | |
| # predict the data with predict_and_label. The result have 2 list, example is ('puas', 'netral'). Put to dataframe for column predict_model and predict_gemini | |
| df['predict_model'], df['predict_gemini'] = zip(*df['content'].apply(predict_and_label)) | |
| # show only column at rename as date, content, rating, and order desc by date | |
| df = df[['title','at', 'content', 'score', 'rating','predict_model','predict_gemini']].rename(columns={'at': 'date'}) | |
| return df | |
| except Exception as e: | |
| raise gr.Error(f"Error scraping data: {type(e).__name__}. Please check your app package names and connection.") | |
| def scrape_and_download_data(app_packages, language, country, sort, score, start_date, end_date, count): | |
| try: | |
| app_reviews = [] | |
| # Convert app_packages to a list if it's a string | |
| if isinstance(app_packages, str): | |
| app_packages = [app_packages] | |
| # Convert date strings to datetime objects (if needed) | |
| if isinstance(start_date, str): | |
| start_date = datetime.strptime(start_date, "%Y-%m-%d") | |
| if isinstance(end_date, str): | |
| end_date = datetime.strptime(end_date, "%Y-%m-%d") | |
| # Scrape data based on criteria | |
| for ap in app_packages: | |
| for scr in str(score): # Ambil semua rating (1-5) | |
| rvs, _ = reviews( | |
| ap, | |
| lang=str(language), # Convert language to string | |
| country=str(country), # Convert country to string | |
| sort=Sort.NEWEST if str(sort) == 'NEWEST' else Sort.MOST_RELEVANT, | |
| count=int(count), | |
| filter_score_with=scr, | |
| ) | |
| # Filter reviews based on date range and other criteria | |
| for r in rvs: | |
| review_date = datetime.strptime(r['at'].strftime("%Y-%m-%d"), "%Y-%m-%d") | |
| if start_date <= review_date <= end_date: # Date range filter | |
| r['sortOrder'] = sort | |
| r['appId'] = ap | |
| app_reviews.append(r) | |
| # Create DataFrame | |
| df = pd.DataFrame(app_reviews) | |
| # Check if DataFrame is empty | |
| if df.empty: | |
| # Handle empty DataFrame, e.g., return an empty DataFrame or raise an exception | |
| print("DataFrame is empty. No reviews found for the specified criteria.") | |
| return df # or: raise ValueError("No reviews found for the specified criteria.") | |
| else: | |
| # Load Apps Info | |
| app_infos = [] | |
| for ap in app_packages: | |
| info = app(ap, lang=language, country=country) | |
| del info['comments'] | |
| app_infos.append(info) | |
| app_infos_df = pd.DataFrame(app_infos) | |
| df = pd.merge(df, app_infos_df[['appId', 'title']], on='appId', how='left') | |
| # Create label if DataFrame is not empty | |
| def label_sentiment(score): | |
| if score >= 4: | |
| return 'puas' | |
| elif score < 3: | |
| return 'tidak_puas' | |
| else: | |
| return 'netral' | |
| df['rating'] = df['score'].apply(label_sentiment) | |
| # show only column title, at, sortOrder, reviewId, userName, userImage, content, score, thumbsUpCount, replyContent, repliedAt, rating | |
| df = df[['title','at', 'sortOrder', 'reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'replyContent', 'repliedAt', 'rating']].rename(columns={'at': 'date'}) # Rename 'at' to 'date | |
| df = df.sort_values(by='date', ascending=False) | |
| return df | |
| except Exception as e: | |
| raise gr.Error(f"Error scraping or processing data: {type(e).__name__}. Please check your inputs and connection.") | |
| # def create_charts(): | |
| # # 1. Rating Distribution Pie Chart | |
| # df = scrape_and_show_data() | |
| # rating_counts = df['rating'].value_counts() | |
| # # Create the pie chart using Matplotlib | |
| # fig_pie, ax_pie = plt.subplots() | |
| # ax_pie.pie(rating_counts, labels=rating_counts.index, autopct='%1.1f%%', startangle=90) | |
| # ax_pie.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle. | |
| # plt.title("Rating Distribution") | |
| # # Convert to gradio plot | |
| # rating_pie_chart = gr.Plot(value=fig_pie) # Using gr.Plot | |
| # # 2. Daily Reviews Line Chart | |
| # daily_reviews = df.groupby('date').size().reset_index(name='total_reviews') | |
| # # Create line chart using Plotly | |
| # fig_line = px.line(daily_reviews, x='date', y='total_reviews', title='Total Reviews per Day') | |
| # fig_line.update_traces(mode='markers+lines') | |
| # # Convert to gradio plot | |
| # daily_reviews_chart = gr.Plot(value=fig_line) # Using gr.Plot | |
| # return rating_pie_chart, daily_reviews_chart # Return both gradio plots | |
| with gr.Blocks() as apps: | |
| with gr.Tabs(): | |
| with gr.TabItem("Prediction Existing Data"): | |
| # Sentiment Prediction section | |
| gr.Interface( | |
| fn=predict_and_label, | |
| inputs=[ | |
| gr.Textbox(lines=5, label="Masukkan Ulasan"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Prediksi Model",info="Prediksi Model Sentiment"), | |
| gr.Textbox(label="Prediksi Gemini",info="Prediksi Gemini Sentiment"), | |
| ], | |
| title="Prediksi Sentimen dari Ulasan Aplikasi di Google Play Store", | |
| description="Masukkan ulasan Anda untuk memprediksi sentimen.", | |
| api_name="prediksi_sentimen" | |
| ) | |
| gr.Interface( | |
| fn=scrape_and_show_data, | |
| inputs=None, | |
| outputs=gr.Dataframe(label="Cleaned Reviews DataFrame",wrap=True), | |
| description="Displaying the Latest the Data:", | |
| api_name="prediksi_sentimen_latest" | |
| ) | |
| # gr.Interface( | |
| # fn=create_charts, | |
| # inputs=None, | |
| # outputs=[ | |
| # gr.Plot(label="Rating Distribution"), | |
| # gr.Plot(label="Daily Reviews"), | |
| # ], | |
| # description="Displaying Charts:", | |
| # ) | |
| with gr.TabItem("Download New Data"): | |
| with gr.Column(): # Place input elements in a column | |
| app_packages_input = gr.Textbox(label="App Packages (comma-separated)", value="com.gojek.gopay",info="Enter app packages separated by commas") | |
| language_input = gr.Textbox(label="Language", value="id", info="Enter language code") | |
| country_input = gr.Textbox(label="Country", value="id", info="Enter country code") | |
| sort_input = gr.Radio(["NEWEST", "MOST_RELEVANT"], label="Sort Order", value="NEWEST", info="Select sort order") | |
| scores_input = gr.CheckboxGroup([1, 2, 3, 4, 5], label="Scores", value=[1, 2, 3, 4, 5], info="Select scores") | |
| start_date_input = gr.Textbox(label="Start Date (YYYY-MM-DD)", value=(datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d"),info="Enter start date (YYYY-MM-DD)") | |
| end_date_input = gr.Textbox(label="End Date (YYYY-MM-DD)", value=datetime.now().strftime("%Y-%m-%d"),info="Enter end date (YYYY-MM-DD)") | |
| count = gr.Textbox(label="Count", value="10",info="Enter count") | |
| generate_button = gr.Button("Generate Data") | |
| # download_button = gr.DownloadButton(label="Download Data") | |
| # Place output elements below the input column | |
| output_data = gr.Dataframe(label="Scraped Data", wrap=True) | |
| generate_button.click( | |
| fn=scrape_and_download_data, | |
| inputs=[app_packages_input, language_input, country_input, sort_input, scores_input, start_date_input, end_date_input, count], | |
| outputs=[output_data], | |
| api_name="generate_data" | |
| ) | |
| # download_button.click( | |
| # fn=lambda df: io.StringIO(df.to_csv(index=False)), # Convert DataFrame to CSV in memory | |
| # inputs=output_data, | |
| # outputs=download_button, | |
| # api_name="download_data" | |
| # ) | |
| with gr.TabItem("API Settings"): # New tab for API settings | |
| with gr.Row(): | |
| api_key_input = gr.Textbox(label="API Key", value="", info="Enter your API key") | |
| model_name_input = gr.Textbox(label="Model Name", value="gemini-2.0-flash", info="Enter the model name") | |
| update_button = gr.Button("Check and Update API Credentials") | |
| update_button.click( | |
| fn=update_api_credentials, | |
| inputs=[api_key_input, model_name_input], | |
| outputs=gr.Textbox(label="Status"), | |
| api_name="update_api_credentials" | |
| ) | |
| # information to get API Key on https://aistudio.google.com/app/apikey | |
| gr.Markdown("Get API Key on https://aistudio.google.com/app/apikey") | |
| apps.launch(share=False, debug=True, auth=("admin", "admin"), ssr_mode=False) |