Spaces:

Agrannya
/

app_sentiment_analysis

Sleeping

App Files Files Community

Agrannya commited on Aug 8, 2025

Commit

ce676fe

verified ·

1 Parent(s): dddbad7

Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +1 -0
README.md +3 -9
deploy.py +49 -0
model_save.py +107 -0
model_traing.py +36 -0
random_forest_model.joblib +3 -0
requirements.txt +4 -0
session.json +37 -0
tfidf_vectorizer.joblib +3 -0
train_best.xlsx +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+train_best.xlsx filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: App Sentiment Analysis
-emoji: 🔥
-colorFrom: green
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.41.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: app_sentiment_analysis
+app_file: deploy.py
 sdk: gradio
+sdk_version: 5.41.0
 ---

deploy.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import gradio as gr
+import joblib
+import numpy as np
+import pandas as pd
+# --- 1. Load the saved model and vectorizer ---
+print("Loading model and vectorizer...")
+model = joblib.load('random_forest_model.joblib')
+vectorizer = joblib.load('tfidf_vectorizer.joblib')
+print("Files loaded successfully.")
+# --- 2. Define the Prediction & Denormalization Function ---
+def predict_rating(review_text):
+    review_tfidf = vectorizer.transform([review_text])
+    normalized_prediction = model.predict(review_tfidf)[0]
+    final_rating = (normalized_prediction * 9) + 1
+    final_rating = np.clip(final_rating, 1, 10)
+    return round(final_rating, 2)
+# --- 3. Define the App's Title, Description, and Examples ---
+title = "⭐ Company Review Rating Predictor"
+description = """
+### **Model Information**
+This app uses a **Random Forest Regressor** model to predict a numerical rating based on the text of a company review.
+### **Dataset Information**
+The model was trained on the ["Sentiment Analysis on Company Reviews" dataset from Kaggle](https://www.kaggle.com/competitions/sentiment-analysis-company-reviews/code). This dataset contains reviews from employees about the companies they work for, with ratings originally on a **1-to-10 scale**.
+### **Error Margin**
+The model has a Mean Squared Error (MSE) of 0.0104. This means its predictions on the 1-10 scale have an average error margin of approximately **±0.9 points**.
+"""
+examples = [
+    ["Great place to work, good people, and good work-life balance."],
+    ["The job is okay, but the management is not very good."],
+    ["I would not recommend this company to anyone. The pay is low and the hours are long."]
+]
+# --- 4. Launch the Gradio Interface ---
+print("Launching Gradio interface...")
+interface = gr.Interface(
+    fn=predict_rating,
+    inputs=gr.Textbox(lines=5, label="Enter an Employee Review", placeholder="e.g., 'Great work-life balance and supportive management...'"),
+    outputs=gr.Number(label="Predicted Rating (on a 1-10 Scale)"),
+    title=title,
+    description=description,
+    examples=examples,
+    allow_flagging="never"
+)
+# Launch the app and create a public, shareable link
+interface.launch(share=True)

model_save.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Block 4: Random Forest Model with Progress Display
+# --------------------------------------------------
+# Use a classic machine learning approach for rating prediction with progress tracking.
+# Block 2: Load and Prepare Data
+# ------------------------------
+# This block loads the data from your Excel file, fixes the header,
+# and prepares it for the model.
+import pandas as pd
+print("--- Loading and Preparing Data ---")
+# Define the correct column names we want to use.
+correct_column_names = ['Id', 'Review', 'Rating']
+# 1. Load the Excel file, skipping the bad header row.
+#    We explicitly tell pandas there is no header to read.
+df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)
+# 2. Manually assign our correct column names. This is the key step
+#    to prevent the 'KeyError'.
+df.columns = correct_column_names
+# 3. Clean the data:
+#    - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
+#    - Drop any rows where 'Rating' or 'Review' is missing.
+df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
+df.dropna(subset=['Rating', 'Review'], inplace=True)
+# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
+#    This helps the model train more effectively.
+df['normalized_rating'] = (df['Rating'] - 1) / 9.0
+# 5. Create our final, clean DataFrame for the model.
+df_regression = df[['Review', 'normalized_rating']].copy()
+print("✅ Data loaded and prepared successfully!")
+print("\nHere's a sample of the prepared data:")
+print(df_regression.head())
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_squared_error, r2_score
+import joblib
+import numpy as np
+print("--- Pivoting to Random Forest ---")
+# Assume 'df_regression' is your DataFrame with 'Review' and 'normalized_rating' columns
+# --- 4.1. Prepare Data and Split ---
+X = df_regression['Review']
+y = df_regression['normalized_rating']
+X_train, X_val, y_train, y_val = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")
+# --- 4.2. Vectorize Text Data using TF-IDF ---
+print("Vectorizing text with TF-IDF...")
+vectorizer = TfidfVectorizer(
+    max_features=5000,
+    ngram_range=(1, 2),
+    stop_words='english'
+)
+X_train_tfidf = vectorizer.fit_transform(X_train)
+X_val_tfidf = vectorizer.transform(X_val)
+print("Vectorization complete.")
+print(f"Shape of TF-IDF matrix: {X_train_tfidf.shape}")
+# --- 4.3. Train the Random Forest Model ---
+print("⚙️ Training Random Forest Regressor...")
+rf_model = RandomForestRegressor(
+    n_estimators=200,
+    max_depth=50,
+    random_state=42,
+    n_jobs=-1,
+    verbose=1  # <<< ADDED: This will print progress updates during training.
+)
+rf_model.fit(X_train_tfidf, y_train)
+print("✅ Model training finished!")
+# --- 4.4. Evaluate the Model ---
+print("Evaluating model performance...")
+predictions = rf_model.predict(X_val_tfidf)
+mse = mean_squared_error(y_val, predictions)
+r2 = r2_score(y_val, predictions)
+print(f"\n--- Evaluation Results ---")
+print(f"Mean Squared Error (MSE): {mse:.4f}")
+print(f"R-squared (R²): {r2:.4f}")
+print("--------------------------")
+# --- 4.5. Save the Model and Vectorizer ---
+joblib.dump(rf_model, 'random_forest_model.joblib')
+joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
+print("\nModel and TF-IDF vectorizer saved successfully.")

model_traing.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Block 2: Load and Prepare Data
+# ------------------------------
+# This block loads the data from your Excel file, fixes the header,
+# and prepares it for the model.
+import pandas as pd
+print("--- Loading and Preparing Data ---")
+# Define the correct column names we want to use.
+correct_column_names = ['Id', 'Review', 'Rating']
+# 1. Load the Excel file, skipping the bad header row.
+#    We explicitly tell pandas there is no header to read.
+df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)
+# 2. Manually assign our correct column names. This is the key step
+#    to prevent the 'KeyError'.
+df.columns = correct_column_names
+# 3. Clean the data:
+#    - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
+#    - Drop any rows where 'Rating' or 'Review' is missing.
+df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
+df.dropna(subset=['Rating', 'Review'], inplace=True)
+# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
+#    This helps the model train more effectively.
+df['normalized_rating'] = (df['Rating'] - 1) / 9.0
+# 5. Create our final, clean DataFrame for the model.
+df_regression = df[['Review', 'normalized_rating']].copy()
+print("✅ Data loaded and prepared successfully!")
+print("\nHere's a sample of the prepared data:")
+print(df_regression.head())

random_forest_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7d54298865f632ea0ffd1ff2ec83aa9bb33a7ea69b69820b577153d00799f85
+size 66617905

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+scikit-learn
+pandas
+numpy
+gradio

session.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+    "uuids": {
+        "phone_id": "299d2587-06c5-4043-a756-656483aaa5ac",
+        "uuid": "bb8c72a4-e53a-448f-8cec-ccdee30dbdbb",
+        "client_session_id": "953b3935-cd5a-4f11-a38b-a3da94ff97ef",
+        "advertising_id": "0b264ae4-8e8c-45b5-aa21-dba53a1a4fc5",
+        "android_device_id": "android-a974d4e83e6321e4",
+        "request_id": "6e9c1840-9635-45f8-9eea-a644457d6413",
+        "tray_session_id": "c67a752d-6af9-448e-bbb5-7d58086d2a0a"
+    },
+    "mid": "aJONkgABAAGlpg0hUEp6T_GySBpV",
+    "ig_u_rur": null,
+    "ig_www_claim": null,
+    "authorization_data": {
+        "ds_user_id": "62889334662",
+        "sessionid": "62889334662%3ADVIDAfxi1LRPx7%3A11%3AAYftxy5MvSHKlsymp5C7jnVjSgqBkBijqvq0TPTLCA"
+    },
+    "cookies": {},
+    "last_login": 1754500513.609895,
+    "device_settings": {
+        "app_version": "269.0.0.18.75",
+        "android_version": 26,
+        "android_release": "8.0.0",
+        "dpi": "480dpi",
+        "resolution": "1080x1920",
+        "manufacturer": "OnePlus",
+        "device": "devitron",
+        "model": "6T Dev",
+        "cpu": "qcom",
+        "version_code": "314665256"
+    },
+    "user_agent": "Instagram 269.0.0.18.75 Android (26/8.0.0; 480dpi; 1080x1920; OnePlus; 6T Dev; devitron; qcom; en_US; 314665256)",
+    "country": "US",
+    "country_code": 1,
+    "locale": "en_US",
+    "timezone_offset": -14400
+}

tfidf_vectorizer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b44ca9f00b46a25b457fe2997694de5940750786d60fb205bef18f93d69f6cf
+size 194272

train_best.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a04fe95eb23f17654916ff96911826b200cc5c9b20d0b8d463f5bf02de548f1f
+size 9331872