Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- .gitattributes +1 -0
- README.md +3 -9
- deploy.py +49 -0
- model_save.py +107 -0
- model_traing.py +36 -0
- random_forest_model.joblib +3 -0
- requirements.txt +4 -0
- session.json +37 -0
- tfidf_vectorizer.joblib +3 -0
- train_best.xlsx +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
train_best.xlsx filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: green
|
| 5 |
-
colorTo: indigo
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.41.
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: app_sentiment_analysis
|
| 3 |
+
app_file: deploy.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
+
sdk_version: 5.41.0
|
|
|
|
|
|
|
| 6 |
---
|
|
|
|
|
|
deploy.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import joblib
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
# --- 1. Load the saved model and vectorizer ---
|
| 7 |
+
print("Loading model and vectorizer...")
|
| 8 |
+
model = joblib.load('random_forest_model.joblib')
|
| 9 |
+
vectorizer = joblib.load('tfidf_vectorizer.joblib')
|
| 10 |
+
print("Files loaded successfully.")
|
| 11 |
+
|
| 12 |
+
# --- 2. Define the Prediction & Denormalization Function ---
|
| 13 |
+
def predict_rating(review_text):
|
| 14 |
+
review_tfidf = vectorizer.transform([review_text])
|
| 15 |
+
normalized_prediction = model.predict(review_tfidf)[0]
|
| 16 |
+
final_rating = (normalized_prediction * 9) + 1
|
| 17 |
+
final_rating = np.clip(final_rating, 1, 10)
|
| 18 |
+
return round(final_rating, 2)
|
| 19 |
+
|
| 20 |
+
# --- 3. Define the App's Title, Description, and Examples ---
|
| 21 |
+
title = "⭐ Company Review Rating Predictor"
|
| 22 |
+
description = """
|
| 23 |
+
### **Model Information**
|
| 24 |
+
This app uses a **Random Forest Regressor** model to predict a numerical rating based on the text of a company review.
|
| 25 |
+
### **Dataset Information**
|
| 26 |
+
The model was trained on the ["Sentiment Analysis on Company Reviews" dataset from Kaggle](https://www.kaggle.com/competitions/sentiment-analysis-company-reviews/code). This dataset contains reviews from employees about the companies they work for, with ratings originally on a **1-to-10 scale**.
|
| 27 |
+
### **Error Margin**
|
| 28 |
+
The model has a Mean Squared Error (MSE) of 0.0104. This means its predictions on the 1-10 scale have an average error margin of approximately **±0.9 points**.
|
| 29 |
+
"""
|
| 30 |
+
examples = [
|
| 31 |
+
["Great place to work, good people, and good work-life balance."],
|
| 32 |
+
["The job is okay, but the management is not very good."],
|
| 33 |
+
["I would not recommend this company to anyone. The pay is low and the hours are long."]
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
# --- 4. Launch the Gradio Interface ---
|
| 37 |
+
print("Launching Gradio interface...")
|
| 38 |
+
interface = gr.Interface(
|
| 39 |
+
fn=predict_rating,
|
| 40 |
+
inputs=gr.Textbox(lines=5, label="Enter an Employee Review", placeholder="e.g., 'Great work-life balance and supportive management...'"),
|
| 41 |
+
outputs=gr.Number(label="Predicted Rating (on a 1-10 Scale)"),
|
| 42 |
+
title=title,
|
| 43 |
+
description=description,
|
| 44 |
+
examples=examples,
|
| 45 |
+
allow_flagging="never"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# Launch the app and create a public, shareable link
|
| 49 |
+
interface.launch(share=True)
|
model_save.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Block 4: Random Forest Model with Progress Display
|
| 2 |
+
# --------------------------------------------------
|
| 3 |
+
# Use a classic machine learning approach for rating prediction with progress tracking.
|
| 4 |
+
# Block 2: Load and Prepare Data
|
| 5 |
+
# ------------------------------
|
| 6 |
+
# This block loads the data from your Excel file, fixes the header,
|
| 7 |
+
# and prepares it for the model.
|
| 8 |
+
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
print("--- Loading and Preparing Data ---")
|
| 12 |
+
|
| 13 |
+
# Define the correct column names we want to use.
|
| 14 |
+
correct_column_names = ['Id', 'Review', 'Rating']
|
| 15 |
+
|
| 16 |
+
# 1. Load the Excel file, skipping the bad header row.
|
| 17 |
+
# We explicitly tell pandas there is no header to read.
|
| 18 |
+
df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)
|
| 19 |
+
|
| 20 |
+
# 2. Manually assign our correct column names. This is the key step
|
| 21 |
+
# to prevent the 'KeyError'.
|
| 22 |
+
df.columns = correct_column_names
|
| 23 |
+
|
| 24 |
+
# 3. Clean the data:
|
| 25 |
+
# - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
|
| 26 |
+
# - Drop any rows where 'Rating' or 'Review' is missing.
|
| 27 |
+
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
|
| 28 |
+
df.dropna(subset=['Rating', 'Review'], inplace=True)
|
| 29 |
+
|
| 30 |
+
# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
|
| 31 |
+
# This helps the model train more effectively.
|
| 32 |
+
df['normalized_rating'] = (df['Rating'] - 1) / 9.0
|
| 33 |
+
|
| 34 |
+
# 5. Create our final, clean DataFrame for the model.
|
| 35 |
+
df_regression = df[['Review', 'normalized_rating']].copy()
|
| 36 |
+
|
| 37 |
+
print("✅ Data loaded and prepared successfully!")
|
| 38 |
+
print("\nHere's a sample of the prepared data:")
|
| 39 |
+
print(df_regression.head())
|
| 40 |
+
import pandas as pd
|
| 41 |
+
from sklearn.model_selection import train_test_split
|
| 42 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 43 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 44 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
| 45 |
+
import joblib
|
| 46 |
+
import numpy as np
|
| 47 |
+
|
| 48 |
+
print("--- Pivoting to Random Forest ---")
|
| 49 |
+
|
| 50 |
+
# Assume 'df_regression' is your DataFrame with 'Review' and 'normalized_rating' columns
|
| 51 |
+
|
| 52 |
+
# --- 4.1. Prepare Data and Split ---
|
| 53 |
+
X = df_regression['Review']
|
| 54 |
+
y = df_regression['normalized_rating']
|
| 55 |
+
|
| 56 |
+
X_train, X_val, y_train, y_val = train_test_split(
|
| 57 |
+
X, y, test_size=0.2, random_state=42
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")
|
| 61 |
+
|
| 62 |
+
# --- 4.2. Vectorize Text Data using TF-IDF ---
|
| 63 |
+
print("Vectorizing text with TF-IDF...")
|
| 64 |
+
vectorizer = TfidfVectorizer(
|
| 65 |
+
max_features=5000,
|
| 66 |
+
ngram_range=(1, 2),
|
| 67 |
+
stop_words='english'
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
X_train_tfidf = vectorizer.fit_transform(X_train)
|
| 71 |
+
X_val_tfidf = vectorizer.transform(X_val)
|
| 72 |
+
print("Vectorization complete.")
|
| 73 |
+
print(f"Shape of TF-IDF matrix: {X_train_tfidf.shape}")
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# --- 4.3. Train the Random Forest Model ---
|
| 77 |
+
print("⚙️ Training Random Forest Regressor...")
|
| 78 |
+
rf_model = RandomForestRegressor(
|
| 79 |
+
n_estimators=200,
|
| 80 |
+
max_depth=50,
|
| 81 |
+
random_state=42,
|
| 82 |
+
n_jobs=-1,
|
| 83 |
+
verbose=1 # <<< ADDED: This will print progress updates during training.
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
rf_model.fit(X_train_tfidf, y_train)
|
| 87 |
+
print("✅ Model training finished!")
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# --- 4.4. Evaluate the Model ---
|
| 91 |
+
print("Evaluating model performance...")
|
| 92 |
+
predictions = rf_model.predict(X_val_tfidf)
|
| 93 |
+
|
| 94 |
+
mse = mean_squared_error(y_val, predictions)
|
| 95 |
+
r2 = r2_score(y_val, predictions)
|
| 96 |
+
|
| 97 |
+
print(f"\n--- Evaluation Results ---")
|
| 98 |
+
print(f"Mean Squared Error (MSE): {mse:.4f}")
|
| 99 |
+
print(f"R-squared (R²): {r2:.4f}")
|
| 100 |
+
print("--------------------------")
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# --- 4.5. Save the Model and Vectorizer ---
|
| 104 |
+
joblib.dump(rf_model, 'random_forest_model.joblib')
|
| 105 |
+
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
|
| 106 |
+
|
| 107 |
+
print("\nModel and TF-IDF vectorizer saved successfully.")
|
model_traing.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Block 2: Load and Prepare Data
|
| 2 |
+
# ------------------------------
|
| 3 |
+
# This block loads the data from your Excel file, fixes the header,
|
| 4 |
+
# and prepares it for the model.
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
print("--- Loading and Preparing Data ---")
|
| 9 |
+
|
| 10 |
+
# Define the correct column names we want to use.
|
| 11 |
+
correct_column_names = ['Id', 'Review', 'Rating']
|
| 12 |
+
|
| 13 |
+
# 1. Load the Excel file, skipping the bad header row.
|
| 14 |
+
# We explicitly tell pandas there is no header to read.
|
| 15 |
+
df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)
|
| 16 |
+
|
| 17 |
+
# 2. Manually assign our correct column names. This is the key step
|
| 18 |
+
# to prevent the 'KeyError'.
|
| 19 |
+
df.columns = correct_column_names
|
| 20 |
+
|
| 21 |
+
# 3. Clean the data:
|
| 22 |
+
# - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
|
| 23 |
+
# - Drop any rows where 'Rating' or 'Review' is missing.
|
| 24 |
+
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
|
| 25 |
+
df.dropna(subset=['Rating', 'Review'], inplace=True)
|
| 26 |
+
|
| 27 |
+
# 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
|
| 28 |
+
# This helps the model train more effectively.
|
| 29 |
+
df['normalized_rating'] = (df['Rating'] - 1) / 9.0
|
| 30 |
+
|
| 31 |
+
# 5. Create our final, clean DataFrame for the model.
|
| 32 |
+
df_regression = df[['Review', 'normalized_rating']].copy()
|
| 33 |
+
|
| 34 |
+
print("✅ Data loaded and prepared successfully!")
|
| 35 |
+
print("\nHere's a sample of the prepared data:")
|
| 36 |
+
print(df_regression.head())
|
random_forest_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7d54298865f632ea0ffd1ff2ec83aa9bb33a7ea69b69820b577153d00799f85
|
| 3 |
+
size 66617905
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scikit-learn
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
gradio
|
session.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"uuids": {
|
| 3 |
+
"phone_id": "299d2587-06c5-4043-a756-656483aaa5ac",
|
| 4 |
+
"uuid": "bb8c72a4-e53a-448f-8cec-ccdee30dbdbb",
|
| 5 |
+
"client_session_id": "953b3935-cd5a-4f11-a38b-a3da94ff97ef",
|
| 6 |
+
"advertising_id": "0b264ae4-8e8c-45b5-aa21-dba53a1a4fc5",
|
| 7 |
+
"android_device_id": "android-a974d4e83e6321e4",
|
| 8 |
+
"request_id": "6e9c1840-9635-45f8-9eea-a644457d6413",
|
| 9 |
+
"tray_session_id": "c67a752d-6af9-448e-bbb5-7d58086d2a0a"
|
| 10 |
+
},
|
| 11 |
+
"mid": "aJONkgABAAGlpg0hUEp6T_GySBpV",
|
| 12 |
+
"ig_u_rur": null,
|
| 13 |
+
"ig_www_claim": null,
|
| 14 |
+
"authorization_data": {
|
| 15 |
+
"ds_user_id": "62889334662",
|
| 16 |
+
"sessionid": "62889334662%3ADVIDAfxi1LRPx7%3A11%3AAYftxy5MvSHKlsymp5C7jnVjSgqBkBijqvq0TPTLCA"
|
| 17 |
+
},
|
| 18 |
+
"cookies": {},
|
| 19 |
+
"last_login": 1754500513.609895,
|
| 20 |
+
"device_settings": {
|
| 21 |
+
"app_version": "269.0.0.18.75",
|
| 22 |
+
"android_version": 26,
|
| 23 |
+
"android_release": "8.0.0",
|
| 24 |
+
"dpi": "480dpi",
|
| 25 |
+
"resolution": "1080x1920",
|
| 26 |
+
"manufacturer": "OnePlus",
|
| 27 |
+
"device": "devitron",
|
| 28 |
+
"model": "6T Dev",
|
| 29 |
+
"cpu": "qcom",
|
| 30 |
+
"version_code": "314665256"
|
| 31 |
+
},
|
| 32 |
+
"user_agent": "Instagram 269.0.0.18.75 Android (26/8.0.0; 480dpi; 1080x1920; OnePlus; 6T Dev; devitron; qcom; en_US; 314665256)",
|
| 33 |
+
"country": "US",
|
| 34 |
+
"country_code": 1,
|
| 35 |
+
"locale": "en_US",
|
| 36 |
+
"timezone_offset": -14400
|
| 37 |
+
}
|
tfidf_vectorizer.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b44ca9f00b46a25b457fe2997694de5940750786d60fb205bef18f93d69f6cf
|
| 3 |
+
size 194272
|
train_best.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a04fe95eb23f17654916ff96911826b200cc5c9b20d0b8d463f5bf02de548f1f
|
| 3 |
+
size 9331872
|