Agrannya commited on
Commit
ce676fe
·
verified ·
1 Parent(s): dddbad7

Upload 9 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ train_best.xlsx filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: App Sentiment Analysis
3
- emoji: 🔥
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.41.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: app_sentiment_analysis
3
+ app_file: deploy.py
 
 
4
  sdk: gradio
5
+ sdk_version: 5.41.0
 
 
6
  ---
 
 
deploy.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import joblib
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ # --- 1. Load the saved model and vectorizer ---
7
+ print("Loading model and vectorizer...")
8
+ model = joblib.load('random_forest_model.joblib')
9
+ vectorizer = joblib.load('tfidf_vectorizer.joblib')
10
+ print("Files loaded successfully.")
11
+
12
+ # --- 2. Define the Prediction & Denormalization Function ---
13
+ def predict_rating(review_text):
14
+ review_tfidf = vectorizer.transform([review_text])
15
+ normalized_prediction = model.predict(review_tfidf)[0]
16
+ final_rating = (normalized_prediction * 9) + 1
17
+ final_rating = np.clip(final_rating, 1, 10)
18
+ return round(final_rating, 2)
19
+
20
+ # --- 3. Define the App's Title, Description, and Examples ---
21
+ title = "⭐ Company Review Rating Predictor"
22
+ description = """
23
+ ### **Model Information**
24
+ This app uses a **Random Forest Regressor** model to predict a numerical rating based on the text of a company review.
25
+ ### **Dataset Information**
26
+ The model was trained on the ["Sentiment Analysis on Company Reviews" dataset from Kaggle](https://www.kaggle.com/competitions/sentiment-analysis-company-reviews/code). This dataset contains reviews from employees about the companies they work for, with ratings originally on a **1-to-10 scale**.
27
+ ### **Error Margin**
28
+ The model has a Mean Squared Error (MSE) of 0.0104. This means its predictions on the 1-10 scale have an average error margin of approximately **±0.9 points**.
29
+ """
30
+ examples = [
31
+ ["Great place to work, good people, and good work-life balance."],
32
+ ["The job is okay, but the management is not very good."],
33
+ ["I would not recommend this company to anyone. The pay is low and the hours are long."]
34
+ ]
35
+
36
+ # --- 4. Launch the Gradio Interface ---
37
+ print("Launching Gradio interface...")
38
+ interface = gr.Interface(
39
+ fn=predict_rating,
40
+ inputs=gr.Textbox(lines=5, label="Enter an Employee Review", placeholder="e.g., 'Great work-life balance and supportive management...'"),
41
+ outputs=gr.Number(label="Predicted Rating (on a 1-10 Scale)"),
42
+ title=title,
43
+ description=description,
44
+ examples=examples,
45
+ allow_flagging="never"
46
+ )
47
+
48
+ # Launch the app and create a public, shareable link
49
+ interface.launch(share=True)
model_save.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Block 4: Random Forest Model with Progress Display
2
+ # --------------------------------------------------
3
+ # Use a classic machine learning approach for rating prediction with progress tracking.
4
+ # Block 2: Load and Prepare Data
5
+ # ------------------------------
6
+ # This block loads the data from your Excel file, fixes the header,
7
+ # and prepares it for the model.
8
+
9
+ import pandas as pd
10
+
11
+ print("--- Loading and Preparing Data ---")
12
+
13
+ # Define the correct column names we want to use.
14
+ correct_column_names = ['Id', 'Review', 'Rating']
15
+
16
+ # 1. Load the Excel file, skipping the bad header row.
17
+ # We explicitly tell pandas there is no header to read.
18
+ df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)
19
+
20
+ # 2. Manually assign our correct column names. This is the key step
21
+ # to prevent the 'KeyError'.
22
+ df.columns = correct_column_names
23
+
24
+ # 3. Clean the data:
25
+ # - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
26
+ # - Drop any rows where 'Rating' or 'Review' is missing.
27
+ df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
28
+ df.dropna(subset=['Rating', 'Review'], inplace=True)
29
+
30
+ # 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
31
+ # This helps the model train more effectively.
32
+ df['normalized_rating'] = (df['Rating'] - 1) / 9.0
33
+
34
+ # 5. Create our final, clean DataFrame for the model.
35
+ df_regression = df[['Review', 'normalized_rating']].copy()
36
+
37
+ print("✅ Data loaded and prepared successfully!")
38
+ print("\nHere's a sample of the prepared data:")
39
+ print(df_regression.head())
40
+ import pandas as pd
41
+ from sklearn.model_selection import train_test_split
42
+ from sklearn.feature_extraction.text import TfidfVectorizer
43
+ from sklearn.ensemble import RandomForestRegressor
44
+ from sklearn.metrics import mean_squared_error, r2_score
45
+ import joblib
46
+ import numpy as np
47
+
48
+ print("--- Pivoting to Random Forest ---")
49
+
50
+ # Assume 'df_regression' is your DataFrame with 'Review' and 'normalized_rating' columns
51
+
52
+ # --- 4.1. Prepare Data and Split ---
53
+ X = df_regression['Review']
54
+ y = df_regression['normalized_rating']
55
+
56
+ X_train, X_val, y_train, y_val = train_test_split(
57
+ X, y, test_size=0.2, random_state=42
58
+ )
59
+
60
+ print(f"Training on {len(X_train)} samples, validating on {len(X_val)} samples.")
61
+
62
+ # --- 4.2. Vectorize Text Data using TF-IDF ---
63
+ print("Vectorizing text with TF-IDF...")
64
+ vectorizer = TfidfVectorizer(
65
+ max_features=5000,
66
+ ngram_range=(1, 2),
67
+ stop_words='english'
68
+ )
69
+
70
+ X_train_tfidf = vectorizer.fit_transform(X_train)
71
+ X_val_tfidf = vectorizer.transform(X_val)
72
+ print("Vectorization complete.")
73
+ print(f"Shape of TF-IDF matrix: {X_train_tfidf.shape}")
74
+
75
+
76
+ # --- 4.3. Train the Random Forest Model ---
77
+ print("⚙️ Training Random Forest Regressor...")
78
+ rf_model = RandomForestRegressor(
79
+ n_estimators=200,
80
+ max_depth=50,
81
+ random_state=42,
82
+ n_jobs=-1,
83
+ verbose=1 # <<< ADDED: This will print progress updates during training.
84
+ )
85
+
86
+ rf_model.fit(X_train_tfidf, y_train)
87
+ print("✅ Model training finished!")
88
+
89
+
90
+ # --- 4.4. Evaluate the Model ---
91
+ print("Evaluating model performance...")
92
+ predictions = rf_model.predict(X_val_tfidf)
93
+
94
+ mse = mean_squared_error(y_val, predictions)
95
+ r2 = r2_score(y_val, predictions)
96
+
97
+ print(f"\n--- Evaluation Results ---")
98
+ print(f"Mean Squared Error (MSE): {mse:.4f}")
99
+ print(f"R-squared (R²): {r2:.4f}")
100
+ print("--------------------------")
101
+
102
+
103
+ # --- 4.5. Save the Model and Vectorizer ---
104
+ joblib.dump(rf_model, 'random_forest_model.joblib')
105
+ joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
106
+
107
+ print("\nModel and TF-IDF vectorizer saved successfully.")
model_traing.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Block 2: Load and Prepare Data
2
+ # ------------------------------
3
+ # This block loads the data from your Excel file, fixes the header,
4
+ # and prepares it for the model.
5
+
6
+ import pandas as pd
7
+
8
+ print("--- Loading and Preparing Data ---")
9
+
10
+ # Define the correct column names we want to use.
11
+ correct_column_names = ['Id', 'Review', 'Rating']
12
+
13
+ # 1. Load the Excel file, skipping the bad header row.
14
+ # We explicitly tell pandas there is no header to read.
15
+ df = pd.read_excel('train_best.xlsx', header=None, skiprows=1)
16
+
17
+ # 2. Manually assign our correct column names. This is the key step
18
+ # to prevent the 'KeyError'.
19
+ df.columns = correct_column_names
20
+
21
+ # 3. Clean the data:
22
+ # - Convert 'Rating' to a number. If a value can't be converted, it becomes 'NaN'.
23
+ # - Drop any rows where 'Rating' or 'Review' is missing.
24
+ df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
25
+ df.dropna(subset=['Rating', 'Review'], inplace=True)
26
+
27
+ # 4. Normalize the 'Rating' from a 1-10 scale to a 0-1 scale.
28
+ # This helps the model train more effectively.
29
+ df['normalized_rating'] = (df['Rating'] - 1) / 9.0
30
+
31
+ # 5. Create our final, clean DataFrame for the model.
32
+ df_regression = df[['Review', 'normalized_rating']].copy()
33
+
34
+ print("✅ Data loaded and prepared successfully!")
35
+ print("\nHere's a sample of the prepared data:")
36
+ print(df_regression.head())
random_forest_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7d54298865f632ea0ffd1ff2ec83aa9bb33a7ea69b69820b577153d00799f85
3
+ size 66617905
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ scikit-learn
2
+ pandas
3
+ numpy
4
+ gradio
session.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "uuids": {
3
+ "phone_id": "299d2587-06c5-4043-a756-656483aaa5ac",
4
+ "uuid": "bb8c72a4-e53a-448f-8cec-ccdee30dbdbb",
5
+ "client_session_id": "953b3935-cd5a-4f11-a38b-a3da94ff97ef",
6
+ "advertising_id": "0b264ae4-8e8c-45b5-aa21-dba53a1a4fc5",
7
+ "android_device_id": "android-a974d4e83e6321e4",
8
+ "request_id": "6e9c1840-9635-45f8-9eea-a644457d6413",
9
+ "tray_session_id": "c67a752d-6af9-448e-bbb5-7d58086d2a0a"
10
+ },
11
+ "mid": "aJONkgABAAGlpg0hUEp6T_GySBpV",
12
+ "ig_u_rur": null,
13
+ "ig_www_claim": null,
14
+ "authorization_data": {
15
+ "ds_user_id": "62889334662",
16
+ "sessionid": "62889334662%3ADVIDAfxi1LRPx7%3A11%3AAYftxy5MvSHKlsymp5C7jnVjSgqBkBijqvq0TPTLCA"
17
+ },
18
+ "cookies": {},
19
+ "last_login": 1754500513.609895,
20
+ "device_settings": {
21
+ "app_version": "269.0.0.18.75",
22
+ "android_version": 26,
23
+ "android_release": "8.0.0",
24
+ "dpi": "480dpi",
25
+ "resolution": "1080x1920",
26
+ "manufacturer": "OnePlus",
27
+ "device": "devitron",
28
+ "model": "6T Dev",
29
+ "cpu": "qcom",
30
+ "version_code": "314665256"
31
+ },
32
+ "user_agent": "Instagram 269.0.0.18.75 Android (26/8.0.0; 480dpi; 1080x1920; OnePlus; 6T Dev; devitron; qcom; en_US; 314665256)",
33
+ "country": "US",
34
+ "country_code": 1,
35
+ "locale": "en_US",
36
+ "timezone_offset": -14400
37
+ }
tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b44ca9f00b46a25b457fe2997694de5940750786d60fb205bef18f93d69f6cf
3
+ size 194272
train_best.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a04fe95eb23f17654916ff96911826b200cc5c9b20d0b8d463f5bf02de548f1f
3
+ size 9331872