Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- Unconfirmed 489417.crdownload +0 -0
- app.py +68 -0
- dataset.csv +13 -0
- preprocess.py +34 -0
- requirements.txt +6 -0
- sentiment_model_best.pkl +3 -0
- tfidf_vectorizer.pkl +3 -0
- train_model.py +62 -0
Unconfirmed 489417.crdownload
ADDED
|
File without changes
|
app.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import joblib
|
| 3 |
+
import preprocess
|
| 4 |
+
|
| 5 |
+
# Load the trained model
|
| 6 |
+
try:
|
| 7 |
+
model = joblib.load("sentiment_model_best.pkl")
|
| 8 |
+
print("Model loaded successfully.")
|
| 9 |
+
except FileNotFoundError:
|
| 10 |
+
print("Error: Model file 'sentiment_model_best.pkl' not found. Please run train_model.py first.")
|
| 11 |
+
model = None
|
| 12 |
+
|
| 13 |
+
def analyze_sentiment(text):
|
| 14 |
+
if model is None:
|
| 15 |
+
return "Model not loaded."
|
| 16 |
+
|
| 17 |
+
# Preprocess
|
| 18 |
+
clean_text = preprocess.preprocess_text(text)
|
| 19 |
+
|
| 20 |
+
# Predict
|
| 21 |
+
# The pipeline handles vectorization
|
| 22 |
+
prediction = model.predict([clean_text])[0]
|
| 23 |
+
|
| 24 |
+
# Get confidence scores if possible (LinearSVC uses decision_function, not predict_proba by default,
|
| 25 |
+
# but for simplicity we rely on the label.
|
| 26 |
+
# If we wanted proba, we'd need CalibratedClassifierCV or use LogisticRegression)
|
| 27 |
+
|
| 28 |
+
return prediction
|
| 29 |
+
|
| 30 |
+
# Custom CSS for a nicer look
|
| 31 |
+
custom_css = """
|
| 32 |
+
body {background-color: #f0f2f5;}
|
| 33 |
+
.gradio-container {max-width: 700px !important; margin-top: 50px !important;}
|
| 34 |
+
h1 {text-align: center; color: #333;}
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
with gr.Blocks(css=custom_css, title="Sentiment Analyzer") as demo:
|
| 38 |
+
gr.Markdown("# 📊 Sentiment Analysis System")
|
| 39 |
+
gr.Markdown("Enter a review or sentence below to analyze its sentiment (Positive, Negative, or Neutral).")
|
| 40 |
+
|
| 41 |
+
with gr.Row():
|
| 42 |
+
input_text = gr.Textbox(
|
| 43 |
+
label="Input Text",
|
| 44 |
+
placeholder="Type something here... (e.g., 'The product is amazing!')",
|
| 45 |
+
lines=3
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
with gr.Row():
|
| 49 |
+
analyze_btn = gr.Button("Analyze Sentiment", variant="primary")
|
| 50 |
+
|
| 51 |
+
with gr.Row():
|
| 52 |
+
output_label = gr.Label(label="Predicted Sentiment")
|
| 53 |
+
|
| 54 |
+
analyze_btn.click(fn=analyze_sentiment, inputs=input_text, outputs=output_label)
|
| 55 |
+
|
| 56 |
+
gr.Markdown("---")
|
| 57 |
+
gr.Markdown("### Examples")
|
| 58 |
+
gr.Examples(
|
| 59 |
+
examples=[
|
| 60 |
+
["I absolutely love this! It's fantastic."],
|
| 61 |
+
["This is the worst experience I've ever had."],
|
| 62 |
+
["It's average, nothing special."],
|
| 63 |
+
],
|
| 64 |
+
inputs=input_text
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
demo.launch(share=False)
|
dataset.csv
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
text,label
|
| 2 |
+
"I absolutely love this product! It works perfectly.",Positive
|
| 3 |
+
"Terrible experience. The item was broken and support was rude.",Negative
|
| 4 |
+
"It's okay, does the job but nothing special.",Neutral
|
| 5 |
+
"Best purchase I've made all year. Highly recommended.",Positive
|
| 6 |
+
"Waste of money. Do not buy.",Negative
|
| 7 |
+
"Average quality, faster delivery than expected.",Neutral
|
| 8 |
+
"The design is beautiful but the functionality is lacking.",Neutral
|
| 9 |
+
"Absolutely fantastic service and great quality.",Positive
|
| 10 |
+
"Disappointed. Not as described.",Negative
|
| 11 |
+
"Just arrived. Looks good so far.",Neutral
|
| 12 |
+
"I am very happy with this.",Positive
|
| 13 |
+
"This is garbage.",Negative
|
preprocess.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import nltk
|
| 3 |
+
from nltk.corpus import stopwords
|
| 4 |
+
from nltk.stem import WordNetLemmatizer
|
| 5 |
+
|
| 6 |
+
# Ensure nltk resources are downloaded
|
| 7 |
+
try:
|
| 8 |
+
nltk.data.find('corpora/stopwords')
|
| 9 |
+
except LookupError:
|
| 10 |
+
nltk.download('stopwords')
|
| 11 |
+
try:
|
| 12 |
+
nltk.data.find('corpora/wordnet')
|
| 13 |
+
except LookupError:
|
| 14 |
+
nltk.download('wordnet')
|
| 15 |
+
|
| 16 |
+
stop_words = set(stopwords.words('english'))
|
| 17 |
+
lemmatizer = WordNetLemmatizer()
|
| 18 |
+
|
| 19 |
+
def preprocess_text(text):
|
| 20 |
+
if not isinstance(text, str):
|
| 21 |
+
return ""
|
| 22 |
+
|
| 23 |
+
# Lowercase
|
| 24 |
+
text = text.lower()
|
| 25 |
+
|
| 26 |
+
# Remove special characters, numbers, and urls
|
| 27 |
+
text = re.sub(r'http\S+', '', text)
|
| 28 |
+
text = re.sub(r'[^a-zA-Z\s]', '', text)
|
| 29 |
+
|
| 30 |
+
# Tokenize and remove stopwords & lemmatize
|
| 31 |
+
words = text.split()
|
| 32 |
+
clean_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
|
| 33 |
+
|
| 34 |
+
return " ".join(clean_words)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
scikit-learn
|
| 3 |
+
nltk
|
| 4 |
+
gradio
|
| 5 |
+
joblib
|
| 6 |
+
numpy
|
sentiment_model_best.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66cd844719f82787c69dcb627c5e28cd049391ea81a695d2ef0098345495f4b5
|
| 3 |
+
size 3181
|
tfidf_vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
|
| 3 |
+
size 0
|
train_model.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import joblib
|
| 3 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
| 4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
+
from sklearn.pipeline import Pipeline
|
| 6 |
+
from sklearn.svm import LinearSVC
|
| 7 |
+
from sklearn.metrics import classification_report, accuracy_score
|
| 8 |
+
import preprocess
|
| 9 |
+
|
| 10 |
+
def train():
|
| 11 |
+
print("Loading dataset...")
|
| 12 |
+
try:
|
| 13 |
+
df = pd.read_csv("dataset.csv")
|
| 14 |
+
except FileNotFoundError:
|
| 15 |
+
print("Error: dataset.csv not found.")
|
| 16 |
+
return
|
| 17 |
+
|
| 18 |
+
print("Preprocessing data...")
|
| 19 |
+
# Fill NaN with empty string just in case
|
| 20 |
+
df['text'] = df['text'].fillna('')
|
| 21 |
+
df['clean_text'] = df['text'].apply(preprocess.preprocess_text)
|
| 22 |
+
|
| 23 |
+
X = df['clean_text']
|
| 24 |
+
y = df['label']
|
| 25 |
+
|
| 26 |
+
print("Splitting data...")
|
| 27 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 28 |
+
|
| 29 |
+
print("Setting up pipeline and grid search...")
|
| 30 |
+
# Pipeline: TF-IDF -> LinearSVC (often best for text)
|
| 31 |
+
pipeline = Pipeline([
|
| 32 |
+
('tfidf', TfidfVectorizer()),
|
| 33 |
+
('clf', LinearSVC(dual='auto'))
|
| 34 |
+
])
|
| 35 |
+
|
| 36 |
+
# Parameters to tune
|
| 37 |
+
param_grid = {
|
| 38 |
+
'tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams or Bigrams
|
| 39 |
+
'tfidf__max_df': [0.9, 1.0],
|
| 40 |
+
'clf__C': [0.1, 1, 10]
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Grid Search for best accuracy
|
| 44 |
+
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
|
| 45 |
+
|
| 46 |
+
print("Training model...")
|
| 47 |
+
grid_search.fit(X_train, y_train)
|
| 48 |
+
|
| 49 |
+
print(f"Best Parameters: {grid_search.best_params_}")
|
| 50 |
+
best_model = grid_search.best_estimator_
|
| 51 |
+
|
| 52 |
+
print("Evaluating model...")
|
| 53 |
+
y_pred = best_model.predict(X_test)
|
| 54 |
+
print("Accuracy:", accuracy_score(y_test, y_pred))
|
| 55 |
+
print("\nClassification Report:\n", classification_report(y_test, y_pred))
|
| 56 |
+
|
| 57 |
+
print("Saving model...")
|
| 58 |
+
joblib.dump(best_model, "sentiment_model_best.pkl")
|
| 59 |
+
print("Model saved to sentiment_model_best.pkl")
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
train()
|