Spaces:

nadish1210
/

sentiment_analysis

Sleeping

App Files Files Community

nadish1210 commited on Jan 22

Commit

f75d9fd

verified ·

1 Parent(s): 1c8f9d6

Upload 8 files

Browse files

Files changed (8) hide show

Unconfirmed 489417.crdownload +0 -0
app.py +68 -0
dataset.csv +13 -0
preprocess.py +34 -0
requirements.txt +6 -0
sentiment_model_best.pkl +3 -0
tfidf_vectorizer.pkl +3 -0
train_model.py +62 -0

Unconfirmed 489417.crdownload ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import gradio as gr
+import joblib
+import preprocess
+# Load the trained model
+try:
+    model = joblib.load("sentiment_model_best.pkl")
+    print("Model loaded successfully.")
+except FileNotFoundError:
+    print("Error: Model file 'sentiment_model_best.pkl' not found. Please run train_model.py first.")
+    model = None
+def analyze_sentiment(text):
+    if model is None:
+        return "Model not loaded."
+    # Preprocess
+    clean_text = preprocess.preprocess_text(text)
+    # Predict
+    # The pipeline handles vectorization
+    prediction = model.predict([clean_text])[0]
+    # Get confidence scores if possible (LinearSVC uses decision_function, not predict_proba by default,
+    # but for simplicity we rely on the label.
+    # If we wanted proba, we'd need CalibratedClassifierCV or use LogisticRegression)
+    return prediction
+# Custom CSS for a nicer look
+custom_css = """
+body {background-color: #f0f2f5;}
+.gradio-container {max-width: 700px !important; margin-top: 50px !important;}
+h1 {text-align: center; color: #333;}
+"""
+with gr.Blocks(css=custom_css, title="Sentiment Analyzer") as demo:
+    gr.Markdown("# 📊 Sentiment Analysis System")
+    gr.Markdown("Enter a review or sentence below to analyze its sentiment (Positive, Negative, or Neutral).")
+    with gr.Row():
+        input_text = gr.Textbox(
+            label="Input Text",
+            placeholder="Type something here... (e.g., 'The product is amazing!')",
+            lines=3
+        )
+    with gr.Row():
+        analyze_btn = gr.Button("Analyze Sentiment", variant="primary")
+    with gr.Row():
+        output_label = gr.Label(label="Predicted Sentiment")
+    analyze_btn.click(fn=analyze_sentiment, inputs=input_text, outputs=output_label)
+    gr.Markdown("---")
+    gr.Markdown("### Examples")
+    gr.Examples(
+        examples=[
+            ["I absolutely love this! It's fantastic."],
+            ["This is the worst experience I've ever had."],
+            ["It's average, nothing special."],
+        ],
+        inputs=input_text
+    )
+if __name__ == "__main__":
+    demo.launch(share=False)

dataset.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+text,label
+"I absolutely love this product! It works perfectly.",Positive
+"Terrible experience. The item was broken and support was rude.",Negative
+"It's okay, does the job but nothing special.",Neutral
+"Best purchase I've made all year. Highly recommended.",Positive
+"Waste of money. Do not buy.",Negative
+"Average quality, faster delivery than expected.",Neutral
+"The design is beautiful but the functionality is lacking.",Neutral
+"Absolutely fantastic service and great quality.",Positive
+"Disappointed. Not as described.",Negative
+"Just arrived. Looks good so far.",Neutral
+"I am very happy with this.",Positive
+"This is garbage.",Negative

preprocess.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+# Ensure nltk resources are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+try:
+    nltk.data.find('corpora/wordnet')
+except LookupError:
+    nltk.download('wordnet')
+stop_words = set(stopwords.words('english'))
+lemmatizer = WordNetLemmatizer()
+def preprocess_text(text):
+    if not isinstance(text, str):
+        return ""
+    # Lowercase
+    text = text.lower()
+    # Remove special characters, numbers, and urls
+    text = re.sub(r'http\S+', '', text)
+    text = re.sub(r'[^a-zA-Z\s]', '', text)
+    # Tokenize and remove stopwords & lemmatize
+    words = text.split()
+    clean_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
+    return " ".join(clean_words)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pandas
+scikit-learn
+nltk
+gradio
+joblib
+numpy

sentiment_model_best.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66cd844719f82787c69dcb627c5e28cd049391ea81a695d2ef0098345495f4b5
+size 3181

tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

train_model.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import pandas as pd
+import joblib
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+from sklearn.metrics import classification_report, accuracy_score
+import preprocess
+def train():
+    print("Loading dataset...")
+    try:
+        df = pd.read_csv("dataset.csv")
+    except FileNotFoundError:
+        print("Error: dataset.csv not found.")
+        return
+    print("Preprocessing data...")
+    # Fill NaN with empty string just in case
+    df['text'] = df['text'].fillna('')
+    df['clean_text'] = df['text'].apply(preprocess.preprocess_text)
+    X = df['clean_text']
+    y = df['label']
+    print("Splitting data...")
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    print("Setting up pipeline and grid search...")
+    # Pipeline: TF-IDF -> LinearSVC (often best for text)
+    pipeline = Pipeline([
+        ('tfidf', TfidfVectorizer()),
+        ('clf', LinearSVC(dual='auto'))
+    ])
+    # Parameters to tune
+    param_grid = {
+        'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or Bigrams
+        'tfidf__max_df': [0.9, 1.0],
+        'clf__C': [0.1, 1, 10]
+    }
+    # Grid Search for best accuracy
+    grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
+    print("Training model...")
+    grid_search.fit(X_train, y_train)
+    print(f"Best Parameters: {grid_search.best_params_}")
+    best_model = grid_search.best_estimator_
+    print("Evaluating model...")
+    y_pred = best_model.predict(X_test)
+    print("Accuracy:", accuracy_score(y_test, y_pred))
+    print("\nClassification Report:\n", classification_report(y_test, y_pred))
+    print("Saving model...")
+    joblib.dump(best_model, "sentiment_model_best.pkl")
+    print("Model saved to sentiment_model_best.pkl")
+if __name__ == "__main__":
+    train()