mcp-sentiment / app.py
Yuvaraj7's picture
Update app.py
40c0846 verified
import gradio as gr
import pandas as pd
import os
import pickle
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# ======================
# Paths
# ======================
MODEL_PATH = "sklearn_nb_model.pkl"
DATA_PATH = "test.csv"
SAMPLE_SIZE = 50000
# ======================
# Helper function for text cleaning (basic)
# ======================
def clean_text(text):
if not isinstance(text, str):
text = str(text)
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# ======================
# 1. Train or Load Model
# ======================
if os.path.exists(MODEL_PATH):
with open(MODEL_PATH, "rb") as f:
model = pickle.load(f)
print("Loaded existing scikit-learn model.")
else:
assert os.path.exists(DATA_PATH), f"{DATA_PATH} not found! Upload a CSV dataset."
# Load CSV instead of Parquet
df = pd.read_csv(DATA_PATH, encoding='ISO-8859-1')
print(f"Type of df after loading csv: {type(df)}")
print(f"df columns after loading: {df.columns.tolist()}")
# Sample to reduce memory usage
if len(df) > SAMPLE_SIZE:
df = df.sample(n=SAMPLE_SIZE, random_state=42)
# Normalize column names
df.columns = [c.lower().strip() for c in df.columns]
print(f"df columns after normalization: {df.columns.tolist()}")
# Check required columns
if 'text' not in df.columns or 'sentiment' not in df.columns:
raise ValueError("CSV file must contain 'text' and 'sentiment' columns after column normalization")
# Clean only the 'text' column
df['text'] = df['text'].fillna("").astype(str).apply(clean_text)
print(f"Type of df after cleaning: {type(df)}") # <class 'pandas.core.frame.DataFrame'>
print(f"Type of df['text'] after processing: {type(df['text'])}") # <class 'pandas.core.series.Series'>
# Process 'sentiment' column
print(f"Type of df['sentiment'] immediately before fillna: {type(df['sentiment'])}")
print(f"df['sentiment'] head immediately before fillna: {df['sentiment'].head()}")
df['sentiment'] = df['sentiment'].fillna("").astype(str)
print(f"Type of df['sentiment'] after processing: {type(df['sentiment'])}")
X = df['text']
y = df['sentiment']
# Train a simple pipeline: CountVectorizer + MultinomialNB
model = make_pipeline(CountVectorizer(min_df=2), MultinomialNB(alpha=1.0))
model.fit(X, y)
# Save the trained model
with open(MODEL_PATH, "wb") as f:
pickle.dump(model, f)
print(f"Model trained on {len(df)} samples and saved!")
# ======================
# 2. Sentiment Analysis Function
# ======================
def sentiment_analysis(text: str) -> dict:
if not text or len(text.strip()) < 2:
return {
"prediction": "neutral",
"probabilities": {"pos": 0.5, "neg": 0.5},
"message": "Input too short or empty for meaningful analysis. Defaulting to neutral."
}
cleaned_text = clean_text(text)
try:
if not cleaned_text:
return {
"prediction": "neutral",
"probabilities": {"pos": 0.5, "neg": 0.5},
"message": "Input became empty after cleaning. Defaulting to neutral."
}
pred_array = model.predict([cleaned_text])
prob_array = model.predict_proba([cleaned_text])
classes = model.classes_
pred = pred_array[0] # Access the first (and only) element of the prediction array
prob_dict = {cls: round(float(prob_array[0][i]), 4) for i, cls in enumerate(classes)} # Get probabilities
if sum(prob_dict.values()) == 0:
prob_dict = {"pos": 0.01, "neg": 0.01}
pred = "neutral"
return {
"prediction": pred,
"probabilities": prob_dict,
"cleaned_input": cleaned_text
}
except Exception as e:
return {
"prediction": "error",
"probabilities": {"pos": 0.5, "neg": 0.5},
"error_details": str(e),
"message": "An error occurred during analysis."
}
# ======================
# 3. Gradio Interface
# ======================
demo = gr.Interface(
fn=sentiment_analysis,
inputs=gr.Textbox(placeholder="Enter text to analyze...", lines=3),
outputs=gr.JSON(label="Analysis Results"),
title="Naive Bayes Sentiment Analysis (scikit-learn)",
description="Sentiment analysis using a memory-efficient Naive Bayes classifier. <br> "
"Enter text to get its predicted sentiment and probability distribution."
)
# ======================
# 4. Launch
# ======================
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")