Spaces:

Yuvaraj7
/

mcp-sentiment

Sleeping

App Files Files Community

mcp-sentiment / app.py

Yuvaraj7

Update app.py

40c0846 verified 7 months ago

raw

history blame contribute delete

4.92 kB

	import gradio as gr
	import pandas as pd
	import os
	import pickle
	import re
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.pipeline import make_pipeline
	from sklearn.model_selection import train_test_split


	# ======================
	# Paths
	# ======================
	MODEL_PATH = "sklearn_nb_model.pkl"
	DATA_PATH = "test.csv"
	SAMPLE_SIZE = 50000


	# ======================
	# Helper function for text cleaning (basic)
	# ======================
	def clean_text(text):
	if not isinstance(text, str):
	text = str(text)
	text = text.lower()
	text = re.sub(r'[^a-z\s]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	# ======================
	# 1. Train or Load Model
	# ======================
	if os.path.exists(MODEL_PATH):
	with open(MODEL_PATH, "rb") as f:
	model = pickle.load(f)
	print("Loaded existing scikit-learn model.")
	else:
	assert os.path.exists(DATA_PATH), f"{DATA_PATH} not found! Upload a CSV dataset."


	# Load CSV instead of Parquet
	df = pd.read_csv(DATA_PATH, encoding='ISO-8859-1')
	print(f"Type of df after loading csv: {type(df)}")
	print(f"df columns after loading: {df.columns.tolist()}")


	# Sample to reduce memory usage
	if len(df) > SAMPLE_SIZE:
	df = df.sample(n=SAMPLE_SIZE, random_state=42)


	# Normalize column names
	df.columns = [c.lower().strip() for c in df.columns]
	print(f"df columns after normalization: {df.columns.tolist()}")


	# Check required columns
	if 'text' not in df.columns or 'sentiment' not in df.columns:
	raise ValueError("CSV file must contain 'text' and 'sentiment' columns after column normalization")


	# Clean only the 'text' column
	df['text'] = df['text'].fillna("").astype(str).apply(clean_text)
	print(f"Type of df after cleaning: {type(df)}") # <class 'pandas.core.frame.DataFrame'>
	print(f"Type of df['text'] after processing: {type(df['text'])}") # <class 'pandas.core.series.Series'>


	# Process 'sentiment' column
	print(f"Type of df['sentiment'] immediately before fillna: {type(df['sentiment'])}")
	print(f"df['sentiment'] head immediately before fillna: {df['sentiment'].head()}")


	df['sentiment'] = df['sentiment'].fillna("").astype(str)
	print(f"Type of df['sentiment'] after processing: {type(df['sentiment'])}")


	X = df['text']
	y = df['sentiment']


	# Train a simple pipeline: CountVectorizer + MultinomialNB
	model = make_pipeline(CountVectorizer(min_df=2), MultinomialNB(alpha=1.0))
	model.fit(X, y)


	# Save the trained model
	with open(MODEL_PATH, "wb") as f:
	pickle.dump(model, f)
	print(f"Model trained on {len(df)} samples and saved!")


	# ======================
	# 2. Sentiment Analysis Function
	# ======================
	def sentiment_analysis(text: str) -> dict:
	if not text or len(text.strip()) < 2:
	return {
	"prediction": "neutral",
	"probabilities": {"pos": 0.5, "neg": 0.5},
	"message": "Input too short or empty for meaningful analysis. Defaulting to neutral."
	}


	cleaned_text = clean_text(text)


	try:
	if not cleaned_text:
	return {
	"prediction": "neutral",
	"probabilities": {"pos": 0.5, "neg": 0.5},
	"message": "Input became empty after cleaning. Defaulting to neutral."
	}


	pred_array = model.predict([cleaned_text])
	prob_array = model.predict_proba([cleaned_text])
	classes = model.classes_


	pred = pred_array[0] # Access the first (and only) element of the prediction array


	prob_dict = {cls: round(float(prob_array[0][i]), 4) for i, cls in enumerate(classes)} # Get probabilities


	if sum(prob_dict.values()) == 0:
	prob_dict = {"pos": 0.01, "neg": 0.01}
	pred = "neutral"


	return {
	"prediction": pred,
	"probabilities": prob_dict,
	"cleaned_input": cleaned_text
	}


	except Exception as e:
	return {
	"prediction": "error",
	"probabilities": {"pos": 0.5, "neg": 0.5},
	"error_details": str(e),
	"message": "An error occurred during analysis."
	}


	# ======================
	# 3. Gradio Interface
	# ======================
	demo = gr.Interface(
	fn=sentiment_analysis,
	inputs=gr.Textbox(placeholder="Enter text to analyze...", lines=3),
	outputs=gr.JSON(label="Analysis Results"),
	title="Naive Bayes Sentiment Analysis (scikit-learn)",
	description="Sentiment analysis using a memory-efficient Naive Bayes classifier. <br> "
	"Enter text to get its predicted sentiment and probability distribution."
	)


	# ======================
	# 4. Launch
	# ======================
	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0")