Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

NoCodeTextClassifier / app.py

Alamgirapi

Update app.py

0a50c6f verified 7 months ago

raw

history blame

20.6 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.svm import LinearSVC, SVC
	from sklearn.naive_bayes import MultinomialNB, GaussianNB
	from sklearn.preprocessing import LabelEncoder
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	import os
	import pickle
	import re
	import string
	from collections import Counter

	# Set page config
	st.set_page_config(page_title="Text Classification App", page_icon="📊", layout="wide")

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	color: #1f77b4;
	text-align: center;
	margin-bottom: 2rem;
	}
	.section-header {
	font-size: 1.8rem;
	color: #ff7f0e;
	border-bottom: 2px solid #ff7f0e;
	padding-bottom: 0.5rem;
	}
	</style>
	""", unsafe_allow_html=True)

	# Utility functions
	def clean_text(text):
	"""Clean text data"""
	if pd.isna(text):
	return ""

	text = str(text).lower()
	text = re.sub(r'[^a-zA-Z\s]', '', text)
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()
	return text

	def save_artifacts(obj, folder_name, file_name):
	"""Save artifacts like encoders and vectorizers"""
	try:
	os.makedirs(folder_name, exist_ok=True)
	with open(os.path.join(folder_name, file_name), 'wb') as f:
	pickle.dump(obj, f)
	return True
	except Exception as e:
	st.error(f"Error saving {file_name}: {str(e)}")
	return False

	def load_artifacts(folder_name, file_name):
	"""Load saved artifacts"""
	try:
	with open(os.path.join(folder_name, file_name), 'rb') as f:
	return pickle.load(f)
	except FileNotFoundError:
	st.error(f"File {file_name} not found in {folder_name} folder")
	return None
	except Exception as e:
	st.error(f"Error loading {file_name}: {str(e)}")
	return None

	def analyze_data(df, text_col, target_col):
	"""Perform data analysis"""
	analysis = {}

	# Basic info
	analysis['shape'] = df.shape
	analysis['columns'] = df.columns.tolist()
	analysis['missing_values'] = df.isnull().sum().to_dict()

	# Text analysis
	df['text_length'] = df[text_col].astype(str).apply(len)
	analysis['avg_text_length'] = df['text_length'].mean()
	analysis['text_length_stats'] = df['text_length'].describe().to_dict()

	# Target analysis
	analysis['class_distribution'] = df[target_col].value_counts().to_dict()
	analysis['num_classes'] = df[target_col].nunique()

	return analysis

	def create_visualizations(df, text_col, target_col):
	"""Create visualizations"""
	fig, axes = plt.subplots(2, 2, figsize=(15, 10))

	# Class distribution
	class_counts = df[target_col].value_counts()
	axes[0, 0].bar(class_counts.index, class_counts.values)
	axes[0, 0].set_title('Class Distribution')
	axes[0, 0].set_xlabel('Classes')
	axes[0, 0].set_ylabel('Count')
	plt.setp(axes[0, 0].get_xticklabels(), rotation=45, ha='right')

	# Text length distribution
	axes[0, 1].hist(df['text_length'], bins=30, alpha=0.7)
	axes[0, 1].set_title('Text Length Distribution')
	axes[0, 1].set_xlabel('Text Length')
	axes[0, 1].set_ylabel('Frequency')

	# Box plot of text length by class
	df.boxplot(column='text_length', by=target_col, ax=axes[1, 0])
	axes[1, 0].set_title('Text Length by Class')
	axes[1, 0].set_xlabel('Class')
	axes[1, 0].set_ylabel('Text Length')

	# Correlation plot (if applicable)
	if df[target_col].dtype in ['int64', 'float64'] or len(df[target_col].unique()) < 10:
	correlation = df[['text_length', target_col]].corr()
	sns.heatmap(correlation, annot=True, ax=axes[1, 1], cmap='coolwarm')
	axes[1, 1].set_title('Correlation Matrix')
	else:
	axes[1, 1].text(0.5, 0.5, 'Correlation not applicable\nfor categorical target',
	ha='center', va='center', transform=axes[1, 1].transAxes)
	axes[1, 1].set_title('Correlation Analysis')

	plt.tight_layout()
	return fig

	def train_model(model_name, X_train, X_test, y_train, y_test):
	"""Train selected model"""
	models_dict = {
	"Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
	"Decision Tree": DecisionTreeClassifier(random_state=42),
	"Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
	"Linear SVC": LinearSVC(random_state=42, max_iter=1000),
	"SVC": SVC(random_state=42, probability=True),
	"Multinomial Naive Bayes": MultinomialNB(),
	"Gaussian Naive Bayes": GaussianNB()
	}

	if model_name not in models_dict:
	return None, None, None

	model = models_dict[model_name]

	# Special handling for Gaussian NB (needs dense array)
	if model_name == "Gaussian Naive Bayes":
	X_train_model = X_train.toarray()
	X_test_model = X_test.toarray()
	else:
	X_train_model = X_train
	X_test_model = X_test

	# Train model
	model.fit(X_train_model, y_train)

	# Make predictions
	y_pred = model.predict(X_test_model)

	# Calculate metrics
	accuracy = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred, output_dict=True)

	# Save model
	os.makedirs("models", exist_ok=True)
	model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
	save_artifacts(model, "models", model_filename)

	return model, accuracy, report

	def predict_text(model_name, text, vectorizer_type="tfidf"):
	"""Make prediction on new text"""
	try:
	# Load model
	model_filename = f"{model_name.lower().replace(' ', '_')}_model.pkl"
	model = load_artifacts("models", model_filename)
	if model is None:
	return None, None

	# Load vectorizer
	vectorizer_filename = f"{vectorizer_type}_vectorizer.pkl"
	vectorizer = load_artifacts("artifacts", vectorizer_filename)
	if vectorizer is None:
	return None, None

	# Load label encoder
	encoder = load_artifacts("artifacts", "label_encoder.pkl")
	if encoder is None:
	return None, None

	# Clean and vectorize text
	clean_text_input = clean_text(text)
	text_vector = vectorizer.transform([clean_text_input])

	# Special handling for Gaussian NB
	if "gaussian" in model_name.lower():
	text_vector = text_vector.toarray()

	# Make prediction
	prediction = model.predict(text_vector)
	prediction_proba = None

	# Get prediction probabilities if available
	if hasattr(model, 'predict_proba'):
	try:
	if "gaussian" in model_name.lower():
	prediction_proba = model.predict_proba(text_vector)[0]
	else:
	prediction_proba = model.predict_proba(text_vector)[0]
	except Exception as e:
	st.warning(f"Could not get prediction probabilities: {str(e)}")

	# Decode prediction
	predicted_label = encoder.inverse_transform(prediction)[0]

	return predicted_label, prediction_proba

	except Exception as e:
	st.error(f"Error during prediction: {str(e)}")
	return None, None

	# Main App
	st.markdown('<h1 class="main-header">📊 No Code Text Classification App</h1>', unsafe_allow_html=True)
	st.markdown("### Analyze your text data and train machine learning models without coding!")

	# Initialize session state
	if 'vectorizer_type' not in st.session_state:
	st.session_state.vectorizer_type = "tfidf"
	if 'trained_models' not in st.session_state:
	st.session_state.trained_models = []

	# Sidebar
	st.sidebar.markdown("## 📁 Upload Your Dataset")

	# File upload with better error handling
	try:
	uploaded_file = st.sidebar.file_uploader(
	"Choose a CSV file",
	type="csv",
	help="Upload your training dataset (CSV format)"
	)

	# Encoding selection
	encoding = st.sidebar.selectbox(
	"Select file encoding",
	["utf-8", "latin1", "iso-8859-1", "cp1252"],
	help="Try different encodings if you get reading errors"
	)

	except Exception as e:
	st.sidebar.error(f"File upload error: {str(e)}")
	uploaded_file = None

	# Navigation
	section = st.sidebar.radio(
	"Choose Section",
	["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"],
	help="Navigate through different sections of the app"
	)

	# Main content based on section
	if uploaded_file is not None:
	try:
	# Load data with selected encoding
	df = pd.read_csv(uploaded_file, encoding=encoding)

	st.sidebar.success(f"✅ Data loaded successfully! Shape: {df.shape}")

	# Column selection
	columns = df.columns.tolist()
	text_column = st.sidebar.selectbox("📝 Select text column:", columns)
	target_column = st.sidebar.selectbox("🎯 Select target column:", columns)

	# Data preprocessing
	df['clean_text'] = df[text_column].apply(clean_text)
	df['text_length'] = df[text_column].astype(str).apply(len)

	# Process target column
	label_encoder = LabelEncoder()
	df['encoded_target'] = label_encoder.fit_transform(df[target_column])
	save_artifacts(label_encoder, "artifacts", "label_encoder.pkl")

	except Exception as e:
	st.error(f"❌ Error loading data: {str(e)}")
	st.info("💡 Try selecting a different encoding from the sidebar.")
	df = None

	# Section: Data Analysis
	if section == "📊 Data Analysis":
	if uploaded_file is not None and df is not None:
	st.markdown('<h2 class="section-header">Data Analysis</h2>', unsafe_allow_html=True)

	# Data overview
	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("📋 Total Records", df.shape[0])
	with col2:
	st.metric("📊 Features", df.shape[1])
	with col3:
	st.metric("🏷️ Classes", df[target_column].nunique())

	# Data preview
	st.subheader("📖 Data Preview")
	st.dataframe(df[[text_column, target_column, 'text_length']].head(10))

	# Analysis results
	analysis = analyze_data(df, text_column, target_column)

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("📈 Text Statistics")
	st.write(f"Average text length: {analysis['avg_text_length']:.2f}")
	st.write("Text length distribution:")
	st.write(pd.DataFrame([analysis['text_length_stats']]).T)

	with col2:
	st.subheader("🏷️ Class Distribution")
	class_dist = pd.DataFrame(list(analysis['class_distribution'].items()),
	columns=['Class', 'Count'])
	st.dataframe(class_dist)

	# Visualizations
	st.subheader("📊 Visualizations")
	try:
	fig = create_visualizations(df, text_column, target_column)
	st.pyplot(fig)
	except Exception as e:
	st.error(f"Error creating visualizations: {str(e)}")

	else:
	st.warning("📁 Please upload a dataset to analyze.")

	# Section: Train Model
	elif section == "🤖 Train Model":
	if uploaded_file is not None and df is not None:
	st.markdown('<h2 class="section-header">Model Training</h2>', unsafe_allow_html=True)

	col1, col2 = st.columns(2)

	with col1:
	st.subheader("🤖 Select Model")
	model_name = st.selectbox(
	"Choose algorithm:",
	["Logistic Regression", "Decision Tree", "Random Forest",
	"Linear SVC", "SVC", "Multinomial Naive Bayes", "Gaussian Naive Bayes"]
	)

	with col2:
	st.subheader("🔤 Select Vectorizer")
	vectorizer_choice = st.selectbox(
	"Choose text vectorizer:",
	["TF-IDF Vectorizer", "Count Vectorizer"]
	)

	# Vectorizer parameters
	max_features = st.slider("Max features", 1000, 50000, 10000)
	test_size = st.slider("Test size", 0.1, 0.5, 0.2)

	if st.button("🚀 Start Training", type="primary"):
	with st.spinner("🔄 Training model..."):
	try:
	# Initialize vectorizer
	if vectorizer_choice == "TF-IDF Vectorizer":
	vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
	st.session_state.vectorizer_type = "tfidf"
	else:
	vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
	st.session_state.vectorizer_type = "count"

	# Vectorize text
	X = vectorizer.fit_transform(df['clean_text'])
	y = df['encoded_target']

	# Split data
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=test_size, random_state=42, stratify=y
	)

	# Save vectorizer
	vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
	save_artifacts(vectorizer, "artifacts", vectorizer_filename)

	# Train model
	model, accuracy, report = train_model(model_name, X_train, X_test, y_train, y_test)

	if model is not None:
	st.success(f"✅ Model trained successfully!")
	st.session_state.trained_models.append(model_name)

	# Display results
	col1, col2 = st.columns(2)

	with col1:
	st.metric("🎯 Accuracy", f"{accuracy:.4f}")

	with col2:
	st.metric("🏷️ Classes", len(report) - 3) # Exclude avg metrics

	# Detailed metrics
	st.subheader("📊 Detailed Metrics")
	metrics_df = pd.DataFrame(report).transpose()
	st.dataframe(metrics_df.round(4))

	except Exception as e:
	st.error(f"❌ Training failed: {str(e)}")
	else:
	st.warning("📁 Please upload a dataset to train a model.")

	# Section: Predictions
	elif section == "🔮 Predictions":
	st.markdown('<h2 class="section-header">Make Predictions</h2>', unsafe_allow_html=True)

	# Check for trained models
	if os.path.exists("models") and os.listdir("models"):
	available_models = [f.replace('_model.pkl', '').replace('_', ' ').title()
	for f in os.listdir("models") if f.endswith('.pkl')]

	if available_models:
	# Single prediction
	st.subheader("🔮 Single Text Prediction")

	col1, col2 = st.columns([3, 1])

	with col1:
	text_input = st.text_area(
	"Enter text to classify:",
	height=100,
	placeholder="Type or paste your text here..."
	)

	with col2:
	selected_model = st.selectbox("Select model:", available_models)

	if st.button("🔍 Predict", type="primary"):
	if text_input.strip():
	with st.spinner("🔄 Making prediction..."):
	predicted_label, prediction_proba = predict_text(
	selected_model, text_input, st.session_state.get('vectorizer_type', 'tfidf')
	)

	if predicted_label is not None:
	st.success("✅ Prediction completed!")

	# Results
	st.markdown("### 📋 Results")
	st.info(f"Predicted Class: {predicted_label}")

	# Probabilities
	if prediction_proba is not None:
	encoder = load_artifacts("artifacts", "label_encoder.pkl")
	if encoder is not None:
	classes = encoder.classes_
	prob_df = pd.DataFrame({
	'Class': classes,
	'Probability': prediction_proba
	}).sort_values('Probability', ascending=False)

	st.markdown("### 📊 Class Probabilities")
	st.bar_chart(prob_df.set_index('Class'))
	else:
	st.warning("⚠️ Please enter some text to classify.")

	# Batch predictions
	st.markdown("---")
	st.subheader("📦 Batch Predictions")

	batch_file = st.file_uploader("Upload CSV for batch prediction", type=['csv'])

	if batch_file is not None:
	try:
	batch_df = pd.read_csv(batch_file, encoding=encoding)
	st.write("📖 Preview:")
	st.dataframe(batch_df.head())

	batch_text_col = st.selectbox("Select text column:", batch_df.columns.tolist())
	batch_model = st.selectbox("Select model for batch:", available_models, key="batch_model")

	if st.button("🚀 Run Batch Predictions"):
	with st.spinner("🔄 Processing batch predictions..."):
	predictions = []
	progress_bar = st.progress(0)

	for i, text in enumerate(batch_df[batch_text_col]):
	pred, _ = predict_text(
	batch_model, str(text),
	st.session_state.get('vectorizer_type', 'tfidf')
	)
	predictions.append(pred if pred is not None else "Error")
	progress_bar.progress((i + 1) / len(batch_df))

	batch_df['Predicted_Class'] = predictions

	st.success("✅ Batch predictions completed!")
	st.dataframe(batch_df[[batch_text_col, 'Predicted_Class']])

	# Download option
	csv = batch_df.to_csv(index=False)
	st.download_button(
	"📥 Download Results",
	csv,
	"batch_predictions.csv",
	"text/csv"
	)

	except Exception as e:
	st.error(f"❌ Batch prediction error: {str(e)}")
	else:
	st.warning("⚠️ No trained models found.")
	else:
	st.warning("⚠️ No models available. Please train a model first.")

	# Footer
	st.markdown("---")
	st.markdown("Built with Streamlit • Text Classification Made Easy")