Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

NoCodeTextClassifier / app.py

Alamgirapi

Update app.py

1ce2ad9 verified 6 months ago

raw

history blame

24.5 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	from NoCodeTextClassifier.EDA import Informations, Visualizations
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
	from NoCodeTextClassifier.models import Models
	import os
	import pickle
	import hashlib
	import hmac
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

	# Authentication Configuration
	USERS = {
	"admin": "admin123",
	"user1": "password123",
	"demo": "demo123"
	}

	def check_password():
	"""Returns True if the user has correct password."""
	def password_entered():
	"""Checks whether a password entered by the user is correct."""
	username = st.session_state["username"]
	password = st.session_state["password"]

	if username in USERS and hmac.compare_digest(USERS[username], password):
	st.session_state["password_correct"] = True
	st.session_state["authenticated_user"] = username
	del st.session_state["password"] # Don't store passwords
	else:
	st.session_state["password_correct"] = False

	# Return True if password is validated
	if st.session_state.get("password_correct", False):
	return True

	# Show login form
	st.markdown("## 🔐 Login Required")
	st.markdown("Please enter your credentials to access the Text Classification App")

	col1, col2, col3 = st.columns([1, 2, 1])
	with col2:
	st.text_input("Username", key="username", placeholder="Enter username")
	st.text_input("Password", type="password", key="password", placeholder="Enter password")

	if st.button("Login", use_container_width=True):
	password_entered()

	# Show demo credentials
	with st.expander("Demo Credentials"):
	st.info("""
	Demo Account:
	- Username: `demo`
	- Password: `demo123`

	Admin Account:
	- Username: `admin`
	- Password: `admin123`
	""")

	if st.session_state.get("password_correct", False) == False:
	st.error("😞 Username or password incorrect")

	return False

	# Utility functions
	def save_artifacts(obj, folder_name, file_name):
	"""Save artifacts like encoders and vectorizers"""
	try:
	os.makedirs(folder_name, exist_ok=True)
	with open(os.path.join(folder_name, file_name), 'wb') as f:
	pickle.dump(obj, f)
	return True
	except Exception as e:
	st.error(f"Error saving {file_name}: {str(e)}")
	return False

	def load_artifacts(folder_name, file_name):
	"""Load saved artifacts"""
	try:
	with open(os.path.join(folder_name, file_name), 'rb') as f:
	return pickle.load(f)
	except FileNotFoundError:
	st.warning(f"File {file_name} not found in {folder_name} folder")
	return None
	except Exception as e:
	st.error(f"Error loading {file_name}: {str(e)}")
	return None

	def load_model(model_name):
	"""Load trained model"""
	try:
	with open(os.path.join('models', model_name), 'rb') as f:
	return pickle.load(f)
	except FileNotFoundError:
	st.error(f"Model {model_name} not found. Please train a model first.")
	return None
	except Exception as e:
	st.error(f"Error loading model: {str(e)}")
	return None

	def safe_file_upload(uploaded_file, encoding='utf-8'):
	"""Safely read uploaded file with multiple encoding attempts"""
	if uploaded_file is None:
	return None

	encodings_to_try = [encoding, 'latin1', 'cp1252', 'iso-8859-1']

	for enc in encodings_to_try:
	try:
	# Reset file pointer
	uploaded_file.seek(0)
	df = pd.read_csv(uploaded_file, encoding=enc)
	st.success(f"File loaded successfully with {enc} encoding")
	return df
	except UnicodeDecodeError:
	continue
	except Exception as e:
	st.error(f"Error reading file with {enc}: {str(e)}")
	continue

	st.error("Could not read file with any common encoding. Please check your file format.")
	return None

	def predict_text(model_name, text, vectorizer_type="tfidf"):
	"""Make prediction on new text"""
	try:
	# Load model
	model = load_model(model_name)
	if model is None:
	return None, None

	# Load vectorizer
	vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
	vectorizer = load_artifacts("artifacts", vectorizer_file)
	if vectorizer is None:
	return None, None

	# Load label encoder
	encoder = load_artifacts("artifacts", "encoder.pkl")
	if encoder is None:
	return None, None

	# Clean and vectorize text
	text_cleaner = TextCleaner()
	clean_text = text_cleaner.clean_text(text)

	# Transform text using the same vectorizer used during training
	text_vector = vectorizer.transform([clean_text])

	# Make prediction
	prediction = model.predict(text_vector)
	prediction_proba = None

	# Get prediction probabilities if available
	if hasattr(model, 'predict_proba'):
	try:
	prediction_proba = model.predict_proba(text_vector)[0]
	except:
	pass

	# Decode prediction
	predicted_label = encoder.inverse_transform(prediction)[0]

	return predicted_label, prediction_proba

	except Exception as e:
	st.error(f"Error during prediction: {str(e)}")
	return None, None

	# Main App Logic
	def main_app():
	# Header with user info
	col1, col2 = st.columns([3, 1])
	with col1:
	st.title('🤖 No Code Text Classification App')
	st.write('Understand the behavior of your text data and train a model to classify the text data')
	with col2:
	st.markdown(f"👤 User: {st.session_state.get('authenticated_user', 'Unknown')}")
	if st.button("Logout", type="secondary"):
	for key in list(st.session_state.keys()):
	del st.session_state[key]
	st.rerun()

	# Sidebar
	section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🚀 Train Model", "🔮 Predictions"])

	# Upload Data with improved error handling
	st.sidebar.subheader("📁 Upload Your Dataset")

	# File encoding selection
	encoding_choice = st.sidebar.selectbox(
	"File Encoding",
	["utf-8", "latin1", "cp1252", "iso-8859-1"],
	help="If file upload fails, try different encodings"
	)

	train_data = st.sidebar.file_uploader(
	"Upload training data",
	type=["csv"],
	help="Upload a CSV file with your training data"
	)

	test_data = st.sidebar.file_uploader(
	"Upload test data (optional)",
	type=["csv"],
	help="Optional: Upload separate test data"
	)

	# Global variables to store data and settings
	if 'vectorizer_type' not in st.session_state:
	st.session_state.vectorizer_type = "tfidf"

	train_df = None
	test_df = None
	info = None

	if train_data is not None:
	with st.spinner("Loading training data..."):
	train_df = safe_file_upload(train_data, encoding_choice)

	if train_df is not None:
	try:
	if test_data is not None:
	test_df = safe_file_upload(test_data, encoding_choice)

	st.sidebar.success(f"✅ Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
	st.write("📋 Training Data Preview:")
	st.dataframe(train_df.head(3), use_container_width=True)

	columns = train_df.columns.tolist()
	text_data = st.sidebar.selectbox("📝 Choose the text column:", columns)
	target = st.sidebar.selectbox("🎯 Choose the target column:", columns)

	# Process data
	if text_data and target and text_data != target:
	with st.spinner("Processing data..."):
	info = Informations(train_df, text_data, target)
	train_df['clean_text'] = info.clean_text()
	train_df['text_length'] = info.text_length()

	# Handle label encoding manually if the class doesn't store encoder
	from sklearn.preprocessing import LabelEncoder
	label_encoder = LabelEncoder()
	train_df['target'] = label_encoder.fit_transform(train_df[target])

	# Save label encoder for later use
	if save_artifacts(label_encoder, "artifacts", "encoder.pkl"):
	st.sidebar.success("✅ Data processed successfully")
	else:
	st.sidebar.warning("Please select different columns for text and target")

	except Exception as e:
	st.error(f"❌ Error processing data: {str(e)}")
	train_df = None
	info = None

	# Data Analysis Section
	if section == "📊 Data Analysis":
	st.header("📊 Data Analysis & Insights")

	if train_data is not None and train_df is not None and info is not None:
	try:
	# Create tabs for better organization
	tab1, tab2, tab3 = st.tabs(["📈 Basic Stats", "📝 Text Analysis", "📊 Visualizations"])

	with tab1:
	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("📊 Data Shape", f"{info.shape()[0]} x {info.shape()[1]}")

	with col2:
	imbalance_info = info.class_imbalanced()
	st.metric("⚖️ Class Balance", "Balanced" if not imbalance_info else "Imbalanced")

	with col3:
	missing_info = info.missing_values()
	total_missing = sum(missing_info.values()) if isinstance(missing_info, dict) else 0
	st.metric("❌ Missing Values", str(total_missing))

	st.subheader("📋 Processed Data Preview")
	st.dataframe(train_df[['clean_text', 'text_length', 'target']].head(), use_container_width=True)

	with tab2:
	st.subheader("📏 Text Length Analysis")
	text_analysis = info.analysis_text_length('text_length')

	# Display stats in a nice format
	stats_col1, stats_col2 = st.columns(2)
	with stats_col1:
	st.json(text_analysis)

	with stats_col2:
	correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
	st.metric("🔗 Text Length-Target Correlation", f"{correlation:.4f}")

	with tab3:
	st.subheader("📊 Data Visualizations")
	vis = Visualizations(train_df, text_data, target)

	col1, col2 = st.columns(2)
	with col1:
	st.write("Class Distribution")
	vis.class_distribution()

	with col2:
	st.write("Text Length Distribution")
	vis.text_length_distribution()

	except Exception as e:
	st.error(f"❌ Error in data analysis: {str(e)}")
	else:
	st.info("👆 Please upload training data in the sidebar to get insights")

	# Train Model Section
	elif section == "🚀 Train Model":
	st.header("🚀 Train Classification Model")

	if train_data is not None and train_df is not None:
	try:
	# Create two columns for model selection
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("🤖 Choose Model")
	model = st.radio("Select Algorithm:", [
	"Logistic Regression", "Decision Tree",
	"Random Forest", "Linear SVC", "SVC",
	"Multinomial Naive Bayes", "Gaussian Naive Bayes"
	])

	with col2:
	st.subheader("🔤 Choose Vectorizer")
	vectorizer_choice = st.radio("Select Vectorizer:", ["Tfidf Vectorizer", "Count Vectorizer"])

	# Initialize vectorizer
	if vectorizer_choice == "Tfidf Vectorizer":
	vectorizer = TfidfVectorizer(max_features=10000)
	st.session_state.vectorizer_type = "tfidf"
	else:
	vectorizer = CountVectorizer(max_features=10000)
	st.session_state.vectorizer_type = "count"

	st.subheader("📋 Training Data Preview")
	st.dataframe(train_df[['clean_text', 'target']].head(3), use_container_width=True)

	# Vectorize text data
	with st.spinner("Preparing data..."):
	X = vectorizer.fit_transform(train_df['clean_text'])
	y = train_df['target']

	# Split data
	X_train, X_test, y_train, y_test = process.split_data(X, y)
	st.success(f"✅ Data prepared - Train: {X_train.shape}, Test: {X_test.shape}")

	# Save vectorizer for later use
	vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
	save_artifacts(vectorizer, "artifacts", vectorizer_filename)

	if st.button("🚀 Start Training", type="primary", use_container_width=True):
	progress_bar = st.progress(0)
	status_text = st.empty()

	with st.spinner(f"Training {model} model..."):
	status_text.text("Initializing model...")
	progress_bar.progress(20)

	models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

	status_text.text("Training in progress...")
	progress_bar.progress(50)

	# Train selected model
	if model == "Logistic Regression":
	models.LogisticRegression()
	elif model == "Decision Tree":
	models.DecisionTree()
	elif model == "Linear SVC":
	models.LinearSVC()
	elif model == "SVC":
	models.SVC()
	elif model == "Multinomial Naive Bayes":
	models.MultinomialNB()
	elif model == "Random Forest":
	models.RandomForestClassifier()
	elif model == "Gaussian Naive Bayes":
	models.GaussianNB()

	progress_bar.progress(100)
	status_text.text("Training completed!")

	st.success("🎉 Model training completed successfully!")
	st.balloons()
	st.info("💡 You can now use the 'Predictions' section to classify new text.")

	except Exception as e:
	st.error(f"❌ Error in model training: {str(e)}")
	st.exception(e)
	else:
	st.info("👆 Please upload training data in the sidebar to train a model")

	# Predictions Section
	elif section == "🔮 Predictions":
	st.header("🔮 Text Classification Predictions")

	# Check if models exist
	if os.path.exists("models") and os.listdir("models"):
	tab1, tab2 = st.tabs(["🎯 Single Prediction", "📊 Batch Predictions"])

	with tab1:
	st.subheader("🎯 Classify Single Text")

	# Text input for prediction
	text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type or paste your text here...")

	# Model selection
	available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]

	if available_models:
	selected_model = st.selectbox("🤖 Choose the trained model:", available_models)

	# Prediction button
	if st.button("🔮 Predict", key="single_predict", type="primary"):
	if text_input.strip():
	with st.spinner("Making prediction..."):
	predicted_label, prediction_proba = predict_text(
	selected_model,
	text_input,
	st.session_state.get('vectorizer_type', 'tfidf')
	)

	if predicted_label is not None:
	st.success("🎉 Prediction completed!")

	# Display results
	st.markdown("### 📋 Prediction Results")

	# Create result container
	result_container = st.container()
	with result_container:
	st.markdown(f"📝 Input Text: {text_input}")
	st.markdown(f"🏷️ Predicted Class: `{predicted_label}`")

	# Display probabilities if available
	if prediction_proba is not None:
	st.markdown("📊 Class Probabilities:")

	# Load encoder to get class names
	encoder = load_artifacts("artifacts", "encoder.pkl")
	if encoder is not None:
	classes = encoder.classes_
	prob_df = pd.DataFrame({
	'Class': classes,
	'Probability': prediction_proba
	}).sort_values('Probability', ascending=False)

	st.bar_chart(prob_df.set_index('Class'))
	st.dataframe(prob_df, use_container_width=True)
	else:
	st.warning("⚠️ Please enter some text to classify")
	else:
	st.warning("⚠️ No trained models found. Please train a model first.")

	with tab2:
	st.subheader("📊 Batch Classification")

	uploaded_file = st.file_uploader(
	"Upload a CSV file with text to classify",
	type=['csv'],
	help="Upload a CSV file containing text data for batch classification"
	)

	if uploaded_file is not None:
	try:
	batch_df = safe_file_upload(uploaded_file)
	if batch_df is not None:
	st.write("📋 Uploaded data preview:")
	st.dataframe(batch_df.head(), use_container_width=True)

	# Select text column
	text_column = st.selectbox("📝 Select the text column:", batch_df.columns.tolist())

	available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
	batch_model = st.selectbox("🤖 Choose model for batch prediction:", available_models, key="batch_model")

	if st.button("🚀 Run Batch Predictions", key="batch_predict", type="primary"):
	progress_bar = st.progress(0)
	status_text = st.empty()

	with st.spinner("Processing batch predictions..."):
	predictions = []
	total_texts = len(batch_df)

	for i, text in enumerate(batch_df[text_column]):
	status_text.text(f"Processing {i+1}/{total_texts} texts...")
	progress_bar.progress((i+1)/total_texts)

	pred, _ = predict_text(
	batch_model,
	str(text),
	st.session_state.get('vectorizer_type', 'tfidf')
	)
	predictions.append(pred if pred is not None else "Error")

	batch_df['Predicted_Class'] = predictions

	st.success("🎉 Batch predictions completed!")
	st.write("📊 Results:")
	st.dataframe(batch_df[[text_column, 'Predicted_Class']], use_container_width=True)

	# Download results
	csv = batch_df.to_csv(index=False)
	st.download_button(
	label="📥 Download predictions as CSV",
	data=csv,
	file_name="batch_predictions.csv",
	mime="text/csv",
	type="primary"
	)
	except Exception as e:
	st.error(f"❌ Error in batch prediction: {str(e)}")
	else:
	st.info("⚠️ No trained models found. Please go to 'Train Model' section to train a model first.")

	# Main execution
	def main():
	# Page config
	st.set_page_config(
	page_title="Text Classification App",
	page_icon="🤖",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main {
	padding-top: 1rem;
	}
	.stAlert {
	margin-top: 1rem;
	}
	.metric-container {
	background-color: #f0f2f6;
	padding: 1rem;
	border-radius: 0.5rem;
	margin: 0.5rem 0;
	}
	</style>
	""", unsafe_allow_html=True)

	# Check authentication
	if check_password():
	main_app()

	if __name__ == "__main__":
	main()