Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

NoCodeTextClassifier / app.py

Alamgirapi

Update app.py

5ba4816 verified 9 months ago

raw

history blame

21.8 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.svm import LinearSVC, SVC
	from sklearn.naive_bayes import MultinomialNB, GaussianNB
	from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
	import os
	import pickle
	import tempfile
	import re
	import string
	from collections import Counter

	# Text Cleaning Class (replacing the custom module)
	class TextCleaner:
	def clean_text(self, text):
	"""Clean and preprocess text"""
	if pd.isna(text):
	return ""

	# Convert to lowercase
	text = str(text).lower()

	# Remove special characters and digits
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Remove extra whitespace
	text = ' '.join(text.split())

	return text

	# Information Analysis Class (replacing the custom module)
	class TextInformations:
	def __init__(self, df, text_col, target_col):
	self.df = df
	self.text_col = text_col
	self.target_col = target_col

	def shape(self):
	return self.df.shape

	def missing_values(self):
	return self.df.isnull().sum().to_dict()

	def class_imbalanced(self):
	return self.df[self.target_col].value_counts().to_dict()

	def clean_text(self):
	cleaner = TextCleaner()
	return self.df[self.text_col].apply(cleaner.clean_text)

	def text_length(self):
	return self.df[self.text_col].str.len()

	# Utility functions
	def save_to_session(obj, key):
	"""Save objects to session state instead of files"""
	st.session_state[key] = obj

	def load_from_session(key):
	"""Load objects from session state"""
	return st.session_state.get(key, None)

	def train_model(model_name, X_train, X_test, y_train, y_test):
	"""Train the selected model"""
	if model_name == "Logistic Regression":
	model = LogisticRegression(random_state=42, max_iter=1000)
	elif model_name == "Decision Tree":
	model = DecisionTreeClassifier(random_state=42)
	elif model_name == "Random Forest":
	model = RandomForestClassifier(random_state=42, n_estimators=100)
	elif model_name == "Linear SVC":
	model = LinearSVC(random_state=42, max_iter=1000)
	elif model_name == "SVC":
	model = SVC(random_state=42, probability=True)
	elif model_name == "Multinomial Naive Bayes":
	model = MultinomialNB()
	elif model_name == "Gaussian Naive Bayes":
	model = GaussianNB()

	# Train model
	model.fit(X_train, y_train)

	# Make predictions
	y_pred = model.predict(X_test)
	accuracy = accuracy_score(y_test, y_pred)

	return model, accuracy

	def predict_text(text, model, vectorizer, encoder):
	"""Make prediction on new text"""
	try:
	# Clean text
	text_cleaner = TextCleaner()
	clean_text = text_cleaner.clean_text(text)

	# Transform text using the vectorizer
	text_vector = vectorizer.transform([clean_text])

	# Make prediction
	prediction = model.predict(text_vector)
	prediction_proba = None

	# Get prediction probabilities if available
	if hasattr(model, 'predict_proba'):
	try:
	prediction_proba = model.predict_proba(text_vector)[0]
	except:
	pass

	# Decode prediction
	predicted_label = encoder.inverse_transform(prediction)[0]

	return predicted_label, prediction_proba

	except Exception as e:
	st.error(f"Error during prediction: {str(e)}")
	return None, None

	# Streamlit App Configuration
	st.set_page_config(
	page_title="Text Classification App",
	page_icon="📝",
	layout="wide"
	)

	st.title('📝 No Code Text Classification App')
	st.markdown('Analyze your text data and train machine learning models for text classification')

	# Initialize session state
	if 'model_trained' not in st.session_state:
	st.session_state.model_trained = False
	if 'training_data_processed' not in st.session_state:
	st.session_state.training_data_processed = False

	# Sidebar
	st.sidebar.title("Navigation")
	section = st.sidebar.radio(
	"Choose Section",
	["📊 Data Analysis", "🤖 Train Model", "🔮 Predictions"],
	index=0
	)

	# Upload Data Section
	st.sidebar.markdown("---")
	st.sidebar.subheader("📁 Upload Your Dataset")

	# File uploader with better error handling
	try:
	train_data = st.sidebar.file_uploader(
	"Upload training data (CSV)",
	type=["csv"],
	help="Upload a CSV file with text and labels for training"
	)

	test_data = st.sidebar.file_uploader(
	"Upload test data (CSV, optional)",
	type=["csv"],
	help="Optional: Upload a separate test dataset"
	)
	except Exception as e:
	st.sidebar.error(f"File upload error: {str(e)}")
	st.sidebar.info("Try refreshing the page or using a different browser")

	# Process uploaded data
	if train_data is not None:
	try:
	# Add encoding options to handle different CSV formats
	encoding_option = st.sidebar.selectbox(
	"CSV Encoding",
	["utf-8", "latin-1", "cp1252", "iso-8859-1"],
	help="Try different encodings if you get errors"
	)

	train_df = pd.read_csv(train_data, encoding=encoding_option)

	if test_data is not None:
	test_df = pd.read_csv(test_data, encoding=encoding_option)
	else:
	test_df = None

	st.sidebar.success(f"✅ Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")

	# Column selection
	columns = train_df.columns.tolist()
	text_data = st.sidebar.selectbox("📝 Choose the text column:", columns)
	target = st.sidebar.selectbox("🎯 Choose the target column:", columns)

	# Store processed data in session state
	st.session_state.train_df = train_df
	st.session_state.test_df = test_df
	st.session_state.text_col = text_data
	st.session_state.target_col = target
	st.session_state.training_data_processed = True

	except Exception as e:
	st.sidebar.error(f"❌ Error loading data: {str(e)}")
	st.sidebar.info("Please check your CSV file format and encoding")

	# Data Analysis Section
	if section == "📊 Data Analysis":
	st.header("📊 Data Analysis")

	if st.session_state.get('training_data_processed', False):
	try:
	train_df = st.session_state.train_df
	text_col = st.session_state.text_col
	target_col = st.session_state.target_col

	# Create info object
	info = TextInformations(train_df, text_col, target_col)

	# Data preprocessing
	train_df['clean_text'] = info.clean_text()
	train_df['text_length'] = info.text_length()

	# Display basic information
	col1, col2, col3 = st.columns(3)

	with col1:
	st.metric("Dataset Shape", f"{info.shape()[0]} × {info.shape()[1]}")

	with col2:
	missing_vals = sum(info.missing_values().values())
	st.metric("Missing Values", missing_vals)

	with col3:
	unique_classes = len(info.class_imbalanced())
	st.metric("Unique Classes", unique_classes)

	# Data preview
	st.subheader("📋 Data Preview")
	st.dataframe(train_df[[text_col, target_col, 'clean_text', 'text_length']].head(10))

	# Class distribution
	st.subheader("📊 Class Distribution")
	class_counts = info.class_imbalanced()

	col1, col2 = st.columns(2)

	with col1:
	fig, ax = plt.subplots(figsize=(8, 6))
	classes = list(class_counts.keys())
	counts = list(class_counts.values())
	ax.bar(classes, counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8'])
	ax.set_title('Class Distribution')
	ax.set_xlabel('Classes')
	ax.set_ylabel('Count')
	plt.xticks(rotation=45)
	st.pyplot(fig)

	with col2:
	st.write("Class Distribution:")
	for class_name, count in class_counts.items():
	percentage = (count / len(train_df)) * 100
	st.write(f"- {class_name}: {count} ({percentage:.1f}%)")

	# Text length analysis
	st.subheader("📏 Text Length Analysis")

	col1, col2 = st.columns(2)

	with col1:
	fig, ax = plt.subplots(figsize=(8, 6))
	ax.hist(train_df['text_length'], bins=50, alpha=0.7, color='#4ECDC4')
	ax.set_title('Text Length Distribution')
	ax.set_xlabel('Text Length (characters)')
	ax.set_ylabel('Frequency')
	st.pyplot(fig)

	with col2:
	st.write("Text Length Statistics:")
	length_stats = train_df['text_length'].describe()
	for stat, value in length_stats.items():
	st.write(f"- {stat.title()}: {value:.1f}")

	# Update session state
	st.session_state.processed_train_df = train_df

	except Exception as e:
	st.error(f"❌ Error in data analysis: {str(e)}")
	else:
	st.info("🔄 Please upload training data to perform analysis")

	# Train Model Section
	elif section == "🤖 Train Model":
	st.header("🤖 Train Model")

	if st.session_state.get('training_data_processed', False):
	try:
	if 'processed_train_df' in st.session_state:
	train_df = st.session_state.processed_train_df
	else:
	# Process data if not already processed
	train_df = st.session_state.train_df
	text_col = st.session_state.text_col
	target_col = st.session_state.target_col

	info = TextInformations(train_df, text_col, target_col)
	train_df['clean_text'] = info.clean_text()
	train_df['text_length'] = info.text_length()

	# Model and vectorizer selection
	col1, col2 = st.columns(2)

	with col1:
	st.subheader("🎯 Model Selection")
	model_name = st.selectbox("Choose the Model", [
	"Logistic Regression", "Decision Tree",
	"Random Forest", "Linear SVC", "SVC",
	"Multinomial Naive Bayes", "Gaussian Naive Bayes"
	])

	with col2:
	st.subheader("📊 Vectorizer Selection")
	vectorizer_choice = st.selectbox("Choose Vectorizer", ["TF-IDF", "Count"])

	# Training parameters
	st.subheader("⚙️ Training Parameters")
	col1, col2 = st.columns(2)

	with col1:
	max_features = st.slider("Max Features", 1000, 20000, 10000, 1000)
	test_size = st.slider("Test Size", 0.1, 0.5, 0.2, 0.05)

	with col2:
	random_state = st.number_input("Random State", 0, 100, 42)

	# Training button
	if st.button("🚀 Start Training", type="primary"):
	with st.spinner("Training model... Please wait"):
	try:
	# Prepare data
	X_text = train_df['clean_text'].fillna('')
	y = train_df[st.session_state.target_col]

	# Label encoding
	label_encoder = LabelEncoder()
	y_encoded = label_encoder.fit_transform(y)

	# Vectorization
	if vectorizer_choice == "TF-IDF":
	vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
	else:
	vectorizer = CountVectorizer(max_features=max_features, stop_words='english')

	X_vectorized = vectorizer.fit_transform(X_text)

	# Train-test split
	X_train, X_test, y_train, y_test = train_test_split(
	X_vectorized, y_encoded,
	test_size=test_size,
	random_state=random_state,
	stratify=y_encoded
	)

	# Train model
	model, accuracy = train_model(model_name, X_train, X_test, y_train, y_test)

	# Save to session state
	save_to_session(model, 'trained_model')
	save_to_session(vectorizer, 'vectorizer')
	save_to_session(label_encoder, 'label_encoder')
	save_to_session(model_name, 'model_name')
	save_to_session(vectorizer_choice, 'vectorizer_type')

	st.session_state.model_trained = True

	# Display results
	st.success(f"✅ Model training completed!")

	col1, col2 = st.columns(2)
	with col1:
	st.metric("Model Accuracy", f"{accuracy:.4f}")
	with col2:
	st.metric("Training Samples", len(X_train))

	st.info("🎉 You can now use the 'Predictions' section to classify new text!")

	except Exception as e:
	st.error(f"❌ Error during training: {str(e)}")

	except Exception as e:
	st.error(f"❌ Error in model training setup: {str(e)}")
	else:
	st.info("🔄 Please upload and analyze training data first")

	# Predictions Section
	elif section == "🔮 Predictions":
	st.header("🔮 Make Predictions")

	if st.session_state.get('model_trained', False):

	# Single text prediction
	st.subheader("📝 Single Text Prediction")

	text_input = st.text_area(
	"Enter text to classify:",
	height=120,
	placeholder="Type or paste your text here..."
	)

	col1, col2 = st.columns([1, 3])
	with col1:
	if st.button("🔮 Predict", type="primary"):
	if text_input.strip():
	try:
	model = load_from_session('trained_model')
	vectorizer = load_from_session('vectorizer')
	encoder = load_from_session('label_encoder')

	predicted_label, prediction_proba = predict_text(
	text_input, model, vectorizer, encoder
	)

	if predicted_label is not None:
	st.success("✅ Prediction completed!")

	# Display results
	st.markdown("### 📊 Results")
	st.markdown(f"Predicted Class: `{predicted_label}`")

	# Display probabilities if available
	if prediction_proba is not None:
	st.markdown("Class Probabilities:")

	classes = encoder.classes_
	prob_data = pd.DataFrame({
	'Class': classes,
	'Probability': prediction_proba
	}).sort_values('Probability', ascending=False)

	# Show as bar chart
	st.bar_chart(prob_data.set_index('Class'))

	# Show as table
	st.dataframe(prob_data, use_container_width=True)

	except Exception as e:
	st.error(f"❌ Prediction error: {str(e)}")
	else:
	st.warning("⚠️ Please enter some text to classify")

	# Batch predictions
	st.markdown("---")
	st.subheader("📁 Batch Predictions")

	uploaded_batch = st.file_uploader(
	"Upload CSV file for batch predictions",
	type=['csv'],
	help="Upload a CSV file with text data to classify multiple texts at once"
	)

	if uploaded_batch is not None:
	try:
	# Load batch data
	encoding_option = st.selectbox(
	"Batch CSV Encoding",
	["utf-8", "latin-1", "cp1252", "iso-8859-1"],
	key="batch_encoding"
	)

	batch_df = pd.read_csv(uploaded_batch, encoding=encoding_option)
	st.write("📋 Batch Data Preview:")
	st.dataframe(batch_df.head())

	# Select text column
	text_column = st.selectbox(
	"Select the text column:",
	batch_df.columns.tolist()
	)

	if st.button("🚀 Run Batch Predictions", type="primary"):
	with st.spinner("Processing batch predictions..."):
	try:
	model = load_from_session('trained_model')
	vectorizer = load_from_session('vectorizer')
	encoder = load_from_session('label_encoder')

	predictions = []
	confidences = []

	progress_bar = st.progress(0)
	total_rows = len(batch_df)

	for idx, text in enumerate(batch_df[text_column]):
	pred, pred_proba = predict_text(
	str(text), model, vectorizer, encoder
	)
	predictions.append(pred if pred is not None else "Error")

	# Get confidence (max probability)
	if pred_proba is not None:
	confidences.append(max(pred_proba))
	else:
	confidences.append(0.0)

	progress_bar.progress((idx + 1) / total_rows)

	batch_df['Predicted_Class'] = predictions
	batch_df['Confidence'] = confidences

	st.success("✅ Batch predictions completed!")

	# Show results
	st.write("📊 Prediction Results:")
	st.dataframe(batch_df[[text_column, 'Predicted_Class', 'Confidence']])

	# Download results
	csv = batch_df.to_csv(index=False)
	st.download_button(
	label="📥 Download Results as CSV",
	data=csv,
	file_name="batch_predictions.csv",
	mime="text/csv"
	)

	except Exception as e:
	st.error(f"❌ Batch prediction error: {str(e)}")

	except Exception as e:
	st.error(f"❌ Error loading batch file: {str(e)}")

	else:
	st.info("🔄 Please train a model first before making predictions")

	# Show model info if available
	if st.session_state.get('training_data_processed', False):
	st.write("💡 Tip: Go to the 'Train Model' section to train a model first!")

	# Footer
	st.markdown("---")
	st.markdown(
	"""
	<div style='text-align: center; color: #666; padding: 20px;'>
	<p>📝 No Code Text Classification App</p>
	<p>Built with Streamlit • Upload CSV → Analyze → Train → Predict</p>
	</div>
	""",
	unsafe_allow_html=True
	)