Faham
CREATE: initialized repo
4b35e49
raw
history blame
44.4 kB
import streamlit as st
import pandas as pd
from PIL import Image
import io
import numpy as np
import tempfile
import os
import torch
import torch.nn as nn
from torchvision import transforms, models
import torch.nn.functional as F
# Page configuration
st.set_page_config(
page_title="Sentiment Analysis Testing Ground",
page_icon="🧠",
layout="wide",
initial_sidebar_state="expanded",
)
# Custom CSS for better styling
st.markdown(
"""
<style>
.main-header {
font-size: 2.5rem;
font-weight: bold;
color: #1f77b4;
text-align: center;
margin-bottom: 2rem;
}
.model-card {
background-color: #f0f2f6;
padding: 1.5rem;
border-radius: 10px;
margin: 1rem 0;
border-left: 4px solid #1f77b4;
}
.result-box {
background-color: #e8f4fd;
padding: 1rem;
border-radius: 8px;
border: 1px solid #1f77b4;
margin: 1rem 0;
}
.upload-section {
background-color: #f8f9fa;
padding: 1.5rem;
border-radius: 10px;
border: 2px dashed #dee2e6;
text-align: center;
margin: 1rem 0;
}
</style>
""",
unsafe_allow_html=True,
)
# Global variables for models
@st.cache_resource
def load_vision_model():
"""Load the pre-trained ResNet-50 vision sentiment model"""
try:
# Check if model file exists
model_path = "models/resnet50_model.pth"
if not os.path.exists(model_path):
st.error(f"❌ Vision model file not found at: {model_path}")
return None
# Load the model weights first to check the architecture
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(model_path, map_location=device)
# Check the number of classes from the checkpoint
if "fc.weight" in checkpoint:
num_classes = checkpoint["fc.weight"].shape[0]
st.info(f"πŸ“Š Model checkpoint has {num_classes} output classes")
else:
# Fallback: try to infer from the last layer
num_classes = 3 # Default assumption
st.warning(
"⚠️ Could not determine number of classes from checkpoint, assuming 3"
)
# Initialize ResNet-50 model with the correct number of classes
# Note: Your model was trained with RGB images, so we keep 3 channels
model = models.resnet50(weights=None) # Don't load ImageNet weights
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes) # Use actual number of classes
# Load trained weights
model.load_state_dict(checkpoint)
model.to(device)
model.eval()
st.success(f"βœ… Vision model loaded successfully with {num_classes} classes!")
return model, device, num_classes
except Exception as e:
st.error(f"❌ Error loading vision model: {str(e)}")
return None, None, None
@st.cache_data
def get_vision_transforms():
"""Get the image transforms used during FER2013 training"""
return transforms.Compose(
[
transforms.Resize(224), # Match training: transforms.Resize(224)
transforms.CenterCrop(224), # Match training: transforms.CenterCrop(224)
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
), # ImageNet normalization
]
)
def detect_and_preprocess_face(image, crop_tightness=0.05):
"""
Detect face in image, crop to face region, convert to grayscale, and resize to 224x224
to match FER2013 dataset format (grayscale converted to 3-channel RGB)
Args:
image: Input image (PIL Image or numpy array)
crop_tightness: Padding around face (0.0 = no padding, 0.3 = 30% padding)
"""
try:
import cv2
import numpy as np
# Convert PIL image to OpenCV format
if isinstance(image, Image.Image):
# Convert PIL to numpy array
img_array = np.array(image)
# Convert RGB to BGR for OpenCV
if len(img_array.shape) == 3:
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
else:
img_array = image
# Load face detection cascade
face_cascade = cv2.CascadeClassifier(
cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
)
# Convert to grayscale for face detection (detection works better on grayscale)
gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
# Detect faces
faces = face_cascade.detectMultiScale(
gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)
)
if len(faces) == 0:
st.warning("⚠️ No face detected in the image. Using center crop instead.")
# Fallback: center crop and resize
if isinstance(image, Image.Image):
# Convert to RGB first
rgb_pil = image.convert("RGB")
# Center crop to square
width, height = rgb_pil.size
size = min(width, height)
left = (width - size) // 2
top = (height - size) // 2
right = left + size
bottom = top + size
cropped = rgb_pil.crop((left, top, right, bottom))
# Resize to 224x224 (matching FER2013 training: transforms.Resize(224))
resized = cropped.resize((224, 224), Image.Resampling.LANCZOS)
# Convert to grayscale and then to 3-channel RGB
gray_pil = resized.convert("L")
# Convert back to RGB (this replicates grayscale values to all 3 channels)
gray_rgb_pil = gray_pil.convert("RGB")
return gray_rgb_pil
else:
return None
# Get the largest face (assuming it's the main subject)
x, y, w, h = max(faces, key=lambda rect: rect[2] * rect[3])
# Add padding around the face based on user preference
padding_x = int(w * crop_tightness)
padding_y = int(h * crop_tightness)
# Ensure we don't go out of bounds
x1 = max(0, x - padding_x)
y1 = max(0, y - padding_y)
x2 = min(img_array.shape[1], x + w + padding_x)
y2 = min(img_array.shape[0], y + h + padding_y)
# Crop to face region
face_crop = img_array[y1:y2, x1:x2]
# Convert BGR to RGB first
face_crop_rgb = cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB)
# Convert to grayscale
face_gray = cv2.cvtColor(face_crop_rgb, cv2.COLOR_RGB2GRAY)
# Resize to 224x224 (matching FER2013 training: transforms.Resize(224))
face_resized = cv2.resize(face_gray, (224, 224), interpolation=cv2.INTER_AREA)
# Convert grayscale to 3-channel RGB (replicate grayscale values)
face_rgb_3channel = cv2.cvtColor(face_resized, cv2.COLOR_GRAY2RGB)
# Convert back to PIL Image
face_pil = Image.fromarray(face_rgb_3channel)
return face_pil
except ImportError:
st.error(
"❌ OpenCV not installed. Please install it with: pip install opencv-python"
)
st.info("Falling back to basic preprocessing...")
# Fallback: basic grayscale conversion and resize
if isinstance(image, Image.Image):
rgb_pil = image.convert("RGB")
resized = rgb_pil.resize((48, 48), Image.Resampling.LANCZOS)
# Convert to grayscale and then to 3-channel RGB
gray_pil = resized.convert("L")
gray_rgb_pil = gray_pil.convert("RGB")
return gray_rgb_pil
return None
except Exception as e:
st.error(f"❌ Error in face detection: {str(e)}")
st.info("Falling back to basic preprocessing...")
# Fallback: basic grayscale conversion and resize
if isinstance(image, Image.Image):
rgb_pil = image.convert("RGB")
resized = rgb_pil.resize((48, 48), Image.Resampling.LANCZOS)
# Convert to grayscale and then to 3-channel RGB
gray_pil = resized.convert("L")
gray_rgb_pil = gray_pil.convert("RGB")
return gray_rgb_pil
return None
def get_sentiment_mapping(num_classes):
"""Get the sentiment mapping based on number of classes"""
if num_classes == 3:
return {0: "Negative", 1: "Neutral", 2: "Positive"}
elif num_classes == 4:
# Common 4-class emotion mapping
return {0: "Angry", 1: "Sad", 2: "Happy", 3: "Neutral"}
elif num_classes == 7:
# FER2013 7-class emotion mapping
return {
0: "Angry",
1: "Disgust",
2: "Fear",
3: "Happy",
4: "Sad",
5: "Surprise",
6: "Neutral",
}
else:
# Generic mapping for unknown number of classes
return {i: f"Class_{i}" for i in range(num_classes)}
# Placeholder functions for model predictions
def predict_text_sentiment(text):
"""
Analyze text sentiment using TextBlob
"""
if not text or text.strip() == "":
return "No text provided", 0.0
try:
from textblob import TextBlob
# Create TextBlob object
blob = TextBlob(text)
# Get polarity (-1 to 1, where -1 is very negative, 1 is very positive)
polarity = blob.sentiment.polarity
# Get subjectivity (0 to 1, where 0 is very objective, 1 is very subjective)
subjectivity = blob.sentiment.subjectivity
# Convert polarity to sentiment categories
if polarity > 0.1:
sentiment = "Positive"
confidence = min(0.95, 0.6 + abs(polarity) * 0.3)
elif polarity < -0.1:
sentiment = "Negative"
confidence = min(0.95, 0.6 + abs(polarity) * 0.3)
else:
sentiment = "Neutral"
confidence = 0.7 - abs(polarity) * 0.2
# Round confidence to 2 decimal places
confidence = round(confidence, 2)
return sentiment, confidence
except ImportError:
st.error(
"❌ TextBlob not installed. Please install it with: pip install textblob"
)
return "TextBlob not available", 0.0
except Exception as e:
st.error(f"❌ Error in text sentiment analysis: {str(e)}")
return "Error occurred", 0.0
@st.cache_resource
def load_audio_model():
"""Load the pre-trained Wav2Vec2 audio sentiment model"""
try:
# Check if model file exists
model_path = "models/wav2vec2_model.pth"
if not os.path.exists(model_path):
st.error(f"❌ Audio model file not found at: {model_path}")
return None, None, None, None
# Load the model weights first to check the architecture
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load(model_path, map_location=device)
# Check the number of classes from the checkpoint
if "classifier.weight" in checkpoint:
num_classes = checkpoint["classifier.weight"].shape[0]
st.info(f"πŸ“Š Audio model checkpoint has {num_classes} output classes")
else:
num_classes = 3 # Default assumption
st.warning(
"⚠️ Could not determine number of classes from checkpoint, assuming 3"
)
# Initialize Wav2Vec2 model with the correct number of classes
from transformers import AutoModelForAudioClassification
model = AutoModelForAudioClassification.from_pretrained(
"facebook/wav2vec2-base", num_labels=num_classes
)
# Load trained weights
model.load_state_dict(checkpoint)
model.to(device)
model.eval()
# Load feature extractor
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained(
"facebook/wav2vec2-base"
)
st.success(f"βœ… Audio model loaded successfully with {num_classes} classes!")
return model, device, num_classes, feature_extractor
except Exception as e:
st.error(f"❌ Error loading audio model: {str(e)}")
return None, None, None, None
def predict_audio_sentiment(audio_bytes):
"""
Analyze audio sentiment using fine-tuned Wav2Vec2 model
Preprocessing matches CREMA-D + RAVDESS training specifications:
- Target sampling rate: 16kHz
- Max duration: 5.0 seconds
- Feature extraction: AutoFeatureExtractor with max_length, truncation, padding
"""
if audio_bytes is None:
return "No audio provided", 0.0
try:
# Load model if not already loaded
model, device, num_classes, feature_extractor = load_audio_model()
if model is None:
return "Model not loaded", 0.0
# Load and preprocess audio
import librosa
import io
import tempfile
# Save audio bytes to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
tmp_file.write(audio_bytes)
tmp_file_path = tmp_file.name
try:
# Load audio with librosa
audio, sr = librosa.load(tmp_file_path, sr=None)
# Resample to 16kHz if needed
if sr != 16000:
audio = librosa.resample(y=audio, orig_sr=sr, target_sr=16000)
# Preprocess with feature extractor (matching CREMA-D + RAVDESS training exactly)
# From training: max_length=int(max_duration_s * TARGET_SAMPLING_RATE) = 5.0 * 16000
inputs = feature_extractor(
audio,
sampling_rate=16000,
max_length=int(5.0 * 16000), # 5 seconds max (matching training)
truncation=True,
padding="max_length",
return_tensors="pt",
)
# Move to device
input_values = inputs.input_values.to(device)
# Run inference
with torch.no_grad():
outputs = model(input_values)
probabilities = torch.softmax(outputs.logits, dim=1)
confidence, predicted = torch.max(probabilities, 1)
# Get sentiment mapping based on number of classes
if num_classes == 3:
sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
else:
# Generic mapping for unknown number of classes
sentiment_map = {i: f"Class_{i}" for i in range(num_classes)}
sentiment = sentiment_map[predicted.item()]
confidence_score = confidence.item()
return sentiment, confidence_score
finally:
# Clean up temporary file
os.unlink(tmp_file_path)
except ImportError as e:
st.error(f"❌ Required library not installed: {str(e)}")
st.info("Please install: pip install librosa transformers")
return "Library not available", 0.0
except Exception as e:
st.error(f"❌ Error in audio sentiment prediction: {str(e)}")
return "Error occurred", 0.0
def predict_vision_sentiment(image, crop_tightness=0.05):
"""
Load ResNet-50 and run inference for vision sentiment analysis
Args:
image: Input image (PIL Image or numpy array)
crop_tightness: Padding around face (0.0 = no padding, 0.3 = 30% padding)
"""
if image is None:
return "No image provided", 0.0
try:
# Load model if not already loaded
model, device, num_classes = load_vision_model()
if model is None:
return "Model not loaded", 0.0
# Preprocess image to match FER2013 format
st.info(
"πŸ” Detecting face and preprocessing image to match training data format..."
)
preprocessed_image = detect_and_preprocess_face(image, crop_tightness=0.0)
if preprocessed_image is None:
return "Image preprocessing failed", 0.0
# Show preprocessed image
st.image(
preprocessed_image,
caption="Preprocessed Image (48x48 Grayscale β†’ 3-channel RGB)",
width=200,
)
# Get transforms
transform = get_vision_transforms()
# Convert preprocessed image to tensor
image_tensor = transform(preprocessed_image).unsqueeze(0).to(device)
# Run inference
with torch.no_grad():
outputs = model(image_tensor)
# Debug: print output shape
st.info(f"πŸ” Model output shape: {outputs.shape}")
probabilities = F.softmax(outputs, dim=1)
confidence, predicted = torch.max(probabilities, 1)
# Get sentiment mapping based on number of classes
sentiment_map = get_sentiment_mapping(num_classes)
sentiment = sentiment_map[predicted.item()]
confidence_score = confidence.item()
return sentiment, confidence_score
except Exception as e:
st.error(f"Error in vision sentiment prediction: {str(e)}")
st.error(
f"Model output shape mismatch. Expected {num_classes} classes but got different."
)
return "Error occurred", 0.0
def predict_fused_sentiment(text=None, audio_bytes=None, image=None):
"""
TODO: Implement ensemble/fusion logic combining all three models
This is a placeholder function for fused sentiment analysis
"""
# Placeholder logic - replace with actual fusion implementation
results = []
if text:
text_sentiment, text_conf = predict_text_sentiment(text)
results.append((text_sentiment, text_conf))
if audio_bytes:
audio_sentiment, audio_conf = predict_audio_sentiment(audio_bytes)
results.append((audio_sentiment, audio_conf))
if image:
vision_sentiment, vision_conf = predict_vision_sentiment(image)
results.append((vision_sentiment, vision_conf))
if not results:
return "No inputs provided", 0.0
# Simple ensemble logic (replace with your fusion strategy)
sentiment_counts = {}
total_confidence = 0
for sentiment, confidence in results:
sentiment_counts[sentiment] = sentiment_counts.get(sentiment, 0) + 1
total_confidence += confidence
# Majority voting with confidence averaging
final_sentiment = max(sentiment_counts, key=sentiment_counts.get)
avg_confidence = total_confidence / len(results)
return final_sentiment, avg_confidence
# Sidebar navigation
st.sidebar.title("🧠 Sentiment Analysis")
st.sidebar.markdown("---")
# Navigation
page = st.sidebar.selectbox(
"Choose a page:",
[
"🏠 Home",
"πŸ“ Text Sentiment",
"🎡 Audio Sentiment",
"πŸ–ΌοΈ Vision Sentiment",
"πŸ”— Fused Model",
],
)
# Home Page
if page == "🏠 Home":
st.markdown(
'<h1 class="main-header">Sentiment Analysis Testing Ground</h1>',
unsafe_allow_html=True,
)
st.markdown(
"""
<div class="model-card">
<h2>Welcome to your Multi-Modal Sentiment Analysis Testing Platform!</h2>
<p>This application provides a comprehensive testing environment for your three independent sentiment analysis models:</p>
</div>
""",
unsafe_allow_html=True,
)
col1, col2, col3 = st.columns(3)
with col1:
st.markdown(
"""
<div class="model-card">
<h3>πŸ“ Text Sentiment Model</h3>
<p>βœ… <strong>READY TO USE</strong> - Analyze sentiment from text input using TextBlob</p>
<ul>
<li>Process any text input</li>
<li>Get sentiment classification (Positive/Negative/Neutral)</li>
<li>View confidence scores</li>
<li>Real-time NLP analysis</li>
</ul>
</div>
""",
unsafe_allow_html=True,
)
with col2:
st.markdown(
"""
<div class="model-card">
<h3>🎡 Audio Sentiment Model</h3>
<p>βœ… <strong>READY TO USE</strong> - Analyze sentiment from audio files using fine-tuned Wav2Vec2</p>
<ul>
<li>Upload audio files (.wav, .mp3, .m4a, .flac)</li>
<li>πŸŽ™οΈ Record audio directly with microphone (max 5s)</li>
<li>πŸ”„ Automatic preprocessing: 16kHz sampling, 5s max duration (CREMA-D + RAVDESS format)</li>
<li>Listen to uploaded/recorded audio</li>
<li>Get sentiment predictions</li>
<li>Real-time audio analysis</li>
</ul>
</div>
""",
unsafe_allow_html=True,
)
with col3:
st.markdown(
"""
<div class="model-card">
<h3>πŸ–ΌοΈ Vision Sentiment Model</h3>
<p>Analyze sentiment from images using fine-tuned ResNet-50</p>
<ul>
<li>Upload image files (.png, .jpg, .jpeg, .bmp, .tiff)</li>
<li>πŸ”„ Automatic face detection & preprocessing</li>
<li>🎯 Fixed 0% padding for tightest face crop</li>
<li>πŸ“ Convert to 224x224 grayscale β†’ 3-channel RGB (FER2013 format)</li>
<li>🎯 Transforms: Resize(224) β†’ CenterCrop(224) β†’ ImageNet Normalization</li>
<li>Preview original & preprocessed images</li>
<li>Get sentiment predictions</li>
</ul>
</div>
""",
unsafe_allow_html=True,
)
st.markdown(
"""
<div class="model-card">
<h3>πŸ”— Fused Model</h3>
<p>Combine predictions from all three models for enhanced accuracy</p>
<ul>
<li>Multi-modal input processing</li>
<li>Ensemble prediction strategies</li>
<li>Comprehensive sentiment analysis</li>
</ul>
</div>
""",
unsafe_allow_html=True,
)
st.markdown("---")
st.markdown(
"""
<div style="text-align: center; color: #666;">
<p><strong>Note:</strong> This application now has <strong>ALL THREE MODELS</strong> fully integrated and ready to use! πŸŽ‰</p>
<p><strong>TextBlob</strong> (Text) + <strong>Wav2Vec2</strong> (Audio) + <strong>ResNet-50</strong> (Vision)</p>
</div>
""",
unsafe_allow_html=True,
)
# Text Sentiment Page
elif page == "πŸ“ Text Sentiment":
st.title("πŸ“ Text Sentiment Analysis")
st.markdown("Analyze the sentiment of your text using our TextBlob-based model.")
# Text input
text_input = st.text_area(
"Enter your text here:",
height=150,
placeholder="Type or paste your text here to analyze its sentiment...",
)
# Analyze button
if st.button("πŸ” Analyze Sentiment", type="primary", use_container_width=True):
if text_input and text_input.strip():
with st.spinner("Analyzing text sentiment..."):
sentiment, confidence = predict_text_sentiment(text_input)
# Display results
st.markdown("### Results")
# Display results in columns
col1, col2 = st.columns(2)
with col1:
st.metric("Sentiment", sentiment)
with col2:
st.metric("Confidence", f"{confidence:.2f}")
# Color-coded sentiment display
sentiment_colors = {
"Positive": "🟒",
"Negative": "πŸ”΄",
"Neutral": "🟑",
}
st.markdown(
f"""
<div class="result-box">
<h4>{sentiment_colors.get(sentiment, "❓")} Sentiment: {sentiment}</h4>
<p><strong>Confidence:</strong> {confidence:.2f}</p>
<p><strong>Input Text:</strong> "{text_input[:100]}{'...' if len(text_input) > 100 else ''}"</p>
<p><strong>Model:</strong> TextBlob (Natural Language Processing)</p>
</div>
""",
unsafe_allow_html=True,
)
else:
st.error("Please enter some text to analyze.")
# Audio Sentiment Page
elif page == "🎡 Audio Sentiment":
st.title("🎡 Audio Sentiment Analysis")
st.markdown(
"Analyze the sentiment of your audio files using our fine-tuned Wav2Vec2 model."
)
# Preprocessing information
st.info(
"ℹ️ **Audio Preprocessing**: Audio will be automatically processed to match CREMA-D + RAVDESS training format: "
"16kHz sampling rate, max 5 seconds, with automatic resampling and feature extraction."
)
# Model status
model, device, num_classes, feature_extractor = load_audio_model()
if model is None:
st.error("❌ Audio model could not be loaded. Please check the model file.")
st.info("Expected model file: `models/wav2vec2_model.pth`")
else:
st.success(
f"βœ… Audio model loaded successfully on {device} with {num_classes} classes!"
)
# Input method selection
st.subheader("🎀 Choose Input Method")
input_method = st.radio(
"Select how you want to provide audio:",
["πŸ“ Upload Audio File", "πŸŽ™οΈ Record Audio"],
horizontal=True,
)
if input_method == "πŸ“ Upload Audio File":
# File uploader
uploaded_audio = st.file_uploader(
"Choose an audio file",
type=["wav", "mp3", "m4a", "flac"],
help="Supported formats: WAV, MP3, M4A, FLAC",
)
audio_source = "uploaded_file"
audio_name = uploaded_audio.name if uploaded_audio else None
else: # Audio recording
st.markdown(
"""
<div class="model-card">
<h3>πŸŽ™οΈ Audio Recording</h3>
<p>Record audio directly with your microphone (max 5 seconds).</p>
<p><strong>Note:</strong> Make sure your microphone is accessible and you have permission to use it.</p>
</div>
""",
unsafe_allow_html=True,
)
# Audio recorder
recorded_audio = st.audio_input(
label="Click to start recording",
help="Click the microphone button to start/stop recording. Maximum recording time is 5 seconds.",
)
if recorded_audio is not None:
# Display recorded audio
st.audio(recorded_audio, format="audio/wav")
st.success("βœ… Audio recorded successfully!")
# Convert recorded audio to bytes for processing
uploaded_audio = recorded_audio
audio_source = "recorded"
audio_name = "Recorded Audio"
else:
uploaded_audio = None
audio_source = None
audio_name = None
if uploaded_audio is not None:
# Display audio player
if audio_source == "recorded":
st.audio(uploaded_audio, format="audio/wav")
st.info(f"πŸŽ™οΈ {audio_name} | Source: Microphone Recording")
else:
st.audio(
uploaded_audio, format=f'audio/{uploaded_audio.name.split(".")[-1]}'
)
# File info for uploaded files
file_size = len(uploaded_audio.getvalue()) / 1024 # KB
st.info(f"πŸ“ File: {uploaded_audio.name} | Size: {file_size:.1f} KB")
# Analyze button
if st.button(
"πŸ” Analyze Audio Sentiment", type="primary", use_container_width=True
):
if model is None:
st.error("❌ Model not loaded. Cannot analyze audio.")
else:
with st.spinner("Analyzing audio sentiment..."):
audio_bytes = uploaded_audio.getvalue()
sentiment, confidence = predict_audio_sentiment(audio_bytes)
# Display results
st.markdown("### Results")
col1, col2 = st.columns(2)
with col1:
st.metric("Sentiment", sentiment)
with col2:
st.metric("Confidence", f"{confidence:.2f}")
# Color-coded sentiment display
sentiment_colors = {"Positive": "🟒", "Negative": "πŸ”΄", "Neutral": "🟑"}
st.markdown(
f"""
<div class="result-box">
<h4>{sentiment_colors.get(sentiment, "❓")} Sentiment: {sentiment}</h4>
<p><strong>Confidence:</strong> {confidence:.2f}</p>
<p><strong>Audio Source:</strong> {audio_name}</p>
<p><strong>Model:</strong> Wav2Vec2 (Fine-tuned on RAVDESS + CREMA-D)</p>
</div>
""",
unsafe_allow_html=True,
)
else:
if input_method == "πŸ“ Upload Audio File":
st.info("πŸ‘† Please upload an audio file to begin analysis.")
else:
st.info("πŸŽ™οΈ Click the microphone button above to record audio for analysis.")
# Vision Sentiment Page
elif page == "πŸ–ΌοΈ Vision Sentiment":
st.title("πŸ–ΌοΈ Vision Sentiment Analysis")
st.markdown(
"Analyze the sentiment of your images using our fine-tuned ResNet-50 model."
)
st.info(
"ℹ️ **Note**: Images will be automatically preprocessed to match FER2013 format: face detection, grayscale conversion, and 224x224 resize (converted to 3-channel RGB)."
)
# Face cropping is set to 0% (no padding) for tightest crop
st.info(
"🎯 **Face Cropping**: Set to 0% padding for tightest crop on facial features"
)
# Model status
model, device, num_classes = load_vision_model()
if model is None:
st.error("❌ Vision model could not be loaded. Please check the model file.")
st.info("Expected model file: `models/resnet50_model.pth`")
else:
st.success(
f"βœ… Vision model loaded successfully on {device} with {num_classes} classes!"
)
# Input method selection
st.subheader("πŸ“Έ Choose Input Method")
input_method = st.radio(
"Select how you want to provide an image:",
["πŸ“ Upload Image File", "πŸ“· Take Photo with Camera"],
horizontal=True,
)
if input_method == "πŸ“ Upload Image File":
# File uploader
uploaded_image = st.file_uploader(
"Choose an image file",
type=["png", "jpg", "jpeg", "bmp", "tiff"],
help="Supported formats: PNG, JPG, JPEG, BMP, TIFF",
)
if uploaded_image is not None:
# Display image
image = Image.open(uploaded_image)
st.image(
image,
caption=f"Uploaded Image: {uploaded_image.name}",
use_container_width=True,
)
# File info
file_size = len(uploaded_image.getvalue()) / 1024 # KB
st.info(
f"πŸ“ File: {uploaded_image.name} | Size: {file_size:.1f} KB | Dimensions: {image.size[0]}x{image.size[1]}"
)
# Analyze button
if st.button(
"πŸ” Analyze Image Sentiment", type="primary", use_container_width=True
):
if model is None:
st.error("❌ Model not loaded. Cannot analyze image.")
else:
with st.spinner("Analyzing image sentiment..."):
sentiment, confidence = predict_vision_sentiment(image)
# Display results
st.markdown("### Results")
col1, col2 = st.columns(2)
with col1:
st.metric("Sentiment", sentiment)
with col2:
st.metric("Confidence", f"{confidence:.2f}")
# Color-coded sentiment display
sentiment_colors = {
"Positive": "🟒",
"Negative": "πŸ”΄",
"Neutral": "🟑",
}
st.markdown(
f"""
<div class="result-box">
<h4>{sentiment_colors.get(sentiment, "❓")} Sentiment: {sentiment}</h4>
<p><strong>Confidence:</strong> {confidence:.2f}</p>
<p><strong>Image File:</strong> {uploaded_image.name}</p>
<p><strong>Model:</strong> ResNet-50 (Fine-tuned on FER2013)</p>
</div>
""",
unsafe_allow_html=True,
)
else: # Camera capture
st.markdown(
"""
<div class="model-card">
<h3>πŸ“· Camera Capture</h3>
<p>Take a photo directly with your camera to analyze its sentiment.</p>
<p><strong>Note:</strong> Make sure your camera is accessible and you have permission to use it.</p>
</div>
""",
unsafe_allow_html=True,
)
# Camera input
camera_photo = st.camera_input(
"Take a photo",
help="Click the camera button to take a photo, or use the upload button to select an existing photo",
)
if camera_photo is not None:
# Display captured image
image = Image.open(camera_photo)
st.image(
image,
caption="Captured Photo",
use_container_width=True,
)
# Image info
st.info(
f"πŸ“· Captured Photo | Dimensions: {image.size[0]}x{image.size[1]} | Format: {image.format}"
)
# Analyze button
if st.button(
"πŸ” Analyze Photo Sentiment", type="primary", use_container_width=True
):
if model is None:
st.error("❌ Model not loaded. Cannot analyze image.")
else:
with st.spinner("Analyzing photo sentiment..."):
sentiment, confidence = predict_vision_sentiment(image)
# Display results
st.markdown("### Results")
col1, col2 = st.columns(2)
with col1:
st.metric("Sentiment", sentiment)
with col2:
st.metric("Confidence", f"{confidence:.2f}")
# Color-coded sentiment display
sentiment_colors = {
"Positive": "🟒",
"Negative": "πŸ”΄",
"Neutral": "🟑",
}
st.markdown(
f"""
<div class="result-box">
<h4>{sentiment_colors.get(sentiment, "❓")} Sentiment: {sentiment}</h4>
<p><strong>Confidence:</strong> {confidence:.2f}</p>
<p><strong>Image Source:</strong> Camera Capture</p>
<p><strong>Model:</strong> ResNet-50 (Fine-tuned on FER2013)</p>
</div>
""",
unsafe_allow_html=True,
)
# Show info if no image is provided
if input_method == "πŸ“ Upload Image File" and "uploaded_image" not in locals():
st.info("πŸ‘† Please upload an image file to begin analysis.")
elif input_method == "πŸ“· Take Photo with Camera" and "camera_photo" not in locals():
st.info("πŸ“· Click the camera button above to take a photo for analysis.")
# Fused Model Page
elif page == "πŸ”— Fused Model":
st.title("πŸ”— Fused Model Analysis")
st.markdown(
"Combine predictions from all three models for enhanced sentiment analysis."
)
st.markdown(
"""
<div class="model-card">
<h3>Multi-Modal Sentiment Analysis</h3>
<p>This page allows you to input text, audio, and/or image data to get a comprehensive sentiment analysis
using all three models combined.</p>
</div>
""",
unsafe_allow_html=True,
)
# Input sections
col1, col2 = st.columns(2)
with col1:
st.subheader("πŸ“ Text Input")
text_input = st.text_area(
"Enter text (optional):",
height=100,
placeholder="Type or paste your text here...",
)
st.subheader("🎡 Audio Input")
# Audio preprocessing information for fused model
st.info(
"ℹ️ **Audio Preprocessing**: Audio will be automatically processed to match CREMA-D + RAVDESS training format: "
"16kHz sampling rate, max 5 seconds, with automatic resampling and feature extraction."
)
# Audio input method for fused model
audio_input_method = st.radio(
"Audio input method:",
["πŸ“ Upload File", "πŸŽ™οΈ Record Audio"],
key="fused_audio_method",
horizontal=True,
)
if audio_input_method == "πŸ“ Upload File":
uploaded_audio = st.file_uploader(
"Upload audio file (optional):",
type=["wav", "mp3", "m4a", "flac"],
key="fused_audio",
)
audio_source = "uploaded_file"
audio_name = uploaded_audio.name if uploaded_audio else None
else:
# Audio recorder for fused model
recorded_audio = st.audio_input(
label="Record audio (optional):",
key="fused_audio_recorder",
help="Click to record audio for sentiment analysis",
)
if recorded_audio is not None:
st.audio(recorded_audio, format="audio/wav")
st.success("βœ… Audio recorded successfully!")
uploaded_audio = recorded_audio
audio_source = "recorded"
audio_name = "Recorded Audio"
else:
uploaded_audio = None
audio_source = None
audio_name = None
with col2:
st.subheader("πŸ–ΌοΈ Image Input")
# Face cropping is set to 0% (no padding) for tightest crop
st.info(
"🎯 **Face Cropping**: Set to 0% padding for tightest crop on facial features"
)
# Image input method for fused model
image_input_method = st.radio(
"Image input method:",
["πŸ“ Upload File", "πŸ“· Take Photo"],
key="fused_image_method",
horizontal=True,
)
if image_input_method == "πŸ“ Upload File":
uploaded_image = st.file_uploader(
"Upload image file (optional):",
type=["png", "jpg", "jpeg", "bmp", "tiff"],
key="fused_image",
)
if uploaded_image:
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_container_width=True)
else:
# Camera capture for fused model
camera_photo = st.camera_input(
"Take a photo (optional):",
key="fused_camera",
help="Click to take a photo for sentiment analysis",
)
if camera_photo:
image = Image.open(camera_photo)
st.image(image, caption="Captured Photo", use_container_width=True)
# Set uploaded_image to camera_photo for processing
uploaded_image = camera_photo
if uploaded_audio:
st.audio(
uploaded_audio, format=f'audio/{uploaded_audio.name.split(".")[-1]}'
)
# Analyze button
if st.button("πŸ” Run Fused Analysis", type="primary", use_container_width=True):
if text_input or uploaded_audio or uploaded_image:
with st.spinner("Running fused sentiment analysis..."):
# Prepare inputs
audio_bytes = uploaded_audio.getvalue() if uploaded_audio else None
image = Image.open(uploaded_image) if uploaded_image else None
# Get fused prediction
sentiment, confidence = predict_fused_sentiment(
text=text_input if text_input else None,
audio_bytes=audio_bytes,
image=image,
)
# Display results
st.markdown("### Fused Model Results")
col1, col2 = st.columns(2)
with col1:
st.metric("Final Sentiment", sentiment)
with col2:
st.metric("Overall Confidence", f"{confidence:.2f}")
# Show individual model results
st.markdown("### Individual Model Results")
results_data = []
if text_input:
text_sentiment, text_conf = predict_text_sentiment(text_input)
results_data.append(
{
"Model": "Text (TextBlob) βœ…",
"Input": f"Text: {text_input[:50]}...",
"Sentiment": text_sentiment,
"Confidence": f"{text_conf:.2f}",
}
)
if uploaded_audio:
audio_sentiment, audio_conf = predict_audio_sentiment(audio_bytes)
results_data.append(
{
"Model": "Audio (Wav2Vec2) βœ…",
"Input": f"Audio: {audio_name}",
"Sentiment": audio_sentiment,
"Confidence": f"{audio_conf:.2f}",
}
)
if uploaded_image:
# Face cropping is set to 0% (no padding) for tightest crop
vision_sentiment, vision_conf = predict_vision_sentiment(
image, crop_tightness=0.0
)
results_data.append(
{
"Model": "Vision (ResNet-50)",
"Input": f"Image: {uploaded_image.name}",
"Sentiment": vision_sentiment,
"Confidence": f"{vision_conf:.2f}",
}
)
if results_data:
df = pd.DataFrame(results_data)
st.dataframe(df, use_container_width=True)
# Final result display
sentiment_colors = {"Positive": "🟒", "Negative": "πŸ”΄", "Neutral": "🟑"}
st.markdown(
f"""
<div class="result-box">
<h4>{sentiment_colors.get(sentiment, "❓")} Final Fused Sentiment: {sentiment}</h4>
<p><strong>Overall Confidence:</strong> {confidence:.2f}</p>
<p><strong>Models Used:</strong> {len(results_data)}</p>
</div>
""",
unsafe_allow_html=True,
)
else:
st.warning(
"⚠️ Please provide at least one input (text, audio, or image) for fused analysis."
)
# Footer
st.markdown("---")
st.markdown(
"""
<div style="text-align: center; color: #666; padding: 1rem;">
<p>Built with ❀️ | by <a href="https://github.com/iamfaham">iamfaham</a></p>
</div>
""",
unsafe_allow_html=True,
)