Spaces:

jmprathab
/

dlgenai-emotion-classification

Sleeping

App Files Files Community

jmprathab commited on 12 days ago

Commit

49e3fdb

verified ·

1 Parent(s): e27a584

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +224 -36

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,228 @@
-import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+from transformers import AutoTokenizer, AutoModel
+import kagglehub
 import numpy as np
+import os
 import pandas as pd
 import streamlit as st
+import torch
+import torch.nn as nn
+MODEL_HANDLE = "prathabmurugan/dlgenai-emotion-classification/pyTorch/1a"
+EMOTION_LABELS = ['anger', 'fear', 'joy', 'sadness', 'surprise']
+THRESHOLDS = np.array([0.85, 0.43, 0.21, 0.7, 0.36])
+MAX_LEN = 100
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class RobertaClassifier(nn.Module):
+    def __init__(self, model_name: str, num_labels: int, dropout: float = 0.3):
+        super().__init__()
+        self.roberta = AutoModel.from_pretrained(model_name)
+        hidden_size = self.roberta.config.hidden_size
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.roberta(
+            input_ids=input_ids, attention_mask=attention_mask
+        )
+        pooled = outputs.pooler_output
+        pooled = self.dropout(pooled)
+        logits = self.classifier(pooled)
+        return logits
+def standardize_space(text):
+    """Normalize whitespace in text."""
+    return " ".join(str(text).split())
+@st.cache_resource
+def load_resources():
+    status_container = st.empty()
+    # 1. Download Model Weights
+    status_container.info(
+        f"Downloading model weights from KaggleHub [{MODEL_HANDLE}]")
+    try:
+        model_dir = kagglehub.model_download(MODEL_HANDLE)
+        model_path = os.path.join(model_dir, "roberta_best_model.pth")
+        if not os.path.exists(model_path):
+            files = [f for f in os.listdir(model_dir) if f.endswith('.pth')]
+            if files:
+                model_path = os.path.join(model_dir, files[0])
+            else:
+                raise FileNotFoundError(
+                    f"Could not find .pth file in [{model_dir}]")
+    except Exception as e:
+        status_container.error(f"Failed to download model [{e}]")
+        st.stop()
+    # 2. Initialize Architecture
+    status_container.info("Initializing RoBERTa architecture")
+    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
+    model = RobertaClassifier("roberta-base", num_labels=5)
+    # 3. Load Weights
+    try:
+        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+        model.to(DEVICE)
+        model.eval()
+    except Exception as e:
+        status_container.error(f"Error loading state dict [{e}]")
+        st.stop()
+    status_container.empty()  # Clear the status messages
+    return model, tokenizer
+def predict(texts, model, tokenizer):
+    # Preprocessing
+    processed_texts = [standardize_space(t) for t in texts]
+    # Tokenization
+    encodings = tokenizer(
+        processed_texts,
+        truncation=True,
+        max_length=MAX_LEN,
+        padding='max_length',
+        return_tensors='pt'
+    )
+    input_ids = encodings['input_ids'].to(DEVICE)
+    attention_mask = encodings['attention_mask'].to(DEVICE)
+    # Inference
+    with torch.no_grad():
+        logits = model(input_ids, attention_mask)
+        probs = torch.sigmoid(logits).cpu().numpy()
+    # Apply specific thresholds
+    preds = (probs > THRESHOLDS).astype(int)
+    return preds, probs
+# Streamlit UI
+st.set_page_config(page_title="Emotion Classifier", layout="centered")
+st.title("Emotion Classification")
+st.markdown(
+    "This app pulls a custom fine-tuned **RoBERTa** model from Kaggle to classify text into 5 emotions.")
+# Load model
+model, tokenizer = load_resources()
+# Tabs for different input modes
+tab1, tab2 = st.tabs(["Single Text Inference", "Batch CSV Inference"])
+with tab1:
+    st.header("Test a single sentence")
+    user_input = st.text_area(
+        "Enter text here:", "Hello World!")
+    if st.button("Analyze Text", type="primary"):
+        if user_input.strip():
+            with st.spinner("Analyzing..."):
+                preds, probs = predict([user_input], model, tokenizer)
+            st.subheader("Results:")
+            # Display nicely
+            col1, col2 = st.columns(2)
+            with col1:
+                st.write("**Detected Emotions:**")
+                detected = []
+                for idx, is_present in enumerate(preds[0]):
+                    if is_present:
+                        detected.append(EMOTION_LABELS[idx].capitalize())
+                if detected:
+                    for d in detected:
+                        st.markdown(f"### ✅ {d}")
+                else:
+                    st.markdown(
+                        "*No specific emotion detected above thresholds.*")
+            with col2:
+                st.write("**Confidence Scores:**")
+                scores_df = pd.DataFrame({
+                    "Emotion": EMOTION_LABELS,
+                    "Score": probs[0],
+                    "Threshold": THRESHOLDS,
+                    "Detected": preds[0].astype(bool)
+                })
+                # Formatting the dataframe for visual appeal
+                st.dataframe(
+                    scores_df.style.format(
+                        {"Score": "{:.2%}", "Threshold": "{:.2f}"})
+                    .background_gradient(subset=["Score"], cmap="Greens"),
+                    hide_index=True,
+                    use_container_width=True
+                )
+        else:
+            st.warning("Please enter some text.")
+with tab2:
+    st.header("Batch Process (CSV)")
+    st.markdown("Upload a CSV file with a `text` and `id` column.")
+    uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
+    if uploaded_file is not None:
+        try:
+            input_df = pd.read_csv(uploaded_file)
+            if 'text' not in input_df.columns:
+                st.error("CSV must have a 'text' column.")
+            else:
+                st.info(
+                    f"Loaded [{len(input_df)}] rows. Click below to start.")
+                if st.button("Generate Predictions"):
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    # Process in batches
+                    batch_size = 16
+                    all_preds = []
+                    texts = input_df['text'].tolist()
+                    for i in range(0, len(texts), batch_size):
+                        batch_texts = texts[i:i + batch_size]
+                        batch_preds, _ = predict(batch_texts, model, tokenizer)
+                        all_preds.append(batch_preds)
+                        # Update progress
+                        progress = min((i + batch_size) / len(texts), 1.0)
+                        progress_bar.progress(progress)
+                        status_text.text(
+                            f"Processed {i + len(batch_texts)}/{len(texts)} rows")
+                    # Aggregate results
+                    predictions_np = np.vstack(all_preds)
+                    submission_df = pd.DataFrame(
+                        predictions_np, columns=EMOTION_LABELS, dtype=int)
+                    # Combine with original IDs
+                    if 'id' in input_df.columns:
+                        final_df = pd.concat(
+                            [input_df[['id']], submission_df], axis=1)
+                    else:
+                        final_df = submission_df
+                    st.success("Processing complete!")
+                    st.dataframe(final_df.head(), use_container_width=True)
+                    # Download button
+                    csv = final_df.to_csv(index=False).encode('utf-8')
+                    st.download_button(
+                        label="Download Predictions CSV",
+                        data=csv,
+                        file_name="submission.csv",
+                        mime="text/csv"
+                    )
+        except Exception as e:
+            st.error(f"Error reading CSV: {e}")