Spaces:

anshu9749
/

classification

Sleeping

App Files Files Community

anshu9749 commited on May 14, 2025

Commit

d016772

verified ·

1 Parent(s): c3779ec

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +58 -29

src/streamlit_app.py CHANGED Viewed

@@ -2,39 +2,68 @@ import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import numpy as np
 import pandas as pd
 import streamlit as st
+import streamlit as st
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from transformers import BertTokenizer, BertForSequenceClassification
+@st.cache_resource(show_spinner=False)
+def load_model():
+    # Load your fine-tuned model and tokenizer
+    tokenizer = BertTokenizer.from_pretrained("CustomModel")
+    model = BertForSequenceClassification.from_pretrained("CustomModel")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    return tokenizer, model, device
+tokenizer, model, device = load_model()
+st.title("Batch Toxic Comment Classifier")
+st.write("Upload a CSV file containing text comments and get toxicity scores for each row.")
+# CSV upload
+uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+    # Let user select which column contains text
+    text_cols = df.select_dtypes(include=["object"]).columns.tolist()
+    if not text_cols:
+        st.error("No text columns found in the uploaded file.")
+    else:
+        col = st.selectbox("Select text column to classify", text_cols)
+        if st.button("Classify CSV"):
+            texts = df[col].astype(str).tolist()
+            results = []
+            # Batch inference
+            for text in texts:
+                inputs = tokenizer(
+                    text,
+                    padding=True,
+                    truncation=True,
+                    return_tensors="pt"
+                ).to(device)
+                outputs = model(**inputs)
+                probs = F.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0]
+                id2label = model.config.id2label if hasattr(model.config, "id2label") else {0: "non-toxic", 1: "toxic"}
+                # record per-row scores
+                row_res = {id2label[i]: float(probs[i]) for i in range(len(probs))}
+                results.append(row_res)
+            # Combine with original
+            score_df = pd.DataFrame(results)
+            combined = pd.concat([df.reset_index(drop=True), score_df], axis=1)
+            st.subheader("Classification Results")
+            st.dataframe(combined)
+            # Optional: download results
+            csv = combined.to_csv(index=False).encode('utf-8')
+            st.download_button(
+                label="Download results as CSV",
+                data=csv,
+                file_name="classified_results.csv",
+                mime="text/csv"
+            )