Spaces:

ChocoLord
/

paper_classifier

Sleeping

App Files Files Community

ChocoLord commited on Apr 7

Commit

ae436d1

1 Parent(s): 6a8432c

Add Streamlit app

Browse files

Files changed (4) hide show

Dockerfile +16 -0
README.md +1 -0
app.py +100 -0
requirements.txt +8 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11-slim
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PIP_NO_CACHE_DIR=1
+COPY requirements.txt .
+RUN pip install --upgrade pip && pip install -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ emoji: 🏢
 colorFrom: red
 colorTo: green
 sdk: docker
 pinned: false
 short_description: Classifies arxiv paper
 ---

 colorFrom: red
 colorTo: green
 sdk: docker
+app_port: 7860
 pinned: false
 short_description: Classifies arxiv paper
 ---

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import json
+import numpy as np
+import pandas as pd
+import streamlit as st
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import plotly.express as px
+MODEL_REPO = os.getenv("MODEL_REPO", "ChocoLord/paper-classifier-model")
+MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
+TOP_P = float(os.getenv("TOP_P", "0.95"))
+st.set_page_config(page_title="Paper classifier", layout="wide")
+st.title("Paper classifier")
+@st.cache_resource
+def load_artifacts():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_REPO)
+    model.eval()
+    id2label = model.config.id2label
+    if id2label is None or len(id2label) == 0:
+        raise ValueError("Model config must contain id2label.")
+    id2label = {int(k): v for k, v in id2label.items()} if not isinstance(list(id2label.keys())[0], int) else id2label
+    return tokenizer, model, id2label
+tokenizer, model, id2label = load_artifacts()
+def predict(title: str, summary: str):
+    title = title or ""
+    summary = summary or ""
+    text = f"{title}\n{summary}".strip()
+    inputs = tokenizer(
+        text,
+        truncation=True,
+        padding="max_length",
+        max_length=MAX_LENGTH,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        logits = model(**inputs).logits
+        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
+    labels = [id2label[i] for i in range(len(probs))]
+    df = pd.DataFrame({
+        "class_name": labels,
+        "predicted_proba": probs,
+    }).sort_values("predicted_proba", ascending=False).reset_index(drop=True)
+    df["cumsum"] = df["predicted_proba"].cumsum()
+    cutoff_idx = int(np.searchsorted(df["cumsum"].values, TOP_P, side="left"))
+    selected_df = df.iloc[:cutoff_idx + 1].copy()
+    return df, selected_df
+title = st.text_input("Title")
+summary = st.text_area("Summary", height=250)
+n_value = st.number_input("Max classes to display in text output", min_value=1, max_value=100, value=20, step=1)
+if st.button("Classify", type="primary"):
+    if not title.strip() and not summary.strip():
+        st.warning("Enter title and/or summary.")
+    else:
+        df, selected_df = predict(title, summary)
+        st.subheader("Selected classes")
+        st.write(
+            f"Top classes whose cumulative predicted probability reaches at least {TOP_P:.2f}. "
+            f"Selected {len(selected_df)} classes with total probability {selected_df['predicted_proba'].sum():.4f}."
+        )
+        text_df = selected_df.head(int(n_value)).copy()
+        lines = [
+            f"{i+1}. {row.class_name} — {row.predicted_proba:.4f}"
+            for i, row in text_df.iterrows()
+        ]
+        st.text("\n".join(lines))
+        st.subheader("Probability bar chart")
+        fig = px.bar(
+            df,
+            x="class_name",
+            y="predicted_proba",
+            hover_data=["cumsum"],
+        )
+        fig.update_layout(
+            xaxis_title="Class",
+            yaxis_title="Predicted probability",
+            xaxis_tickangle=-45,
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        with st.expander("Full sorted predictions"):
+            st.dataframe(df, use_container_width=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+torch
+transformers
+pandas
+numpy
+plotly
+sentencepiece
+safetensors