Spaces:

xapavlov
/

paper-classifier

Sleeping

App Files Files Community

Andrei Pavlov commited on Apr 8

Commit

e0b0f3b

1 Parent(s): 340f25a

Paper classifier app and model

Browse files

Files changed (9) hide show

src/config.py +43 -0
src/model/final/config.json +85 -0
src/model/final/label_mapping.json +86 -0
src/model/final/model.safetensors +3 -0
src/model/final/tokenizer.json +0 -0
src/model/final/tokenizer_config.json +14 -0
src/model/final/training_args.bin +3 -0
src/model_utils.py +75 -0
src/streamlit_app.py +95 -38

src/config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from pathlib import Path
+import re
+ROOT = Path(__file__).parent
+DATA_DIR = ROOT / "data"
+MODEL_DIR = ROOT / "model"
+RAW_DATA_PATH = ROOT / "arxivData.json"
+SEED = 42
+BATCH_SIZE = 16
+NUM_EPOCHS = 10
+VAL_RATIO = 0.1
+TEST_RATIO = 0.1
+LEARNING_RATE = 1e-3
+MAX_LENGTH = 512
+def _load_taxonomy(path):
+    tag_names = {}
+    for line in open(path):
+        line = line.strip()
+        if not line:
+            continue
+        regex_tag_and_name = re.match(r"^([\w.-]+)\s+\((.+)\)$", line)
+        if regex_tag_and_name:
+            tag_names[regex_tag_and_name.group(1)] = regex_tag_and_name.group(2)
+    return tag_names
+TAG_NAMES = _load_taxonomy(ROOT / "taxonomy.txt")
+def get_tag_name(tag):
+    if tag in TAG_NAMES:
+        return TAG_NAMES[tag]
+    prefix = tag.split(".")[0] if "." in tag else tag
+    if prefix in TAG_NAMES:
+        return TAG_NAMES[prefix]
+    return tag

src/model/final/config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "add_cross_attention": false,
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": null,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "eos_token_id": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "cmp-lg",
+    "1": "cs.AI",
+    "2": "cs.CE",
+    "3": "cs.CL",
+    "4": "cs.CR",
+    "5": "cs.CV",
+    "6": "cs.CY",
+    "7": "cs.DB",
+    "8": "cs.DC",
+    "9": "cs.DS",
+    "10": "cs.GT",
+    "11": "cs.HC",
+    "12": "cs.IR",
+    "13": "cs.IT",
+    "14": "cs.LG",
+    "15": "cs.LO",
+    "16": "cs.MM",
+    "17": "cs.NE",
+    "18": "cs.RO",
+    "19": "cs.SD",
+    "20": "cs.SE",
+    "21": "cs.SI",
+    "22": "math.OC",
+    "23": "q-bio.NC",
+    "24": "stat.ME",
+    "25": "stat.ML"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "is_decoder": false,
+  "label2id": {
+    "cmp-lg": 0,
+    "cs.AI": 1,
+    "cs.CE": 2,
+    "cs.CL": 3,
+    "cs.CR": 4,
+    "cs.CV": 5,
+    "cs.CY": 6,
+    "cs.DB": 7,
+    "cs.DC": 8,
+    "cs.DS": 9,
+    "cs.GT": 10,
+    "cs.HC": 11,
+    "cs.IR": 12,
+    "cs.IT": 13,
+    "cs.LG": 14,
+    "cs.LO": 15,
+    "cs.MM": 16,
+    "cs.NE": 17,
+    "cs.RO": 18,
+    "cs.SD": 19,
+    "cs.SE": 20,
+    "cs.SI": 21,
+    "math.OC": 22,
+    "q-bio.NC": 23,
+    "stat.ME": 24,
+    "stat.ML": 25
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.0",
+  "type_vocab_size": 2,
+  "use_cache": false,
+  "vocab_size": 31090
+}

src/model/final/label_mapping.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "label2id": {
+    "cmp-lg": "0",
+    "cs.AI": "1",
+    "cs.CE": "2",
+    "cs.CL": "3",
+    "cs.CR": "4",
+    "cs.CV": "5",
+    "cs.CY": "6",
+    "cs.DB": "7",
+    "cs.DC": "8",
+    "cs.DS": "9",
+    "cs.GT": "10",
+    "cs.HC": "11",
+    "cs.IR": "12",
+    "cs.IT": "13",
+    "cs.LG": "14",
+    "cs.LO": "15",
+    "cs.MM": "16",
+    "cs.NE": "17",
+    "cs.RO": "18",
+    "cs.SD": "19",
+    "cs.SE": "20",
+    "cs.SI": "21",
+    "math.OC": "22",
+    "q-bio.NC": "23",
+    "stat.ME": "24",
+    "stat.ML": "25"
+  },
+  "id2label": {
+    "0": "cmp-lg",
+    "1": "cs.AI",
+    "2": "cs.CE",
+    "3": "cs.CL",
+    "4": "cs.CR",
+    "5": "cs.CV",
+    "6": "cs.CY",
+    "7": "cs.DB",
+    "8": "cs.DC",
+    "9": "cs.DS",
+    "10": "cs.GT",
+    "11": "cs.HC",
+    "12": "cs.IR",
+    "13": "cs.IT",
+    "14": "cs.LG",
+    "15": "cs.LO",
+    "16": "cs.MM",
+    "17": "cs.NE",
+    "18": "cs.RO",
+    "19": "cs.SD",
+    "20": "cs.SE",
+    "21": "cs.SI",
+    "22": "math.OC",
+    "23": "q-bio.NC",
+    "24": "stat.ME",
+    "25": "stat.ML"
+  },
+  "label_names": {
+    "cmp-lg": "Computational Linguistics",
+    "cs.AI": "Artificial Intelligence",
+    "cs.CE": "Computational Engineering, Finance, and Science",
+    "cs.CL": "Computation and Language",
+    "cs.CR": "Cryptography and Security",
+    "cs.CV": "Computer Vision and Pattern Recognition",
+    "cs.CY": "Computers and Society",
+    "cs.DB": "Databases",
+    "cs.DC": "Distributed, Parallel, and Cluster Computing",
+    "cs.DS": "Data Structures and Algorithms",
+    "cs.GT": "Computer Science and Game Theory",
+    "cs.HC": "Human-Computer Interaction",
+    "cs.IR": "Information Retrieval",
+    "cs.IT": "Information Theory",
+    "cs.LG": "Machine Learning",
+    "cs.LO": "Logic in Computer Science",
+    "cs.MM": "Multimedia",
+    "cs.NE": "Neural and Evolutionary Computing",
+    "cs.RO": "Robotics",
+    "cs.SD": "Sound",
+    "cs.SE": "Software Engineering",
+    "cs.SI": "Social and Information Networks",
+    "math.OC": "Optimization and Control",
+    "q-bio.NC": "Neurons and Cognition",
+    "stat.ME": "Methodology",
+    "stat.ML": "Machine Learning"
+  }
+}

src/model/final/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6e8d238bf5418b8d3b730f2ad95291c32d41b9628d9313b667f711d5cdddb90
+size 439777344

src/model/final/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/model/final/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "is_local": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

src/model/final/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc0ddfa117157db3ff50032a9a59efc659d26c4602a636deec4a8cf00b781bab
+size 5329

src/model_utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import json
+import re
+from pathlib import Path
+import numpy as np
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from config import MAX_LENGTH, MODEL_DIR, get_tag_name
+def clean_text(text):
+    return re.sub(r"\s+", " ", text.strip())
+def format_input(title, abstract=None):
+    title = clean_text(title)
+    if abstract and abstract.strip():
+        return f"[TITLE] {title} [SEP] [ABSTRACT] {clean_text(abstract)}"
+    return f"[TITLE] {title}"
+class PaperClassifier:
+    def __init__(self, model_path=None):
+        if model_path is None:
+            model_path = str(MODEL_DIR / "final")
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available()
+            else "mps" if torch.backends.mps.is_available()
+            else "cpu"
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
+        self.model.to(self.device)
+        self.model.eval()
+        with open(Path(model_path) / "label_mapping.json") as f:
+            mapping = json.load(f)
+        self.id2label = mapping["id2label"]
+        self.label_names = mapping.get("label_names", {})
+    @torch.no_grad()
+    def predict(self, title, abstract=None, threshold=0.95):
+        text = format_input(title, abstract)
+        inputs = self.tokenizer(
+            text,
+            padding="max_length",
+            truncation=True,
+            max_length=MAX_LENGTH,
+            return_tensors="pt",
+        ).to(self.device)
+        logits = self.model(**inputs).logits[0].cpu().numpy()
+        probs = np.exp(logits - logits.max())
+        probs /= probs.sum()
+        results = []
+        cumulative = 0.0
+        for idx in np.argsort(probs)[::-1]:
+            tag = self.id2label[str(idx)]
+            prob = float(probs[idx])
+            results.append({
+                "tag": tag,
+                "name": self.label_names.get(tag, get_tag_name(tag)),
+                "probability": prob,
+            })
+            cumulative += prob
+            if cumulative >= threshold:
+                break
+        return results

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,97 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+from model_utils import PaperClassifier
+st.set_page_config(page_title="Paper Classifier", layout="centered")
+st.markdown("""
+<style>
+    .result-box {
+        background: #4a5568; padding: 1rem; border-radius: 8px; color: white; margin-bottom: 0.5rem;
+    }
+    .prob-bar {
+        background: rgba(255,255,255,0.2); border-radius: 6px; height: 22px; margin-top: 4px; overflow: hidden;
+    }
+    .prob-fill {
+        background: #68d391; height: 100%; border-radius: 6px;
+        padding-left: 8px; font-size: 0.85rem; font-weight: 600;
+        color: #1a202c; display: flex; align-items: center;
+    }
+</style>
+""", unsafe_allow_html=True)
+@st.cache_resource(show_spinner="Loading model...")
+def load_model():
+    return PaperClassifier()
+EXAMPLES = [
+    {"title": "Attention Is All You Need",
+     "abstract": "We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely."},
+    {"title": "A Survey on 3D Gaussian Splatting",
+     "abstract": "3D Gaussian splatting (GS) has emerged as a transformative technique in radiance fields. Unlike mainstream implicit neural models, 3D GS uses millions of learnable 3D Gaussians for an explicit scene representation."},
+    {"title": "Interior Point Differential Dynamic Programming",
+     "abstract": ""},
+]
+if "input_title" not in st.session_state:
+    st.session_state.input_title = ""
+if "input_abstract" not in st.session_state:
+    st.session_state.input_abstract = ""
+def set_example(idx):
+    st.session_state.input_title = EXAMPLES[idx]["title"]
+    st.session_state.input_abstract = EXAMPLES[idx]["abstract"]
+def show_results(results):
+    st.markdown(f"### Predicted {len(results)} categories")
+    for r in results:
+        pct = r["probability"] * 100
+        st.markdown(f"""
+        <div class="result-box">
+            <b>{r['tag']}</b> - {r['name']}
+            <div class="prob-bar">
+                <div class="prob-fill" style="width:{max(pct,3)}%">{pct:.1f}%</div>
+            </div>
+        </div>""", unsafe_allow_html=True)
+def main():
+    st.title("Paper Classifier")
+    st.write("Classify papers using fine-tuned SciBERT in one click!")
+    try:
+        clf = load_model()
+    except Exception as err:
+        st.error(f"Could not load model: {err}")
+        return
+    title = st.text_input("**Title:**", key="input_title", placeholder="Paste paper title here")
+    abstract = st.text_area("**Abstract**", key="input_abstract", placeholder="You can leave it empty", height=150)
+    st.write("**Use our examples:**")
+    cols = st.columns(len(EXAMPLES))
+    for i, (col, ex) in enumerate(zip(cols, EXAMPLES)):
+        with col:
+            label = ex["title"][:20] + "..." if len(ex["title"]) > 20 else ex["title"]
+            st.button(label, key=f"ex_{i}", on_click=set_example, args=(i,), use_container_width=True)
+    if st.button("Classify", use_container_width=True):
+        if not title or not title.strip():
+            st.warning("Enter a title first.")
+            return
+        with st.spinner("Classifying..."):
+            try:
+                results = clf.predict(title=title, abstract=abstract)
+            except Exception as err:
+                st.error(f"Error: {err}")
+                return
+        show_results(results)
+if __name__ == "__main__":
+    main()