Spaces:

gvbazhenov
/

paper-categorization

Build error

App Files Files Community

gvbazhenov commited on Apr 5, 2025

Commit

7120cc1

1 Parent(s): 38581dc

deploy

Browse files

Files changed (6) hide show

app.py +69 -0
categories.csv +122 -0
checkpoint/config.json +271 -0
checkpoint/model.safetensors +3 -0
checkpoint/training_args.bin +3 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import numpy as np
+import pandas as pd
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification
+)
+import streamlit as st
+DEPLOYMENT_PATH = '.'
+@st.cache_resource
+def setup():
+    model_name = 'distilbert-base-cased'
+    model = AutoModelForSequenceClassification.from_pretrained(f'{DEPLOYMENT_PATH}/checkpoint')
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    idx2category = pd.read_csv(f'{DEPLOYMENT_PATH}/categories.csv').values.squeeze()
+    return model, tokenizer, idx2category
+@torch.no_grad()
+def get_probas(title, abstract=None):
+    inputs = tokenizer(
+        title,
+        abstract,
+        padding=True,
+        truncation=True,
+        return_tensors='pt'
+    )
+    outputs = model(**inputs)
+    logits = outputs.logits
+    probas = (
+        torch.sigmoid(logits)
+        .detach().numpy().reshape(-1)
+    )
+    return probas
+model, tokenizer, idx2category = setup()
+num_categories = len(idx2category)
+def get_categories_by_threshold(probas, threshold=0.3):
+    categories = [
+        idx2category[idx] for idx in range(num_categories)
+        if probas[idx] > threshold
+    ]
+    return categories
+def get_top_categories(probas, num_predictions=5):
+    categories = [
+        idx2category[idx] for idx in np.argsort(probas)[::-1][:num_predictions]
+    ]
+    return categories
+st.title('ArXiv Papers Categorization')
+title_input = st.text_input('Enter the title of paper:')
+abstract_input = st.text_area('Enter the abstract (optional):')
+IS_READY = len(title_input) > 0
+if IS_READY and st.button('Categorize'):
+    probas = get_probas(title_input, abstract_input)
+    categories_predicted = get_categories_by_threshold(probas)
+    if len(categories_predicted) == 0:
+        categories_predicted = get_top_categories(probas)
+    st.write('Relevant arXiv categories:')
+    for category in categories_predicted:
+        st.markdown(f'- `{category}`')

categories.csv ADDED Viewed

	@@ -0,0 +1,122 @@

+category
+cond-mat.dis-nn
+cs.AI
+cs.AR
+cs.CC
+cs.CE
+cs.CG
+cs.CL
+cs.CR
+cs.CV
+cs.CY
+cs.DB
+cs.DC
+cs.DL
+cs.DM
+cs.DS
+cs.ET
+cs.FL
+cs.GL
+cs.GR
+cs.GT
+cs.HC
+cs.IR
+cs.IT
+cs.LG
+cs.LO
+cs.MA
+cs.MM
+cs.MS
+cs.NA
+cs.NE
+cs.NI
+cs.OH
+cs.OS
+cs.PF
+cs.PL
+cs.RO
+cs.SC
+cs.SD
+cs.SE
+cs.SI
+cs.SY
+econ.EM
+eess.AS
+eess.IV
+eess.SP
+math.AC
+math.AG
+math.AP
+math.AT
+math.CA
+math.CO
+math.CT
+math.CV
+math.DG
+math.DS
+math.FA
+math.GM
+math.GN
+math.GR
+math.GT
+math.HO
+math.IT
+math.LO
+math.MG
+math.MP
+math.NA
+math.NT
+math.OA
+math.OC
+math.PR
+math.QA
+math.RA
+math.RT
+math.SP
+math.ST
+nlin.AO
+nlin.CD
+nlin.CG
+nlin.PS
+physics.ao-ph
+physics.app-ph
+physics.bio-ph
+physics.chem-ph
+physics.class-ph
+physics.comp-ph
+physics.data-an
+physics.flu-dyn
+physics.gen-ph
+physics.geo-ph
+physics.hist-ph
+physics.ins-det
+physics.med-ph
+physics.optics
+physics.pop-ph
+physics.soc-ph
+physics.space-ph
+q-bio
+q-bio.BM
+q-bio.CB
+q-bio.GN
+q-bio.MN
+q-bio.NC
+q-bio.OT
+q-bio.PE
+q-bio.QM
+q-bio.SC
+q-bio.TO
+q-fin.CP
+q-fin.EC
+q-fin.GN
+q-fin.PM
+q-fin.PR
+q-fin.RM
+q-fin.ST
+q-fin.TR
+stat.AP
+stat.CO
+stat.ME
+stat.ML
+stat.OT
+stat.TH

checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,271 @@

+{
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30",
+    "31": "LABEL_31",
+    "32": "LABEL_32",
+    "33": "LABEL_33",
+    "34": "LABEL_34",
+    "35": "LABEL_35",
+    "36": "LABEL_36",
+    "37": "LABEL_37",
+    "38": "LABEL_38",
+    "39": "LABEL_39",
+    "40": "LABEL_40",
+    "41": "LABEL_41",
+    "42": "LABEL_42",
+    "43": "LABEL_43",
+    "44": "LABEL_44",
+    "45": "LABEL_45",
+    "46": "LABEL_46",
+    "47": "LABEL_47",
+    "48": "LABEL_48",
+    "49": "LABEL_49",
+    "50": "LABEL_50",
+    "51": "LABEL_51",
+    "52": "LABEL_52",
+    "53": "LABEL_53",
+    "54": "LABEL_54",
+    "55": "LABEL_55",
+    "56": "LABEL_56",
+    "57": "LABEL_57",
+    "58": "LABEL_58",
+    "59": "LABEL_59",
+    "60": "LABEL_60",
+    "61": "LABEL_61",
+    "62": "LABEL_62",
+    "63": "LABEL_63",
+    "64": "LABEL_64",
+    "65": "LABEL_65",
+    "66": "LABEL_66",
+    "67": "LABEL_67",
+    "68": "LABEL_68",
+    "69": "LABEL_69",
+    "70": "LABEL_70",
+    "71": "LABEL_71",
+    "72": "LABEL_72",
+    "73": "LABEL_73",
+    "74": "LABEL_74",
+    "75": "LABEL_75",
+    "76": "LABEL_76",
+    "77": "LABEL_77",
+    "78": "LABEL_78",
+    "79": "LABEL_79",
+    "80": "LABEL_80",
+    "81": "LABEL_81",
+    "82": "LABEL_82",
+    "83": "LABEL_83",
+    "84": "LABEL_84",
+    "85": "LABEL_85",
+    "86": "LABEL_86",
+    "87": "LABEL_87",
+    "88": "LABEL_88",
+    "89": "LABEL_89",
+    "90": "LABEL_90",
+    "91": "LABEL_91",
+    "92": "LABEL_92",
+    "93": "LABEL_93",
+    "94": "LABEL_94",
+    "95": "LABEL_95",
+    "96": "LABEL_96",
+    "97": "LABEL_97",
+    "98": "LABEL_98",
+    "99": "LABEL_99",
+    "100": "LABEL_100",
+    "101": "LABEL_101",
+    "102": "LABEL_102",
+    "103": "LABEL_103",
+    "104": "LABEL_104",
+    "105": "LABEL_105",
+    "106": "LABEL_106",
+    "107": "LABEL_107",
+    "108": "LABEL_108",
+    "109": "LABEL_109",
+    "110": "LABEL_110",
+    "111": "LABEL_111",
+    "112": "LABEL_112",
+    "113": "LABEL_113",
+    "114": "LABEL_114",
+    "115": "LABEL_115",
+    "116": "LABEL_116",
+    "117": "LABEL_117",
+    "118": "LABEL_118",
+    "119": "LABEL_119",
+    "120": "LABEL_120"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_100": 100,
+    "LABEL_101": 101,
+    "LABEL_102": 102,
+    "LABEL_103": 103,
+    "LABEL_104": 104,
+    "LABEL_105": 105,
+    "LABEL_106": 106,
+    "LABEL_107": 107,
+    "LABEL_108": 108,
+    "LABEL_109": 109,
+    "LABEL_11": 11,
+    "LABEL_110": 110,
+    "LABEL_111": 111,
+    "LABEL_112": 112,
+    "LABEL_113": 113,
+    "LABEL_114": 114,
+    "LABEL_115": 115,
+    "LABEL_116": 116,
+    "LABEL_117": 117,
+    "LABEL_118": 118,
+    "LABEL_119": 119,
+    "LABEL_12": 12,
+    "LABEL_120": 120,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_29": 29,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_31": 31,
+    "LABEL_32": 32,
+    "LABEL_33": 33,
+    "LABEL_34": 34,
+    "LABEL_35": 35,
+    "LABEL_36": 36,
+    "LABEL_37": 37,
+    "LABEL_38": 38,
+    "LABEL_39": 39,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_42": 42,
+    "LABEL_43": 43,
+    "LABEL_44": 44,
+    "LABEL_45": 45,
+    "LABEL_46": 46,
+    "LABEL_47": 47,
+    "LABEL_48": 48,
+    "LABEL_49": 49,
+    "LABEL_5": 5,
+    "LABEL_50": 50,
+    "LABEL_51": 51,
+    "LABEL_52": 52,
+    "LABEL_53": 53,
+    "LABEL_54": 54,
+    "LABEL_55": 55,
+    "LABEL_56": 56,
+    "LABEL_57": 57,
+    "LABEL_58": 58,
+    "LABEL_59": 59,
+    "LABEL_6": 6,
+    "LABEL_60": 60,
+    "LABEL_61": 61,
+    "LABEL_62": 62,
+    "LABEL_63": 63,
+    "LABEL_64": 64,
+    "LABEL_65": 65,
+    "LABEL_66": 66,
+    "LABEL_67": 67,
+    "LABEL_68": 68,
+    "LABEL_69": 69,
+    "LABEL_7": 7,
+    "LABEL_70": 70,
+    "LABEL_71": 71,
+    "LABEL_72": 72,
+    "LABEL_73": 73,
+    "LABEL_74": 74,
+    "LABEL_75": 75,
+    "LABEL_76": 76,
+    "LABEL_77": 77,
+    "LABEL_78": 78,
+    "LABEL_79": 79,
+    "LABEL_8": 8,
+    "LABEL_80": 80,
+    "LABEL_81": 81,
+    "LABEL_82": 82,
+    "LABEL_83": 83,
+    "LABEL_84": 84,
+    "LABEL_85": 85,
+    "LABEL_86": 86,
+    "LABEL_87": 87,
+    "LABEL_88": 88,
+    "LABEL_89": 89,
+    "LABEL_9": 9,
+    "LABEL_90": 90,
+    "LABEL_91": 91,
+    "LABEL_92": 92,
+    "LABEL_93": 93,
+    "LABEL_94": 94,
+    "LABEL_95": 95,
+    "LABEL_96": 96,
+    "LABEL_97": 97,
+    "LABEL_98": 98,
+    "LABEL_99": 99
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "problem_type": "multi_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.50.3",
+  "vocab_size": 28996
+}

checkpoint/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f5408495c388d56779d586a1cea49b5118730777367cc117906705f0854d4e8
+size 263510740

checkpoint/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4750d64b2f40be3f5815be468cb1e00555d1c85d6f15b2fb3ae1bdee389db690
+size 5304

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+numpy
+pandas
+torch
+transformers