Spaces:

AleksBlacky
/

Science_topic_classifier

Runtime error

App Files Files Community

AleksBlacky commited on Oct 24, 2022

Commit

8cf1f84

1 Parent(s): de73359

added secod model

Browse files

Files changed (7) hide show

__pycache__/model.cpython-39.pyc +0 -0
app.py +15 -10
model.py +52 -1
models/maintopic_clf/config.json +72 -0
models/maintopic_clf/decode_dict_maintopic.pkl +0 -0
models/maintopic_clf/pytorch_model.bin +3 -0
models/scibert/{decode_dict.pkl → decode_dict_topic.pkl} +0 -0

__pycache__/model.cpython-39.pyc CHANGED Viewed

Binary files a/__pycache__/model.cpython-39.pyc and b/__pycache__/model.cpython-39.pyc differ

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import streamlit as st
 from pandas import DataFrame
 import seaborn as sns
-from model import ArxivClassifierModel
 st.markdown("# Hello, friend!")
 st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
-# st.write("Loading model")
-model = ArxivClassifierModel()
 with st.form(key="my_form"):
     st.markdown("### 🎈 Do you want a little magic?  ")
@@ -63,24 +62,24 @@ abstract = doc_abstract
 # except ValueError:
 #     st.error("Word parsing into tokens went wrong! Is input valid? If yes, pls contact author alekseystepin13@gmail.com")
-predicts = model.make_predict(title + abstract)
 st.markdown("## 🎈 Yor article probably about:  ")
 st.header("")
 df = (
-    DataFrame(predicts.items(), columns=["Topic", "Prob"])
         .sort_values(by="Prob", ascending=False)
         .reset_index(drop=True)
 )
 df.index += 1
 df2 = (
-    DataFrame(predicts.items(), columns=["Topic", "Prob"])
         .sort_values(by="Prob", ascending=False)
         .reset_index(drop=True)
 )
-# df2.index += 1
 # Add styling
 cmGreen = sns.light_palette("green", as_cmap=True)
@@ -91,6 +90,12 @@ df = df.style.background_gradient(
         "Prob",
     ],
 )
 c1, c2, c3 = st.columns([1, 3, 1])
@@ -99,10 +104,10 @@ format_dictionary = {
 }
 df = df.format(format_dictionary)
-df2 = df.format(format_dictionary)
 with c2:
     st.markdown("#### We suppose your research about:  ")
-    st.table(df)
-    st.markdown("##### More detailed, it's about topic:  ")
     st.table(df2)

 import streamlit as st
 from pandas import DataFrame
 import seaborn as sns
+from model import ArxivClassifierModel, ArxivClassifierModelsPipeline
 st.markdown("# Hello, friend!")
 st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
+model = ArxivClassifierModelsPipeline()
 with st.form(key="my_form"):
     st.markdown("### 🎈 Do you want a little magic?  ")
 # except ValueError:
 #     st.error("Word parsing into tokens went wrong! Is input valid? If yes, pls contact author alekseystepin13@gmail.com")
+preds_topic, preds_maintopic = model.make_predict(title + abstract)
 st.markdown("## 🎈 Yor article probably about:  ")
 st.header("")
 df = (
+    DataFrame(preds_topic.items(), columns=["Topic", "Prob"])
         .sort_values(by="Prob", ascending=False)
         .reset_index(drop=True)
 )
 df.index += 1
 df2 = (
+    DataFrame(preds_maintopic.items(), columns=["Topic", "Prob"])
         .sort_values(by="Prob", ascending=False)
         .reset_index(drop=True)
 )
+df2.index += 1
 # Add styling
 cmGreen = sns.light_palette("green", as_cmap=True)
         "Prob",
     ],
 )
+df2 = df2.style.background_gradient(
+    cmap=cmGreen,
+    subset=[
+        "Prob",
+    ],
+)
 c1, c2, c3 = st.columns([1, 3, 1])
 }
 df = df.format(format_dictionary)
+df2 = df2.format(format_dictionary)
 with c2:
     st.markdown("#### We suppose your research about:  ")
     st.table(df2)
+    st.markdown("##### More detailed, it's about topic:  ")
+    st.table(df)

model.py CHANGED Viewed

@@ -29,4 +29,55 @@ class ArxivClassifierModel():
     @st.cache(suppress_st_warning=True)
     def __load_model(self):
         st.write("Loading big model")
-        return AutoModelForSequenceClassification.from_pretrained("models/scibert/")

     @st.cache(suppress_st_warning=True)
     def __load_model(self):
         st.write("Loading big model")
+        return AutoModelForSequenceClassification.from_pretrained("models/scibert/")
+class ArxivClassifierModelsPipeline():
+    def __init__(self):
+        self.model_topic_clf = self.__load_topic_clf()
+        self.model_maintopic_clf = self.__load_maintopic_clf()
+        topic_clf_default_model = "allenai/scibert_scivocab_uncased"
+        self.topic_tokenizer = AutoTokenizer.from_pretrained(topic_clf_default_model)
+        maintopic_clf_default_model = "Wi/arxiv-topics-distilbert-base-cased"
+        self.maintopic_tokenizer = AutoTokenizer.from_pretrained(maintopic_clf_default_model)
+        with open('models/scibert/decode_dict_topic.pkl', 'rb') as f:
+            self.decode_dict_topic = pickle.load(f)
+        with open('models/maintopic_clf/decode_dict_maintopic.pkl', 'rb') as f:
+            self.decode_dict_maintopic = pickle.load(f)
+    def make_predict(self, text):
+        tokens_topic = self.topic_tokenizer(text, return_tensors="pt")
+        topic_outs = self.model_topic_clf(tokens_topic.input_ids)
+        probs_topic = topic_outs["logits"].softmax(dim=-1).tolist()[0]
+        topic_probs = {}
+        for i, p in enumerate(probs_topic):
+            if p > 0.1:
+                topic_probs[self.decode_dict_topic[i]] = p
+        tokens_maintopic = self.maintopic_tokenizer(text, return_tensors="pt")
+        maintopic_outs = self.model_maintopic_clf(tokens_maintopic.input_ids)
+        probs_maintopic = maintopic_outs["logits"].softmax(dim=-1).tolist()[0]
+        maintopic_probs = {}
+        for i, p in enumerate(probs_maintopic):
+            if p > 0.1:
+                maintopic_probs[self.decode_dict_maintopic[i]] = p
+        return topic_probs, maintopic_probs
+    @st.cache(suppress_st_warning=True)
+    def __load_topic_clf(self):
+        st.write("Loading model")
+        return AutoModelForSequenceClassification.from_pretrained("models/scibert/")
+    @st.cache(suppress_st_warning=True)
+    def __load_maintopic_clf(self):
+        st.write("Loading second model")
+        return AutoModelForSequenceClassification.from_pretrained("models/maintopic_clf/")

models/maintopic_clf/config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "_name_or_path": "Wi/arxiv-topics-distilbert-base-cased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "Astrophysics",
+    "1": "Condensed Matter",
+    "2": "Computer Science",
+    "3": "Economics",
+    "4": "Electrical Engineering and Systems Science",
+    "5": "General Relativity and Quantum Cosmology",
+    "6": "High Energy Physics - Experiment",
+    "7": "High Energy Physics - Lattice",
+    "8": "High Energy Physics - Phenomenology",
+    "9": "High Energy Physics - Theory",
+    "10": "Mathematics",
+    "11": "Mathematical Physics",
+    "12": "Nonlinear Sciences",
+    "13": "Nuclear Experiment",
+    "14": "Nuclear Theory",
+    "15": "Physics",
+    "16": "Quantitative Biology",
+    "17": "Quantitative Finance",
+    "18": "Quantum Physics",
+    "19": "Statistics",
+    "20": "Other"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "Astrophysics": 0,
+    "Computer Science": 2,
+    "Condensed Matter": 1,
+    "Economics": 3,
+    "Electrical Engineering and Systems Science": 4,
+    "General Relativity and Quantum Cosmology": 5,
+    "High Energy Physics - Experiment": 6,
+    "High Energy Physics - Lattice": 7,
+    "High Energy Physics - Phenomenology": 8,
+    "High Energy Physics - Theory": 9,
+    "Mathematical Physics": 11,
+    "Mathematics": 10,
+    "Nonlinear Sciences": 12,
+    "Nuclear Experiment": 13,
+    "Nuclear Theory": 14,
+    "Other": 20,
+    "Physics": 15,
+    "Quantitative Biology": 16,
+    "Quantitative Finance": 17,
+    "Quantum Physics": 18,
+    "Statistics": 19
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "output_past": true,
+  "pad_token_id": 0,
+  "problem_type": "single_label_classification",
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.23.1",
+  "vocab_size": 28996
+}

models/maintopic_clf/decode_dict_maintopic.pkl ADDED Viewed

Binary file (230 Bytes). View file

models/maintopic_clf/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af3e1c904bab3e773dfabebc016952ab4aac12dd9e30db35272eb908b461eba9
+size 263224881

models/scibert/{decode_dict.pkl → decode_dict_topic.pkl} RENAMED Viewed

File without changes