NhatHuy1110 commited on
Commit
a1bbbd5
·
verified ·
1 Parent(s): d745845

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ import joblib, json, numpy as np
4
+ from pathlib import Path
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ ART = Path("artifacts")
9
+ LABELS = json.load(open(ART/"label_names.json"))
10
+ EMB_MODEL_NAME = (ART/"emb_model_name.txt").read_text().strip()
11
+
12
+ @st.cache_resource(show_spinner=False)
13
+ def load_models():
14
+ emb = SentenceTransformer(EMB_MODEL_NAME)
15
+ clf = joblib.load(ART/"lgbm_model.pkl")
16
+ nn = joblib.load(ART/"nn_index.pkl")
17
+ tfidf = joblib.load(ART/"tfidf_explainer.pkl")
18
+ train_meta = json.load(open(ART/"train_meta.json"))
19
+ class_keywords = json.load(open(ART/"class_keywords.json"))
20
+ return emb, clf, nn, tfidf, train_meta, class_keywords
21
+
22
+ def encode_one(emb_model, text: str) -> np.ndarray:
23
+ text = text.strip()
24
+ prompt = f"passage: {text}"
25
+ v = emb_model.encode([prompt], show_progress_bar=False, normalize_embeddings=True)
26
+ return np.asarray(v, dtype=np.float32)
27
+
28
+ st.set_page_config(page_title="ArXiv Abstract Classifier", page_icon="🧠", layout="wide")
29
+ st.title("🧠 ArXiv Abstract Classifier")
30
+ st.caption("Embeddings (E5) + LightGBM • Probabilities • Similar Papers • Class Keywords")
31
+
32
+ with st.sidebar:
33
+ st.markdown("### Settings")
34
+ topk = st.slider("Top similar papers", 1, 10, 3)
35
+ show_keywords = st.checkbox("Show class keywords", value=True)
36
+ st.divider()
37
+ st.markdown("Model")
38
+ st.code(f"Encoder: {EMB_MODEL_NAME}\nClassifier: LightGBM", language="yaml")
39
+
40
+ emb_model, clf, nn, tfidf, train_meta, class_keywords = load_models()
41
+
42
+ default_text = """We propose a novel neural architecture for efficient transformer inference,
43
+ reducing memory footprint while maintaining accuracy on common NLP tasks.
44
+ Experiments on translation and summarization demonstrate competitive results."""
45
+ text = st.text_area("Paste paper abstract here:", default_text, height=220)
46
+
47
+ col1, col2 = st.columns([1,1])
48
+ with col1:
49
+ run = st.button("🔍 Classify")
50
+ with col2:
51
+ clear = st.button("🧹 Clear")
52
+ if clear:
53
+ st.experimental_rerun()
54
+
55
+ if run:
56
+ if not text.strip():
57
+ st.warning("Please enter an abstract.")
58
+ st.stop()
59
+
60
+ v = encode_one(emb_model, text)
61
+ probs = clf.predict_proba(v)[0]
62
+ pred_idx = int(np.argmax(probs))
63
+ pred_label = LABELS[pred_idx]
64
+
65
+ st.success(f"**Predicted field:** `{pred_label}`")
66
+ st.write("### Class probabilities")
67
+ prob_dict = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
68
+ st.bar_chart(prob_dict)
69
+
70
+ st.write("### 🔗 Most similar training papers")
71
+ dists, idxs = nn.kneighbors(v, n_neighbors=max(topk, 3), return_distance=True)
72
+ idxs = idxs[0].tolist()
73
+ dists = dists[0].tolist()
74
+
75
+ titles = train_meta["train_titles"]
76
+ abstracts = train_meta["train_abstracts"]
77
+ labels = train_meta["train_labels"]
78
+
79
+ for rank, (i, d) in enumerate(zip(idxs[:topk], dists[:topk]), start=1):
80
+ cos = 1 - d
81
+ with st.container(border=True):
82
+ st.markdown(f"**#{rank}. {titles[i]}**")
83
+ st.caption(f"_Label:_ `{labels[i]}` • _Cosine similarity:_ **{cos:.3f}**")
84
+ st.write(abstracts[i][:600] + ("..." if len(abstracts[i]) > 600 else ""))
85
+
86
+ if show_keywords:
87
+ st.write("### 🏷️ Class keywords (TF-IDF centroids)")
88
+ cols = st.columns(len(LABELS))
89
+ for j, lb in enumerate(LABELS):
90
+ with cols[j]:
91
+ st.markdown(f"**{lb}**")
92
+ st.write(", ".join(class_keywords.get(lb, [])[:15]))