pvyas96 commited on
Commit
f49bf31
·
verified ·
1 Parent(s): 4f74997

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -0
app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ from scipy.sparse import csr_matrix
4
+ import joblib
5
+ from sklearn.metrics import f1_score
6
+ from sklearn.decomposition import TruncatedSVD
7
+ from tqdm import tqdm
8
+ import tempfile
9
+ import os
10
+
11
+ st.set_page_config(page_title="RLF Model Tester", layout="wide")
12
+ st.title("Random Label Forest (RLF) - Test Dataset Evaluation")
13
+
14
+ # ----------------------- Parsing Function ----------------------- #
15
+ def parse_rcv1_sparse_file(filename):
16
+ with open(filename, 'r') as f:
17
+ header = f.readline()
18
+ num_samples, num_features, num_labels = map(int, header.strip().split())
19
+
20
+ feat_data, feat_rows, feat_cols = [], [], []
21
+ label_data, label_rows, label_cols = [], [], []
22
+
23
+ for row_idx, line in enumerate(f):
24
+ line = line.strip()
25
+ if not line:
26
+ continue
27
+ label_part, feature_part = line.split(' ', 1)
28
+ labels = list(map(int, label_part.split(',')))
29
+ features = feature_part.strip().split()
30
+
31
+ for feat in features:
32
+ fid, fval = feat.split(':')
33
+ feat_rows.append(int(fid))
34
+ feat_cols.append(row_idx)
35
+ feat_data.append(float(fval))
36
+
37
+ for lid in labels:
38
+ label_rows.append(lid)
39
+ label_cols.append(row_idx)
40
+ label_data.append(1.0)
41
+
42
+ X = csr_matrix((feat_data, (feat_rows, feat_cols)), shape=(num_features, num_samples), dtype=np.float32)
43
+ Y = csr_matrix((label_data, (label_rows, label_cols)), shape=(num_labels, num_samples), dtype=np.uint8)
44
+ return X.transpose(), Y.transpose()
45
+
46
+ # ----------------------- Prediction Functions ----------------------- #
47
+ def predict_scores_one_tree(x, tree):
48
+ kmeans1 = tree['level1_kmeans']
49
+ level2_dict = tree['level2_kmeans']
50
+ tree_model = tree['model_tree']
51
+
52
+ l1_cluster = kmeans1.predict(x.reshape(1, -1))[0]
53
+
54
+ if l1_cluster not in level2_dict:
55
+ return {}
56
+
57
+ kmeans2 = level2_dict[l1_cluster]
58
+ l2_cluster = kmeans2.predict(x.reshape(1, -1))[0]
59
+
60
+ node_key = (l1_cluster, l2_cluster)
61
+ if node_key not in tree_model:
62
+ return {}
63
+
64
+ classifiers = tree_model[node_key]['classifiers']
65
+ label_ids = tree_model[node_key]['label_ids']
66
+
67
+ preds = {}
68
+ for clf, lid in zip(classifiers, label_ids):
69
+ score = clf.decision_function(x.reshape(1, -1))[0]
70
+ preds[lid] = score
71
+ return preds
72
+
73
+ def predict_ensemble(X_test_reduced, ensemble):
74
+ all_preds = []
75
+ for i in range(X_test_reduced.shape[0]):
76
+ instance_scores = {}
77
+ x = X_test_reduced[i]
78
+ for tree in ensemble:
79
+ preds = predict_scores_one_tree(x, tree)
80
+ for lid, score in preds.items():
81
+ if lid not in instance_scores:
82
+ instance_scores[lid] = []
83
+ instance_scores[lid].append(score)
84
+ avg_scores = {lid: np.mean(scores) for lid, scores in instance_scores.items()}
85
+ all_preds.append(avg_scores)
86
+ return all_preds
87
+
88
+ def get_topk(pred_scores, k):
89
+ top_labels = []
90
+ for scores in pred_scores:
91
+ if len(scores) == 0:
92
+ top_labels.append([])
93
+ continue
94
+ sorted_labels = sorted(scores.items(), key=lambda x: -x[1])
95
+ top_labels.append([lid for lid, _ in sorted_labels[:k]])
96
+ return top_labels
97
+
98
+ def precision_at_k(preds_topk, Y_true, k):
99
+ hits = 0
100
+ total = len(preds_topk)
101
+ for i, pred_labels in enumerate(preds_topk):
102
+ true_labels = set(Y_true[i].nonzero()[1])
103
+ hits += len(set(pred_labels) & true_labels)
104
+ return hits / (total * k)
105
+
106
+ def evaluate_rforest(ensemble, X_test, Y_test, dim=400):
107
+ svd = TruncatedSVD(n_components=dim, random_state=42)
108
+ X_test_reduced = svd.fit_transform(X_test)
109
+
110
+ pred_scores = predict_ensemble(X_test_reduced, ensemble)
111
+
112
+ preds_at_1 = get_topk(pred_scores, k=1)
113
+ preds_at_3 = get_topk(pred_scores, k=3)
114
+
115
+ p1 = precision_at_k(preds_at_1, Y_test, k=1)
116
+ p3 = precision_at_k(preds_at_3, Y_test, k=3)
117
+
118
+ y_true = Y_test.toarray()
119
+ y_pred_bin = np.zeros_like(y_true)
120
+ for i, pred in enumerate(get_topk(pred_scores, k=5)):
121
+ y_pred_bin[i, pred] = 1
122
+
123
+ macro_f1 = f1_score(y_true, y_pred_bin, average='macro', zero_division=0)
124
+
125
+ return p1, p3, macro_f1
126
+
127
+ # ----------------------- Load Pretrained Model ----------------------- #
128
+ @st.cache_resource
129
+ def load_model():
130
+ return joblib.load("random_label_forest_model.pkl")
131
+
132
+ ensemble_model = load_model()
133
+
134
+ # ----------------------- User Interface ----------------------- #
135
+ uploaded_file = st.file_uploader("Upload Test Dataset (.txt in RCV1 format)", type=["txt"])
136
+
137
+ if uploaded_file:
138
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
139
+ tmp.write(uploaded_file.getvalue())
140
+ test_file_path = tmp.name
141
+
142
+ if st.button("Test Model"):
143
+ st.info("Parsing dataset and evaluating. Please wait...")
144
+ X_test, Y_test = parse_rcv1_sparse_file(test_file_path)
145
+ p1, p3, macro_f1 = evaluate_rforest(ensemble_model, X_test, Y_test)
146
+
147
+ st.success("Evaluation Completed!")
148
+ st.metric("Precision@1", f"{p1:.4f}")
149
+ st.metric("Precision@3", f"{p3:.4f}")
150
+ st.metric("Macro F1 Score", f"{macro_f1:.4f}")