akage99 commited on
Commit
ffe5715
·
verified ·
1 Parent(s): 9f906cd

update final code after corrupt cheking

Browse files
Files changed (1) hide show
  1. app.py +289 -157
app.py CHANGED
@@ -1,177 +1,309 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
  import torch
 
 
 
 
4
 
5
- # Cuma Load RoBERTa (Tanpa BGE, Tanpa JSON Playbook)
6
- ROBERTA_PATH = "akage99/roberta-corporate-backend"
 
 
 
 
 
7
 
8
- print("⏳ Cek Kesehatan Tokenizer & Model...")
 
 
 
 
9
  try:
10
  tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
11
  model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
12
- print("✅ ALHAMDULILLAH! Tokenizer & Model SEHAT (Tidak Corrupt).")
13
- status_model = "✅ Model & Tokenizer Aman!"
14
  except Exception as e:
15
- print(f"❌ WADUH! File Model Rusak: {e}")
16
- status_model = f" Error: {e}"
17
-
18
- def cek_status(text):
19
- return {
20
- "status_file_model": status_model,
21
- "input_kamu": text
22
- }
23
-
24
- with gr.Interface(fn=cek_status, inputs="text", outputs="json") as demo:
25
- demo.launch()
26
-
27
- # import gradio as gr
28
- # import torch
29
- # import json
30
- # import pandas as pd
31
- # from transformers import AutoTokenizer, AutoModelForSequenceClassification
32
- # from sentence_transformers import SentenceTransformer, util
33
-
34
- # # --- KONFIGURASI ---
35
- # ROBERTA_PATH = "akage99/roberta-corporate-backend"
36
- # PLAYBOOK_PATH = "competency_keywords.json" # <--- PASTIKAN NAMA FILE DI TAB 'FILES' SAMA PERSIS INI
37
- # BGE_MODEL_NAME = "BAAI/bge-m3"
38
- # ALIGNMENT_THRESHOLD = 0.68
39
- # MIN_WORD_COUNT = 500
40
-
41
- # # Variabel Global untuk menampung status Error
42
- # LOADING_ERROR = None
43
-
44
- # # --- LOAD MODEL ---
45
- # print("⏳ Loading Models...")
46
- # try:
47
- # tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
48
- # model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
49
- # model.eval()
50
- # print("✅ RoBERTa Loaded!")
51
- # except Exception as e:
52
- # print(f"❌ Error RoBERTa: {e}")
53
-
54
- # # Load BGE & Playbook
55
- # playbook_emb = None
56
- # df_playbook = pd.DataFrame()
57
-
58
- # try:
59
- # bge_model = SentenceTransformer(BGE_MODEL_NAME)
60
 
61
- # # Coba buka file JSON
62
- # with open(PLAYBOOK_PATH, "r") as f:
63
- # playbook_data = json.load(f)
64
 
65
- # playbook_rows = []
66
- # for cat, comps in playbook_data.items():
67
- # for comp, data in comps.items():
68
- # comp_type = data.get('type', '-')
69
- # text = f"{data.get('description','')} {', '.join(data.get('keywords',[]))}"
70
- # playbook_rows.append({
71
- # "category": cat,
72
- # "competency": comp,
73
- # "type": comp_type,
74
- # "text": text
75
- # })
76
 
77
- # df_playbook = pd.DataFrame(playbook_rows)
78
- # playbook_emb = bge_model.encode(df_playbook['text'].tolist(), convert_to_tensor=True)
79
- # print("✅ System Ready!")
80
-
81
- # except Exception as e:
82
- # # TANGKAP ERRORNYA DISINI
83
- # LOADING_ERROR = str(e) # Simpan pesan error ke variabel
84
- # print(f"❌ Error BGE/Playbook: {e}")
85
-
86
- # # --- LOGIC UTAMA ---
87
- # def process_article(title, content):
88
- # # 0. CEK STATUS LOADING DULU
89
- # # Kalau loading gagal, langsung lapor ke user di JSON Output
90
- # if LOADING_ERROR is not None:
91
- # return {
92
- # "is_content": False,
93
- # "error": "SYSTEM ERROR: Gagal memuat database Playbook.",
94
- # "detail_error": LOADING_ERROR, # <--- INI AKAN MUNCUL DI LAYAR
95
- # "tips": f"Cek apakah file '{PLAYBOOK_PATH}' sudah ada di tab Files?"
96
- # }
97
 
98
- # # Kalau Playbook None tapi gak ada error (aneh), lapor juga
99
- # if playbook_emb is None:
100
- # return {
101
- # "is_content": False,
102
- # "error": "SYSTEM ERROR: Playbook Embedding kosong (None).",
103
- # "detail_error": "Unknown Error during initialization."
104
- # }
105
-
106
- # full_text = f"{title}\n\n{content}"
 
 
 
 
 
 
 
 
 
 
107
 
108
- # # 1. CEK JUMLAH KATA
109
- # word_count = len(full_text.split())
110
- # if word_count < MIN_WORD_COUNT:
111
- # return {
112
- # "is_content": False,
113
- # "message": f"REJECTED: Konten terlalu pendek ({word_count} kata). Minimal {MIN_WORD_COUNT} kata.",
114
- # "scores": {"roberta": "0.0000", "bge": "0.0000"}
115
- # }
116
-
117
- # # 2. RoBERTa Classification
118
- # try:
119
- # inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
120
- # with torch.no_grad():
121
- # outputs = model(**inputs)
122
- # probs = torch.softmax(outputs.logits, dim=-1)[0]
123
 
124
- # rob_score = float(probs[1])
125
- # is_content = rob_score >= 0.5
126
 
127
- # response = {
128
- # "is_content": is_content,
129
- # "scores": {
130
- # "roberta": f"{rob_score:.4f}",
131
- # "bge": "0.0000"
132
- # }
133
- # }
134
-
135
- # if not is_content:
136
- # response["message"] = "REJECTED: Bukan konten artikel."
137
- # return response
138
-
139
- # # 3. Hitung BGE
140
- # art_vec = bge_model.encode(full_text, convert_to_tensor=True)
141
- # cos_sim = util.cos_sim(art_vec, playbook_emb)
142
- # top_val, top_idx = torch.max(cos_sim, dim=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
- # bge_score = float(top_val)
145
- # idx = int(top_idx)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- # best_cat = df_playbook.iloc[idx]['category']
148
- # best_comp = df_playbook.iloc[idx]['competency']
149
- # best_type = df_playbook.iloc[idx]['type']
150
 
151
- # response["scores"]["bge"] = f"{bge_score:.4f}"
152
-
153
- # comp_data = {
154
- # "category": best_cat,
155
- # "competency": best_comp,
156
- # "type": best_type,
157
- # "prediction_status": "AI Prediction" if bge_score >= ALIGNMENT_THRESHOLD else "AI Recommendation"
158
- # }
159
-
160
- # if bge_score >= ALIGNMENT_THRESHOLD:
161
- # response["predict_competencies"] = comp_data
162
- # else:
163
- # response["recommendation_competencies"] = comp_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # except Exception as e:
166
- # return {"is_content": False, "error": str(e)}
167
 
168
- # return response
169
-
170
- # # --- GRADIO ---
171
- # with gr.Interface(
172
- # fn=process_article,
173
- # inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Content")],
174
- # outputs=gr.JSON(label="JSON Output"),
175
- # title="Article Classifier API",
176
- # ) as demo:
177
- # demo.launch()
 
1
  import gradio as gr
 
2
  import torch
3
+ import json
4
+ import pandas as pd
5
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
6
+ from sentence_transformers import SentenceTransformer, util
7
 
8
+ # --- KONFIGURASI ---
9
+ ROBERTA_PATH = "akage99/roberta-corporate-backend"
10
+ # PASTIKAN NAMA FILE INI SAMA PERSIS DENGAN DI TAB 'FILES'
11
+ PLAYBOOK_PATH = "competency_keywords.json"
12
+ BGE_MODEL_NAME = "BAAI/bge-m3"
13
+ ALIGNMENT_THRESHOLD = 0.68
14
+ MIN_WORD_COUNT = 500
15
 
16
+ # Variabel Global untuk menyimpan status error saat loading
17
+ LOADING_STATUS = {"error": None, "message": "System Normal"}
18
+
19
+ # --- 1. LOAD ROBERTA ---
20
+ print("⏳ Loading RoBERTa...")
21
  try:
22
  tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
23
  model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
24
+ model.eval()
25
+ print("✅ RoBERTa Loaded!")
26
  except Exception as e:
27
+ print(f"❌ Error RoBERTa: {e}")
28
+ LOADING_STATUS["error"] = "RoBERTa Error"
29
+ LOADING_STATUS["message"] = str(e)
30
+
31
+ # --- 2. LOAD BGE & PLAYBOOK ---
32
+ playbook_emb = None
33
+ df_playbook = pd.DataFrame()
34
+
35
+ print("⏳ Loading BGE & Playbook...")
36
+ try:
37
+ # Load Model BGE
38
+ bge_model = SentenceTransformer(BGE_MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ # Load File JSON
41
+ with open(PLAYBOOK_PATH, "r") as f:
42
+ playbook_data = json.load(f)
43
 
44
+ playbook_rows = []
45
+ for cat, comps in playbook_data.items():
46
+ for comp, data in comps.items():
47
+ comp_type = data.get('type', '-')
48
+ text = f"{data.get('description','')} {', '.join(data.get('keywords',[]))}"
49
+ playbook_rows.append({
50
+ "category": cat,
51
+ "competency": comp,
52
+ "type": comp_type,
53
+ "text": text
54
+ })
55
 
56
+ df_playbook = pd.DataFrame(playbook_rows)
57
+ # Encode Playbook (Ini yang biasanya bikin berat)
58
+ playbook_emb = bge_model.encode(df_playbook['text'].tolist(), convert_to_tensor=True)
59
+ print("✅ System Ready & Playbook Loaded!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ except Exception as e:
62
+ # TANGKAP ERRORNYA SUPAYA MUNCUL DI LAYAR
63
+ print(f"❌ Error BGE/Playbook: {e}")
64
+ LOADING_STATUS["error"] = "Playbook/BGE Error"
65
+ LOADING_STATUS["message"] = f"Gagal memuat {PLAYBOOK_PATH}. Detail: {str(e)}"
66
+
67
+ # --- LOGIC UTAMA ---
68
+ def process_article(title, content):
69
+ # CEK STATUS LOADING DULU
70
+ # Kalau tadi saat loading ada error, kasih tau user sekarang!
71
+ if LOADING_STATUS["error"]:
72
+ return {
73
+ "is_content": False,
74
+ "SYSTEM_ERROR": LOADING_STATUS["error"],
75
+ "DETAIL": LOADING_STATUS["message"],
76
+ "TIPS": "Cek nama file JSON di tab Files atau cek Logs untuk detail."
77
+ }
78
+
79
+ full_text = f"{title}\n\n{content}"
80
 
81
+ # 1. CEK JUMLAH KATA
82
+ word_count = len(full_text.split())
83
+ if word_count < MIN_WORD_COUNT:
84
+ return {
85
+ "is_content": False,
86
+ "message": f"REJECTED: Konten terlalu pendek ({word_count} kata). Minimal {MIN_WORD_COUNT} kata.",
87
+ "scores": {"roberta": "0.0000", "bge": "0.0000"}
88
+ }
89
+
90
+ # 2. RoBERTa Classification
91
+ try:
92
+ inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
93
+ with torch.no_grad():
94
+ outputs = model(**inputs)
95
+ probs = torch.softmax(outputs.logits, dim=-1)[0]
96
 
97
+ rob_score = float(probs[1])
98
+ is_content = rob_score >= 0.5
99
 
100
+ response = {
101
+ "is_content": is_content,
102
+ "scores": {
103
+ "roberta": f"{rob_score:.4f}",
104
+ "bge": "0.0000"
105
+ }
106
+ }
107
+
108
+ if not is_content:
109
+ response["message"] = "REJECTED: Bukan konten artikel."
110
+ return response
111
+
112
+ # 3. Hitung BGE
113
+ # Pastikan playbook berhasil di-load tadi
114
+ if playbook_emb is not None:
115
+ art_vec = bge_model.encode(full_text, convert_to_tensor=True)
116
+ cos_sim = util.cos_sim(art_vec, playbook_emb)
117
+ top_val, top_idx = torch.max(cos_sim, dim=1)
118
+
119
+ bge_score = float(top_val)
120
+ idx = int(top_idx)
121
+
122
+ best_cat = df_playbook.iloc[idx]['category']
123
+ best_comp = df_playbook.iloc[idx]['competency']
124
+ best_type = df_playbook.iloc[idx]['type']
125
+
126
+ response["scores"]["bge"] = f"{bge_score:.4f}"
127
+
128
+ comp_data = {
129
+ "category": best_cat,
130
+ "competency": best_comp,
131
+ "type": best_type,
132
+ "prediction_status": "AI Prediction" if bge_score >= ALIGNMENT_THRESHOLD else "AI Recommendation"
133
+ }
134
+
135
+ if bge_score >= ALIGNMENT_THRESHOLD:
136
+ response["predict_competencies"] = comp_data
137
+ else:
138
+ response["recommendation_competencies"] = comp_data
139
+ else:
140
+ # Kalau Playbook None (aneh), lapor error
141
+ response["SYSTEM_WARNING"] = "Playbook Embedding Kosong. Cek Log."
142
+
143
+ except Exception as e:
144
+ return {"is_content": False, "error": str(e)}
145
+
146
+ return response
147
+
148
+ # --- GRADIO ---
149
+ with gr.Interface(
150
+ fn=process_article,
151
+ inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Content")],
152
+ outputs=gr.JSON(label="JSON Output"),
153
+ title="Article Classifier API",
154
+ ) as demo:
155
+ demo.launch()import gradio as gr
156
+ import torch
157
+ import json
158
+ import pandas as pd
159
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
160
+ from sentence_transformers import SentenceTransformer, util
161
+
162
+ # --- KONFIGURASI ---
163
+ ROBERTA_PATH = "akage99/roberta-corporate-backend"
164
+ # PASTIKAN NAMA FILE INI SAMA PERSIS DENGAN DI TAB 'FILES'
165
+ PLAYBOOK_PATH = "competency_keywords.json"
166
+ BGE_MODEL_NAME = "BAAI/bge-m3"
167
+ ALIGNMENT_THRESHOLD = 0.68
168
+ MIN_WORD_COUNT = 500
169
+
170
+ # Variabel Global untuk menyimpan status error saat loading
171
+ LOADING_STATUS = {"error": None, "message": "System Normal"}
172
+
173
+ # --- 1. LOAD ROBERTA ---
174
+ print("⏳ Loading RoBERTa...")
175
+ try:
176
+ tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
177
+ model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
178
+ model.eval()
179
+ print("✅ RoBERTa Loaded!")
180
+ except Exception as e:
181
+ print(f"❌ Error RoBERTa: {e}")
182
+ LOADING_STATUS["error"] = "RoBERTa Error"
183
+ LOADING_STATUS["message"] = str(e)
184
+
185
+ # --- 2. LOAD BGE & PLAYBOOK ---
186
+ playbook_emb = None
187
+ df_playbook = pd.DataFrame()
188
+
189
+ print("⏳ Loading BGE & Playbook...")
190
+ try:
191
+ # Load Model BGE
192
+ bge_model = SentenceTransformer(BGE_MODEL_NAME)
193
+
194
+ # Load File JSON
195
+ with open(PLAYBOOK_PATH, "r") as f:
196
+ playbook_data = json.load(f)
197
 
198
+ playbook_rows = []
199
+ for cat, comps in playbook_data.items():
200
+ for comp, data in comps.items():
201
+ comp_type = data.get('type', '-')
202
+ text = f"{data.get('description','')} {', '.join(data.get('keywords',[]))}"
203
+ playbook_rows.append({
204
+ "category": cat,
205
+ "competency": comp,
206
+ "type": comp_type,
207
+ "text": text
208
+ })
209
+
210
+ df_playbook = pd.DataFrame(playbook_rows)
211
+ # Encode Playbook (Ini yang biasanya bikin berat)
212
+ playbook_emb = bge_model.encode(df_playbook['text'].tolist(), convert_to_tensor=True)
213
+ print("✅ System Ready & Playbook Loaded!")
214
+
215
+ except Exception as e:
216
+ # TANGKAP ERRORNYA SUPAYA MUNCUL DI LAYAR
217
+ print(f"❌ Error BGE/Playbook: {e}")
218
+ LOADING_STATUS["error"] = "Playbook/BGE Error"
219
+ LOADING_STATUS["message"] = f"Gagal memuat {PLAYBOOK_PATH}. Detail: {str(e)}"
220
+
221
+ # --- LOGIC UTAMA ---
222
+ def process_article(title, content):
223
+ # CEK STATUS LOADING DULU
224
+ # Kalau tadi saat loading ada error, kasih tau user sekarang!
225
+ if LOADING_STATUS["error"]:
226
+ return {
227
+ "is_content": False,
228
+ "SYSTEM_ERROR": LOADING_STATUS["error"],
229
+ "DETAIL": LOADING_STATUS["message"],
230
+ "TIPS": "Cek nama file JSON di tab Files atau cek Logs untuk detail."
231
+ }
232
+
233
+ full_text = f"{title}\n\n{content}"
234
+
235
+ # 1. CEK JUMLAH KATA
236
+ word_count = len(full_text.split())
237
+ if word_count < MIN_WORD_COUNT:
238
+ return {
239
+ "is_content": False,
240
+ "message": f"REJECTED: Konten terlalu pendek ({word_count} kata). Minimal {MIN_WORD_COUNT} kata.",
241
+ "scores": {"roberta": "0.0000", "bge": "0.0000"}
242
+ }
243
+
244
+ # 2. RoBERTa Classification
245
+ try:
246
+ inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
247
+ with torch.no_grad():
248
+ outputs = model(**inputs)
249
+ probs = torch.softmax(outputs.logits, dim=-1)[0]
250
 
251
+ rob_score = float(probs[1])
252
+ is_content = rob_score >= 0.5
 
253
 
254
+ response = {
255
+ "is_content": is_content,
256
+ "scores": {
257
+ "roberta": f"{rob_score:.4f}",
258
+ "bge": "0.0000"
259
+ }
260
+ }
261
+
262
+ if not is_content:
263
+ response["message"] = "REJECTED: Bukan konten artikel."
264
+ return response
265
+
266
+ # 3. Hitung BGE
267
+ # Pastikan playbook berhasil di-load tadi
268
+ if playbook_emb is not None:
269
+ art_vec = bge_model.encode(full_text, convert_to_tensor=True)
270
+ cos_sim = util.cos_sim(art_vec, playbook_emb)
271
+ top_val, top_idx = torch.max(cos_sim, dim=1)
272
+
273
+ bge_score = float(top_val)
274
+ idx = int(top_idx)
275
+
276
+ best_cat = df_playbook.iloc[idx]['category']
277
+ best_comp = df_playbook.iloc[idx]['competency']
278
+ best_type = df_playbook.iloc[idx]['type']
279
+
280
+ response["scores"]["bge"] = f"{bge_score:.4f}"
281
+
282
+ comp_data = {
283
+ "category": best_cat,
284
+ "competency": best_comp,
285
+ "type": best_type,
286
+ "prediction_status": "AI Prediction" if bge_score >= ALIGNMENT_THRESHOLD else "AI Recommendation"
287
+ }
288
+
289
+ if bge_score >= ALIGNMENT_THRESHOLD:
290
+ response["predict_competencies"] = comp_data
291
+ else:
292
+ response["recommendation_competencies"] = comp_data
293
+ else:
294
+ # Kalau Playbook None (aneh), lapor error
295
+ response["SYSTEM_WARNING"] = "Playbook Embedding Kosong. Cek Log."
296
 
297
+ except Exception as e:
298
+ return {"is_content": False, "error": str(e)}
299
 
300
+ return response
301
+
302
+ # --- GRADIO ---
303
+ with gr.Interface(
304
+ fn=process_article,
305
+ inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Content")],
306
+ outputs=gr.JSON(label="JSON Output"),
307
+ title="Article Classifier API",
308
+ ) as demo:
309
+ demo.launch()