akage99 commited on
Commit
ae36f5b
·
verified ·
1 Parent(s): 4ae7a17

create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import json
4
+ import re
5
+ import pandas as pd
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ from sentence_transformers import SentenceTransformer, util
8
+
9
+ # --- KONFIGURASI PENTING ---
10
+ # Ini alamat Gudang tempat file 1.11 GB tadi kamu simpan
11
+ ROBERTA_PATH = "akage99/roberta-corporate-backend"
12
+
13
+ # Nama file JSON & BGE
14
+ PLAYBOOK_PATH = "competency_keywords.json"
15
+ BGE_MODEL_NAME = "BAAI/bge-m3"
16
+
17
+ # --- 1. LOAD MODEL ---
18
+ print("⏳ Sedang menghubungkan ke Gudang Model...")
19
+ try:
20
+ # Load dari Repo Model (Gudang)
21
+ tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
22
+ model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
23
+ model.eval()
24
+ print("✅ RoBERTa Berhasil Diload!")
25
+ except Exception as e:
26
+ print(f"❌ Error Load RoBERTa: {e}")
27
+
28
+ # Load BGE (Otomatis download dari internet)
29
+ print("⏳ Loading BGE...")
30
+ bge_model = SentenceTransformer(BGE_MODEL_NAME)
31
+
32
+ # Load Playbook
33
+ print("⏳ Loading Playbook...")
34
+ playbook_emb = None
35
+ df_playbook = pd.DataFrame()
36
+
37
+ try:
38
+ with open(PLAYBOOK_PATH, "r") as f:
39
+ playbook_data = json.load(f)
40
+
41
+ playbook_rows = []
42
+ for cat, comps in playbook_data.items():
43
+ for comp, data in comps.items():
44
+ text = f"{data.get('description','')} {', '.join(data.get('keywords',[]))}"
45
+ playbook_rows.append({"category": cat, "competency": comp, "text": text})
46
+
47
+ df_playbook = pd.DataFrame(playbook_rows)
48
+ playbook_emb = bge_model.encode(df_playbook['text'].tolist(), convert_to_tensor=True)
49
+ print("✅ Playbook Siap!")
50
+ except Exception as e:
51
+ print(f"⚠️ Warning: {e}. Pastikan file json sudah diupload.")
52
+
53
+ # --- 2. LOGIKA PROSES ---
54
+ def process_article(title, content):
55
+ full_text = f"{title}\n\n{content}"
56
+
57
+ # A. Cek Sampah (Regex)
58
+ if re.match(r'^[\d\W\s]+$', str(full_text)):
59
+ return {"Status": "REJECTED", "Reason": "Isi cuma angka/simbol"}
60
+ if len(full_text) < 50:
61
+ return {"Status": "REJECTED", "Reason": "Terlalu pendek (<50 huruf)"}
62
+
63
+ # B. Cek Gaya Bahasa (RoBERTa)
64
+ inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
65
+ with torch.no_grad():
66
+ outputs = model(**inputs)
67
+ probs = torch.softmax(outputs.logits, dim=-1)[0]
68
+ rob_score = float(probs[1]) # 1 = Align
69
+
70
+ # C. Cek Topik (BGE)
71
+ bge_score = 0.0
72
+ pred_cat, pred_comp = "-", "-"
73
+
74
+ if playbook_emb is not None:
75
+ art_vec = bge_model.encode(full_text, convert_to_tensor=True)
76
+ cos_sim = util.cos_sim(art_vec, playbook_emb)
77
+ top_val, top_idx = torch.max(cos_sim, dim=1)
78
+ bge_score = float(top_val)
79
+ idx = int(top_idx)
80
+ pred_cat = df_playbook.iloc[idx]['category']
81
+ pred_comp = df_playbook.iloc[idx]['competency']
82
+
83
+ # D. Keputusan Akhir
84
+ status = "✅ VALID ALIGN" if (rob_score >= 0.5 and bge_score >= 0.75) else "❌ REJECTED"
85
+
86
+ return {
87
+ "Status": status,
88
+ "RoBERTa Score": f"{rob_score:.4f}",
89
+ "BGE Score": f"{bge_score:.4f}",
90
+ "Category": pred_cat,
91
+ "Competency": pred_comp
92
+ }
93
+
94
+ # --- 3. TAMPILAN WEB ---
95
+ with gr.Interface(
96
+ fn=process_article,
97
+ inputs=[gr.Textbox(label="Judul"), gr.Textbox(label="Isi Artikel", lines=6)],
98
+ outputs=gr.JSON(label="Hasil Analisis"),
99
+ title="Corporate Article Validator",
100
+ description="Validasi Artikel: Regex -> RoBERTa -> BGE Similarity",
101
+ allow_flagging="never"
102
+ ) as demo:
103
+ demo.launch()