Spaces:
Running
Running
update menggunakan file pkl
Browse files
app.py
CHANGED
|
@@ -15,16 +15,14 @@ from huggingface_hub import login, InferenceClient
|
|
| 15 |
hf_token = os.getenv("HF_TOKEN")
|
| 16 |
login(token=hf_token)
|
| 17 |
|
| 18 |
-
# PATH FILE
|
| 19 |
ROBERTA_PATH = "akage99/roberta-corporate-backend"
|
| 20 |
BGE_MODEL_NAME = "BAAI/bge-m3"
|
| 21 |
-
COMPETENCY_PATH = "competency_keywords.json"
|
| 22 |
-
FORM_PATH = "content_forms.json"
|
| 23 |
|
| 24 |
-
# --- A. GATEKEEPER RULES (
|
| 25 |
GATE_RULES = {
|
| 26 |
-
"min_words": 500, #
|
| 27 |
-
"max_digit_ratio": 0.3, # Maksimal 30% angka
|
| 28 |
"math_latex_triggers": [
|
| 29 |
r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int",
|
| 30 |
r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2",
|
|
@@ -33,25 +31,23 @@ GATE_RULES = {
|
|
| 33 |
]
|
| 34 |
}
|
| 35 |
|
| 36 |
-
# --- B. QUALITY AUDIT RULES (
|
| 37 |
-
# Mengacu pada file: parameter_content.xlsx
|
| 38 |
QUALITY_RULES = {
|
| 39 |
"standard": {
|
| 40 |
"min_paragraphs": 3,
|
| 41 |
"max_sentence_length": 60
|
| 42 |
},
|
| 43 |
"penalties": {
|
| 44 |
-
"bad_structure": 30,
|
| 45 |
-
"risk_word": 50,
|
| 46 |
-
"bad_tone": 20,
|
| 47 |
"short_content": 20
|
| 48 |
},
|
| 49 |
-
# Keywords bahaya dari Excel (Data Sensitif, Etika, Hoax)
|
| 50 |
"risk_keywords": [
|
| 51 |
-
"confidential", "rahasia", "internal use only", "top secret",
|
| 52 |
-
"bodoh", "goblok", "brengsek", "tolol", "idiot",
|
| 53 |
-
"password:", "api_key", "access token",
|
| 54 |
-
"suap", "gratifikasi", "korupsi"
|
| 55 |
]
|
| 56 |
}
|
| 57 |
|
|
@@ -62,90 +58,46 @@ LLM_MODELS = [
|
|
| 62 |
"google/gemma-1.1-7b-it"
|
| 63 |
]
|
| 64 |
|
| 65 |
-
# ================= 2. SETUP ENGINE (
|
| 66 |
-
print("⏳
|
| 67 |
|
| 68 |
-
# 1. Load Tokenizer & RoBERTa
|
|
|
|
| 69 |
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
|
| 70 |
roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
|
| 71 |
|
| 72 |
-
# 2. Load Embedding Model
|
|
|
|
| 73 |
embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)
|
| 74 |
|
| 75 |
-
# 3. Setup Chunking
|
| 76 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 77 |
|
| 78 |
-
# 4.
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
#
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
text_content = f"{comp_name}. {desc} Keywords: {keywords_str}"
|
| 94 |
-
|
| 95 |
-
# Metadata (Disimpan untuk ditampilkan nanti)
|
| 96 |
-
meta = {
|
| 97 |
-
"source": "competency",
|
| 98 |
-
"category": category,
|
| 99 |
-
"competency": comp_name,
|
| 100 |
-
"type": details.get('type', '-'),
|
| 101 |
-
"keywords": keywords_str # Simpan buat display kalau perlu
|
| 102 |
-
}
|
| 103 |
-
docs.append(Document(page_content=text_content, metadata=meta))
|
| 104 |
-
except FileNotFoundError:
|
| 105 |
-
print(f"⚠️ Error: {COMPETENCY_PATH} tidak ditemukan.")
|
| 106 |
-
|
| 107 |
-
# B. Load Content Forms (content_forms.json)
|
| 108 |
-
try:
|
| 109 |
-
with open(FORM_PATH, 'r') as f:
|
| 110 |
-
data = json.load(f)
|
| 111 |
-
# Struktur JSON: Group -> Type Name -> {description, examples}
|
| 112 |
-
for group, types in data.items():
|
| 113 |
-
for type_name, details in types.items():
|
| 114 |
-
desc = details.get('description', '')
|
| 115 |
-
examples = details.get('examples', '')
|
| 116 |
-
# Text = "Tipe + Deskripsi + Contoh"
|
| 117 |
-
text_content = f"{type_name}. {desc} Contoh: {examples}"
|
| 118 |
-
|
| 119 |
-
meta = {
|
| 120 |
-
"source": "form",
|
| 121 |
-
"group": group,
|
| 122 |
-
"content_type": type_name
|
| 123 |
-
}
|
| 124 |
-
docs.append(Document(page_content=text_content, metadata=meta))
|
| 125 |
-
except FileNotFoundError:
|
| 126 |
-
print(f"⚠️ Error: {FORM_PATH} tidak ditemukan.")
|
| 127 |
-
|
| 128 |
-
# C. Build FAISS Index (Simpan di RAM)
|
| 129 |
-
if docs:
|
| 130 |
-
print(f"✅ Embedding {len(docs)} documents to RAM...")
|
| 131 |
-
return FAISS.from_documents(docs, embeddings)
|
| 132 |
-
else:
|
| 133 |
-
return None
|
| 134 |
-
|
| 135 |
-
# INISIALISASI DATABASE (Jalan 1x saat start)
|
| 136 |
-
vectorstore = build_vector_store()
|
| 137 |
print("✅ System Ready!")
|
| 138 |
|
| 139 |
-
# ================= 3. LOGIC MODULES =================
|
| 140 |
|
| 141 |
-
# --- MODUL 1: GATEKEEPER
|
| 142 |
def run_gatekeeper(text):
|
| 143 |
-
# A. Cek Word Count
|
| 144 |
words = text.strip().split()
|
| 145 |
if len(words) < GATE_RULES['min_words']:
|
| 146 |
return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata."
|
| 147 |
|
| 148 |
-
# B. Cek Angka (Numeric Spam)
|
| 149 |
clean_text = re.sub(r'[\s.,\-\+]', '', text)
|
| 150 |
if clean_text.isdigit():
|
| 151 |
return False, "REJECTED: Input hanya berisi angka (Spam Data)."
|
|
@@ -154,31 +106,27 @@ def run_gatekeeper(text):
|
|
| 154 |
if digit_ratio > GATE_RULES['max_digit_ratio']:
|
| 155 |
return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah."
|
| 156 |
|
| 157 |
-
# C. Cek LaTeX / Matematika
|
| 158 |
for pattern in GATE_RULES['math_latex_triggers']:
|
| 159 |
if re.search(pattern, text, re.IGNORECASE):
|
| 160 |
return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')."
|
| 161 |
|
| 162 |
return True, "PASS"
|
| 163 |
|
| 164 |
-
# --- MODUL 2: SYSTEM A (MANUAL AUDIT
|
| 165 |
def run_manual_audit(text, rules):
|
| 166 |
base_score = 100
|
| 167 |
flags = []
|
| 168 |
|
| 169 |
-
# 1. Cek Struktur (Excel: "Struktur sangat buruk")
|
| 170 |
if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']:
|
| 171 |
base_score -= rules['penalties']['bad_structure']
|
| 172 |
flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)")
|
| 173 |
|
| 174 |
-
# 2. Cek Risiko & Etika (Excel: "Masalah etika / Risiko tinggi")
|
| 175 |
text_lower = text.lower()
|
| 176 |
found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
|
| 177 |
if found_risks:
|
| 178 |
base_score -= rules['penalties']['risk_word']
|
| 179 |
flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}")
|
| 180 |
|
| 181 |
-
# 3. Cek Tone/Bahasa (Excel: "Bahasa perlu perbaikan")
|
| 182 |
try:
|
| 183 |
inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
|
| 184 |
with torch.no_grad():
|
|
@@ -188,11 +136,9 @@ def run_manual_audit(text, rules):
|
|
| 188 |
except:
|
| 189 |
rob_score = 0.5
|
| 190 |
|
| 191 |
-
# 4. Kalkulasi Final Score (0.0000 - 1.0000)
|
| 192 |
rule_decimal = max(0, base_score) / 100.0
|
| 193 |
final_score = (rule_decimal * 0.7) + (rob_score * 0.3)
|
| 194 |
|
| 195 |
-
# Mapping ke Kategori Excel
|
| 196 |
if final_score >= 0.85: verdict = "AI-Curated (Good Quality)"
|
| 197 |
elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
|
| 198 |
else: verdict = "Needs Attention (Low Clarity/Risk)"
|
|
@@ -230,7 +176,7 @@ def run_llm_judge(text, content_type):
|
|
| 230 |
except: continue
|
| 231 |
return None
|
| 232 |
|
| 233 |
-
# ================= 4. MAIN PROCESSOR =================
|
| 234 |
def process_article(title, content):
|
| 235 |
full_text = f"{title}\n\n{content}"
|
| 236 |
|
|
@@ -239,7 +185,7 @@ def process_article(title, content):
|
|
| 239 |
if not is_valid:
|
| 240 |
return {"is_content": False, "rejection_reason": msg}
|
| 241 |
|
| 242 |
-
# 2. CHUNKING & VECTOR SEARCH
|
| 243 |
chunks = text_splitter.split_text(full_text)
|
| 244 |
|
| 245 |
competency_candidates = []
|
|
@@ -247,46 +193,49 @@ def process_article(title, content):
|
|
| 247 |
|
| 248 |
if vectorstore:
|
| 249 |
for chunk in chunks:
|
| 250 |
-
# Cari di FAISS (
|
|
|
|
| 251 |
res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'})
|
| 252 |
-
res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
for doc, score in res_comp:
|
| 255 |
competency_candidates.append({"meta": doc.metadata, "score": score})
|
| 256 |
-
for doc, score in
|
| 257 |
form_candidates.append({"meta": doc.metadata, "score": score})
|
| 258 |
|
| 259 |
-
# 3. AGGREGATION
|
| 260 |
unique_comp = {}
|
| 261 |
for item in competency_candidates:
|
| 262 |
-
name = item['meta']
|
| 263 |
if name not in unique_comp:
|
| 264 |
unique_comp[name] = item['meta']
|
| 265 |
unique_comp[name]['best_score'] = item['score']
|
| 266 |
else:
|
| 267 |
-
# Ambil score terendah (L2 distance terbaik)
|
| 268 |
if item['score'] < unique_comp[name]['best_score']:
|
| 269 |
unique_comp[name]['best_score'] = item['score']
|
| 270 |
|
| 271 |
-
# Sort & Ambil Top 5
|
| 272 |
top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5]
|
| 273 |
|
| 274 |
final_competencies = []
|
| 275 |
for comp in top_5_competencies:
|
| 276 |
-
# Konversi L2 Distance ke Similarity (Simulasi: 1 / (1+dist))
|
| 277 |
sim_score = 1 / (1 + comp['best_score'])
|
| 278 |
final_competencies.append({
|
| 279 |
-
"category": comp
|
| 280 |
-
"competency": comp
|
| 281 |
-
"type": comp
|
| 282 |
"similarity_score": f"{sim_score:.4f}"
|
| 283 |
})
|
| 284 |
|
| 285 |
-
# Content Type Dominan
|
| 286 |
predicted_form = "General"
|
| 287 |
if form_candidates:
|
| 288 |
best_form = min(form_candidates, key=lambda x: x['score'])
|
| 289 |
-
predicted_form = best_form['meta']['
|
| 290 |
|
| 291 |
# 4. QUALITY AUDIT
|
| 292 |
manual_res = run_manual_audit(full_text, QUALITY_RULES)
|
|
@@ -321,10 +270,10 @@ def process_article(title, content):
|
|
| 321 |
# ================= 5. UI =================
|
| 322 |
iface = gr.Interface(
|
| 323 |
fn=process_article,
|
| 324 |
-
inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel")],
|
| 325 |
-
outputs=gr.JSON(label="Hasil
|
| 326 |
-
title="DigiFeed V6:
|
| 327 |
-
description="
|
| 328 |
)
|
| 329 |
|
| 330 |
iface.launch()
|
|
|
|
| 15 |
hf_token = os.getenv("HF_TOKEN")
|
| 16 |
login(token=hf_token)
|
| 17 |
|
| 18 |
+
# PATH FILE MODEL
|
| 19 |
ROBERTA_PATH = "akage99/roberta-corporate-backend"
|
| 20 |
BGE_MODEL_NAME = "BAAI/bge-m3"
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
# --- A. GATEKEEPER RULES (TIDAK DIUBAH) ---
|
| 23 |
GATE_RULES = {
|
| 24 |
+
"min_words": 500, # Wajib 500 kata
|
| 25 |
+
"max_digit_ratio": 0.3, # Maksimal 30% angka
|
| 26 |
"math_latex_triggers": [
|
| 27 |
r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int",
|
| 28 |
r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2",
|
|
|
|
| 31 |
]
|
| 32 |
}
|
| 33 |
|
| 34 |
+
# --- B. QUALITY AUDIT RULES (TIDAK DIUBAH) ---
|
|
|
|
| 35 |
QUALITY_RULES = {
|
| 36 |
"standard": {
|
| 37 |
"min_paragraphs": 3,
|
| 38 |
"max_sentence_length": 60
|
| 39 |
},
|
| 40 |
"penalties": {
|
| 41 |
+
"bad_structure": 30,
|
| 42 |
+
"risk_word": 50,
|
| 43 |
+
"bad_tone": 20,
|
| 44 |
"short_content": 20
|
| 45 |
},
|
|
|
|
| 46 |
"risk_keywords": [
|
| 47 |
+
"confidential", "rahasia", "internal use only", "top secret",
|
| 48 |
+
"bodoh", "goblok", "brengsek", "tolol", "idiot",
|
| 49 |
+
"password:", "api_key", "access token",
|
| 50 |
+
"suap", "gratifikasi", "korupsi"
|
| 51 |
]
|
| 52 |
}
|
| 53 |
|
|
|
|
| 58 |
"google/gemma-1.1-7b-it"
|
| 59 |
]
|
| 60 |
|
| 61 |
+
# ================= 2. SETUP ENGINE (UPDATED) =================
|
| 62 |
+
print("⏳ Starting System...")
|
| 63 |
|
| 64 |
+
# 1. Load Tokenizer & RoBERTa (Untuk Audit Tone)
|
| 65 |
+
print(" Loading RoBERTa Model...")
|
| 66 |
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
|
| 67 |
roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
|
| 68 |
|
| 69 |
+
# 2. Load Embedding Model (Untuk memproses input User)
|
| 70 |
+
print(" Loading BGE-M3 Model...")
|
| 71 |
embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)
|
| 72 |
|
| 73 |
+
# 3. Setup Chunking (Untuk memecah artikel User)
|
| 74 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 75 |
|
| 76 |
+
# 4. LOAD DATABASE VEKTOR DARI FILE (PENGGANTI JSON BUILDER)
|
| 77 |
+
print(" Loading Vector Database from Disk (index.pkl & index.faiss)...")
|
| 78 |
+
try:
|
| 79 |
+
# "." artinya mencari file di folder root (tempat file ini berada)
|
| 80 |
+
# allow_dangerous_deserialization=True wajib aktif karena kita pakai Pickle
|
| 81 |
+
vectorstore = FAISS.load_local(
|
| 82 |
+
folder_path=".",
|
| 83 |
+
embeddings=embeddings,
|
| 84 |
+
allow_dangerous_deserialization=True
|
| 85 |
+
)
|
| 86 |
+
print("✅ SUCCESS: Database Vektor Berhasil Dimuat!")
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"❌ CRITICAL ERROR: Gagal memuat database vektor. Pastikan file index.pkl dan index.faiss sudah diupload.\nDetail: {e}")
|
| 89 |
+
vectorstore = None
|
| 90 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
print("✅ System Ready!")
|
| 92 |
|
| 93 |
+
# ================= 3. LOGIC MODULES (TIDAK DIUBAH) =================
|
| 94 |
|
| 95 |
+
# --- MODUL 1: GATEKEEPER ---
|
| 96 |
def run_gatekeeper(text):
|
|
|
|
| 97 |
words = text.strip().split()
|
| 98 |
if len(words) < GATE_RULES['min_words']:
|
| 99 |
return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata."
|
| 100 |
|
|
|
|
| 101 |
clean_text = re.sub(r'[\s.,\-\+]', '', text)
|
| 102 |
if clean_text.isdigit():
|
| 103 |
return False, "REJECTED: Input hanya berisi angka (Spam Data)."
|
|
|
|
| 106 |
if digit_ratio > GATE_RULES['max_digit_ratio']:
|
| 107 |
return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah."
|
| 108 |
|
|
|
|
| 109 |
for pattern in GATE_RULES['math_latex_triggers']:
|
| 110 |
if re.search(pattern, text, re.IGNORECASE):
|
| 111 |
return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')."
|
| 112 |
|
| 113 |
return True, "PASS"
|
| 114 |
|
| 115 |
+
# --- MODUL 2: SYSTEM A (MANUAL AUDIT) ---
|
| 116 |
def run_manual_audit(text, rules):
|
| 117 |
base_score = 100
|
| 118 |
flags = []
|
| 119 |
|
|
|
|
| 120 |
if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']:
|
| 121 |
base_score -= rules['penalties']['bad_structure']
|
| 122 |
flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)")
|
| 123 |
|
|
|
|
| 124 |
text_lower = text.lower()
|
| 125 |
found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
|
| 126 |
if found_risks:
|
| 127 |
base_score -= rules['penalties']['risk_word']
|
| 128 |
flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}")
|
| 129 |
|
|
|
|
| 130 |
try:
|
| 131 |
inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
|
| 132 |
with torch.no_grad():
|
|
|
|
| 136 |
except:
|
| 137 |
rob_score = 0.5
|
| 138 |
|
|
|
|
| 139 |
rule_decimal = max(0, base_score) / 100.0
|
| 140 |
final_score = (rule_decimal * 0.7) + (rob_score * 0.3)
|
| 141 |
|
|
|
|
| 142 |
if final_score >= 0.85: verdict = "AI-Curated (Good Quality)"
|
| 143 |
elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
|
| 144 |
else: verdict = "Needs Attention (Low Clarity/Risk)"
|
|
|
|
| 176 |
except: continue
|
| 177 |
return None
|
| 178 |
|
| 179 |
+
# ================= 4. MAIN PROCESSOR (TIDAK DIUBAH) =================
|
| 180 |
def process_article(title, content):
|
| 181 |
full_text = f"{title}\n\n{content}"
|
| 182 |
|
|
|
|
| 185 |
if not is_valid:
|
| 186 |
return {"is_content": False, "rejection_reason": msg}
|
| 187 |
|
| 188 |
+
# 2. CHUNKING & VECTOR SEARCH
|
| 189 |
chunks = text_splitter.split_text(full_text)
|
| 190 |
|
| 191 |
competency_candidates = []
|
|
|
|
| 193 |
|
| 194 |
if vectorstore:
|
| 195 |
for chunk in chunks:
|
| 196 |
+
# Cari di FAISS (Load dari File)
|
| 197 |
+
# Filter berdasarkan metadata 'source' yang kita buat di Colab
|
| 198 |
res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'})
|
| 199 |
+
res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'}) # Ganti 'form' jadi 'content_form' sesuai script colab
|
| 200 |
+
|
| 201 |
+
# Koreksi Sedikit: Di Colab tadi metadata source-nya "content_form", bukan "form"
|
| 202 |
+
# Kita sesuaikan filternya di bawah ini agar match:
|
| 203 |
+
|
| 204 |
+
# ... Ulangi search dengan filter yang benar ...
|
| 205 |
+
res_form_corrected = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'content_form'})
|
| 206 |
|
| 207 |
for doc, score in res_comp:
|
| 208 |
competency_candidates.append({"meta": doc.metadata, "score": score})
|
| 209 |
+
for doc, score in res_form_corrected:
|
| 210 |
form_candidates.append({"meta": doc.metadata, "score": score})
|
| 211 |
|
| 212 |
+
# 3. AGGREGATION
|
| 213 |
unique_comp = {}
|
| 214 |
for item in competency_candidates:
|
| 215 |
+
name = item['meta'].get('competency', item['meta'].get('name', 'Unknown')) # Handle variasi nama key
|
| 216 |
if name not in unique_comp:
|
| 217 |
unique_comp[name] = item['meta']
|
| 218 |
unique_comp[name]['best_score'] = item['score']
|
| 219 |
else:
|
|
|
|
| 220 |
if item['score'] < unique_comp[name]['best_score']:
|
| 221 |
unique_comp[name]['best_score'] = item['score']
|
| 222 |
|
|
|
|
| 223 |
top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5]
|
| 224 |
|
| 225 |
final_competencies = []
|
| 226 |
for comp in top_5_competencies:
|
|
|
|
| 227 |
sim_score = 1 / (1 + comp['best_score'])
|
| 228 |
final_competencies.append({
|
| 229 |
+
"category": comp.get('group', comp.get('category', '-')), # Sesuaikan key metadata Colab
|
| 230 |
+
"competency": comp.get('name', comp.get('competency', '-')),
|
| 231 |
+
"type": comp.get('code', comp.get('type', '-')),
|
| 232 |
"similarity_score": f"{sim_score:.4f}"
|
| 233 |
})
|
| 234 |
|
|
|
|
| 235 |
predicted_form = "General"
|
| 236 |
if form_candidates:
|
| 237 |
best_form = min(form_candidates, key=lambda x: x['score'])
|
| 238 |
+
predicted_form = best_form['meta'].get('name', best_form['meta'].get('content_type', 'General'))
|
| 239 |
|
| 240 |
# 4. QUALITY AUDIT
|
| 241 |
manual_res = run_manual_audit(full_text, QUALITY_RULES)
|
|
|
|
| 270 |
# ================= 5. UI =================
|
| 271 |
iface = gr.Interface(
|
| 272 |
fn=process_article,
|
| 273 |
+
inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel (Min 500 Kata)")],
|
| 274 |
+
outputs=gr.JSON(label="Hasil Analisis"),
|
| 275 |
+
title="DigiFeed V6.1: Pre-computed Index",
|
| 276 |
+
description="Sistem klasifikasi artikel menggunakan FAISS Index statis (Pre-loaded) untuk performa lebih cepat dan stabil."
|
| 277 |
)
|
| 278 |
|
| 279 |
iface.launch()
|