akage99 commited on
Commit
d1e36c0
·
verified ·
1 Parent(s): 3c2780d

update menggunakan file pkl

Browse files
Files changed (1) hide show
  1. app.py +59 -110
app.py CHANGED
@@ -15,16 +15,14 @@ from huggingface_hub import login, InferenceClient
15
  hf_token = os.getenv("HF_TOKEN")
16
  login(token=hf_token)
17
 
18
- # PATH FILE (Wajib ada di satu folder)
19
  ROBERTA_PATH = "akage99/roberta-corporate-backend"
20
  BGE_MODEL_NAME = "BAAI/bge-m3"
21
- COMPETENCY_PATH = "competency_keywords.json"
22
- FORM_PATH = "content_forms.json"
23
 
24
- # --- A. GATEKEEPER RULES (Filter Awal / Hard Reject) ---
25
  GATE_RULES = {
26
- "min_words": 500, # Sesuai request: Wajib 500 kata
27
- "max_digit_ratio": 0.3, # Maksimal 30% angka dalam teks
28
  "math_latex_triggers": [
29
  r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int",
30
  r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2",
@@ -33,25 +31,23 @@ GATE_RULES = {
33
  ]
34
  }
35
 
36
- # --- B. QUALITY AUDIT RULES (Diterjemahkan dari Excel Parameter) ---
37
- # Mengacu pada file: parameter_content.xlsx
38
  QUALITY_RULES = {
39
  "standard": {
40
  "min_paragraphs": 3,
41
  "max_sentence_length": 60
42
  },
43
  "penalties": {
44
- "bad_structure": 30, # "Struktur sangat buruk" -> -30
45
- "risk_word": 50, # "Risiko tinggi / Masalah etika" -> -50
46
- "bad_tone": 20, # "Bahasa perlu perbaikan" -> -20
47
  "short_content": 20
48
  },
49
- # Keywords bahaya dari Excel (Data Sensitif, Etika, Hoax)
50
  "risk_keywords": [
51
- "confidential", "rahasia", "internal use only", "top secret", # Data Sensitif
52
- "bodoh", "goblok", "brengsek", "tolol", "idiot", # Masalah Etika
53
- "password:", "api_key", "access token", # Security
54
- "suap", "gratifikasi", "korupsi" # Pelanggaran
55
  ]
56
  }
57
 
@@ -62,90 +58,46 @@ LLM_MODELS = [
62
  "google/gemma-1.1-7b-it"
63
  ]
64
 
65
- # ================= 2. SETUP ENGINE (INIT) =================
66
- print("⏳ Loading Models & Building Local Database...")
67
 
68
- # 1. Load Tokenizer & RoBERTa
 
69
  tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
70
  roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
71
 
72
- # 2. Load Embedding Model
 
73
  embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)
74
 
75
- # 3. Setup Chunking
76
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
77
 
78
- # 4. FUNGSI BUILD VECTOR STORE (JSON -> RAM Variable)
79
- def build_vector_store():
80
- docs = []
81
-
82
- # A. Load Kompetensi (competency_keywords.json)
83
- try:
84
- with open(COMPETENCY_PATH, 'r') as f:
85
- data = json.load(f)
86
- # Struktur JSON: Category -> Competency -> {type, keywords, description}
87
- for category, competencies in data.items():
88
- for comp_name, details in competencies.items():
89
- # Gabung Data untuk Embedding (Biar kaya konteks)
90
- # Text = "Nama Kompetensi + Deskripsi + Keyword"
91
- keywords_str = ", ".join(details.get('keywords', []))
92
- desc = details.get('description', '')
93
- text_content = f"{comp_name}. {desc} Keywords: {keywords_str}"
94
-
95
- # Metadata (Disimpan untuk ditampilkan nanti)
96
- meta = {
97
- "source": "competency",
98
- "category": category,
99
- "competency": comp_name,
100
- "type": details.get('type', '-'),
101
- "keywords": keywords_str # Simpan buat display kalau perlu
102
- }
103
- docs.append(Document(page_content=text_content, metadata=meta))
104
- except FileNotFoundError:
105
- print(f"⚠️ Error: {COMPETENCY_PATH} tidak ditemukan.")
106
-
107
- # B. Load Content Forms (content_forms.json)
108
- try:
109
- with open(FORM_PATH, 'r') as f:
110
- data = json.load(f)
111
- # Struktur JSON: Group -> Type Name -> {description, examples}
112
- for group, types in data.items():
113
- for type_name, details in types.items():
114
- desc = details.get('description', '')
115
- examples = details.get('examples', '')
116
- # Text = "Tipe + Deskripsi + Contoh"
117
- text_content = f"{type_name}. {desc} Contoh: {examples}"
118
-
119
- meta = {
120
- "source": "form",
121
- "group": group,
122
- "content_type": type_name
123
- }
124
- docs.append(Document(page_content=text_content, metadata=meta))
125
- except FileNotFoundError:
126
- print(f"⚠️ Error: {FORM_PATH} tidak ditemukan.")
127
-
128
- # C. Build FAISS Index (Simpan di RAM)
129
- if docs:
130
- print(f"✅ Embedding {len(docs)} documents to RAM...")
131
- return FAISS.from_documents(docs, embeddings)
132
- else:
133
- return None
134
-
135
- # INISIALISASI DATABASE (Jalan 1x saat start)
136
- vectorstore = build_vector_store()
137
  print("✅ System Ready!")
138
 
139
- # ================= 3. LOGIC MODULES =================
140
 
141
- # --- MODUL 1: GATEKEEPER (Specific Errors) ---
142
  def run_gatekeeper(text):
143
- # A. Cek Word Count
144
  words = text.strip().split()
145
  if len(words) < GATE_RULES['min_words']:
146
  return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata."
147
 
148
- # B. Cek Angka (Numeric Spam)
149
  clean_text = re.sub(r'[\s.,\-\+]', '', text)
150
  if clean_text.isdigit():
151
  return False, "REJECTED: Input hanya berisi angka (Spam Data)."
@@ -154,31 +106,27 @@ def run_gatekeeper(text):
154
  if digit_ratio > GATE_RULES['max_digit_ratio']:
155
  return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah."
156
 
157
- # C. Cek LaTeX / Matematika
158
  for pattern in GATE_RULES['math_latex_triggers']:
159
  if re.search(pattern, text, re.IGNORECASE):
160
  return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')."
161
 
162
  return True, "PASS"
163
 
164
- # --- MODUL 2: SYSTEM A (MANUAL AUDIT - EXCEL RULES) ---
165
  def run_manual_audit(text, rules):
166
  base_score = 100
167
  flags = []
168
 
169
- # 1. Cek Struktur (Excel: "Struktur sangat buruk")
170
  if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']:
171
  base_score -= rules['penalties']['bad_structure']
172
  flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)")
173
 
174
- # 2. Cek Risiko & Etika (Excel: "Masalah etika / Risiko tinggi")
175
  text_lower = text.lower()
176
  found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
177
  if found_risks:
178
  base_score -= rules['penalties']['risk_word']
179
  flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}")
180
 
181
- # 3. Cek Tone/Bahasa (Excel: "Bahasa perlu perbaikan")
182
  try:
183
  inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
184
  with torch.no_grad():
@@ -188,11 +136,9 @@ def run_manual_audit(text, rules):
188
  except:
189
  rob_score = 0.5
190
 
191
- # 4. Kalkulasi Final Score (0.0000 - 1.0000)
192
  rule_decimal = max(0, base_score) / 100.0
193
  final_score = (rule_decimal * 0.7) + (rob_score * 0.3)
194
 
195
- # Mapping ke Kategori Excel
196
  if final_score >= 0.85: verdict = "AI-Curated (Good Quality)"
197
  elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
198
  else: verdict = "Needs Attention (Low Clarity/Risk)"
@@ -230,7 +176,7 @@ def run_llm_judge(text, content_type):
230
  except: continue
231
  return None
232
 
233
- # ================= 4. MAIN PROCESSOR =================
234
  def process_article(title, content):
235
  full_text = f"{title}\n\n{content}"
236
 
@@ -239,7 +185,7 @@ def process_article(title, content):
239
  if not is_valid:
240
  return {"is_content": False, "rejection_reason": msg}
241
 
242
- # 2. CHUNKING & VECTOR SEARCH (FAISS)
243
  chunks = text_splitter.split_text(full_text)
244
 
245
  competency_candidates = []
@@ -247,46 +193,49 @@ def process_article(title, content):
247
 
248
  if vectorstore:
249
  for chunk in chunks:
250
- # Cari di FAISS (In-Memory)
 
251
  res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'})
252
- res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'})
 
 
 
 
 
 
253
 
254
  for doc, score in res_comp:
255
  competency_candidates.append({"meta": doc.metadata, "score": score})
256
- for doc, score in res_form:
257
  form_candidates.append({"meta": doc.metadata, "score": score})
258
 
259
- # 3. AGGREGATION (Top 5 Competencies)
260
  unique_comp = {}
261
  for item in competency_candidates:
262
- name = item['meta']['competency']
263
  if name not in unique_comp:
264
  unique_comp[name] = item['meta']
265
  unique_comp[name]['best_score'] = item['score']
266
  else:
267
- # Ambil score terendah (L2 distance terbaik)
268
  if item['score'] < unique_comp[name]['best_score']:
269
  unique_comp[name]['best_score'] = item['score']
270
 
271
- # Sort & Ambil Top 5
272
  top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5]
273
 
274
  final_competencies = []
275
  for comp in top_5_competencies:
276
- # Konversi L2 Distance ke Similarity (Simulasi: 1 / (1+dist))
277
  sim_score = 1 / (1 + comp['best_score'])
278
  final_competencies.append({
279
- "category": comp['category'],
280
- "competency": comp['competency'],
281
- "type": comp['type'],
282
  "similarity_score": f"{sim_score:.4f}"
283
  })
284
 
285
- # Content Type Dominan
286
  predicted_form = "General"
287
  if form_candidates:
288
  best_form = min(form_candidates, key=lambda x: x['score'])
289
- predicted_form = best_form['meta']['content_type']
290
 
291
  # 4. QUALITY AUDIT
292
  manual_res = run_manual_audit(full_text, QUALITY_RULES)
@@ -321,10 +270,10 @@ def process_article(title, content):
321
  # ================= 5. UI =================
322
  iface = gr.Interface(
323
  fn=process_article,
324
- inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel")],
325
- outputs=gr.JSON(label="Hasil V6 (Local Warrior)"),
326
- title="DigiFeed V6: Local FAISS + Hybrid Audit",
327
- description="Klasifikasi Artikel Tanpa Cloud DB. Menggunakan FAISS (RAM), Gatekeeper 500 Kata, dan Parameter Excel."
328
  )
329
 
330
  iface.launch()
 
15
  hf_token = os.getenv("HF_TOKEN")
16
  login(token=hf_token)
17
 
18
+ # PATH FILE MODEL
19
  ROBERTA_PATH = "akage99/roberta-corporate-backend"
20
  BGE_MODEL_NAME = "BAAI/bge-m3"
 
 
21
 
22
+ # --- A. GATEKEEPER RULES (TIDAK DIUBAH) ---
23
  GATE_RULES = {
24
+ "min_words": 500, # Wajib 500 kata
25
+ "max_digit_ratio": 0.3, # Maksimal 30% angka
26
  "math_latex_triggers": [
27
  r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int",
28
  r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2",
 
31
  ]
32
  }
33
 
34
+ # --- B. QUALITY AUDIT RULES (TIDAK DIUBAH) ---
 
35
  QUALITY_RULES = {
36
  "standard": {
37
  "min_paragraphs": 3,
38
  "max_sentence_length": 60
39
  },
40
  "penalties": {
41
+ "bad_structure": 30,
42
+ "risk_word": 50,
43
+ "bad_tone": 20,
44
  "short_content": 20
45
  },
 
46
  "risk_keywords": [
47
+ "confidential", "rahasia", "internal use only", "top secret",
48
+ "bodoh", "goblok", "brengsek", "tolol", "idiot",
49
+ "password:", "api_key", "access token",
50
+ "suap", "gratifikasi", "korupsi"
51
  ]
52
  }
53
 
 
58
  "google/gemma-1.1-7b-it"
59
  ]
60
 
61
+ # ================= 2. SETUP ENGINE (UPDATED) =================
62
+ print("⏳ Starting System...")
63
 
64
+ # 1. Load Tokenizer & RoBERTa (Untuk Audit Tone)
65
+ print(" Loading RoBERTa Model...")
66
  tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
67
  roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
68
 
69
+ # 2. Load Embedding Model (Untuk memproses input User)
70
+ print(" Loading BGE-M3 Model...")
71
  embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)
72
 
73
+ # 3. Setup Chunking (Untuk memecah artikel User)
74
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
75
 
76
+ # 4. LOAD DATABASE VEKTOR DARI FILE (PENGGANTI JSON BUILDER)
77
+ print(" Loading Vector Database from Disk (index.pkl & index.faiss)...")
78
+ try:
79
+ # "." artinya mencari file di folder root (tempat file ini berada)
80
+ # allow_dangerous_deserialization=True wajib aktif karena kita pakai Pickle
81
+ vectorstore = FAISS.load_local(
82
+ folder_path=".",
83
+ embeddings=embeddings,
84
+ allow_dangerous_deserialization=True
85
+ )
86
+ print("✅ SUCCESS: Database Vektor Berhasil Dimuat!")
87
+ except Exception as e:
88
+ print(f"❌ CRITICAL ERROR: Gagal memuat database vektor. Pastikan file index.pkl dan index.faiss sudah diupload.\nDetail: {e}")
89
+ vectorstore = None
90
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  print("✅ System Ready!")
92
 
93
+ # ================= 3. LOGIC MODULES (TIDAK DIUBAH) =================
94
 
95
+ # --- MODUL 1: GATEKEEPER ---
96
  def run_gatekeeper(text):
 
97
  words = text.strip().split()
98
  if len(words) < GATE_RULES['min_words']:
99
  return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata."
100
 
 
101
  clean_text = re.sub(r'[\s.,\-\+]', '', text)
102
  if clean_text.isdigit():
103
  return False, "REJECTED: Input hanya berisi angka (Spam Data)."
 
106
  if digit_ratio > GATE_RULES['max_digit_ratio']:
107
  return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah."
108
 
 
109
  for pattern in GATE_RULES['math_latex_triggers']:
110
  if re.search(pattern, text, re.IGNORECASE):
111
  return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')."
112
 
113
  return True, "PASS"
114
 
115
+ # --- MODUL 2: SYSTEM A (MANUAL AUDIT) ---
116
  def run_manual_audit(text, rules):
117
  base_score = 100
118
  flags = []
119
 
 
120
  if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']:
121
  base_score -= rules['penalties']['bad_structure']
122
  flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)")
123
 
 
124
  text_lower = text.lower()
125
  found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
126
  if found_risks:
127
  base_score -= rules['penalties']['risk_word']
128
  flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}")
129
 
 
130
  try:
131
  inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
132
  with torch.no_grad():
 
136
  except:
137
  rob_score = 0.5
138
 
 
139
  rule_decimal = max(0, base_score) / 100.0
140
  final_score = (rule_decimal * 0.7) + (rob_score * 0.3)
141
 
 
142
  if final_score >= 0.85: verdict = "AI-Curated (Good Quality)"
143
  elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
144
  else: verdict = "Needs Attention (Low Clarity/Risk)"
 
176
  except: continue
177
  return None
178
 
179
+ # ================= 4. MAIN PROCESSOR (TIDAK DIUBAH) =================
180
  def process_article(title, content):
181
  full_text = f"{title}\n\n{content}"
182
 
 
185
  if not is_valid:
186
  return {"is_content": False, "rejection_reason": msg}
187
 
188
+ # 2. CHUNKING & VECTOR SEARCH
189
  chunks = text_splitter.split_text(full_text)
190
 
191
  competency_candidates = []
 
193
 
194
  if vectorstore:
195
  for chunk in chunks:
196
+ # Cari di FAISS (Load dari File)
197
+ # Filter berdasarkan metadata 'source' yang kita buat di Colab
198
  res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'})
199
+ res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'}) # Ganti 'form' jadi 'content_form' sesuai script colab
200
+
201
+ # Koreksi Sedikit: Di Colab tadi metadata source-nya "content_form", bukan "form"
202
+ # Kita sesuaikan filternya di bawah ini agar match:
203
+
204
+ # ... Ulangi search dengan filter yang benar ...
205
+ res_form_corrected = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'content_form'})
206
 
207
  for doc, score in res_comp:
208
  competency_candidates.append({"meta": doc.metadata, "score": score})
209
+ for doc, score in res_form_corrected:
210
  form_candidates.append({"meta": doc.metadata, "score": score})
211
 
212
+ # 3. AGGREGATION
213
  unique_comp = {}
214
  for item in competency_candidates:
215
+ name = item['meta'].get('competency', item['meta'].get('name', 'Unknown')) # Handle variasi nama key
216
  if name not in unique_comp:
217
  unique_comp[name] = item['meta']
218
  unique_comp[name]['best_score'] = item['score']
219
  else:
 
220
  if item['score'] < unique_comp[name]['best_score']:
221
  unique_comp[name]['best_score'] = item['score']
222
 
 
223
  top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5]
224
 
225
  final_competencies = []
226
  for comp in top_5_competencies:
 
227
  sim_score = 1 / (1 + comp['best_score'])
228
  final_competencies.append({
229
+ "category": comp.get('group', comp.get('category', '-')), # Sesuaikan key metadata Colab
230
+ "competency": comp.get('name', comp.get('competency', '-')),
231
+ "type": comp.get('code', comp.get('type', '-')),
232
  "similarity_score": f"{sim_score:.4f}"
233
  })
234
 
 
235
  predicted_form = "General"
236
  if form_candidates:
237
  best_form = min(form_candidates, key=lambda x: x['score'])
238
+ predicted_form = best_form['meta'].get('name', best_form['meta'].get('content_type', 'General'))
239
 
240
  # 4. QUALITY AUDIT
241
  manual_res = run_manual_audit(full_text, QUALITY_RULES)
 
270
  # ================= 5. UI =================
271
  iface = gr.Interface(
272
  fn=process_article,
273
+ inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel (Min 500 Kata)")],
274
+ outputs=gr.JSON(label="Hasil Analisis"),
275
+ title="DigiFeed V6.1: Pre-computed Index",
276
+ description="Sistem klasifikasi artikel menggunakan FAISS Index statis (Pre-loaded) untuk performa lebih cepat dan stabil."
277
  )
278
 
279
  iface.launch()