Spaces:

akage99
/

article-model-digifeed

Running

App Files Files Community

akage99 commited on Jan 27

Commit

d1e36c0

verified ·

1 Parent(s): 3c2780d

update menggunakan file pkl

Browse files

Files changed (1) hide show

app.py +59 -110

app.py CHANGED Viewed

@@ -15,16 +15,14 @@ from huggingface_hub import login, InferenceClient
 hf_token = os.getenv("HF_TOKEN")
 login(token=hf_token)
-# PATH FILE (Wajib ada di satu folder)
 ROBERTA_PATH = "akage99/roberta-corporate-backend"
 BGE_MODEL_NAME = "BAAI/bge-m3"
-COMPETENCY_PATH = "competency_keywords.json"
-FORM_PATH = "content_forms.json"
-# --- A. GATEKEEPER RULES (Filter Awal / Hard Reject) ---
 GATE_RULES = {
-    "min_words": 500,       # Sesuai request: Wajib 500 kata
-    "max_digit_ratio": 0.3, # Maksimal 30% angka dalam teks
     "math_latex_triggers": [
         r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int",
         r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2",
@@ -33,25 +31,23 @@ GATE_RULES = {
     ]
 }
-# --- B. QUALITY AUDIT RULES (Diterjemahkan dari Excel Parameter) ---
-# Mengacu pada file: parameter_content.xlsx
 QUALITY_RULES = {
     "standard": {
         "min_paragraphs": 3,
         "max_sentence_length": 60
     },
     "penalties": {
-        "bad_structure": 30, # "Struktur sangat buruk" -> -30
-        "risk_word": 50,     # "Risiko tinggi / Masalah etika" -> -50
-        "bad_tone": 20,      # "Bahasa perlu perbaikan" -> -20
         "short_content": 20
     },
-    # Keywords bahaya dari Excel (Data Sensitif, Etika, Hoax)
     "risk_keywords": [
-        "confidential", "rahasia", "internal use only", "top secret", # Data Sensitif
-        "bodoh", "goblok", "brengsek", "tolol", "idiot", # Masalah Etika
-        "password:", "api_key", "access token", # Security
-        "suap", "gratifikasi", "korupsi" # Pelanggaran
     ]
 }
@@ -62,90 +58,46 @@ LLM_MODELS = [
     "google/gemma-1.1-7b-it"
 ]
-# ================= 2. SETUP ENGINE (INIT) =================
-print("⏳ Loading Models & Building Local Database...")
-# 1. Load Tokenizer & RoBERTa
 tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
 roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
-# 2. Load Embedding Model
 embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)
-# 3. Setup Chunking
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-# 4. FUNGSI BUILD VECTOR STORE (JSON -> RAM Variable)
-def build_vector_store():
-    docs = []
-    # A. Load Kompetensi (competency_keywords.json)
-    try:
-        with open(COMPETENCY_PATH, 'r') as f:
-            data = json.load(f)
-            # Struktur JSON: Category -> Competency -> {type, keywords, description}
-            for category, competencies in data.items():
-                for comp_name, details in competencies.items():
-                    # Gabung Data untuk Embedding (Biar kaya konteks)
-                    # Text = "Nama Kompetensi + Deskripsi + Keyword"
-                    keywords_str = ", ".join(details.get('keywords', []))
-                    desc = details.get('description', '')
-                    text_content = f"{comp_name}. {desc} Keywords: {keywords_str}"
-                    # Metadata (Disimpan untuk ditampilkan nanti)
-                    meta = {
-                        "source": "competency",
-                        "category": category,
-                        "competency": comp_name,
-                        "type": details.get('type', '-'),
-                        "keywords": keywords_str # Simpan buat display kalau perlu
-                    }
-                    docs.append(Document(page_content=text_content, metadata=meta))
-    except FileNotFoundError:
-        print(f"⚠️ Error: {COMPETENCY_PATH} tidak ditemukan.")
-    # B. Load Content Forms (content_forms.json)
-    try:
-        with open(FORM_PATH, 'r') as f:
-            data = json.load(f)
-            # Struktur JSON: Group -> Type Name -> {description, examples}
-            for group, types in data.items():
-                for type_name, details in types.items():
-                    desc = details.get('description', '')
-                    examples = details.get('examples', '')
-                    # Text = "Tipe + Deskripsi + Contoh"
-                    text_content = f"{type_name}. {desc} Contoh: {examples}"
-                    meta = {
-                        "source": "form",
-                        "group": group,
-                        "content_type": type_name
-                    }
-                    docs.append(Document(page_content=text_content, metadata=meta))
-    except FileNotFoundError:
-        print(f"⚠️ Error: {FORM_PATH} tidak ditemukan.")
-    # C. Build FAISS Index (Simpan di RAM)
-    if docs:
-        print(f"✅ Embedding {len(docs)} documents to RAM...")
-        return FAISS.from_documents(docs, embeddings)
-    else:
-        return None
-# INISIALISASI DATABASE (Jalan 1x saat start)
-vectorstore = build_vector_store()
 print("✅ System Ready!")
-# ================= 3. LOGIC MODULES =================
-# --- MODUL 1: GATEKEEPER (Specific Errors) ---
 def run_gatekeeper(text):
-    # A. Cek Word Count
     words = text.strip().split()
     if len(words) < GATE_RULES['min_words']:
         return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata."
-    # B. Cek Angka (Numeric Spam)
     clean_text = re.sub(r'[\s.,\-\+]', '', text)
     if clean_text.isdigit():
         return False, "REJECTED: Input hanya berisi angka (Spam Data)."
@@ -154,31 +106,27 @@ def run_gatekeeper(text):
     if digit_ratio > GATE_RULES['max_digit_ratio']:
         return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah."
-    # C. Cek LaTeX / Matematika
     for pattern in GATE_RULES['math_latex_triggers']:
         if re.search(pattern, text, re.IGNORECASE):
             return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')."
     return True, "PASS"
-# --- MODUL 2: SYSTEM A (MANUAL AUDIT - EXCEL RULES) ---
 def run_manual_audit(text, rules):
     base_score = 100
     flags = []
-    # 1. Cek Struktur (Excel: "Struktur sangat buruk")
     if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']:
         base_score -= rules['penalties']['bad_structure']
         flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)")
-    # 2. Cek Risiko & Etika (Excel: "Masalah etika / Risiko tinggi")
     text_lower = text.lower()
     found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
     if found_risks:
         base_score -= rules['penalties']['risk_word']
         flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}")
-    # 3. Cek Tone/Bahasa (Excel: "Bahasa perlu perbaikan")
     try:
         inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
         with torch.no_grad():
@@ -188,11 +136,9 @@ def run_manual_audit(text, rules):
     except:
         rob_score = 0.5
-    # 4. Kalkulasi Final Score (0.0000 - 1.0000)
     rule_decimal = max(0, base_score) / 100.0
     final_score = (rule_decimal * 0.7) + (rob_score * 0.3)
-    # Mapping ke Kategori Excel
     if final_score >= 0.85: verdict = "AI-Curated (Good Quality)"
     elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
     else: verdict = "Needs Attention (Low Clarity/Risk)"
@@ -230,7 +176,7 @@ def run_llm_judge(text, content_type):
         except: continue
     return None
-# ================= 4. MAIN PROCESSOR =================
 def process_article(title, content):
     full_text = f"{title}\n\n{content}"
@@ -239,7 +185,7 @@ def process_article(title, content):
     if not is_valid:
         return {"is_content": False, "rejection_reason": msg}
-    # 2. CHUNKING & VECTOR SEARCH (FAISS)
     chunks = text_splitter.split_text(full_text)
     competency_candidates = []
@@ -247,46 +193,49 @@ def process_article(title, content):
     if vectorstore:
         for chunk in chunks:
-            # Cari di FAISS (In-Memory)
             res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'})
-            res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'})
             for doc, score in res_comp:
                 competency_candidates.append({"meta": doc.metadata, "score": score})
-            for doc, score in res_form:
                 form_candidates.append({"meta": doc.metadata, "score": score})
-    # 3. AGGREGATION (Top 5 Competencies)
     unique_comp = {}
     for item in competency_candidates:
-        name = item['meta']['competency']
         if name not in unique_comp:
             unique_comp[name] = item['meta']
             unique_comp[name]['best_score'] = item['score']
         else:
-            # Ambil score terendah (L2 distance terbaik)
             if item['score'] < unique_comp[name]['best_score']:
                 unique_comp[name]['best_score'] = item['score']
-    # Sort & Ambil Top 5
     top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5]
     final_competencies = []
     for comp in top_5_competencies:
-        # Konversi L2 Distance ke Similarity (Simulasi: 1 / (1+dist))
         sim_score = 1 / (1 + comp['best_score'])
         final_competencies.append({
-            "category": comp['category'],
-            "competency": comp['competency'],
-            "type": comp['type'],
             "similarity_score": f"{sim_score:.4f}"
         })
-    # Content Type Dominan
     predicted_form = "General"
     if form_candidates:
         best_form = min(form_candidates, key=lambda x: x['score'])
-        predicted_form = best_form['meta']['content_type']
     # 4. QUALITY AUDIT
     manual_res = run_manual_audit(full_text, QUALITY_RULES)
@@ -321,10 +270,10 @@ def process_article(title, content):
 # ================= 5. UI =================
 iface = gr.Interface(
     fn=process_article,
-    inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel")],
-    outputs=gr.JSON(label="Hasil V6 (Local Warrior)"),
-    title="DigiFeed V6: Local FAISS + Hybrid Audit",
-    description="Klasifikasi Artikel Tanpa Cloud DB. Menggunakan FAISS (RAM), Gatekeeper 500 Kata, dan Parameter Excel."
 )
 iface.launch()

 hf_token = os.getenv("HF_TOKEN")
 login(token=hf_token)
+# PATH FILE MODEL
 ROBERTA_PATH = "akage99/roberta-corporate-backend"
 BGE_MODEL_NAME = "BAAI/bge-m3"
+# --- A. GATEKEEPER RULES (TIDAK DIUBAH) ---
 GATE_RULES = {
+    "min_words": 500,       # Wajib 500 kata
+    "max_digit_ratio": 0.3, # Maksimal 30% angka
     "math_latex_triggers": [
         r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int",
         r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2",
     ]
 }
+# --- B. QUALITY AUDIT RULES (TIDAK DIUBAH) ---
 QUALITY_RULES = {
     "standard": {
         "min_paragraphs": 3,
         "max_sentence_length": 60
     },
     "penalties": {
+        "bad_structure": 30,
+        "risk_word": 50,
+        "bad_tone": 20,
         "short_content": 20
     },
     "risk_keywords": [
+        "confidential", "rahasia", "internal use only", "top secret",
+        "bodoh", "goblok", "brengsek", "tolol", "idiot",
+        "password:", "api_key", "access token",
+        "suap", "gratifikasi", "korupsi"
     ]
 }
     "google/gemma-1.1-7b-it"
 ]
+# ================= 2. SETUP ENGINE (UPDATED) =================
+print("⏳ Starting System...")
+# 1. Load Tokenizer & RoBERTa (Untuk Audit Tone)
+print("   Loading RoBERTa Model...")
 tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
 roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
+# 2. Load Embedding Model (Untuk memproses input User)
+print("   Loading BGE-M3 Model...")
 embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)
+# 3. Setup Chunking (Untuk memecah artikel User)
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+# 4. LOAD DATABASE VEKTOR DARI FILE (PENGGANTI JSON BUILDER)
+print("   Loading Vector Database from Disk (index.pkl & index.faiss)...")
+try:
+    # "." artinya mencari file di folder root (tempat file ini berada)
+    # allow_dangerous_deserialization=True wajib aktif karena kita pakai Pickle
+    vectorstore = FAISS.load_local(
+        folder_path=".",
+        embeddings=embeddings,
+        allow_dangerous_deserialization=True
+    )
+    print("✅ SUCCESS: Database Vektor Berhasil Dimuat!")
+except Exception as e:
+    print(f"❌ CRITICAL ERROR: Gagal memuat database vektor. Pastikan file index.pkl dan index.faiss sudah diupload.\nDetail: {e}")
+    vectorstore = None
 print("✅ System Ready!")
+# ================= 3. LOGIC MODULES (TIDAK DIUBAH) =================
+# --- MODUL 1: GATEKEEPER ---
 def run_gatekeeper(text):
     words = text.strip().split()
     if len(words) < GATE_RULES['min_words']:
         return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata."
     clean_text = re.sub(r'[\s.,\-\+]', '', text)
     if clean_text.isdigit():
         return False, "REJECTED: Input hanya berisi angka (Spam Data)."
     if digit_ratio > GATE_RULES['max_digit_ratio']:
         return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah."
     for pattern in GATE_RULES['math_latex_triggers']:
         if re.search(pattern, text, re.IGNORECASE):
             return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')."
     return True, "PASS"
+# --- MODUL 2: SYSTEM A (MANUAL AUDIT) ---
 def run_manual_audit(text, rules):
     base_score = 100
     flags = []
     if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']:
         base_score -= rules['penalties']['bad_structure']
         flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)")
     text_lower = text.lower()
     found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
     if found_risks:
         base_score -= rules['penalties']['risk_word']
         flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}")
     try:
         inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
         with torch.no_grad():
     except:
         rob_score = 0.5
     rule_decimal = max(0, base_score) / 100.0
     final_score = (rule_decimal * 0.7) + (rob_score * 0.3)
     if final_score >= 0.85: verdict = "AI-Curated (Good Quality)"
     elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
     else: verdict = "Needs Attention (Low Clarity/Risk)"
         except: continue
     return None
+# ================= 4. MAIN PROCESSOR (TIDAK DIUBAH) =================
 def process_article(title, content):
     full_text = f"{title}\n\n{content}"
     if not is_valid:
         return {"is_content": False, "rejection_reason": msg}
+    # 2. CHUNKING & VECTOR SEARCH
     chunks = text_splitter.split_text(full_text)
     competency_candidates = []
     if vectorstore:
         for chunk in chunks:
+            # Cari di FAISS (Load dari File)
+            # Filter berdasarkan metadata 'source' yang kita buat di Colab
             res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'})
+            res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'}) # Ganti 'form' jadi 'content_form' sesuai script colab
+            # Koreksi Sedikit: Di Colab tadi metadata source-nya "content_form", bukan "form"
+            # Kita sesuaikan filternya di bawah ini agar match:
+            # ... Ulangi search dengan filter yang benar ...
+            res_form_corrected = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'content_form'})
             for doc, score in res_comp:
                 competency_candidates.append({"meta": doc.metadata, "score": score})
+            for doc, score in res_form_corrected:
                 form_candidates.append({"meta": doc.metadata, "score": score})
+    # 3. AGGREGATION
     unique_comp = {}
     for item in competency_candidates:
+        name = item['meta'].get('competency', item['meta'].get('name', 'Unknown')) # Handle variasi nama key
         if name not in unique_comp:
             unique_comp[name] = item['meta']
             unique_comp[name]['best_score'] = item['score']
         else:
             if item['score'] < unique_comp[name]['best_score']:
                 unique_comp[name]['best_score'] = item['score']
     top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5]
     final_competencies = []
     for comp in top_5_competencies:
         sim_score = 1 / (1 + comp['best_score'])
         final_competencies.append({
+            "category": comp.get('group', comp.get('category', '-')), # Sesuaikan key metadata Colab
+            "competency": comp.get('name', comp.get('competency', '-')),
+            "type": comp.get('code', comp.get('type', '-')),
             "similarity_score": f"{sim_score:.4f}"
         })
     predicted_form = "General"
     if form_candidates:
         best_form = min(form_candidates, key=lambda x: x['score'])
+        predicted_form = best_form['meta'].get('name', best_form['meta'].get('content_type', 'General'))
     # 4. QUALITY AUDIT
     manual_res = run_manual_audit(full_text, QUALITY_RULES)
 # ================= 5. UI =================
 iface = gr.Interface(
     fn=process_article,
+    inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel (Min 500 Kata)")],
+    outputs=gr.JSON(label="Hasil Analisis"),
+    title="DigiFeed V6.1: Pre-computed Index",
+    description="Sistem klasifikasi artikel menggunakan FAISS Index statis (Pre-loaded) untuk performa lebih cepat dan stabil."
 )
 iface.launch()