import os import torch import json import re import gradio as gr import numpy as np from transformers import AutoTokenizer, AutoModelForSequenceClassification from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from huggingface_hub import login, InferenceClient # ================= 1. KONFIGURASI & RULES ================= hf_token = os.getenv("HF_TOKEN") login(token=hf_token) # PATH FILE MODEL ROBERTA_PATH = "akage99/roberta-corporate-backend" BGE_MODEL_NAME = "BAAI/bge-m3" # --- A. GATEKEEPER RULES (TIDAK DIUBAH) --- GATE_RULES = { "min_words": 500, # Wajib 500 kata "max_digit_ratio": 0.3, # Maksimal 30% angka "math_latex_triggers": [ r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int", r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2", r"sin\(", r"cos\(", r"tan\(", r"H_2O", r"CO_2", r"fig\.", r"eq\.", r"et al\." ] } # --- B. QUALITY AUDIT RULES (TIDAK DIUBAH) --- QUALITY_RULES = { "standard": { "min_paragraphs": 3, "max_sentence_length": 60 }, "penalties": { "bad_structure": 30, "risk_word": 50, "bad_tone": 20, "short_content": 20 }, "risk_keywords": [ "confidential", "rahasia", "internal use only", "top secret", "bodoh", "goblok", "brengsek", "tolol", "idiot", "password:", "api_key", "access token", "suap", "gratifikasi", "korupsi" ] } # --- C. LLM MODELS (Fallback Strategy) --- LLM_MODELS = [ "HuggingFaceH4/zephyr-7b-beta", "mistralai/Mistral-7B-Instruct-v0.3", "google/gemma-1.1-7b-it" ] # ================= 2. SETUP ENGINE (UPDATED) ================= print("⏳ Starting System...") # 1. Load Tokenizer & RoBERTa (Untuk Audit Tone) print(" Loading RoBERTa Model...") tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH) roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH) # 2. Load Embedding Model (Untuk memproses input User) print(" Loading BGE-M3 Model...") embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME) # 3. Setup Chunking (Untuk memecah artikel User) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # 4. LOAD DATABASE VEKTOR DARI FILE (PENGGANTI JSON BUILDER) print(" Loading Vector Database from Disk (index.pkl & index.faiss)...") try: # "." artinya mencari file di folder root (tempat file ini berada) # allow_dangerous_deserialization=True wajib aktif karena kita pakai Pickle vectorstore = FAISS.load_local( folder_path=".", embeddings=embeddings, allow_dangerous_deserialization=True ) print("✅ SUCCESS: Database Vektor Berhasil Dimuat!") except Exception as e: print(f"❌ CRITICAL ERROR: Gagal memuat database vektor. Pastikan file index.pkl dan index.faiss sudah diupload.\nDetail: {e}") vectorstore = None print("✅ System Ready!") # ================= 3. LOGIC MODULES (TIDAK DIUBAH) ================= # --- MODUL 1: GATEKEEPER --- def run_gatekeeper(text): words = text.strip().split() if len(words) < GATE_RULES['min_words']: return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata." clean_text = re.sub(r'[\s.,\-\+]', '', text) if clean_text.isdigit(): return False, "REJECTED: Input hanya berisi angka (Spam Data)." digit_ratio = sum(c.isdigit() for c in text) / len(text) if digit_ratio > GATE_RULES['max_digit_ratio']: return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah." for pattern in GATE_RULES['math_latex_triggers']: if re.search(pattern, text, re.IGNORECASE): return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')." return True, "PASS" # --- MODUL 2: SYSTEM A (MANUAL AUDIT) --- def run_manual_audit(text, rules): base_score = 100 flags = [] if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']: base_score -= rules['penalties']['bad_structure'] flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)") text_lower = text.lower() found_risks = [w for w in rules['risk_keywords'] if w in text_lower] if found_risks: base_score -= rules['penalties']['risk_word'] flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}") try: inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = roberta_model(**inputs) probs = torch.softmax(outputs.logits, dim=-1)[0] rob_score = float(probs[1]) except: rob_score = 0.5 rule_decimal = max(0, base_score) / 100.0 final_score = (rule_decimal * 0.7) + (rob_score * 0.3) if final_score >= 0.85: verdict = "AI-Curated (Good Quality)" elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)" else: verdict = "Needs Attention (Low Clarity/Risk)" return { "verdict": verdict, "score_decimal": final_score, "flags": flags } # --- MODUL 3: SYSTEM B (LLM JUDGE) --- def run_llm_judge(text, content_type): prompt = f""" Role: Senior Editor. Assess this article based on DigiFeed Parameters. Type: "{content_type}". STANDARDS: 1. GOOD (>80): Clear idea, logical flow, grammar 90%, professional tone. 2. REVISION (50-80): Unfocused, messy paragraphs, typos. 3. BAD (<50): No flow, misleading, sensitive data, too technical. Text: "{text[:2000]}..." Output JSON: {{ "category": "GOOD/REVISION/BAD", "score": (0-100), "reason": "summary", "advice": "tip" }} """ for model_id in LLM_MODELS: try: client = InferenceClient(model=model_id, token=hf_token, timeout=15) response = client.text_generation(prompt, max_new_tokens=250, temperature=0.4) json_str = response.strip() if "```json" in json_str: json_str = json_str.split("```json")[1].split("```")[0] elif "{" in json_str: json_str = "{" + json_str.split("{", 1)[1].rsplit("}", 1)[0] + "}" res = json.loads(json_str) res['model'] = model_id return res except: continue return None # ================= 4. MAIN PROCESSOR (TIDAK DIUBAH) ================= def process_article(title, content): full_text = f"{title}\n\n{content}" # 1. GATEKEEPER CHECK is_valid, msg = run_gatekeeper(full_text) if not is_valid: return {"is_content": False, "rejection_reason": msg} # 2. CHUNKING & VECTOR SEARCH chunks = text_splitter.split_text(full_text) competency_candidates = [] form_candidates = [] if vectorstore: for chunk in chunks: # Cari di FAISS (Load dari File) # Filter berdasarkan metadata 'source' yang kita buat di Colab res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'}) res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'}) # Ganti 'form' jadi 'content_form' sesuai script colab # Koreksi Sedikit: Di Colab tadi metadata source-nya "content_form", bukan "form" # Kita sesuaikan filternya di bawah ini agar match: # ... Ulangi search dengan filter yang benar ... res_form_corrected = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'content_form'}) for doc, score in res_comp: competency_candidates.append({"meta": doc.metadata, "score": score}) for doc, score in res_form_corrected: form_candidates.append({"meta": doc.metadata, "score": score}) # 3. AGGREGATION unique_comp = {} for item in competency_candidates: name = item['meta'].get('competency', item['meta'].get('name', 'Unknown')) # Handle variasi nama key if name not in unique_comp: unique_comp[name] = item['meta'] unique_comp[name]['best_score'] = item['score'] else: if item['score'] < unique_comp[name]['best_score']: unique_comp[name]['best_score'] = item['score'] top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5] final_competencies = [] for comp in top_5_competencies: sim_score = 1 / (1 + comp['best_score']) final_competencies.append({ "category": comp.get('group', comp.get('category', '-')), # Sesuaikan key metadata Colab "competency": comp.get('name', comp.get('competency', '-')), "type": comp.get('code', comp.get('type', '-')), "similarity_score": f"{sim_score:.4f}" }) predicted_form = "General" if form_candidates: best_form = min(form_candidates, key=lambda x: x['score']) predicted_form = best_form['meta'].get('name', best_form['meta'].get('content_type', 'General')) # 4. QUALITY AUDIT manual_res = run_manual_audit(full_text, QUALITY_RULES) llm_res = run_llm_judge(full_text, predicted_form) # 5. CONSTRUCT OUTPUT result = { "is_content": True, "CONTENT_ANALYSIS": { "predicted_type": predicted_form, "top_5_competencies": final_competencies }, "QUALITY_REPORT": { "SYSTEM_A_MANUAL": { "verdict": manual_res['verdict'], "score": f"{manual_res['score_decimal']:.4f}", "flags": manual_res['flags'] } } } if llm_res: result["QUALITY_REPORT"]["SYSTEM_B_LLM"] = { "verdict": llm_res.get('category'), "score": llm_res.get('score'), "advice": llm_res.get('advice'), "served_by": llm_res.get('model') } return result # ================= 5. UI ================= iface = gr.Interface( fn=process_article, inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel (Min 500 Kata)")], outputs=gr.JSON(label="Hasil Analisis"), title="DigiFeed V6.1: Pre-computed Index", description="Sistem klasifikasi artikel menggunakan FAISS Index statis (Pre-loaded) untuk performa lebih cepat dan stabil." ) iface.launch()