import os import torch import json import re import gradio as gr from transformers import AutoTokenizer, AutoModelForSequenceClassification from langchain_huggingface import HuggingFaceEmbeddings from langchain_pinecone import PineconeVectorStore from langchain_text_splitters import RecursiveCharacterTextSplitter from huggingface_hub import login, InferenceClient # ================= 1. AUTH & CONFIG ================= hf_token = os.getenv("HF_TOKEN") pc_api_key = os.getenv("PINECONE_API_KEY") os.environ['PINECONE_API_KEY'] = pc_api_key login(token=hf_token) PINECONE_INDEX_NAME = "article-classifier" ROBERTA_PATH = "akage99/roberta-corporate-backend" BGE_MODEL_NAME = "BAAI/bge-m3" # --- GATEKEEPER CONFIG (GERBANG PENJAGA UTAMA) --- # Hard Filter GATE_RULES = { "min_words_limit": 500, "max_digit_ratio": 0.3, "math_latex_triggers": [ r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int", r"\$\$.*?\$\$", # Latex $$...$$ r"x\^2", r"a\^2", r"b\^2", r"x_i", r"y_i", # Variabel Matematika r"sin\(", r"cos\(", r"tan\(", r"log\(", # Fungsi Matematika r"H_2O", r"CO_2", # Rumus Kimia r"fig\.", r"eq\.", r"et al\.", # Bahasa Jurnal Kaku r"theorem", r"lemma", r"proof" # Bahasa Buku Teks Math ] } # --- SCORING CONFIG (AUDIT) --- QUALITY_RULES = { "standard": { "min_words": 500, # Disamakan biar konsisten "min_paragraphs": 3, "max_sentence_length": 60, }, "penalties": { "short_content": 20, "bad_structure": 30, "long_sentence": 5, "risk_word": 50, "bad_tone": 20 }, "risk_keywords": [ "confidential", "internal use only", "bodoh", "goblok", "brengsek", "tolol", "bajingan", "bangsat", "fucking", "what the hell", "what the fuck", "bastard", "password:", "api_key", "suap", "gratifikasi", "nigga" ] } LLM_MODELS = [ "HuggingFaceH4/zephyr-7b-beta", "mistralai/Mistral-7B-Instruct-v0.3", "google/gemma-1.1-7b-it" ] MIN_RELEVANCE_SCORE = 0.35 # ================= 2. SETUP ENGINE ================= print("Loading Models...") tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH) roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH) embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME) vectorstore = PineconeVectorStore.from_existing_index( index_name=PINECONE_INDEX_NAME, embedding=embeddings, text_key="text" ) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) print("System Ready!") # ================= 3. FUNGSI GATEKEEPER (FILTER KERAS) ================= def run_gatekeeper_check(text): """ Cek Syarat Mati: 1. Harus > 500 Kata 2. Gak boleh Matematika/Latex 3. Gak boleh Angka doang """ # 1. CEK JUMLAH KATA (WORD COUNT) # split by spasi untuk hitung kata words = text.strip().split() word_count = len(words) if word_count < GATE_RULES['min_words_limit']: return False, f"REJECTED: Jumlah kata {word_count}. Minimal wajib {GATE_RULES['min_words_limit']} kata." # 2. CEK ISINYA CUMA ANGKA # Hapus spasi dan simbol, cek digit clean_text = re.sub(r'[\s.,\-\+]', '', text) if clean_text.isdigit(): return False, "REJECTED: Input hanya berisi angka/nomor (Spam)." # 3. CEK RASIO ANGKA (Laporan Keuangan/Data Mentah) digit_count = sum(c.isdigit() for c in text) total_chars = len(text) if total_chars > 0 and (digit_count / total_chars) > GATE_RULES['max_digit_ratio']: return False, "REJECTED: Terlalu banyak angka" # 4. CEK LATEX & MATEMATIKA (Scientific Check) for pattern in GATE_RULES['math_latex_triggers']: if re.search(pattern, text, re.IGNORECASE): return False, f"REJECTED: Terdeteksi rumus/simbol Matematika Ilmiah ('{pattern}'). Tidak sesuai format artikel populer." return True, "PASS" # ================= 4. FUNGSI PENILAIAN ================= def run_smart_quality_audit(text, rules): base_score = 100 flags = [] # Cek Word Count word_count = len(text.split()) # Cek Struktur paragraph_count = text.count("\n\n") + 1 if paragraph_count < rules['standard']['min_paragraphs']: base_score -= rules['penalties']['bad_structure'] flags.append("Struktur buruk (Kurang paragraf)") # Cek Risiko text_lower = text.lower() found_risks = [w for w in rules['risk_keywords'] if w in text_lower] if found_risks: base_score -= rules['penalties']['risk_word'] flags.append(f"Kata berisiko: {', '.join(found_risks)}") # RoBERTa try: inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512) with torch.no_grad(): outputs = roberta_model(**inputs) probs = torch.softmax(outputs.logits, dim=-1)[0] rob_score = float(probs[1]) except: rob_score = 0.5000 # Kalkulasi Score rule_score_decimal = max(0, base_score) / 100.0 final_decimal_score = (rule_score_decimal * 0.7) + (rob_score * 0.3) if final_decimal_score >= 0.85: verdict = "AI-Curated (Good Quality)" elif final_decimal_score >= 0.60: verdict = "Needs Revision (Minor Issue)" else: verdict = "Needs Attention (Low Clarity/Risk)" return { "verdict": verdict, "formatted_score": f"{final_decimal_score:.4f}", "violations": flags } def evaluate_llm_judge(text, predicted_type): prompt = f""" Role: Editor. Assess type: "{predicted_type}". Rules: GOOD (>80), REVISION (50-80), BAD (<50). Text: "{text[:2000]}..." Output JSON: {{ "category": "GOOD/REVISION/BAD", "score": (0-100), "reason": "summary", "advice": "tip" }} """ for model_id in LLM_MODELS: try: client = InferenceClient(model=model_id, token=hf_token, timeout=15) response = client.text_generation(prompt, max_new_tokens=250, temperature=0.4) json_str = response.strip() if "```json" in json_str: json_str = json_str.split("```json")[1].split("```")[0] elif "{" in json_str: json_str = "{" + json_str.split("{", 1)[1].rsplit("}", 1)[0] + "}" res = json.loads(json_str) res["model"] = model_id return res except: continue return None # ================= 5. LOGIC UTAMA ================= def process_article_final(title, content): full_text = f"{title}\n\n{content}" # --- 1. GATEKEEPER CHECK (GERBANG UTAMA) --- is_valid, message = run_gatekeeper_check(full_text) if not is_valid: # Langsung tolak di sini return { "is_content": False, "rejection_reason": message # "detail": "Artikel tidak lolos filter awal (Gatekeeper)." } # --- 2. JIKA LOLOS, LANJUT PROSES --- # Classification chunks = text_splitter.split_text(full_text) candidate_competencies = [] candidate_forms = [] for chunk in chunks: res_comp = vectorstore.similarity_search_with_score(query=chunk, k=3, filter={'source': 'competency'}) res_form = vectorstore.similarity_search_with_score(query=chunk, k=1, filter={'source': 'form'}) for doc, score in res_comp: if score >= MIN_RELEVANCE_SCORE: candidate_competencies.append({"data": doc.metadata, "score": score}) for doc, score in res_form: if score >= MIN_RELEVANCE_SCORE: candidate_forms.append({"data": doc.metadata, "score": score}) unique_comp = {} for item in candidate_competencies: name = item['data'].get('competency') if not name: continue if name not in unique_comp: unique_comp[name] = { "max_score": 0, "category": item['data'].get('category', '-'), "type_code": item['data'].get('type', '-') } unique_comp[name]["max_score"] = max(unique_comp[name]["max_score"], item['score']) final_competencies = sorted( [{ "category": v['category'], "competency": k, "type": v['type_code'], "similarity_score": f"{v['max_score']:.4f}" } for k, v in unique_comp.items()], key=lambda x: x['similarity_score'], reverse=True )[:5] predicted_form = "General" if candidate_forms: best = max(candidate_forms, key=lambda x: x['score']) predicted_form = best['data'].get('content_type', 'General') # Audit & LLM manual_audit = run_smart_quality_audit(full_text, QUALITY_RULES) llm_audit = evaluate_llm_judge(full_text, predicted_form) # Output result = { "is_content": True, "CONTENT_ANALYSIS": { "predicted_content_type": predicted_form, "matched_competencies": final_competencies }, "QUALITY_REPORT": { "SYSTEM_A_SMART_AUDIT": { "verdict": manual_audit['verdict'], "confidence_score": manual_audit['formatted_score'], "violations_found": manual_audit['violations'] } } } if llm_audit is not None: result["QUALITY_REPORT"]["SYSTEM_B_LLM_JUDGE"] = { "verdict": llm_audit.get('category'), "score": llm_audit.get('score'), "advice": llm_audit.get('advice'), "served_by": llm_audit.get('model') } return result # ================= 6. WEB UI ================= iface = gr.Interface( fn=process_article_final, inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=8, label="Isi")], outputs=gr.JSON(label="Hasil Deteksi Artikel (V21)"), title="DigiFeed V21: Article Detection with Vector Database", description="Filtering Digifeed Article" ) iface.launch()