akage99's picture
update final Article Detection with Vector Database
2ad20be verified
import os
import torch
import json
import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from huggingface_hub import login, InferenceClient
# ================= 1. AUTH & CONFIG =================
hf_token = os.getenv("HF_TOKEN")
pc_api_key = os.getenv("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pc_api_key
login(token=hf_token)
PINECONE_INDEX_NAME = "article-classifier"
ROBERTA_PATH = "akage99/roberta-corporate-backend"
BGE_MODEL_NAME = "BAAI/bge-m3"
# --- GATEKEEPER CONFIG (GERBANG PENJAGA UTAMA) ---
# Hard Filter
GATE_RULES = {
"min_words_limit": 500,
"max_digit_ratio": 0.3,
"math_latex_triggers": [
r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int",
r"\$\$.*?\$\$", # Latex $$...$$
r"x\^2", r"a\^2", r"b\^2", r"x_i", r"y_i", # Variabel Matematika
r"sin\(", r"cos\(", r"tan\(", r"log\(", # Fungsi Matematika
r"H_2O", r"CO_2", # Rumus Kimia
r"fig\.", r"eq\.", r"et al\.", # Bahasa Jurnal Kaku
r"theorem", r"lemma", r"proof" # Bahasa Buku Teks Math
]
}
# --- SCORING CONFIG (AUDIT) ---
QUALITY_RULES = {
"standard": {
"min_words": 500, # Disamakan biar konsisten
"min_paragraphs": 3,
"max_sentence_length": 60,
},
"penalties": {
"short_content": 20,
"bad_structure": 30,
"long_sentence": 5,
"risk_word": 50,
"bad_tone": 20
},
"risk_keywords": [
"confidential", "internal use only",
"bodoh", "goblok", "brengsek", "tolol", "bajingan", "bangsat", "fucking", "what the hell", "what the fuck", "bastard",
"password:", "api_key", "suap", "gratifikasi", "nigga"
]
}
LLM_MODELS = [
"HuggingFaceH4/zephyr-7b-beta",
"mistralai/Mistral-7B-Instruct-v0.3",
"google/gemma-1.1-7b-it"
]
MIN_RELEVANCE_SCORE = 0.35
# ================= 2. SETUP ENGINE =================
print("Loading Models...")
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)
vectorstore = PineconeVectorStore.from_existing_index(
index_name=PINECONE_INDEX_NAME, embedding=embeddings, text_key="text"
)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
print("System Ready!")
# ================= 3. FUNGSI GATEKEEPER (FILTER KERAS) =================
def run_gatekeeper_check(text):
"""
Cek Syarat Mati:
1. Harus > 500 Kata
2. Gak boleh Matematika/Latex
3. Gak boleh Angka doang
"""
# 1. CEK JUMLAH KATA (WORD COUNT)
# split by spasi untuk hitung kata
words = text.strip().split()
word_count = len(words)
if word_count < GATE_RULES['min_words_limit']:
return False, f"REJECTED: Jumlah kata {word_count}. Minimal wajib {GATE_RULES['min_words_limit']} kata."
# 2. CEK ISINYA CUMA ANGKA
# Hapus spasi dan simbol, cek digit
clean_text = re.sub(r'[\s.,\-\+]', '', text)
if clean_text.isdigit():
return False, "REJECTED: Input hanya berisi angka/nomor (Spam)."
# 3. CEK RASIO ANGKA (Laporan Keuangan/Data Mentah)
digit_count = sum(c.isdigit() for c in text)
total_chars = len(text)
if total_chars > 0 and (digit_count / total_chars) > GATE_RULES['max_digit_ratio']:
return False, "REJECTED: Terlalu banyak angka"
# 4. CEK LATEX & MATEMATIKA (Scientific Check)
for pattern in GATE_RULES['math_latex_triggers']:
if re.search(pattern, text, re.IGNORECASE):
return False, f"REJECTED: Terdeteksi rumus/simbol Matematika Ilmiah ('{pattern}'). Tidak sesuai format artikel populer."
return True, "PASS"
# ================= 4. FUNGSI PENILAIAN =================
def run_smart_quality_audit(text, rules):
base_score = 100
flags = []
# Cek Word Count
word_count = len(text.split())
# Cek Struktur
paragraph_count = text.count("\n\n") + 1
if paragraph_count < rules['standard']['min_paragraphs']:
base_score -= rules['penalties']['bad_structure']
flags.append("Struktur buruk (Kurang paragraf)")
# Cek Risiko
text_lower = text.lower()
found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
if found_risks:
base_score -= rules['penalties']['risk_word']
flags.append(f"Kata berisiko: {', '.join(found_risks)}")
# RoBERTa
try:
inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = roberta_model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)[0]
rob_score = float(probs[1])
except:
rob_score = 0.5000
# Kalkulasi Score
rule_score_decimal = max(0, base_score) / 100.0
final_decimal_score = (rule_score_decimal * 0.7) + (rob_score * 0.3)
if final_decimal_score >= 0.85: verdict = "AI-Curated (Good Quality)"
elif final_decimal_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
else: verdict = "Needs Attention (Low Clarity/Risk)"
return {
"verdict": verdict,
"formatted_score": f"{final_decimal_score:.4f}",
"violations": flags
}
def evaluate_llm_judge(text, predicted_type):
prompt = f"""
Role: Editor. Assess type: "{predicted_type}".
Rules: GOOD (>80), REVISION (50-80), BAD (<50).
Text: "{text[:2000]}..."
Output JSON: {{ "category": "GOOD/REVISION/BAD", "score": (0-100), "reason": "summary", "advice": "tip" }}
"""
for model_id in LLM_MODELS:
try:
client = InferenceClient(model=model_id, token=hf_token, timeout=15)
response = client.text_generation(prompt, max_new_tokens=250, temperature=0.4)
json_str = response.strip()
if "```json" in json_str: json_str = json_str.split("```json")[1].split("```")[0]
elif "{" in json_str: json_str = "{" + json_str.split("{", 1)[1].rsplit("}", 1)[0] + "}"
res = json.loads(json_str)
res["model"] = model_id
return res
except: continue
return None
# ================= 5. LOGIC UTAMA =================
def process_article_final(title, content):
full_text = f"{title}\n\n{content}"
# --- 1. GATEKEEPER CHECK (GERBANG UTAMA) ---
is_valid, message = run_gatekeeper_check(full_text)
if not is_valid:
# Langsung tolak di sini
return {
"is_content": False,
"rejection_reason": message
# "detail": "Artikel tidak lolos filter awal (Gatekeeper)."
}
# --- 2. JIKA LOLOS, LANJUT PROSES ---
# Classification
chunks = text_splitter.split_text(full_text)
candidate_competencies = []
candidate_forms = []
for chunk in chunks:
res_comp = vectorstore.similarity_search_with_score(query=chunk, k=3, filter={'source': 'competency'})
res_form = vectorstore.similarity_search_with_score(query=chunk, k=1, filter={'source': 'form'})
for doc, score in res_comp:
if score >= MIN_RELEVANCE_SCORE: candidate_competencies.append({"data": doc.metadata, "score": score})
for doc, score in res_form:
if score >= MIN_RELEVANCE_SCORE: candidate_forms.append({"data": doc.metadata, "score": score})
unique_comp = {}
for item in candidate_competencies:
name = item['data'].get('competency')
if not name: continue
if name not in unique_comp:
unique_comp[name] = {
"max_score": 0,
"category": item['data'].get('category', '-'),
"type_code": item['data'].get('type', '-')
}
unique_comp[name]["max_score"] = max(unique_comp[name]["max_score"], item['score'])
final_competencies = sorted(
[{
"category": v['category'],
"competency": k,
"type": v['type_code'],
"similarity_score": f"{v['max_score']:.4f}"
} for k, v in unique_comp.items()],
key=lambda x: x['similarity_score'], reverse=True
)[:5]
predicted_form = "General"
if candidate_forms:
best = max(candidate_forms, key=lambda x: x['score'])
predicted_form = best['data'].get('content_type', 'General')
# Audit & LLM
manual_audit = run_smart_quality_audit(full_text, QUALITY_RULES)
llm_audit = evaluate_llm_judge(full_text, predicted_form)
# Output
result = {
"is_content": True,
"CONTENT_ANALYSIS": {
"predicted_content_type": predicted_form,
"matched_competencies": final_competencies
},
"QUALITY_REPORT": {
"SYSTEM_A_SMART_AUDIT": {
"verdict": manual_audit['verdict'],
"confidence_score": manual_audit['formatted_score'],
"violations_found": manual_audit['violations']
}
}
}
if llm_audit is not None:
result["QUALITY_REPORT"]["SYSTEM_B_LLM_JUDGE"] = {
"verdict": llm_audit.get('category'),
"score": llm_audit.get('score'),
"advice": llm_audit.get('advice'),
"served_by": llm_audit.get('model')
}
return result
# ================= 6. WEB UI =================
iface = gr.Interface(
fn=process_article_final,
inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=8, label="Isi")],
outputs=gr.JSON(label="Hasil Deteksi Artikel (V21)"),
title="DigiFeed V21: Article Detection with Vector Database",
description="Filtering Digifeed Article"
)
iface.launch()