Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import json | |
| import re | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_pinecone import PineconeVectorStore | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from huggingface_hub import login, InferenceClient | |
| # ================= 1. AUTH & CONFIG ================= | |
| hf_token = os.getenv("HF_TOKEN") | |
| pc_api_key = os.getenv("PINECONE_API_KEY") | |
| os.environ['PINECONE_API_KEY'] = pc_api_key | |
| login(token=hf_token) | |
| PINECONE_INDEX_NAME = "article-classifier" | |
| ROBERTA_PATH = "akage99/roberta-corporate-backend" | |
| BGE_MODEL_NAME = "BAAI/bge-m3" | |
| # --- GATEKEEPER CONFIG (GERBANG PENJAGA UTAMA) --- | |
| # Hard Filter | |
| GATE_RULES = { | |
| "min_words_limit": 500, | |
| "max_digit_ratio": 0.3, | |
| "math_latex_triggers": [ | |
| r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int", | |
| r"\$\$.*?\$\$", # Latex $$...$$ | |
| r"x\^2", r"a\^2", r"b\^2", r"x_i", r"y_i", # Variabel Matematika | |
| r"sin\(", r"cos\(", r"tan\(", r"log\(", # Fungsi Matematika | |
| r"H_2O", r"CO_2", # Rumus Kimia | |
| r"fig\.", r"eq\.", r"et al\.", # Bahasa Jurnal Kaku | |
| r"theorem", r"lemma", r"proof" # Bahasa Buku Teks Math | |
| ] | |
| } | |
| # --- SCORING CONFIG (AUDIT) --- | |
| QUALITY_RULES = { | |
| "standard": { | |
| "min_words": 500, # Disamakan biar konsisten | |
| "min_paragraphs": 3, | |
| "max_sentence_length": 60, | |
| }, | |
| "penalties": { | |
| "short_content": 20, | |
| "bad_structure": 30, | |
| "long_sentence": 5, | |
| "risk_word": 50, | |
| "bad_tone": 20 | |
| }, | |
| "risk_keywords": [ | |
| "confidential", "internal use only", | |
| "bodoh", "goblok", "brengsek", "tolol", "bajingan", "bangsat", "fucking", "what the hell", "what the fuck", "bastard", | |
| "password:", "api_key", "suap", "gratifikasi", "nigga" | |
| ] | |
| } | |
| LLM_MODELS = [ | |
| "HuggingFaceH4/zephyr-7b-beta", | |
| "mistralai/Mistral-7B-Instruct-v0.3", | |
| "google/gemma-1.1-7b-it" | |
| ] | |
| MIN_RELEVANCE_SCORE = 0.35 | |
| # ================= 2. SETUP ENGINE ================= | |
| print("Loading Models...") | |
| tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH) | |
| roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH) | |
| embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME) | |
| vectorstore = PineconeVectorStore.from_existing_index( | |
| index_name=PINECONE_INDEX_NAME, embedding=embeddings, text_key="text" | |
| ) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| print("System Ready!") | |
| # ================= 3. FUNGSI GATEKEEPER (FILTER KERAS) ================= | |
| def run_gatekeeper_check(text): | |
| """ | |
| Cek Syarat Mati: | |
| 1. Harus > 500 Kata | |
| 2. Gak boleh Matematika/Latex | |
| 3. Gak boleh Angka doang | |
| """ | |
| # 1. CEK JUMLAH KATA (WORD COUNT) | |
| # split by spasi untuk hitung kata | |
| words = text.strip().split() | |
| word_count = len(words) | |
| if word_count < GATE_RULES['min_words_limit']: | |
| return False, f"REJECTED: Jumlah kata {word_count}. Minimal wajib {GATE_RULES['min_words_limit']} kata." | |
| # 2. CEK ISINYA CUMA ANGKA | |
| # Hapus spasi dan simbol, cek digit | |
| clean_text = re.sub(r'[\s.,\-\+]', '', text) | |
| if clean_text.isdigit(): | |
| return False, "REJECTED: Input hanya berisi angka/nomor (Spam)." | |
| # 3. CEK RASIO ANGKA (Laporan Keuangan/Data Mentah) | |
| digit_count = sum(c.isdigit() for c in text) | |
| total_chars = len(text) | |
| if total_chars > 0 and (digit_count / total_chars) > GATE_RULES['max_digit_ratio']: | |
| return False, "REJECTED: Terlalu banyak angka" | |
| # 4. CEK LATEX & MATEMATIKA (Scientific Check) | |
| for pattern in GATE_RULES['math_latex_triggers']: | |
| if re.search(pattern, text, re.IGNORECASE): | |
| return False, f"REJECTED: Terdeteksi rumus/simbol Matematika Ilmiah ('{pattern}'). Tidak sesuai format artikel populer." | |
| return True, "PASS" | |
| # ================= 4. FUNGSI PENILAIAN ================= | |
| def run_smart_quality_audit(text, rules): | |
| base_score = 100 | |
| flags = [] | |
| # Cek Word Count | |
| word_count = len(text.split()) | |
| # Cek Struktur | |
| paragraph_count = text.count("\n\n") + 1 | |
| if paragraph_count < rules['standard']['min_paragraphs']: | |
| base_score -= rules['penalties']['bad_structure'] | |
| flags.append("Struktur buruk (Kurang paragraf)") | |
| # Cek Risiko | |
| text_lower = text.lower() | |
| found_risks = [w for w in rules['risk_keywords'] if w in text_lower] | |
| if found_risks: | |
| base_score -= rules['penalties']['risk_word'] | |
| flags.append(f"Kata berisiko: {', '.join(found_risks)}") | |
| # RoBERTa | |
| try: | |
| inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512) | |
| with torch.no_grad(): | |
| outputs = roberta_model(**inputs) | |
| probs = torch.softmax(outputs.logits, dim=-1)[0] | |
| rob_score = float(probs[1]) | |
| except: | |
| rob_score = 0.5000 | |
| # Kalkulasi Score | |
| rule_score_decimal = max(0, base_score) / 100.0 | |
| final_decimal_score = (rule_score_decimal * 0.7) + (rob_score * 0.3) | |
| if final_decimal_score >= 0.85: verdict = "AI-Curated (Good Quality)" | |
| elif final_decimal_score >= 0.60: verdict = "Needs Revision (Minor Issue)" | |
| else: verdict = "Needs Attention (Low Clarity/Risk)" | |
| return { | |
| "verdict": verdict, | |
| "formatted_score": f"{final_decimal_score:.4f}", | |
| "violations": flags | |
| } | |
| def evaluate_llm_judge(text, predicted_type): | |
| prompt = f""" | |
| Role: Editor. Assess type: "{predicted_type}". | |
| Rules: GOOD (>80), REVISION (50-80), BAD (<50). | |
| Text: "{text[:2000]}..." | |
| Output JSON: {{ "category": "GOOD/REVISION/BAD", "score": (0-100), "reason": "summary", "advice": "tip" }} | |
| """ | |
| for model_id in LLM_MODELS: | |
| try: | |
| client = InferenceClient(model=model_id, token=hf_token, timeout=15) | |
| response = client.text_generation(prompt, max_new_tokens=250, temperature=0.4) | |
| json_str = response.strip() | |
| if "```json" in json_str: json_str = json_str.split("```json")[1].split("```")[0] | |
| elif "{" in json_str: json_str = "{" + json_str.split("{", 1)[1].rsplit("}", 1)[0] + "}" | |
| res = json.loads(json_str) | |
| res["model"] = model_id | |
| return res | |
| except: continue | |
| return None | |
| # ================= 5. LOGIC UTAMA ================= | |
| def process_article_final(title, content): | |
| full_text = f"{title}\n\n{content}" | |
| # --- 1. GATEKEEPER CHECK (GERBANG UTAMA) --- | |
| is_valid, message = run_gatekeeper_check(full_text) | |
| if not is_valid: | |
| # Langsung tolak di sini | |
| return { | |
| "is_content": False, | |
| "rejection_reason": message | |
| # "detail": "Artikel tidak lolos filter awal (Gatekeeper)." | |
| } | |
| # --- 2. JIKA LOLOS, LANJUT PROSES --- | |
| # Classification | |
| chunks = text_splitter.split_text(full_text) | |
| candidate_competencies = [] | |
| candidate_forms = [] | |
| for chunk in chunks: | |
| res_comp = vectorstore.similarity_search_with_score(query=chunk, k=3, filter={'source': 'competency'}) | |
| res_form = vectorstore.similarity_search_with_score(query=chunk, k=1, filter={'source': 'form'}) | |
| for doc, score in res_comp: | |
| if score >= MIN_RELEVANCE_SCORE: candidate_competencies.append({"data": doc.metadata, "score": score}) | |
| for doc, score in res_form: | |
| if score >= MIN_RELEVANCE_SCORE: candidate_forms.append({"data": doc.metadata, "score": score}) | |
| unique_comp = {} | |
| for item in candidate_competencies: | |
| name = item['data'].get('competency') | |
| if not name: continue | |
| if name not in unique_comp: | |
| unique_comp[name] = { | |
| "max_score": 0, | |
| "category": item['data'].get('category', '-'), | |
| "type_code": item['data'].get('type', '-') | |
| } | |
| unique_comp[name]["max_score"] = max(unique_comp[name]["max_score"], item['score']) | |
| final_competencies = sorted( | |
| [{ | |
| "category": v['category'], | |
| "competency": k, | |
| "type": v['type_code'], | |
| "similarity_score": f"{v['max_score']:.4f}" | |
| } for k, v in unique_comp.items()], | |
| key=lambda x: x['similarity_score'], reverse=True | |
| )[:5] | |
| predicted_form = "General" | |
| if candidate_forms: | |
| best = max(candidate_forms, key=lambda x: x['score']) | |
| predicted_form = best['data'].get('content_type', 'General') | |
| # Audit & LLM | |
| manual_audit = run_smart_quality_audit(full_text, QUALITY_RULES) | |
| llm_audit = evaluate_llm_judge(full_text, predicted_form) | |
| # Output | |
| result = { | |
| "is_content": True, | |
| "CONTENT_ANALYSIS": { | |
| "predicted_content_type": predicted_form, | |
| "matched_competencies": final_competencies | |
| }, | |
| "QUALITY_REPORT": { | |
| "SYSTEM_A_SMART_AUDIT": { | |
| "verdict": manual_audit['verdict'], | |
| "confidence_score": manual_audit['formatted_score'], | |
| "violations_found": manual_audit['violations'] | |
| } | |
| } | |
| } | |
| if llm_audit is not None: | |
| result["QUALITY_REPORT"]["SYSTEM_B_LLM_JUDGE"] = { | |
| "verdict": llm_audit.get('category'), | |
| "score": llm_audit.get('score'), | |
| "advice": llm_audit.get('advice'), | |
| "served_by": llm_audit.get('model') | |
| } | |
| return result | |
| # ================= 6. WEB UI ================= | |
| iface = gr.Interface( | |
| fn=process_article_final, | |
| inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=8, label="Isi")], | |
| outputs=gr.JSON(label="Hasil Deteksi Artikel (V21)"), | |
| title="DigiFeed V21: Article Detection with Vector Database", | |
| description="Filtering Digifeed Article" | |
| ) | |
| iface.launch() |