import os
import torch
import json
import re
import gradio as gr
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from huggingface_hub import login, InferenceClient

# ================= 1. KONFIGURASI & RULES =================
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

# PATH FILE MODEL
ROBERTA_PATH = "akage99/roberta-corporate-backend"
BGE_MODEL_NAME = "BAAI/bge-m3"

# --- A. GATEKEEPER RULES (TIDAK DIUBAH) ---
GATE_RULES = {
    "min_words": 500,       # Wajib 500 kata
    "max_digit_ratio": 0.3, # Maksimal 30% angka
    "math_latex_triggers": [
        r"\\documentclass", r"\\begin\{", r"\\frac", r"\\sum", r"\\int", 
        r"\$\$.*?\$\$", r"x\^2", r"a\^2", r"b\^2", 
        r"sin\(", r"cos\(", r"tan\(", r"H_2O", r"CO_2", 
        r"fig\.", r"eq\.", r"et al\."
    ]
}

# --- B. QUALITY AUDIT RULES (TIDAK DIUBAH) ---
QUALITY_RULES = {
    "standard": { 
        "min_paragraphs": 3, 
        "max_sentence_length": 60 
    },
    "penalties": { 
        "bad_structure": 30, 
        "risk_word": 50,      
        "bad_tone": 20,       
        "short_content": 20 
    },
    "risk_keywords": [
        "confidential", "rahasia", "internal use only", "top secret",
        "bodoh", "goblok", "brengsek", "tolol", "idiot",
        "password:", "api_key", "access token",
        "suap", "gratifikasi", "korupsi"
    ]
}

# --- C. LLM MODELS (Fallback Strategy) ---
LLM_MODELS = [
    "HuggingFaceH4/zephyr-7b-beta",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "google/gemma-1.1-7b-it"
]

# ================= 2. SETUP ENGINE (UPDATED) =================
print("⏳ Starting System...")

# 1. Load Tokenizer & RoBERTa (Untuk Audit Tone)
print("   Loading RoBERTa Model...")
tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
roberta_model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)

# 2. Load Embedding Model (Untuk memproses input User)
print("   Loading BGE-M3 Model...")
embeddings = HuggingFaceEmbeddings(model_name=BGE_MODEL_NAME)

# 3. Setup Chunking (Untuk memecah artikel User)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# 4. LOAD DATABASE VEKTOR DARI FILE (PENGGANTI JSON BUILDER)
print("   Loading Vector Database from Disk (index.pkl & index.faiss)...")
try:
    # "." artinya mencari file di folder root (tempat file ini berada)
    # allow_dangerous_deserialization=True wajib aktif karena kita pakai Pickle
    vectorstore = FAISS.load_local(
        folder_path=".", 
        embeddings=embeddings, 
        allow_dangerous_deserialization=True
    )
    print("✅ SUCCESS: Database Vektor Berhasil Dimuat!")
except Exception as e:
    print(f"❌ CRITICAL ERROR: Gagal memuat database vektor. Pastikan file index.pkl dan index.faiss sudah diupload.\nDetail: {e}")
    vectorstore = None

print("✅ System Ready!")

# ================= 3. LOGIC MODULES (TIDAK DIUBAH) =================

# --- MODUL 1: GATEKEEPER ---
def run_gatekeeper(text):
    words = text.strip().split()
    if len(words) < GATE_RULES['min_words']:
        return False, f"REJECTED: Terlalu pendek ({len(words)} kata). Minimal {GATE_RULES['min_words']} kata."
    
    clean_text = re.sub(r'[\s.,\-\+]', '', text)
    if clean_text.isdigit(): 
        return False, "REJECTED: Input hanya berisi angka (Spam Data)."
    
    digit_ratio = sum(c.isdigit() for c in text) / len(text)
    if digit_ratio > GATE_RULES['max_digit_ratio']:
        return False, "REJECTED: Terlalu banyak angka. Terdeteksi sebagai Laporan Keuangan/Data Mentah."

    for pattern in GATE_RULES['math_latex_triggers']:
        if re.search(pattern, text, re.IGNORECASE):
            return False, f"REJECTED: Terdeteksi format Dokumen Teknis/Matematika/LaTeX ('{pattern}')."
            
    return True, "PASS"

# --- MODUL 2: SYSTEM A (MANUAL AUDIT) ---
def run_manual_audit(text, rules):
    base_score = 100
    flags = []
    
    if text.count("\n\n") + 1 < rules['standard']['min_paragraphs']:
        base_score -= rules['penalties']['bad_structure']
        flags.append("⚠️ Struktur buruk (Kurang paragraf/Wall of text)")

    text_lower = text.lower()
    found_risks = [w for w in rules['risk_keywords'] if w in text_lower]
    if found_risks:
        base_score -= rules['penalties']['risk_word']
        flags.append(f"🚨 Terdeteksi kata berisiko/sensitif: {', '.join(found_risks)}")

    try:
        inputs = tokenizer(text[:512], return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = roberta_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]
        rob_score = float(probs[1]) 
    except:
        rob_score = 0.5

    rule_decimal = max(0, base_score) / 100.0
    final_score = (rule_decimal * 0.7) + (rob_score * 0.3)
    
    if final_score >= 0.85: verdict = "AI-Curated (Good Quality)"
    elif final_score >= 0.60: verdict = "Needs Revision (Minor Issue)"
    else: verdict = "Needs Attention (Low Clarity/Risk)"

    return {
        "verdict": verdict,
        "score_decimal": final_score,
        "flags": flags
    }

# --- MODUL 3: SYSTEM B (LLM JUDGE) ---
def run_llm_judge(text, content_type):
    prompt = f"""
    Role: Senior Editor. Assess this article based on DigiFeed Parameters.
    Type: "{content_type}".
    
    STANDARDS:
    1. GOOD (>80): Clear idea, logical flow, grammar 90%, professional tone.
    2. REVISION (50-80): Unfocused, messy paragraphs, typos.
    3. BAD (<50): No flow, misleading, sensitive data, too technical.

    Text: "{text[:2000]}..."
    Output JSON: {{ "category": "GOOD/REVISION/BAD", "score": (0-100), "reason": "summary", "advice": "tip" }}
    """
    for model_id in LLM_MODELS:
        try:
            client = InferenceClient(model=model_id, token=hf_token, timeout=15)
            response = client.text_generation(prompt, max_new_tokens=250, temperature=0.4)
            json_str = response.strip()
            if "```json" in json_str: json_str = json_str.split("```json")[1].split("```")[0]
            elif "{" in json_str: json_str = "{" + json_str.split("{", 1)[1].rsplit("}", 1)[0] + "}"
            res = json.loads(json_str)
            res['model'] = model_id
            return res
        except: continue
    return None

# ================= 4. MAIN PROCESSOR (TIDAK DIUBAH) =================
def process_article(title, content):
    full_text = f"{title}\n\n{content}"
    
    # 1. GATEKEEPER CHECK
    is_valid, msg = run_gatekeeper(full_text)
    if not is_valid:
        return {"is_content": False, "rejection_reason": msg}

    # 2. CHUNKING & VECTOR SEARCH
    chunks = text_splitter.split_text(full_text)
    
    competency_candidates = []
    form_candidates = []

    if vectorstore:
        for chunk in chunks:
            # Cari di FAISS (Load dari File)
            # Filter berdasarkan metadata 'source' yang kita buat di Colab
            res_comp = vectorstore.similarity_search_with_score(chunk, k=3, filter={'source': 'competency'})
            res_form = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'form'}) # Ganti 'form' jadi 'content_form' sesuai script colab
            
            # Koreksi Sedikit: Di Colab tadi metadata source-nya "content_form", bukan "form"
            # Kita sesuaikan filternya di bawah ini agar match:
            
            # ... Ulangi search dengan filter yang benar ...
            res_form_corrected = vectorstore.similarity_search_with_score(chunk, k=1, filter={'source': 'content_form'})
            
            for doc, score in res_comp:
                competency_candidates.append({"meta": doc.metadata, "score": score})
            for doc, score in res_form_corrected:
                form_candidates.append({"meta": doc.metadata, "score": score})

    # 3. AGGREGATION
    unique_comp = {}
    for item in competency_candidates:
        name = item['meta'].get('competency', item['meta'].get('name', 'Unknown')) # Handle variasi nama key
        if name not in unique_comp:
            unique_comp[name] = item['meta']
            unique_comp[name]['best_score'] = item['score'] 
        else:
            if item['score'] < unique_comp[name]['best_score']:
                unique_comp[name]['best_score'] = item['score']

    top_5_competencies = sorted(unique_comp.values(), key=lambda x: x['best_score'])[:5]
    
    final_competencies = []
    for comp in top_5_competencies:
        sim_score = 1 / (1 + comp['best_score'])
        final_competencies.append({
            "category": comp.get('group', comp.get('category', '-')), # Sesuaikan key metadata Colab
            "competency": comp.get('name', comp.get('competency', '-')),
            "type": comp.get('code', comp.get('type', '-')),
            "similarity_score": f"{sim_score:.4f}"
        })

    predicted_form = "General"
    if form_candidates:
        best_form = min(form_candidates, key=lambda x: x['score'])
        predicted_form = best_form['meta'].get('name', best_form['meta'].get('content_type', 'General'))

    # 4. QUALITY AUDIT
    manual_res = run_manual_audit(full_text, QUALITY_RULES)
    llm_res = run_llm_judge(full_text, predicted_form)

    # 5. CONSTRUCT OUTPUT
    result = {
        "is_content": True,
        "CONTENT_ANALYSIS": {
            "predicted_type": predicted_form,
            "top_5_competencies": final_competencies
        },
        "QUALITY_REPORT": {
            "SYSTEM_A_MANUAL": {
                "verdict": manual_res['verdict'],
                "score": f"{manual_res['score_decimal']:.4f}",
                "flags": manual_res['flags']
            }
        }
    }
    
    if llm_res:
        result["QUALITY_REPORT"]["SYSTEM_B_LLM"] = {
            "verdict": llm_res.get('category'),
            "score": llm_res.get('score'),
            "advice": llm_res.get('advice'),
            "served_by": llm_res.get('model')
        }

    return result

# ================= 5. UI =================
iface = gr.Interface(
    fn=process_article,
    inputs=[gr.Textbox(label="Judul"), gr.Textbox(lines=10, label="Isi Artikel (Min 500 Kata)")],
    outputs=gr.JSON(label="Hasil Analisis"),
    title="DigiFeed V6.1: Pre-computed Index",
    description="Sistem klasifikasi artikel menggunakan FAISS Index statis (Pre-loaded) untuk performa lebih cepat dan stabil."
)

iface.launch()