Spaces:

akage99
/

article-model-digifeed

Sleeping

App Files Files Community

akage99 commited on Dec 23, 2025

Commit

ffe5715

verified ·

1 Parent(s): 9f906cd

update final code after corrupt cheking

Browse files

Files changed (1) hide show

app.py +289 -157

app.py CHANGED Viewed

@@ -1,177 +1,309 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
-# Cuma Load RoBERTa (Tanpa BGE, Tanpa JSON Playbook)
-ROBERTA_PATH = "akage99/roberta-corporate-backend"
-print("⏳ Cek Kesehatan Tokenizer & Model...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
     model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
-    print("✅ ALHAMDULILLAH! Tokenizer & Model SEHAT (Tidak Corrupt).")
-    status_model = "✅ Model & Tokenizer Aman!"
 except Exception as e:
-    print(f"❌ WADUH! File Model Rusak: {e}")
-    status_model = f"❌ Error: {e}"
-def cek_status(text):
-    return {
-        "status_file_model": status_model,
-        "input_kamu": text
-    }
-with gr.Interface(fn=cek_status, inputs="text", outputs="json") as demo:
-    demo.launch()
-# import gradio as gr
-# import torch
-# import json
-# import pandas as pd
-# from transformers import AutoTokenizer, AutoModelForSequenceClassification
-# from sentence_transformers import SentenceTransformer, util
-# # --- KONFIGURASI ---
-# ROBERTA_PATH = "akage99/roberta-corporate-backend"
-# PLAYBOOK_PATH = "competency_keywords.json"  # <--- PASTIKAN NAMA FILE DI TAB 'FILES' SAMA PERSIS INI
-# BGE_MODEL_NAME = "BAAI/bge-m3"
-# ALIGNMENT_THRESHOLD = 0.68
-# MIN_WORD_COUNT = 500
-# # Variabel Global untuk menampung status Error
-# LOADING_ERROR = None
-# # --- LOAD MODEL ---
-# print("⏳ Loading Models...")
-# try:
-#     tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
-#     model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
-#     model.eval()
-#     print("✅ RoBERTa Loaded!")
-# except Exception as e:
-#     print(f"❌ Error RoBERTa: {e}")
-# # Load BGE & Playbook
-# playbook_emb = None
-# df_playbook = pd.DataFrame()
-# try:
-#     bge_model = SentenceTransformer(BGE_MODEL_NAME)
-#     # Coba buka file JSON
-#     with open(PLAYBOOK_PATH, "r") as f:
-#         playbook_data = json.load(f)
-#     playbook_rows = []
-#     for cat, comps in playbook_data.items():
-#         for comp, data in comps.items():
-#             comp_type = data.get('type', '-')
-#             text = f"{data.get('description','')} {', '.join(data.get('keywords',[]))}"
-#             playbook_rows.append({
-#                 "category": cat,
-#                 "competency": comp,
-#                 "type": comp_type,
-#                 "text": text
-#             })
-#     df_playbook = pd.DataFrame(playbook_rows)
-#     playbook_emb = bge_model.encode(df_playbook['text'].tolist(), convert_to_tensor=True)
-#     print("✅ System Ready!")
-# except Exception as e:
-#     # TANGKAP ERRORNYA DISINI
-#     LOADING_ERROR = str(e)  # Simpan pesan error ke variabel
-#     print(f"❌ Error BGE/Playbook: {e}")
-# # --- LOGIC UTAMA ---
-# def process_article(title, content):
-#     # 0. CEK STATUS LOADING DULU
-#     # Kalau loading gagal, langsung lapor ke user di JSON Output
-#     if LOADING_ERROR is not None:
-#         return {
-#             "is_content": False,
-#             "error": "SYSTEM ERROR: Gagal memuat database Playbook.",
-#             "detail_error": LOADING_ERROR,  # <--- INI AKAN MUNCUL DI LAYAR
-#             "tips": f"Cek apakah file '{PLAYBOOK_PATH}' sudah ada di tab Files?"
-#         }
-#     # Kalau Playbook None tapi gak ada error (aneh), lapor juga
-#     if playbook_emb is None:
-#         return {
-#             "is_content": False,
-#             "error": "SYSTEM ERROR: Playbook Embedding kosong (None).",
-#             "detail_error": "Unknown Error during initialization."
-#         }
-#     full_text = f"{title}\n\n{content}"
-#     # 1. CEK JUMLAH KATA
-#     word_count = len(full_text.split())
-#     if word_count < MIN_WORD_COUNT:
-#         return {
-#             "is_content": False,
-#             "message": f"REJECTED: Konten terlalu pendek ({word_count} kata). Minimal {MIN_WORD_COUNT} kata.",
-#             "scores": {"roberta": "0.0000", "bge": "0.0000"}
-#         }
-#     # 2. RoBERTa Classification
-#     try:
-#         inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
-#         with torch.no_grad():
-#             outputs = model(**inputs)
-#             probs = torch.softmax(outputs.logits, dim=-1)[0]
-#         rob_score = float(probs[1])
-#         is_content = rob_score >= 0.5
-#         response = {
-#             "is_content": is_content,
-#             "scores": {
-#                 "roberta": f"{rob_score:.4f}",
-#                 "bge": "0.0000"
-#             }
-#         }
-#         if not is_content:
-#             response["message"] = "REJECTED: Bukan konten artikel."
-#             return response
-#         # 3. Hitung BGE
-#         art_vec = bge_model.encode(full_text, convert_to_tensor=True)
-#         cos_sim = util.cos_sim(art_vec, playbook_emb)
-#         top_val, top_idx = torch.max(cos_sim, dim=1)
-#         bge_score = float(top_val)
-#         idx = int(top_idx)
-#         best_cat = df_playbook.iloc[idx]['category']
-#         best_comp = df_playbook.iloc[idx]['competency']
-#         best_type = df_playbook.iloc[idx]['type']
-#         response["scores"]["bge"] = f"{bge_score:.4f}"
-#         comp_data = {
-#             "category": best_cat,
-#             "competency": best_comp,
-#             "type": best_type,
-#             "prediction_status": "AI Prediction" if bge_score >= ALIGNMENT_THRESHOLD else "AI Recommendation"
-#         }
-#         if bge_score >= ALIGNMENT_THRESHOLD:
-#             response["predict_competencies"] = comp_data
-#         else:
-#             response["recommendation_competencies"] = comp_data
-#     except Exception as e:
-#         return {"is_content": False, "error": str(e)}
-#     return response
-# # --- GRADIO ---
-# with gr.Interface(
-#     fn=process_article,
-#     inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Content")],
-#     outputs=gr.JSON(label="JSON Output"),
-#     title="Article Classifier API",
-# ) as demo:
-#     demo.launch()

 import gradio as gr
 import torch
+import json
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from sentence_transformers import SentenceTransformer, util
+# --- KONFIGURASI ---
+ROBERTA_PATH = "akage99/roberta-corporate-backend"
+# PASTIKAN NAMA FILE INI SAMA PERSIS DENGAN DI TAB 'FILES'
+PLAYBOOK_PATH = "competency_keywords.json"
+BGE_MODEL_NAME = "BAAI/bge-m3"
+ALIGNMENT_THRESHOLD = 0.68
+MIN_WORD_COUNT = 500
+# Variabel Global untuk menyimpan status error saat loading
+LOADING_STATUS = {"error": None, "message": "System Normal"}
+# --- 1. LOAD ROBERTA ---
+print("⏳ Loading RoBERTa...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
     model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
+    model.eval()
+    print("✅ RoBERTa Loaded!")
 except Exception as e:
+    print(f"❌ Error RoBERTa: {e}")
+    LOADING_STATUS["error"] = "RoBERTa Error"
+    LOADING_STATUS["message"] = str(e)
+# --- 2. LOAD BGE & PLAYBOOK ---
+playbook_emb = None
+df_playbook = pd.DataFrame()
+print("⏳ Loading BGE & Playbook...")
+try:
+    # Load Model BGE
+    bge_model = SentenceTransformer(BGE_MODEL_NAME)
+    # Load File JSON
+    with open(PLAYBOOK_PATH, "r") as f:
+        playbook_data = json.load(f)
+    playbook_rows = []
+    for cat, comps in playbook_data.items():
+        for comp, data in comps.items():
+            comp_type = data.get('type', '-')
+            text = f"{data.get('description','')} {', '.join(data.get('keywords',[]))}"
+            playbook_rows.append({
+                "category": cat,
+                "competency": comp,
+                "type": comp_type,
+                "text": text
+            })
+    df_playbook = pd.DataFrame(playbook_rows)
+    # Encode Playbook (Ini yang biasanya bikin berat)
+    playbook_emb = bge_model.encode(df_playbook['text'].tolist(), convert_to_tensor=True)
+    print("✅ System Ready & Playbook Loaded!")
+except Exception as e:
+    # TANGKAP ERRORNYA SUPAYA MUNCUL DI LAYAR
+    print(f"❌ Error BGE/Playbook: {e}")
+    LOADING_STATUS["error"] = "Playbook/BGE Error"
+    LOADING_STATUS["message"] = f"Gagal memuat {PLAYBOOK_PATH}. Detail: {str(e)}"
+# --- LOGIC UTAMA ---
+def process_article(title, content):
+    # CEK STATUS LOADING DULU
+    # Kalau tadi saat loading ada error, kasih tau user sekarang!
+    if LOADING_STATUS["error"]:
+        return {
+            "is_content": False,
+            "SYSTEM_ERROR": LOADING_STATUS["error"],
+            "DETAIL": LOADING_STATUS["message"],
+            "TIPS": "Cek nama file JSON di tab Files atau cek Logs untuk detail."
+        }
+    full_text = f"{title}\n\n{content}"
+    # 1. CEK JUMLAH KATA
+    word_count = len(full_text.split())
+    if word_count < MIN_WORD_COUNT:
+        return {
+            "is_content": False,
+            "message": f"REJECTED: Konten terlalu pendek ({word_count} kata). Minimal {MIN_WORD_COUNT} kata.",
+            "scores": {"roberta": "0.0000", "bge": "0.0000"}
+        }
+    # 2. RoBERTa Classification
+    try:
+        inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            probs = torch.softmax(outputs.logits, dim=-1)[0]
+        rob_score = float(probs[1])
+        is_content = rob_score >= 0.5
+        response = {
+            "is_content": is_content,
+            "scores": {
+                "roberta": f"{rob_score:.4f}",
+                "bge": "0.0000"
+            }
+        }
+        if not is_content:
+            response["message"] = "REJECTED: Bukan konten artikel."
+            return response
+        # 3. Hitung BGE
+        # Pastikan playbook berhasil di-load tadi
+        if playbook_emb is not None:
+            art_vec = bge_model.encode(full_text, convert_to_tensor=True)
+            cos_sim = util.cos_sim(art_vec, playbook_emb)
+            top_val, top_idx = torch.max(cos_sim, dim=1)
+            bge_score = float(top_val)
+            idx = int(top_idx)
+            best_cat = df_playbook.iloc[idx]['category']
+            best_comp = df_playbook.iloc[idx]['competency']
+            best_type = df_playbook.iloc[idx]['type']
+            response["scores"]["bge"] = f"{bge_score:.4f}"
+            comp_data = {
+                "category": best_cat,
+                "competency": best_comp,
+                "type": best_type,
+                "prediction_status": "AI Prediction" if bge_score >= ALIGNMENT_THRESHOLD else "AI Recommendation"
+            }
+            if bge_score >= ALIGNMENT_THRESHOLD:
+                response["predict_competencies"] = comp_data
+            else:
+                response["recommendation_competencies"] = comp_data
+        else:
+            # Kalau Playbook None (aneh), lapor error
+            response["SYSTEM_WARNING"] = "Playbook Embedding Kosong. Cek Log."
+    except Exception as e:
+        return {"is_content": False, "error": str(e)}
+    return response
+# --- GRADIO ---
+with gr.Interface(
+    fn=process_article,
+    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Content")],
+    outputs=gr.JSON(label="JSON Output"),
+    title="Article Classifier API",
+) as demo:
+    demo.launch()import gradio as gr
+import torch
+import json
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from sentence_transformers import SentenceTransformer, util
+# --- KONFIGURASI ---
+ROBERTA_PATH = "akage99/roberta-corporate-backend"
+# PASTIKAN NAMA FILE INI SAMA PERSIS DENGAN DI TAB 'FILES'
+PLAYBOOK_PATH = "competency_keywords.json"
+BGE_MODEL_NAME = "BAAI/bge-m3"
+ALIGNMENT_THRESHOLD = 0.68
+MIN_WORD_COUNT = 500
+# Variabel Global untuk menyimpan status error saat loading
+LOADING_STATUS = {"error": None, "message": "System Normal"}
+# --- 1. LOAD ROBERTA ---
+print("⏳ Loading RoBERTa...")
+try:
+    tokenizer = AutoTokenizer.from_pretrained(ROBERTA_PATH)
+    model = AutoModelForSequenceClassification.from_pretrained(ROBERTA_PATH)
+    model.eval()
+    print("✅ RoBERTa Loaded!")
+except Exception as e:
+    print(f"❌ Error RoBERTa: {e}")
+    LOADING_STATUS["error"] = "RoBERTa Error"
+    LOADING_STATUS["message"] = str(e)
+# --- 2. LOAD BGE & PLAYBOOK ---
+playbook_emb = None
+df_playbook = pd.DataFrame()
+print("⏳ Loading BGE & Playbook...")
+try:
+    # Load Model BGE
+    bge_model = SentenceTransformer(BGE_MODEL_NAME)
+    # Load File JSON
+    with open(PLAYBOOK_PATH, "r") as f:
+        playbook_data = json.load(f)
+    playbook_rows = []
+    for cat, comps in playbook_data.items():
+        for comp, data in comps.items():
+            comp_type = data.get('type', '-')
+            text = f"{data.get('description','')} {', '.join(data.get('keywords',[]))}"
+            playbook_rows.append({
+                "category": cat,
+                "competency": comp,
+                "type": comp_type,
+                "text": text
+            })
+    df_playbook = pd.DataFrame(playbook_rows)
+    # Encode Playbook (Ini yang biasanya bikin berat)
+    playbook_emb = bge_model.encode(df_playbook['text'].tolist(), convert_to_tensor=True)
+    print("✅ System Ready & Playbook Loaded!")
+except Exception as e:
+    # TANGKAP ERRORNYA SUPAYA MUNCUL DI LAYAR
+    print(f"❌ Error BGE/Playbook: {e}")
+    LOADING_STATUS["error"] = "Playbook/BGE Error"
+    LOADING_STATUS["message"] = f"Gagal memuat {PLAYBOOK_PATH}. Detail: {str(e)}"
+# --- LOGIC UTAMA ---
+def process_article(title, content):
+    # CEK STATUS LOADING DULU
+    # Kalau tadi saat loading ada error, kasih tau user sekarang!
+    if LOADING_STATUS["error"]:
+        return {
+            "is_content": False,
+            "SYSTEM_ERROR": LOADING_STATUS["error"],
+            "DETAIL": LOADING_STATUS["message"],
+            "TIPS": "Cek nama file JSON di tab Files atau cek Logs untuk detail."
+        }
+    full_text = f"{title}\n\n{content}"
+    # 1. CEK JUMLAH KATA
+    word_count = len(full_text.split())
+    if word_count < MIN_WORD_COUNT:
+        return {
+            "is_content": False,
+            "message": f"REJECTED: Konten terlalu pendek ({word_count} kata). Minimal {MIN_WORD_COUNT} kata.",
+            "scores": {"roberta": "0.0000", "bge": "0.0000"}
+        }
+    # 2. RoBERTa Classification
+    try:
+        inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+        with torch.no_grad():
+            outputs = model(**inputs)
+            probs = torch.softmax(outputs.logits, dim=-1)[0]
+        rob_score = float(probs[1])
+        is_content = rob_score >= 0.5
+        response = {
+            "is_content": is_content,
+            "scores": {
+                "roberta": f"{rob_score:.4f}",
+                "bge": "0.0000"
+            }
+        }
+        if not is_content:
+            response["message"] = "REJECTED: Bukan konten artikel."
+            return response
+        # 3. Hitung BGE
+        # Pastikan playbook berhasil di-load tadi
+        if playbook_emb is not None:
+            art_vec = bge_model.encode(full_text, convert_to_tensor=True)
+            cos_sim = util.cos_sim(art_vec, playbook_emb)
+            top_val, top_idx = torch.max(cos_sim, dim=1)
+            bge_score = float(top_val)
+            idx = int(top_idx)
+            best_cat = df_playbook.iloc[idx]['category']
+            best_comp = df_playbook.iloc[idx]['competency']
+            best_type = df_playbook.iloc[idx]['type']
+            response["scores"]["bge"] = f"{bge_score:.4f}"
+            comp_data = {
+                "category": best_cat,
+                "competency": best_comp,
+                "type": best_type,
+                "prediction_status": "AI Prediction" if bge_score >= ALIGNMENT_THRESHOLD else "AI Recommendation"
+            }
+            if bge_score >= ALIGNMENT_THRESHOLD:
+                response["predict_competencies"] = comp_data
+            else:
+                response["recommendation_competencies"] = comp_data
+        else:
+            # Kalau Playbook None (aneh), lapor error
+            response["SYSTEM_WARNING"] = "Playbook Embedding Kosong. Cek Log."
+    except Exception as e:
+        return {"is_content": False, "error": str(e)}
+    return response
+# --- GRADIO ---
+with gr.Interface(
+    fn=process_article,
+    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Content")],
+    outputs=gr.JSON(label="JSON Output"),
+    title="Article Classifier API",
+) as demo:
+    demo.launch()