Spaces:

kikikara
/

TUFA-Explainable_AI

Sleeping

App Files Files Community

kikikara commited on Jun 4, 2025

Commit

4223edb

verified ·

1 Parent(s): 6ae9375

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -117

app.py CHANGED Viewed

@@ -5,22 +5,27 @@ import torch
 import numpy as np
 import html
 from transformers import AutoTokenizer, AutoModel, logging as hf_logging
 # Hugging Face Transformers 로깅 레벨 설정
 hf_logging.set_verbosity_error()
-# ────────── 설정 ──────────
 MODEL_NAME = "bert-base-uncased"
 DEVICE     = "cpu"
-SAVE_DIR   = "저장저장1" # 이 폴더가 app.py와 같은 위치에 있어야 합니다.
 LAYER_ID   = 4
 SEED       = 0
 CLF_NAME   = "linear"
-# ────────── 전역 모델 로드 (Gradio 앱 시작 시 한 번 실행) ──────────
-# Streamlit의 @st.cache_resource 대신, 앱 시작 시 로드되도록 전역 변수로 관리
-TOKENIZER_GLOBAL = None
-MODEL_GLOBAL = None
 W_GLOBAL, MU_GLOBAL, W_P_GLOBAL, B_P_GLOBAL = None, None, None, None
 CLASS_NAMES_GLOBAL = None
 MODELS_LOADED_SUCCESSFULLY = False
@@ -32,7 +37,7 @@ try:
     clf_file_path = os.path.join(SAVE_DIR, f"{CLF_NAME}_layer{LAYER_ID}_projlda_seed{SEED}.pkl")
     if not os.path.isdir(SAVE_DIR):
-        raise FileNotFoundError(f"오류: 모델 저장 디렉토리 '{SAVE_DIR}'를 찾을 수 없습니다. '저장저장1' 폴더를 확인하세요.")
     if not os.path.exists(lda_file_path):
         raise FileNotFoundError(f"오류: LDA 모델 파일 '{lda_file_path}'를 찾을 수 없습니다.")
     if not os.path.exists(clf_file_path):
@@ -41,8 +46,7 @@ try:
     lda = joblib.load(lda_file_path)
     clf = joblib.load(clf_file_path)
-    if hasattr(clf, "base_estimator"):
-        clf = clf.base_estimator
     W_GLOBAL   = torch.tensor(lda.scalings_,  dtype=torch.float32, device=DEVICE)
     MU_GLOBAL  = torch.tensor(lda.xbar_,     dtype=torch.float32, device=DEVICE)
@@ -50,184 +54,259 @@ try:
     B_P_GLOBAL = torch.tensor(clf.intercept_, dtype=torch.float32, device=DEVICE)
     TOKENIZER_GLOBAL = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
-    MODEL_GLOBAL     = AutoModel.from_pretrained(
-        MODEL_NAME, output_hidden_states=True
     ).to(DEVICE).eval()
-    if hasattr(lda, 'classes_'):
-        CLASS_NAMES_GLOBAL = lda.classes_
-    elif hasattr(clf, 'classes_'):
-        CLASS_NAMES_GLOBAL = clf.classes_
     MODELS_LOADED_SUCCESSFULLY = True
     print("Gradio App: 모든 모델 및 데이터 로드 성공!")
 except Exception as e:
     MODEL_LOADING_ERROR_MESSAGE = f"모델 로딩 중 심각한 오류 발생: {str(e)}\n'저장저장1' 폴더와 내용물을 확인해주세요."
     print(MODEL_LOADING_ERROR_MESSAGE)
-    # 이 오류는 Gradio UI를 통해 사용자에게 전달될 수 있도록 처리할 수 있습니다.
-# ────────── 핵심 분석 함수 (Gradio 인터페이스가 호출) ──────────
 def analyze_sentence_for_gradio(sentence_text, top_k_value):
     if not MODELS_LOADED_SUCCESSFULLY:
-        # 모델 로딩 실패 시 Gradio 출력 형식에 맞춰 오류 메시지 반환
         error_html = f"<p style='color:red;'>초기화 오류: {html.escape(MODEL_LOADING_ERROR_MESSAGE)}</p>"
-        # Gradio Interface는 정의된 모든 출력에 대해 값을 받아야 합니다.
-        return error_html, "모델 로딩 실패", "N/A", [] # HTML, 예측결과텍스트, 상세결과(Label), TopK(DataFrame)
     try:
-        # 전역에서 로드된 모델 사용
-        tokenizer = TOKENIZER_GLOBAL
-        model = MODEL_GLOBAL
         W, mu, w_p, b_p = W_GLOBAL, MU_GLOBAL, W_P_GLOBAL, B_P_GLOBAL
         class_names = CLASS_NAMES_GLOBAL
-        # 1) 토큰화
         enc = tokenizer(sentence_text, return_tensors="pt", truncation=True, max_length=510, padding=True)
-        input_ids  = enc["input_ids"].to(DEVICE)
-        attn_mask  = enc["attention_mask"].to(DEVICE)
         if input_ids.shape[1] == 0:
-             return "<p style='color:orange;'>입력 오류: 유효한 토큰이 없습니다.</p>", "입력 오류", "N/A", []
-        # 2) 임베딩 및 그래디언트 설정
-        input_embeds = model.embeddings.word_embeddings(input_ids).clone().detach()
-        input_embeds.requires_grad_(True)
-        # 3) Forward pass
-        outputs = model(inputs_embeds=input_embeds, attention_mask=attn_mask, output_hidden_states=True)
         cls_vec = outputs.hidden_states[LAYER_ID][:, 0, :]
-        # 4) LDA 투영 및 분류
         z_projected = (cls_vec - mu) @ W
         logit_output = z_projected @ w_p.T + b_p
         probs = torch.softmax(logit_output, dim=1)
-        pred_idx = torch.argmax(probs, dim=1).item()
-        pred_prob_val = probs[0, pred_idx].item()
-        # 5) Gradient 계산
-        if input_embeds.grad is not None:
-            input_embeds.grad.zero_()
         logit_output[0, pred_idx].backward()
-        if input_embeds.grad is None:
-            return "<p style='color:red;'>분석 오류: 그래디언트 계산 실패.</p>", "분석 오류", "N/A", []
-        grads = input_embeds.grad.clone().detach()
-        # 6) 중요도 점수 계산
-        scores = (grads * input_embeds.detach()).norm(dim=2).squeeze(0)
         scores_np = scores.cpu().numpy()
         valid_scores = scores_np[np.isfinite(scores_np)]
-        if len(valid_scores) > 0 and valid_scores.max() > 0:
-            scores_np = scores_np / (valid_scores.max() + 1e-9)
-        else:
-            scores_np = np.zeros_like(scores_np)
-        # 7) HTML 생성
-        tokens = tokenizer.convert_ids_to_tokens(input_ids[0], skip_special_tokens=False)
-        html_tokens_list = []
-        cls_token_id, sep_token_id, pad_token_id = tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id
-        for i, tok_str in enumerate(tokens):
-            if input_ids[0, i] == pad_token_id: continue
             clean_tok_str = tok_str.replace("##", "") if "##" not in tok_str else tok_str[2:]
-            if input_ids[0, i] == cls_token_id or input_ids[0, i] == sep_token_id:
                 html_tokens_list.append(f"<span style='font-weight:bold;'>{html.escape(clean_tok_str)}</span>")
             else:
-                score_val = scores_np[i] if i < len(scores_np) else 0
-                color = f"rgba(255, 0, 0, {max(0, min(1, score_val)):.2f})"
                 html_tokens_list.append(f"<span style='background-color:{color}; padding: 1px 2px; margin: 1px; border-radius: 3px; display:inline-block;'>{html.escape(clean_tok_str)}</span>")
         html_output_str = " ".join(html_tokens_list).replace(" ##", "")
-        # Top-K 토큰 (DataFrame용 리스트의 리스트)
-        top_tokens_for_df = []
-        valid_indices = [idx for idx, token_id in enumerate(input_ids[0].tolist())
-                         if token_id not in [cls_token_id, sep_token_id, pad_token_id] and idx < len(scores_np)]
-        sorted_valid_indices = sorted(valid_indices, key=lambda idx: -scores_np[idx])
         for token_idx in sorted_valid_indices[:top_k_value]:
-            top_tokens_for_df.append([tokens[token_idx], f"{scores_np[token_idx]:.3f}"])
-        # 예측 클래스 레이블
         predicted_class_label_str = str(pred_idx)
         if class_names is not None and 0 <= pred_idx < len(class_names):
             predicted_class_label_str = str(class_names[pred_idx])
         prediction_summary_text = f"클래스: {predicted_class_label_str}\n확률: {pred_prob_val:.3f}"
         prediction_details_for_label = {"예측 클래스": predicted_class_label_str, "확률": f"{pred_prob_val:.3f}"}
-        return html_output_str, prediction_summary_text, prediction_details_for_label, top_tokens_for_df
     except Exception as e:
         import traceback
         tb_str = traceback.format_exc()
         error_html = f"<p style='color:red;'>분석 중 오류 발생: {html.escape(str(e))}</p><pre>{html.escape(tb_str)}</pre>"
         print(f"Analyze_sentence_for_gradio error: {e}\n{tb_str}")
-        return error_html, "분석 실패", {"오류": str(e)}, []
-# ────────── Gradio 인터페이스 정의 ──────────
-# 입력 컴포넌트
-input_sentence = gr.Textbox(lines=3, label="분석할 영어 문장", placeholder="여기에 영어 문장을 입력하세요...")
-input_top_k = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="표시할 Top-K 중요 토큰 수")
-# 출력 컴포넌트
-output_html_visualization = gr.HTML(label="토큰 중요도 시각화")
-output_prediction_summary = gr.Textbox(label="예측 요약", lines=2) # 간단한 텍스트 요약용
-output_prediction_details = gr.Label(label="예측 상세") # Label은 딕셔너리를 잘 보여줌
-output_top_tokens_df = gr.DataFrame(headers=["Token", "Score"], label="Top-K 중요 토큰", row_count=(1,"dynamic"), col_count=(2,"fixed"))
-# Gradio Blocks를 사용하여 레이아웃 구성 (선택 사항, Interface보다 유연함)
-with gr.Blocks(title="문장 토큰 중요도 분석기 (Gradio)", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📝 문장 토큰 중요도 분석기 (Gradio)")
-    gr.Markdown("BERT와 LDA를 활용하여 문장 내 각 토큰의 중요도를 시각화합니다.")
-    with gr.Row():
         with gr.Column(scale=2):
-            input_sentence.render()
-            input_top_k.render()
-            submit_button = gr.Button("분석 실행하기 🚀", variant="primary")
-        with gr.Column(scale=3):
-            output_prediction_summary.render()
-            output_prediction_details.render()
-    output_html_visualization.render()
-    output_top_tokens_df.render()
     gr.Markdown("---")
-    gr.Markdown("<p style='text-align: center; color: grey;'>BERT 기반 문장 분석 데모 (Gradio)</p>")
-    # 버튼 클릭 시 함수 연결
     submit_button.click(
         fn=analyze_sentence_for_gradio,
         inputs=[input_sentence, input_top_k],
-        outputs=[output_html_visualization, output_prediction_summary, output_prediction_details, output_top_tokens_df]
-    )
-    # 예제 추가
-    gr.Examples(
-        examples=[
-            ["This is a great movie and I really enjoyed it!", 5],
-            ["The weather is quite gloomy today.", 3],
-            ["I am not sure if this is the right way to do it, but let's try.", 4]
         ],
-        inputs=[input_sentence, input_top_k],
-        outputs=[output_html_visualization, output_prediction_summary, output_prediction_details, output_top_tokens_df], # 예제 실행 시에도 모든 출력 컴포넌트 필요
-        fn=analyze_sentence_for_gradio, # 예제 실행 시에도 동일 함수 사용
-        cache_examples=False # 모델이 있는 경우 True로 하면 예제 로딩이 빨라질 수 있으나, 디버깅 중에는 False 권장
     )
-# Gradio 앱 실행 (Hugging Face Spaces에서는 이 부분이 자동으로 처리됨)
-# 로컬에서 테스트 시: demo.launch()
 if __name__ == "__main__":
     if not MODELS_LOADED_SUCCESSFULLY:
         print("*"*80)
-        print("경고: 모델 로딩에 실패하여 Gradio 앱이 정상적으로 작동하지 않을 수 있습니다.")
-        print(f"오류 내용: {MODEL_LOADING_ERROR_MESSAGE}")
-        print("Gradio UI는 표시되지만, '분석 실행하기' 버튼을 눌렀을 때 오류가 발생합니다.")
-        print("`저장저장1` 폴더 및 내부 파일들이 `app.py`와 동일한 디렉토리에 있는지 확인하세요.")
         print("*"*80)
-    # Hugging Face Spaces는 app.py를 실행하고 demo.launch()를 찾거나
-    # demo라는 이름의 launchable Blocks/Interface 객체를 찾습니다.
     demo.launch()

 import numpy as np
 import html
 from transformers import AutoTokenizer, AutoModel, logging as hf_logging
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg') # Matplotlib 백엔드 설정
+import matplotlib.pyplot as plt
+# from mpl_toolkits.mplot3d import Axes3D # 3D 플롯에 필요
+from sklearn.decomposition import PCA
+# --- 기존 설정 및 전역 모델 로드 부분 ---
 # Hugging Face Transformers 로깅 레벨 설정
 hf_logging.set_verbosity_error()
+# 설정
 MODEL_NAME = "bert-base-uncased"
 DEVICE     = "cpu"
+SAVE_DIR   = "저장저장1"
 LAYER_ID   = 4
 SEED       = 0
 CLF_NAME   = "linear"
+# 전역 모델 로드
+TOKENIZER_GLOBAL, MODEL_GLOBAL = None, None
 W_GLOBAL, MU_GLOBAL, W_P_GLOBAL, B_P_GLOBAL = None, None, None, None
 CLASS_NAMES_GLOBAL = None
 MODELS_LOADED_SUCCESSFULLY = False
     clf_file_path = os.path.join(SAVE_DIR, f"{CLF_NAME}_layer{LAYER_ID}_projlda_seed{SEED}.pkl")
     if not os.path.isdir(SAVE_DIR):
+        raise FileNotFoundError(f"오류: 모델 저장 디렉토리 '{SAVE_DIR}'를 찾을 수 없습니다.")
     if not os.path.exists(lda_file_path):
         raise FileNotFoundError(f"오류: LDA 모델 파일 '{lda_file_path}'를 찾을 수 없습니다.")
     if not os.path.exists(clf_file_path):
     lda = joblib.load(lda_file_path)
     clf = joblib.load(clf_file_path)
+    if hasattr(clf, "base_estimator"): clf = clf.base_estimator
     W_GLOBAL   = torch.tensor(lda.scalings_,  dtype=torch.float32, device=DEVICE)
     MU_GLOBAL  = torch.tensor(lda.xbar_,     dtype=torch.float32, device=DEVICE)
     B_P_GLOBAL = torch.tensor(clf.intercept_, dtype=torch.float32, device=DEVICE)
     TOKENIZER_GLOBAL = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
+    MODEL_GLOBAL     = AutoModel.from_pretrained( # output_attentions 제거 또는 False
+        MODEL_NAME, output_hidden_states=True, output_attentions=False
     ).to(DEVICE).eval()
+    if hasattr(lda, 'classes_'): CLASS_NAMES_GLOBAL = lda.classes_
+    elif hasattr(clf, 'classes_'): CLASS_NAMES_GLOBAL = clf.classes_
     MODELS_LOADED_SUCCESSFULLY = True
     print("Gradio App: 모든 모델 및 데이터 로드 성공!")
 except Exception as e:
+    MODELS_LOADED_SUCCESSFULLY = False
     MODEL_LOADING_ERROR_MESSAGE = f"모델 로딩 중 심각한 오류 발생: {str(e)}\n'저장저장1' 폴더와 내용물을 확인해주세요."
     print(MODEL_LOADING_ERROR_MESSAGE)
+# 헬퍼 함수: PCA 시각화 (3D로 수정)
+def plot_token_pca_3d(token_embeddings_3d, tokens, scores, title="Token Embeddings 3D PCA (Colored by Importance)"):
+    fig = plt.figure(figsize=(10, 8))
+    ax = fig.add_subplot(111, projection='3d') # 3D 축 생성
+    # 일부 토큰만 어노테이션 (너무 많으면 복잡해짐)
+    # 예를 들어, 중요도 상위 N개 또는 간격을 두고 어노테이션
+    num_annotations = min(len(tokens), 15) # 최대 15개 토큰 어노테이션
+    indices_to_annotate = np.argsort(scores)[-num_annotations:] # 중요도 높은 순
+    scatter = ax.scatter(token_embeddings_3d[:, 0], token_embeddings_3d[:, 1], token_embeddings_3d[:, 2],
+                         c=scores, cmap="coolwarm_r", s=50, alpha=0.8, depthshade=True) # coolwarm_r: 높을수록 진한 빨강
+    for i in range(len(tokens)):
+        if i in indices_to_annotate: # 선택된 인덱스의 토큰만 표시
+            ax.text(token_embeddings_3d[i, 0], token_embeddings_3d[i, 1], token_embeddings_3d[i, 2],
+                    f' {tokens[i]}', size=8, zorder=1, color='k')
+    ax.set_title(title, fontsize=14)
+    ax.set_xlabel("PCA Component 1", fontsize=10)
+    ax.set_ylabel("PCA Component 2", fontsize=10)
+    ax.set_zlabel("PCA Component 3", fontsize=10)
+    cbar = plt.colorbar(scatter, label="Importance Score", shrink=0.7)
+    cbar.ax.tick_params(labelsize=8)
+    ax.tick_params(axis='both', which='major', labelsize=8)
+    plt.tight_layout()
+    return fig
+# ────────── 핵심 분석 함수 (어텐션 맵 제외, PCA 3D로, 반환 값 7개) ──────────
 def analyze_sentence_for_gradio(sentence_text, top_k_value):
+    # 빈 플롯 생성 함수 (오류 시 사용)
+    def create_empty_plot(message="N/A"):
+        fig = plt.figure(figsize=(2,2));
+        ax = fig.add_subplot(111)
+        ax.text(0.5, 0.5, message, ha='center', va='center', fontsize=10)
+        ax.axis('off')
+        plt.close(fig) # 메모리 관리를 위해 바로 닫음 (Gradio가 Figure 객체를 복사해 감)
+        return fig
     if not MODELS_LOADED_SUCCESSFULLY:
         error_html = f"<p style='color:red;'>초기화 오류: {html.escape(MODEL_LOADING_ERROR_MESSAGE)}</p>"
+        empty_df = pd.DataFrame(columns=['token', 'score'])
+        empty_fig_placeholder = create_empty_plot()
+        return error_html, [], "모델 로딩 실패", "N/A", [], empty_df, empty_fig_placeholder # 7개 반환
     try:
+        tokenizer, model = TOKENIZER_GLOBAL, MODEL_GLOBAL
         W, mu, w_p, b_p = W_GLOBAL, MU_GLOBAL, W_P_GLOBAL, B_P_GLOBAL
         class_names = CLASS_NAMES_GLOBAL
         enc = tokenizer(sentence_text, return_tensors="pt", truncation=True, max_length=510, padding=True)
+        input_ids, attn_mask  = enc["input_ids"].to(DEVICE), enc["attention_mask"].to(DEVICE)
         if input_ids.shape[1] == 0:
+            empty_df = pd.DataFrame(columns=['token', 'score'])
+            empty_fig_placeholder = create_empty_plot()
+            return "<p style='color:orange;'>입력 오류: 유효한 토큰이 없습니다.</p>", [], "입력 오류", "N/A", [], empty_df, empty_fig_placeholder
+        input_embeds_detached = model.embeddings.word_embeddings(input_ids).clone().detach()
+        input_embeds_for_grad = input_embeds_detached.clone().requires_grad_(True)
+        outputs = model(inputs_embeds=input_embeds_for_grad, attention_mask=attn_mask,
+                        output_hidden_states=True, output_attentions=False) # 어텐션은 이제 필요 없음
         cls_vec = outputs.hidden_states[LAYER_ID][:, 0, :]
         z_projected = (cls_vec - mu) @ W
         logit_output = z_projected @ w_p.T + b_p
         probs = torch.softmax(logit_output, dim=1)
+        pred_idx, pred_prob_val = torch.argmax(probs, dim=1).item(), probs[0, torch.argmax(probs, dim=1).item()].item()
+        if input_embeds_for_grad.grad is not None: input_embeds_for_grad.grad.zero_()
         logit_output[0, pred_idx].backward()
+        if input_embeds_for_grad.grad is None:
+            empty_df = pd.DataFrame(columns=['token', 'score'])
+            empty_fig_placeholder = create_empty_plot()
+            return "<p style='color:red;'>분석 오류: 그래디언트 계산 실패.</p>", [],"분석 오류", "N/A", [], empty_df, empty_fig_placeholder
+        grads = input_embeds_for_grad.grad.clone().detach()
+        scores = (grads * input_embeds_detached).norm(dim=2).squeeze(0)
         scores_np = scores.cpu().numpy()
         valid_scores = scores_np[np.isfinite(scores_np)]
+        scores_np = scores_np / (valid_scores.max() + 1e-9) if len(valid_scores) > 0 and valid_scores.max() > 0 else np.zeros_like(scores_np)
+        tokens_raw = tokenizer.convert_ids_to_tokens(input_ids[0], skip_special_tokens=False)
+        actual_tokens = [tok for i, tok in enumerate(tokens_raw) if input_ids[0,i] != tokenizer.pad_token_id]
+        actual_scores_np = scores_np[:len(actual_tokens)]
+        actual_input_embeds = input_embeds_detached[0, :len(actual_tokens), :].cpu().numpy()
+        html_tokens_list, highlighted_text_data = [], []
+        cls_token_id, sep_token_id = tokenizer.cls_token_id, tokenizer.sep_token_id
+        for i, tok_str in enumerate(actual_tokens):
             clean_tok_str = tok_str.replace("##", "") if "##" not in tok_str else tok_str[2:]
+            current_score = actual_scores_np[i]
+            current_score_clipped = max(0, min(1, current_score))
+            current_token_id = input_ids[0, i].item()
+            if current_token_id == cls_token_id or current_token_id == sep_token_id:
                 html_tokens_list.append(f"<span style='font-weight:bold;'>{html.escape(clean_tok_str)}</span>")
+                highlighted_text_data.append((clean_tok_str + " ", None))
             else:
+                color = f"rgba(255, 0, 0, {current_score_clipped:.2f})"
                 html_tokens_list.append(f"<span style='background-color:{color}; padding: 1px 2px; margin: 1px; border-radius: 3px; display:inline-block;'>{html.escape(clean_tok_str)}</span>")
+                highlighted_text_data.append((clean_tok_str + " ", round(current_score_clipped, 3)))
         html_output_str = " ".join(html_tokens_list).replace(" ##", "")
+        top_tokens_for_df, top_tokens_for_barplot_list = [], []
+        valid_indices = [idx for idx, token_id in enumerate(input_ids[0,:len(actual_tokens)].tolist())
+                         if token_id not in [cls_token_id, sep_token_id]]
+        sorted_valid_indices = sorted(valid_indices, key=lambda idx: -actual_scores_np[idx])
         for token_idx in sorted_valid_indices[:top_k_value]:
+            token_str = actual_tokens[token_idx]
+            score_val_str = f"{actual_scores_np[token_idx]:.3f}"
+            top_tokens_for_df.append([token_str, score_val_str])
+            top_tokens_for_barplot_list.append({"token": token_str, "score": actual_scores_np[token_idx]})
+        barplot_df = pd.DataFrame(top_tokens_for_barplot_list) if top_tokens_for_barplot_list else pd.DataFrame(columns=['token', 'score'])
         predicted_class_label_str = str(pred_idx)
         if class_names is not None and 0 <= pred_idx < len(class_names):
             predicted_class_label_str = str(class_names[pred_idx])
         prediction_summary_text = f"클래스: {predicted_class_label_str}\n확률: {pred_prob_val:.3f}"
         prediction_details_for_label = {"예측 클래스": predicted_class_label_str, "확률": f"{pred_prob_val:.3f}"}
+        # 토큰 임베딩 PCA 시각화 (3D)
+        non_special_token_indices = [idx for idx, token_id in enumerate(input_ids[0,:len(actual_tokens)].tolist())
+                                     if token_id not in [cls_token_id, sep_token_id]]
+        # PCA는 n_samples >= n_components 여야 함. 여기서는 3개의 컴포넌트.
+        if len(non_special_token_indices) >= 3 :
+            pca_tokens = [actual_tokens[i] for i in non_special_token_indices]
+            pca_embeddings = actual_input_embeds[non_special_token_indices, :]
+            pca_scores = actual_scores_np[non_special_token_indices]
+            pca = PCA(n_components=3, random_state=SEED)
+            token_embeddings_3d = pca.fit_transform(pca_embeddings)
+            pca_fig = plot_token_pca_3d(token_embeddings_3d, pca_tokens, pca_scores)
+        else:
+            pca_fig = create_empty_plot("PCA Plot N/A\n(Not enough non-special tokens for 3D)")
+        return (html_output_str, highlighted_text_data,
+                prediction_summary_text, prediction_details_for_label,
+                top_tokens_for_df, barplot_df,
+                pca_fig) # 어텐션 맵 대신 pca_fig만 반환 (총 7개)
     except Exception as e:
         import traceback
         tb_str = traceback.format_exc()
         error_html = f"<p style='color:red;'>분석 중 오류 발생: {html.escape(str(e))}</p><pre>{html.escape(tb_str)}</pre>"
         print(f"Analyze_sentence_for_gradio error: {e}\n{tb_str}")
+        empty_df = pd.DataFrame(columns=['token', 'score'])
+        empty_fig_placeholder = create_empty_plot("Error during plot generation")
+        return error_html, [], "분석 실패", {"오류": str(e)}, [], empty_df, empty_fig_placeholder
+# ────────── Gradio 인터페이스 정의 (어텐션 맵 탭 제거) ──────────
+theme = gr.themes.Glass(primary_hue="blue", secondary_hue="cyan", neutral_hue="sky").set(
+    body_background_fill="linear-gradient(to right, #c9d6ff, #e2e2e2)", # 배경 그라데이션
+    block_background_fill="rgba(255,255,255,0.8)", # 블록 배경 반투명
+    block_border_width="1px",
+    block_shadow="*shadow_drop_lg"
+)
+with gr.Blocks(title="AI 문장 분석기 XAI 🚀", theme=theme, css=".gradio-container {max-width: 98% !important;}") as demo:
+    gr.Markdown("# 🚀 AI 문장 분석기 XAI: 모델 해석 탐험")
+    gr.Markdown("BERT 모델 예측의 근거를 다양한 시각화 기법으로 탐색합니다. 토큰의 중요도와 임베딩 공간에서의 분포를 확인해보세요.")
+    with gr.Row(equal_height=False):
+        with gr.Column(scale=1, min_width=300):
+            with gr.Group():
+                gr.Markdown("### ✏️ 문장 입력 & 설정")
+                input_sentence = gr.Textbox(lines=5, label="분석할 영어 문장", placeholder="여기에 분석하고 싶은 영어 문장을 입력하세요...")
+                input_top_k = gr.Slider(minimum=1, maximum=10, value=5, step=1, label="Top-K 토큰 수")
+                submit_button = gr.Button("분석 시작 💫", variant="primary", scale=1)
         with gr.Column(scale=2):
+            with gr.Accordion("🎯 예측 결과", open=True):
+                output_prediction_summary = gr.Textbox(label="간단 요약", lines=2, interactive=False)
+                output_prediction_details = gr.Label(label="상세 정보") # Label은 딕셔너리 표시
+            with gr.Accordion("⭐ Top-K 중요 토큰 (표)", open=True):
+                output_top_tokens_df = gr.DataFrame(headers=["Token", "Score"], label="중요도 높은 토큰",
+                                                    row_count=(1,"dynamic"), col_count=(2,"fixed"), interactive=False, wrap=True)
+    with gr.Tabs() as tabs:
+        with gr.TabItem("🎨 HTML 하이라이트", id=0):
+            output_html_visualization = gr.HTML(label="토큰별 중요도 (Gradient x Input)")
+        with gr.TabItem("🖍️ 텍스트 하이라이트", id=1):
+            output_highlighted_text = gr.HighlightedText(
+                label="중요도 기반 텍스트 하이라이트 (점수: 0~1)",
+                show_legend=True,
+                combine_adjacent=False
+            )
+        with gr.TabItem("📊 Top-K 막대 그래프", id=2):
+            output_top_tokens_barplot = gr.BarPlot(label="Top-K 토큰 중요도", x="token", y="score", tooltip=['token', 'score'], min_width=300, color_legend_ έναντι=None)
+        with gr.TabItem("🌐 토큰 임베딩 3D PCA", id=3): # 어텐션 맵 대신 PCA 탭
+            output_pca_plot = gr.Plot(label="토큰 임베딩 3D PCA (중요도 색상)")
     gr.Markdown("---")
+    gr.Examples(
+        examples=[
+            ["This movie is an absolute masterpiece, captivating from start to finish.", 5],
+            ["Despite some flaws, the film offers a compelling narrative.", 3],
+            ["I was thoroughly disappointed with the lackluster performance and predictable plot.", 4]
+        ],
+        inputs=[input_sentence, input_top_k],
+        outputs=[ # 반환 값 개수에 맞춰 7개로 수정
+            output_html_visualization, output_highlighted_text,
+            output_prediction_summary, output_prediction_details,
+            output_top_tokens_df, output_top_tokens_barplot,
+            output_pca_plot # 어텐션 플롯 제거, PCA 플롯만 남김
+        ],
+        fn=analyze_sentence_for_gradio,
+        cache_examples=False
+    )
+    gr.Markdown("<p style='text-align: center; color: #666;'>Explainable AI Demo with Gradio & Transformers</p>", unsafe_allow_html=True)
     submit_button.click(
         fn=analyze_sentence_for_gradio,
         inputs=[input_sentence, input_top_k],
+        outputs=[ # 반환 값 개수에 맞춰 7개로 수정
+            output_html_visualization, output_highlighted_text,
+            output_prediction_summary, output_prediction_details,
+            output_top_tokens_df, output_top_tokens_barplot,
+            output_pca_plot # 어텐션 플롯 제거, PCA 플롯만 남김
         ],
+        api_name="explain_sentence_xai"
     )
 if __name__ == "__main__":
     if not MODELS_LOADED_SUCCESSFULLY:
         print("*"*80)
+        print(f"경고: 모델 로딩 실패! {MODEL_LOADING_ERROR_MESSAGE}")
+        print("Gradio UI는 표시되지만 분석 기능이 제대로 작동하지 않습니다.")
         print("*"*80)
     demo.launch()