Spaces:

aeresd
/

test_1

Sleeping

App Files Files Community

aeresd commited on May 20, 2025

Commit

88f8bd0

verified ·

1 Parent(s): bd9feeb

Update app.py

Browse files

Files changed (1) hide show

app.py +164 -147

app.py CHANGED Viewed

@@ -6,16 +6,7 @@ import pytesseract
 import pandas as pd
 import plotly.express as px
-# ✅ 新增维度定义
-OFFENSIVE_CATEGORIES = {
-    "Insult": ["蠢货", "白痴", "废物"],
-    "Abuse": ["去死", "打死", "宰了你"],
-    "Discrimination": ["女司机", "娘娘腔", "黑鬼"],
-    "HateSpeech": ["灭族", "屠杀", "灭绝"],
-    "Vulgarity": ["艹", "sb", "尼玛"]
-}
-# ✅ 模型初始化（保持原有结构）
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
@@ -25,34 +16,71 @@ emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
     "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
 }
-# ✅ 动态评分算法
-def dynamic_scoring(text: str, classifier):
-    scores = {k: 0.0 for k in OFFENSIVE_CATEGORIES}
-    for category, keywords in OFFENSIVE_CATEGORIES.items():
-        for kw in keywords:
-            if kw in text:
-                scores[category] += 0.3
-    words = text.split()
-    for word in words:
-        try:
-            res = classifier(word)[0]
-            if res["label"] in scores:
-                scores[res["label"]] += res["score"] * 0.7
-        except: pass
-    max_score = max(scores.values()) or 1
-    return {k: round(v/max_score, 2) for k,v in scores.items()}
-# ✅ 分类函数改造
 def classify_emoji_text(text: str):
     prompt = f"输入：{text}\n输出："
     input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
     with torch.no_grad():
@@ -60,67 +88,49 @@ def classify_emoji_text(text: str):
     decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
     translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()
-    result = classifier(translated_text)[0]
-    label = result["label"]
-    score = result["score"]
-    reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases."
-    # 新增维度分析
-    category_scores = dynamic_scoring(translated_text, classifier)
     st.session_state.history.append({
         "text": text,
         "translated": translated_text,
-        "label": label,
-        "score": score,
-        "reason": reasoning,
-        "scores": category_scores
-    })
-    return translated_text, label, score, reasoning, category_scores
-# ✅ 可视化生成函数
-def generate_radar_chart(scores_dict: dict):
-    radar_df = pd.DataFrame({
-        "Category": list(scores_dict.keys()),
-        "Score": list(scores_dict.values())
     })
-    fig = px.line_polar(
-        radar_df,
-        r='Score',
-        theta='Category',
-        line_close=True,
-        color_discrete_sequence=['#FF6B6B'],
-        title="🛡️ Multi-Dimensional Offensive Analysis"
-    )
-    fig.update_layout(
-        polar=dict(
-            radialaxis=dict(
-                visible=True,
-                range=[0, 1],
-                tickvals=[0, 0.3, 0.7, 1],
-                ticktext=["Safe", "Caution", "Risk", "Danger"]
-            )),
-        showlegend=False
-    )
-    return fig
-# ✅ 页面配置（保持原有结构）
-st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
-with st.sidebar:
-    st.header("🧠 Configuration")
-    selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
-    selected_model_id = model_options[selected_model]
-    classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)
-if "history" not in st.session_state:
-    st.session_state.history = []
-# 主页面逻辑
 st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
-# 文本输入
 st.subheader("1. 输入与分类")
 default_text = "你是🐷"
 text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
@@ -128,92 +138,99 @@ text = st.text_area("Enter sentence with emojis:", value=default_text, height=15
 if st.button("🚦 Analyze Text"):
     with st.spinner("🔍 Processing..."):
         try:
-            translated, label, score, reason, category_scores = classify_emoji_text(text)
-            # 展示基础结果
             st.markdown("**Translated sentence:**")
             st.code(translated, language="text")
-            # 展示雷达图
-            st.plotly_chart(generate_radar_chart(category_scores))
-# 图片上传与 OCR
 st.markdown("---")
 st.subheader("2. 图片 OCR & 分类")
 uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"])
 if uploaded_file:
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Screenshot", use_column_width=True)
     with st.spinner("🧠 Extracting text via OCR..."):
         ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
         if ocr_text:
             st.markdown("**Extracted Text:**")
             st.code(ocr_text)
-            translated, label, score, reason = classify_emoji_text(ocr_text)
-            st.markdown("**Translated sentence:**")
-            st.code(translated, language="text")
-            st.markdown(f"**Prediction:** {label}")
-            st.markdown(f"**Confidence Score:** {score:.2%}")
-            st.markdown("**Model Explanation:**")
-            st.info(reason)
         else:
-            st.info("⚠️ No text detected in the image.")
-# 分析仪表盘
 st.markdown("---")
-st.subheader("3. Violation Analysis Dashboard")
 if st.session_state.history:
-    # 展示历史记录
-    df = pd.DataFrame(st.session_state.history)
-    st.markdown("### 🧾 Offensive Terms & Suggestions")
-    for item in st.session_state.history:
-        st.markdown(f"- 🔹 **Input:** {item['text']}")
-        st.markdown(f"   - ✨ **Translated:** {item['translated']}")
-        st.markdown(f"   - ❗ **Label:** {item['label']} with **{item['score']:.2%}** confidence")
-        st.markdown(f"   - 🔧 **Suggestion:** {item['reason']}")
     # 雷达图
     radar_df = pd.DataFrame({
-        "Category": ["Insult","Abuse","Discrimination","Hate Speech","Vulgarity"],
-        "Score": [0.7,0.4,0.3,0.5,0.6]
     })
-    radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="⚠️ Risk Radar by Category")
-    radar_fig.update_traces(line_color='black')
-    st.plotly_chart(radar_fig)
-    # —— 新增：单词级冒犯性相关性分析 —— #
-    st.markdown("### 🧬 Word-level Offensive Correlation")
-    # 取最近一次翻译文本，按空格拆分单词
-    last_translated_text = st.session_state.history[-1]["translated"]
-    words = last_translated_text.split()
-    # 对每个单词进行分类并收集分数
-    word_scores = []
-    for word in words:
-        try:
-            res = classifier(word)[0]
-            word_scores.append({
-                "Word": word,
-                "Label": res["label"],
-                "Score": res["score"]
-            })
-        except Exception:
-            continue
-    if word_scores:
-        word_df = pd.DataFrame(word_scores)
-        word_df = word_df.sort_values(by="Score", ascending=False).reset_index(drop=True)
-        max_display = 5
-        # Streamlit 1.22+ 支持 st.toggle，若版本不支持可改用 checkbox
-        show_more = st.toggle("Show more words", value=False)
-        display_df = word_df if show_more else word_df.head(max_display)
-        # 隐藏边框并渲染 HTML 表格
-        st.markdown(
-            display_df.to_html(index=False, border=0),
-            unsafe_allow_html=True
-        )
     else:
-        st.info("❕ No word-level analysis available.")
 else:
-    st.info("⚠️ No classification data available yet.")

 import pandas as pd
 import plotly.express as px
+# ✅ Step 1: Emoji翻译模型
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
+# ✅ Step 2: 分类模型配置
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
     "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
 }
+# 雷达图分类映射系统
+category_system = {
+    "Insult": ["侮辱", "贬低", "人身攻击"],
+    "Abuse": ["威胁", "暴力", "骚扰"],
+    "Discrimination": ["种族", "性别", "宗教"],
+    "Hate Speech": ["仇恨", "极端言论"],
+    "Vulgarity": ["脏话", "低俗", "性暗示"]
+}
+# 模型到分类系统的映射
+model_category_map = {
+    "Toxic-BERT": {
+        "toxic": ["Vulgarity"],
+        "severe_toxic": ["Abuse"],
+        "obscene": ["Vulgarity"],
+        "threat": ["Abuse", "Hate Speech"],
+        "insult": ["Insult"],
+        "identity_hate": ["Discrimination", "Hate Speech"]
+    },
+    "Roberta Offensive": {
+        "offensive": ["Insult", "Abuse"]
+    },
+    "BERT Emotion": {
+        "anger": ["Abuse"],
+        "disgust": ["Vulgarity"]
+    }
+}
+# ✅ 页面配置
+st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
+# ✅ 侧边栏配置
+with st.sidebar:
+    st.header("🧠 Configuration")
+    selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
+    selected_model_id = model_options[selected_model]
+    # 动态调整分类器参数
+    classifier_config = {
+        "device": 0 if torch.cuda.is_available() else -1,
+        "top_k": None if selected_model == "Toxic-BERT" else 1
+    }
+    if selected_model == "Toxic-BERT":
+        classifier_config["function_to_apply"] = "sigmoid"
+    classifier = pipeline(
+        "text-classification",
+        model=selected_model_id,
+        **classifier_config
+    )
+# 初始化历史记录
+if "history" not in st.session_state:
+    st.session_state.history = []
+# ✅ 核心分类函数
 def classify_emoji_text(text: str):
+    # Emoji翻译
     prompt = f"输入：{text}\n输出："
     input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
     with torch.no_grad():
     decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
     translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()
+    # 整体分类
+    main_result = classifier(translated_text)[0]
+    # 元素级分析
+    elements = translated_text.split()
+    element_analysis = []
+    radar_scores = {category: 0.0 for category in category_system}
+    for elem in elements:
+        try:
+            results = classifier(elem)
+            for res in results:
+                for model_label in model_category_map.get(selected_model, {}):
+                    if res["label"] == model_label:
+                        score = res["score"]
+                        for category in model_category_map[selected_model][model_label]:
+                            if score > radar_scores[category]:
+                                radar_scores[category] = score
+                            element_analysis.append({
+                                "Element": elem,
+                                "Original": text.split()[elements.index(elem)] if len(text.split()) > elements.index(elem) else "",
+                                "Category": category,
+                                "Score": score
+                            })
+        except Exception as e:
+            continue
+    # 记录历史
     st.session_state.history.append({
         "text": text,
         "translated": translated_text,
+        "label": main_result["label"],
+        "score": main_result["score"],
+        "elements": element_analysis,
+        "radar": radar_scores
     })
+    return translated_text, main_result["label"], main_result["score"], radar_scores
+# ✅ 主界面
 st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
+# 文本输入模块
 st.subheader("1. 输入与分类")
 default_text = "你是🐷"
 text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
 if st.button("🚦 Analyze Text"):
     with st.spinner("🔍 Processing..."):
         try:
+            translated, label, score, radar = classify_emoji_text(text)
             st.markdown("**Translated sentence:**")
             st.code(translated, language="text")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("Prediction", f"{label} 🔴" if score > 0.5 else f"{label} 🟢")
+            with col2:
+                st.metric("Confidence", f"{score:.2%}")
+            st.markdown("**Model Explanation:**")
+            st.info(f"文本被识别为「{label}」，建议检查以下内容：")
+            for cat, score in radar.items():
+                if score > 0.5:
+                    st.markdown(f"- ❗ **{cat}** 风险 ({score:.2%})")
+        except Exception as e:
+            st.error(f"❌ Error: {e}")
+# 图片分析模块
 st.markdown("---")
 st.subheader("2. 图片 OCR & 分类")
 uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"])
 if uploaded_file:
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Screenshot", use_column_width=True)
     with st.spinner("🧠 Extracting text via OCR..."):
         ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
         if ocr_text:
             st.markdown("**Extracted Text:**")
             st.code(ocr_text)
+            try:
+                translated, label, score, radar = classify_emoji_text(ocr_text)
+                st.markdown(f"**Prediction:** {label} ({score:.2%})")
+            except Exception as e:
+                st.error(f"OCR分析错误: {e}")
         else:
+            st.info("⚠️ 未检测到文字内容")
+# 数据分析仪表盘
 st.markdown("---")
+st.subheader("3. 风险分析仪表盘")
 if st.session_state.history:
+    latest = st.session_state.history[-1]
     # 雷达图
+    st.markdown("### ⚠️ 风险雷达图")
     radar_df = pd.DataFrame({
+        "Category": latest["radar"].keys(),
+        "Score": latest["radar"].values()
     })
+    fig = px.line_polar(
+        radar_df,
+        r="Score",
+        theta="Category",
+        line_close=True,
+        range_r=[0,1],
+        template="plotly_dark"
+    )
+    fig.update_traces(fill="toself", line_color="red")
+    st.plotly_chart(fig, use_container_width=True)
+    # 元素贡献分析
+    st.markdown("### 🧩 风险元素分解表")
+    if latest["elements"]:
+        element_df = pd.DataFrame(latest["elements"])
+        element_df = element_df.sort_values(by=["Score", "Category"], ascending=False)
+        # 分组展示
+        for category in category_system:
+            cat_df = element_df[element_df["Category"] == category]
+            if not cat_df.empty:
+                with st.expander(f"{category} 风险元素 ({len(cat_df)}项)"):
+                    st.dataframe(
+                        cat_df[["Element", "Original", "Score"]]
+                            .style.highlight_between(subset="Score", color="#ffcccc"),
+                        use_container_width=True,
+                        hide_index=True
+                    )
     else:
+        st.info("✅ 未检测到高风险元素")
+    # 历史记录
+    st.markdown("### 📜 分析历史")
+    history_df = pd.DataFrame(st.session_state.history)
+    st.dataframe(
+        history_df[["text", "label", "score"]]
+            .style.applymap(lambda x: "color: red" if x == "OFFENSIVE" else ""),
+        use_container_width=True,
+        hide_index=True
+    )
 else:
+    st.info("🕑 等待首次分析结果...")