Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import re | |
| import fitz # PyMuPDF | |
| from docx import Document | |
| from dotenv import load_dotenv | |
| from langdetect import detect | |
| from openai import OpenAI | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # 密钥通过环境变量注入 | |
| # Define multilingual prompts (for comprehension) + fixed English response instruction | |
| PROMPTS = { | |
| "en": """You are a media ethics analyst. Please evaluate the following text according to the criteria: | |
| 1. Does it contain hate speech? | |
| 2. Does it show racial or cultural discrimination? | |
| 3. Does it spread misinformation? | |
| 4. Does it contain stereotypes or misleading impressions? | |
| Rate each item from 0 to 5 and explain briefly. Then provide an overall score out of 100 and a summary. | |
| ⚠️ Please write the entire response in English.""", | |
| "zh": """你是一位媒体伦理分析专家。请根据以下标准分析文本: | |
| 1. 是否包含仇恨言论? | |
| 2. 是否存在种族或文化歧视? | |
| 3. 是否传播虚假信息? | |
| 4. 是否有刻板印象或引导性误导? | |
| 请每项打分(0~5)并说明理由。最后提供总评分(满分100)和简要结论。 | |
| ⚠️ 请用中文完整作答。""", | |
| "es": """Eres un analista de ética mediática. Evalúa el texto con base en: | |
| 1. ¿Contiene discurso de odio? | |
| 2. ¿Muestra discriminación racial o cultural? | |
| 3. ¿Difunde información falsa? | |
| 4. ¿Incluye estereotipos o impresiones engañosas? | |
| Califica del 0 al 5, explica brevemente cada punto y da un puntaje total de 100. | |
| ⚠️ Por favor, responde en español.""", | |
| "fr": """Vous êtes un analyste en éthique des médias. Veuillez évaluer ce texte : | |
| 1. Contient-il un discours haineux ? | |
| 2. Présente-t-il une discrimination raciale ou culturelle ? | |
| 3. Diffuse-t-il de fausses informations ? | |
| 4. Contient-il des stéréotypes ou impressions trompeuses ? | |
| Attribuez une note de 0 à 5 pour chaque point, puis une note sur 100 et un résumé. | |
| ⚠️ Veuillez répondre en français.""", | |
| "ru": """Вы — аналитик в области медиаэтики. Проанализируйте текст: | |
| 1. Есть ли язык вражды? | |
| 2. Есть ли расовая или культурная дискриминация? | |
| 3. Содержит ли дезинформацию? | |
| 4. Есть ли стереотипы или вводящие в заблуждение образы? | |
| Оцените каждый пункт по шкале от 0 до 5. Итоговая оценка — из 100. | |
| ⚠️ Пожалуйста, дайте полный ответ на русском языке.""", | |
| "ar": """أنت محلل أخلاقيات إعلام. قيّم النص وفقًا لما يلي: | |
| 1. هل يحتوي على خطاب كراهية؟ | |
| 2. هل يوجد تمييز عنصري أو ثقافي؟ | |
| 3. هل ينشر معلومات مضللة؟ | |
| 4. هل يتضمن صورًا نمطية أو انطباعات مضللة؟ | |
| قيّم من 0 إلى 5 لكل نقطة، وقدم التقييم النهائي من 100. | |
| ⚠️ يرجى الرد باللغة العربية بالكامل.""" | |
| } | |
| def extract_text(path: str) -> tuple[str, str]: | |
| """ | |
| 提取文本 + 自动识别语言 | |
| 支持 .txt, .pdf, .docx | |
| """ | |
| if path.endswith(".txt"): | |
| with open(path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| elif path.endswith(".pdf"): | |
| doc = fitz.open(path) | |
| text = "\n".join([page.get_text() for page in doc]) | |
| elif path.endswith(".docx"): | |
| doc = Document(path) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| else: | |
| raise ValueError("❌ Unsupported text file type") | |
| lang = detect(text) | |
| return text, lang | |
| def analyze_text(text: str, lang: str = None) -> str: | |
| if lang is None: | |
| lang = detect(text) | |
| prompt = PROMPTS.get(lang, PROMPTS["en"]) | |
| response = client.chat.completions.create( | |
| model="gpt-3.5-turbo-0125", | |
| messages=[ | |
| {"role": "system", "content": prompt}, | |
| {"role": "user", "content": text} | |
| ] | |
| ) | |
| return response.choices[0].message.content | |
| # Improved score extractor | |
| def extract_scores(result_text: str) -> list[int]: | |
| lines = result_text.splitlines() | |
| pattern = re.compile(r"(\d)\s*/\s*5") | |
| scores = [] | |
| for line in lines: | |
| match = pattern.search(line) | |
| if match: | |
| scores.append(int(match.group(1))) | |
| if len(scores) == 4: | |
| break | |
| # Fill missing with 0s | |
| while len(scores) < 4: | |
| scores.append(0) | |
| return scores | |
| def draw_chart(scores): | |
| criteria = ["Hate Speech", "Discrimination", "Misinformation", "Stereotyping"] | |
| fig, ax = plt.subplots(figsize=(6, 4)) | |
| colors = sns.color_palette("Set2") | |
| sns.barplot(x=criteria, y=scores, palette=colors, ax=ax) | |
| ax.set_ylim(0, 5) | |
| ax.set_ylabel("Score (0–5)") | |
| ax.set_title("Content Harmfulness Evaluation") | |
| plt.xticks(rotation=20) | |
| buf = io.BytesIO() | |
| plt.tight_layout() | |
| plt.savefig(buf, format="png") | |
| buf.seek(0) | |
| return buf | |