Spaces:

HunterNope
/

AutoCenzurer

Sleeping

App Files Files Community

AutoCenzurer / static /config.py

HunterNope

ACZ-1 - Created gradio app for HF Space

5f2a5b3 3 months ago

raw

history blame contribute delete

5.52 kB

	LABEL_ORDER = {
	"NONE": 0,
	"HATE_SPEECH_GENERAL": 1,
	"EXTREMISM_PROMOTION": 2,
	"HARASSMENT_OBSCENITY": 3
	}

	BINARY_CENSURE_LABEL = 'CENSURE_LABEL'
	BINARY_NONE_LABEL = 'NONE_LABEL'

	BINARY_LABEL_TO_CLASS_VALUES = {
	BINARY_CENSURE_LABEL: 1,
	BINARY_NONE_LABEL: 0
	}

	MULTICLASS_CONFLICTUAL_LABEL = "CONFLICTUAL"
	MULTICLASS_UNHARMFUL_PROFANITY_LABEL = 'UNHARMFUL_PROFANITY'
	MULTICLASS_NONE_LABEL = 'NONE'

	MULTICLASS_LABEL_TO_CLASS_VALUES = {
	MULTICLASS_CONFLICTUAL_LABEL: 2,
	MULTICLASS_UNHARMFUL_PROFANITY_LABEL: 1,
	MULTICLASS_NONE_LABEL: 0,
	}

	MERGE_GAP = 0.3
	MAX_WORDS = 10
	OVERLAP_WORDS = 3
	PAUSE_THRESHOLD = 0.55
	PAD_BEFORE = 0.25 #0.5
	PAD_AFTER = 0.4 #0.8

	LLM_BASE_PROMPT_LABEL_SPAN ="""
	Role:
	You are an Executive Judge responsible for classifying text spans for harmful content under a fixed policy.

	Scope:
	Each span is independent. No external context, history, intent, framing, or hypothetical interpretation is allowed.

	Task:
	Given the Span Input, assign exactly one label that reflects the highest severity of harmful content present in the text.

	Restrictions:
	- The span_text is static and must not be reinterpreted, softened, justified, or excused.
	- Quoted, educational, fictional, artistic, or hypothetical harmful speech is treated as harmful.
	- Do not invent assumptions beyond the span_text and provided signals.
	- Do not output multiple labels.
	- Do not deviate from the specified output format.

	Labels (ordered by severity):
	NONE < HATE_SPEECH_GENERAL < EXTREMISM_PROMOTION < HARASSMENT_OBSCENITY

	Output Format:
	MUST return ONLY valid JSON WITHOUT any artifacts with exact the following structure:

	{
	"label": "NONE \| HATE_SPEECH_GENERAL \| EXTREMISM_PROMOTION \| HARASSMENT_OBSCENITY",
	"confidence": "LOW \| MEDIUM \| HIGH",
	"rationale": "Concise empirical justification citing explicit phrases or patterns from span_text"
	}
	"""

	LLM_PROMPT_BINARY_CLASSIFICATION = """
	Role:
	You are an Executive Judge responsible for classifying text spans for harmful content under a fixed policy.

	Scope:
	Each span is independent. No external context, history, intent, framing, or hypothetical interpretation is allowed.

	Task:
	Given the Span Input, assign exactly one label that reflects the highest severity of harmful content present in the text.

	Restrictions:
	- The span_text is static and must not be reinterpreted, softened, justified, or excused.
	- Quoted, educational, fictional, artistic, or hypothetical harmful speech is treated as harmful.
	- Do not invent assumptions beyond the span_text and provided signals.
	- Do not output multiple labels.
	- Do not deviate from the specified output format.

	Labels:
	CENSURE_LABEL -> Use this label to mark any span considered to be Conflictual language. An attack based on protected (race, color, caster, gender, etc) or other categories.
	NONE_LABEL -> Mark with it any other content.

	Output Format:
	MUST return ONLY valid JSON WITHOUT any artifacts with exact the following structure:

	{
	"label": "NONE_LABEL \| CENSURE_LABEL",
	"confidence": "LOW \| MEDIUM \| HIGH",
	"rationale": "Concise empirical justification citing explicit phrases or patterns from span_text"
	}
	"""

	LLM_PROMPT_MULTICLASS_CLASSIFICATION = """
	Role:
	You are an Executive Judge responsible for classifying text spans for harmful content under a fixed policy.

	Scope:
	Each span is independent. No external context, history, intent, framing, or hypothetical interpretation is allowed.

	Task:
	Given the Span Input, assign exactly one label that is suitable for the span content.

	Restrictions:
	- The span_text is static and must not be reinterpreted, softened, justified, or excused.
	- Quoted, educational, fictional, artistic, or hypothetical harmful speech is treated as harmful.
	- Do not invent assumptions beyond the span_text and provided signals.
	- Do not output multiple labels.
	- Do not deviate from the specified output format.

	Labels:
	CONFLICTUAL -> Any span considered to be Conflictual language. An attack based on protected (race, color, caster, gender, etc) or other categories.
	UNHARMFUL_PROFANITY -> Language containing slurs and profanity, but not directed toward a specific entity.
	NONE -> Any other content, not suitable for given labels.

	Output Format:
	MUST return ONLY valid JSON WITHOUT any artifacts with exact the following structure:

	{
	"label": "NONE \| CONFLICTUAL \| UNHARMFUL_PROFANITY",
	"confidence": "LOW \| MEDIUM \| HIGH",
	"rationale": "Concise empirical justification citing explicit phrases or patterns from span_text"
	}
	"""

	### Deterministic signals profanity lists ###
	PROFANITY = {"fuck", "shit", "bitch", "asshole", "dick", "bastard"}
	SLURS = {"nigger", "faggot", "retard", "kike", "chink"}
	INSULTS = {"idiot", "dumbass", "moron", "stupid", "loser"}

	TARGETS = {"you", "your", "he", "she", "they", "this", "that"}
	VIOLENCE_VERBS = {"kill", "murder", "hurt", "harm", "attack", "shoot", "stab", "punch", "beat", "destroy", "rape"}
	THREAT_VERBS = {"will", "gonna", "going", "should", "hope", "wish", "want"}
	THREAT_MODALS = {"should", "would", "could", "might"}
	VIOLENT_OUTCOMES = {"die", "dead", "death", "blood", "pain", "suffer", "suffering"}

	CURSE_VOCAB = PROFANITY \| SLURS \| INSULTS

	pipeline = """
	Audio / Video
	-
	Audio extraction (.wav)
	-
	ASR (Whisper)
	-
	words_df
	-
	build_spans()
	-
	span_df
	-
	deterministic signals
	-
	min_allowed_label
	-
	LLM classification
	-
	final_enforced_label
	-
	filter label != NONE
	-
	extract intervals
	-
	merge_intervals()
	-
	mute_audio()
	"""