Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

App Files Files Community

IfGPT-DataQualityComponents / java /bg /bas /dcl /LLMs /BiasEntry.java

dcl-ibl-bas

Upload 22 files

18573e4 verified 5 days ago

raw

history blame contribute delete

5.97 kB

	package bg.bas.dcl.LLMs;

	import java.util.Collections;
	import java.util.HashSet;
	import java.util.Set;

	/**
	* BiasEntry
	*
	* TSV column order (0-based, tab-separated):
	* 0 word — canonical lemma
	* 1 POS — part of speech (N, A, V, …)
	* 2 signal — "true" / "false" : marks identity-group signals
	* 3 biasType — gender \| race_ethnicity \| religion \| disability \| appearance \| "" (general)
	* 4 biasValue — positive \| negative \| neutral \| ""
	* 5 derogatory — "true" / "false"
	* 6 colloquial — "true" / "false"
	* 7 forms — "true" / "false" (unused flag; inflected forms are in col 10)
	* 8 positivity — double in [0,1]
	* 9 negativity — double in [0,1]
	* 10 inflectedForms — pipe-separated list of surface forms, or empty
	*/
	public class BiasEntry {

	// -----------------------------------------------------------------------
	// Fields
	// -----------------------------------------------------------------------

	private final String word;
	private final String pos;
	private final boolean signal;
	private final String biasType; // "" means general / not type-specific
	private final String biasValue; // "" means unscored
	private final boolean derogatory;
	private final boolean colloquial;
	private final double positivity;
	private final double negativity;

	/** All known surface forms (lemma + inflected), lowercased for fast lookup. */
	private final Set<String> forms;

	// -----------------------------------------------------------------------
	// Constructor — called by BiasLexicon during TSV loading
	// -----------------------------------------------------------------------

	public BiasEntry(String word, String pos,
	boolean signal, String biasType, String biasValue,
	boolean derogatory, boolean colloquial,
	double positivity, double negativity,
	Set<String> forms) {
	this.word = word == null ? "" : word.trim();
	this.pos = pos == null ? "" : pos.trim();
	this.signal = signal;
	this.biasType = biasType == null ? "" : biasType.trim();
	this.biasValue = biasValue == null ? "" : biasValue.trim();
	this.derogatory = derogatory;
	this.colloquial = colloquial;
	this.positivity = positivity;
	this.negativity = negativity;
	this.forms = Collections.unmodifiableSet(
	forms == null ? new HashSet<>() : forms);
	}

	// -----------------------------------------------------------------------
	// Accessors
	// -----------------------------------------------------------------------

	/** Canonical lemma as it appears in the dictionary. */
	public String getWord() { return word; }

	/** Part-of-speech tag (N, A, V, …). */
	public String getPos() { return pos; }

	/**
	* True if this entry marks an identity-group signal word —
	* i.e. a term that identifies a person by a protected attribute
	* (e.g. "жена", "мюсюлманин").
	*/
	public boolean isSignal() { return signal; }

	/**
	* Bias category, or empty string if applicable to all categories.
	* Values: "gender", "race_ethnicity", "religion", "disability", "appearance".
	*/
	public String getBiasType() { return biasType; }

	/**
	* Evaluative polarity of the word in a bias context.
	* Values: "positive", "negative", "neutral", or "" (unscored).
	*/
	public String getBiasValue() { return biasValue; }

	/** True if the word is explicitly marked as derogatory / pejorative. */
	public boolean isDerogatory() { return derogatory; }

	/** True if the word is marked as colloquial / informal. */
	public boolean isColloquial() { return colloquial; }

	/**
	* Positivity score in [0, 1] derived from BulNet synset sentiment.
	* Higher = more positive connotation.
	*/
	public double getPositivity() { return positivity; }

	/**
	* Negativity score in [0, 1] derived from BulNet synset sentiment.
	* Higher = more negative connotation.
	*/
	public double getNegativity() { return negativity; }

	/**
	* Unmodifiable set of all surface forms (lemma + inflected variants),
	* stored in lowercase.
	*/
	public Set<String> getForms() { return forms; }

	// -----------------------------------------------------------------------
	// Convenience predicates
	// -----------------------------------------------------------------------

	/** True if this entry carries any evaluative information (non-empty biasValue). */
	public boolean isEvaluative() {
	return !biasValue.isEmpty() && !biasValue.equals("neutral");
	}

	/** True if biasType is non-empty (i.e. assigned to a specific category). */
	public boolean isTyped() {
	return !biasType.isEmpty();
	}

	/**
	* True if this entry can act as an evaluative modifier in a bias pair —
	* i.e. it has a non-neutral polarity, or it is derogatory or colloquial.
	*/
	public boolean isEvaluativeModifier() {
	return isEvaluative() \|\| derogatory \|\| colloquial
	\|\| positivity > 0.5 \|\| negativity > 0.5;
	}

	// -----------------------------------------------------------------------
	// Object overrides
	// -----------------------------------------------------------------------

	@Override
	public String toString() {
	return String.format("BiasEntry{word='%s', signal=%b, type='%s', value='%s', "
	+ "pos+neg=[%.2f,%.2f], derog=%b, coll=%b, forms=%d}",
	word, signal, biasType, biasValue,
	positivity, negativity, derogatory, colloquial, forms.size());
	}
	}