File size: 5,973 Bytes
18573e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | package bg.bas.dcl.LLMs;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* BiasEntry
*
* TSV column order (0-based, tab-separated):
* 0 word — canonical lemma
* 1 POS — part of speech (N, A, V, …)
* 2 signal — "true" / "false" : marks identity-group signals
* 3 biasType — gender | race_ethnicity | religion | disability | appearance | "" (general)
* 4 biasValue — positive | negative | neutral | ""
* 5 derogatory — "true" / "false"
* 6 colloquial — "true" / "false"
* 7 forms — "true" / "false" (unused flag; inflected forms are in col 10)
* 8 positivity — double in [0,1]
* 9 negativity — double in [0,1]
* 10 inflectedForms — pipe-separated list of surface forms, or empty
*/
public class BiasEntry {
// -----------------------------------------------------------------------
// Fields
// -----------------------------------------------------------------------
private final String word;
private final String pos;
private final boolean signal;
private final String biasType; // "" means general / not type-specific
private final String biasValue; // "" means unscored
private final boolean derogatory;
private final boolean colloquial;
private final double positivity;
private final double negativity;
/** All known surface forms (lemma + inflected), lowercased for fast lookup. */
private final Set<String> forms;
// -----------------------------------------------------------------------
// Constructor — called by BiasLexicon during TSV loading
// -----------------------------------------------------------------------
public BiasEntry(String word, String pos,
boolean signal, String biasType, String biasValue,
boolean derogatory, boolean colloquial,
double positivity, double negativity,
Set<String> forms) {
this.word = word == null ? "" : word.trim();
this.pos = pos == null ? "" : pos.trim();
this.signal = signal;
this.biasType = biasType == null ? "" : biasType.trim();
this.biasValue = biasValue == null ? "" : biasValue.trim();
this.derogatory = derogatory;
this.colloquial = colloquial;
this.positivity = positivity;
this.negativity = negativity;
this.forms = Collections.unmodifiableSet(
forms == null ? new HashSet<>() : forms);
}
// -----------------------------------------------------------------------
// Accessors
// -----------------------------------------------------------------------
/** Canonical lemma as it appears in the dictionary. */
public String getWord() { return word; }
/** Part-of-speech tag (N, A, V, …). */
public String getPos() { return pos; }
/**
* True if this entry marks an identity-group signal word —
* i.e. a term that identifies a person by a protected attribute
* (e.g. "жена", "мюсюлманин").
*/
public boolean isSignal() { return signal; }
/**
* Bias category, or empty string if applicable to all categories.
* Values: "gender", "race_ethnicity", "religion", "disability", "appearance".
*/
public String getBiasType() { return biasType; }
/**
* Evaluative polarity of the word in a bias context.
* Values: "positive", "negative", "neutral", or "" (unscored).
*/
public String getBiasValue() { return biasValue; }
/** True if the word is explicitly marked as derogatory / pejorative. */
public boolean isDerogatory() { return derogatory; }
/** True if the word is marked as colloquial / informal. */
public boolean isColloquial() { return colloquial; }
/**
* Positivity score in [0, 1] derived from BulNet synset sentiment.
* Higher = more positive connotation.
*/
public double getPositivity() { return positivity; }
/**
* Negativity score in [0, 1] derived from BulNet synset sentiment.
* Higher = more negative connotation.
*/
public double getNegativity() { return negativity; }
/**
* Unmodifiable set of all surface forms (lemma + inflected variants),
* stored in lowercase.
*/
public Set<String> getForms() { return forms; }
// -----------------------------------------------------------------------
// Convenience predicates
// -----------------------------------------------------------------------
/** True if this entry carries any evaluative information (non-empty biasValue). */
public boolean isEvaluative() {
return !biasValue.isEmpty() && !biasValue.equals("neutral");
}
/** True if biasType is non-empty (i.e. assigned to a specific category). */
public boolean isTyped() {
return !biasType.isEmpty();
}
/**
* True if this entry can act as an evaluative modifier in a bias pair —
* i.e. it has a non-neutral polarity, or it is derogatory or colloquial.
*/
public boolean isEvaluativeModifier() {
return isEvaluative() || derogatory || colloquial
|| positivity > 0.5 || negativity > 0.5;
}
// -----------------------------------------------------------------------
// Object overrides
// -----------------------------------------------------------------------
@Override
public String toString() {
return String.format("BiasEntry{word='%s', signal=%b, type='%s', value='%s', "
+ "pos+neg=[%.2f,%.2f], derog=%b, coll=%b, forms=%d}",
word, signal, biasType, biasValue,
positivity, negativity, derogatory, colloquial, forms.size());
}
}
|