Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

File size: 5,973 Bytes

18573e4

package bg.bas.dcl.LLMs;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/**
 * BiasEntry
 * 
 * TSV column order (0-based, tab-separated):
 *   0  word          — canonical lemma
 *   1  POS           — part of speech  (N, A, V, …)
 *   2  signal        — "true" / "false" : marks identity-group signals
 *   3  biasType      — gender | race_ethnicity | religion | disability | appearance | "" (general)
 *   4  biasValue     — positive | negative | neutral | ""
 *   5  derogatory    — "true" / "false"
 *   6  colloquial    — "true" / "false"
 *   7  forms         — "true" / "false" (unused flag; inflected forms are in col 10)
 *   8  positivity    — double in [0,1]
 *   9  negativity    — double in [0,1]
 *  10  inflectedForms — pipe-separated list of surface forms, or empty
 */
public class BiasEntry {

    // -----------------------------------------------------------------------
    // Fields
    // -----------------------------------------------------------------------

    private final String      word;
    private final String      pos;
    private final boolean     signal;
    private final String      biasType;      // "" means general / not type-specific
    private final String      biasValue;     // "" means unscored
    private final boolean     derogatory;
    private final boolean     colloquial;
    private final double      positivity;
    private final double      negativity;

    /** All known surface forms (lemma + inflected), lowercased for fast lookup. */
    private final Set<String> forms;

    // -----------------------------------------------------------------------
    // Constructor — called by BiasLexicon during TSV loading
    // -----------------------------------------------------------------------

    public BiasEntry(String word, String pos,
                     boolean signal, String biasType, String biasValue,
                     boolean derogatory, boolean colloquial,
                     double positivity, double negativity,
                     Set<String> forms) {
        this.word        = word == null   ? "" : word.trim();
        this.pos         = pos  == null   ? "" : pos.trim();
        this.signal      = signal;
        this.biasType    = biasType   == null ? "" : biasType.trim();
        this.biasValue   = biasValue  == null ? "" : biasValue.trim();
        this.derogatory  = derogatory;
        this.colloquial  = colloquial;
        this.positivity  = positivity;
        this.negativity  = negativity;
        this.forms       = Collections.unmodifiableSet(
                           forms == null ? new HashSet<>() : forms);
    }

    // -----------------------------------------------------------------------
    // Accessors
    // -----------------------------------------------------------------------

    /** Canonical lemma as it appears in the dictionary. */
    public String getWord()        { return word; }

    /** Part-of-speech tag (N, A, V, …). */
    public String getPos()         { return pos; }

    /**
     * True if this entry marks an identity-group signal word —
     * i.e. a term that identifies a person by a protected attribute
     * (e.g. "жена", "мюсюлманин").
     */
    public boolean isSignal()      { return signal; }

    /**
     * Bias category, or empty string if applicable to all categories.
     * Values: "gender", "race_ethnicity", "religion", "disability", "appearance".
     */
    public String getBiasType()    { return biasType; }

    /**
     * Evaluative polarity of the word in a bias context.
     * Values: "positive", "negative", "neutral", or "" (unscored).
     */
    public String getBiasValue()   { return biasValue; }

    /** True if the word is explicitly marked as derogatory / pejorative. */
    public boolean isDerogatory()  { return derogatory; }

    /** True if the word is marked as colloquial / informal. */
    public boolean isColloquial()  { return colloquial; }

    /**
     * Positivity score in [0, 1] derived from BulNet synset sentiment.
     * Higher = more positive connotation.
     */
    public double getPositivity()  { return positivity; }

    /**
     * Negativity score in [0, 1] derived from BulNet synset sentiment.
     * Higher = more negative connotation.
     */
    public double getNegativity()  { return negativity; }

    /**
     * Unmodifiable set of all surface forms (lemma + inflected variants),
     * stored in lowercase.
     */
    public Set<String> getForms()  { return forms; }

    // -----------------------------------------------------------------------
    // Convenience predicates
    // -----------------------------------------------------------------------

    /** True if this entry carries any evaluative information (non-empty biasValue). */
    public boolean isEvaluative() {
        return !biasValue.isEmpty() && !biasValue.equals("neutral");
    }

    /** True if biasType is non-empty (i.e. assigned to a specific category). */
    public boolean isTyped() {
        return !biasType.isEmpty();
    }

    /**
     * True if this entry can act as an evaluative modifier in a bias pair —
     * i.e. it has a non-neutral polarity, or it is derogatory or colloquial.
     */
    public boolean isEvaluativeModifier() {
        return isEvaluative() || derogatory || colloquial
                || positivity > 0.5 || negativity > 0.5;
    }

    // -----------------------------------------------------------------------
    // Object overrides
    // -----------------------------------------------------------------------

    @Override
    public String toString() {
        return String.format("BiasEntry{word='%s', signal=%b, type='%s', value='%s', "
                + "pos+neg=[%.2f,%.2f], derog=%b, coll=%b, forms=%d}",
                word, signal, biasType, biasValue,
                positivity, negativity, derogatory, colloquial, forms.size());
    }
}