Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

App Files Files Community

IfGPT-DataQualityComponents / java /bg /bas /dcl /LLMs /BiasLexicon.java

dcl-ibl-bas

Upload 22 files

18573e4 verified 5 days ago

raw

history blame contribute delete

9.63 kB

	package bg.bas.dcl.LLMs;

	import java.io.BufferedReader;
	import java.io.FileInputStream;
	import java.io.InputStreamReader;
	import java.nio.charset.StandardCharsets;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;

	/**
	* BiasLexicon
	*
	* Loads the Bulgarian bias dictionary (bulgarian_bias_dictionary_v4.tsv) and
	* provides fast O(1) form-level lookup for use by the bias detector.
	*
	* -----------------------------------------------------------------------
	* TSV FORMAT (tab-separated, first row is header):
	*
	* Col 0 word canonical lemma
	* Col 1 POS N \| A \| V \| …
	* Col 2 signal true \| false
	* Col 3 biasType gender \| race_ethnicity \| religion \| disability \| appearance \| ""
	* Col 4 biasValue positive \| negative \| neutral \| ""
	* Col 5 derogatory true \| false
	* Col 6 colloquial true \| false
	* Col 7 forms (boolean flag — ignored; inflected forms in col 10)
	* Col 8 positivity double [0,1]
	* Col 9 negativity double [0,1]
	* Col 10 inflectedForms pipe-separated surface forms, or empty
	*
	*
	*/
	public class BiasLexicon {

	// -----------------------------------------------------------------------
	// Indexes
	// -----------------------------------------------------------------------

	/**
	* Primary form index: lowercased surface form → BiasEntry.
	* A single form can only map to one entry (first one wins if there are
	* duplicates — extremely rare in the dictionary).
	*/
	private final Map<String, BiasEntry> formIndex = new HashMap<>();

	/**
	* Canonical word index: lowercased lemma → BiasEntry.
	* Useful when you already have the base form.
	*/
	private final Map<String, BiasEntry> wordIndex = new HashMap<>();

	/** All entries in load order. */
	private final List<BiasEntry> entries = new ArrayList<>();

	// -----------------------------------------------------------------------
	// Loading statistics
	// -----------------------------------------------------------------------

	private int loadedEntries = 0;
	private int skippedLines = 0;
	private int formConflicts = 0;

	// -----------------------------------------------------------------------
	// Constructor
	// -----------------------------------------------------------------------

	/**
	* Loads the bias dictionary from a TSV file.
	*
	* @param tsvPath absolute path to the TSV file
	* @throws RuntimeException if the file cannot be read
	*/
	public BiasLexicon(String tsvPath) {
	load(tsvPath);
	System.out.printf("[BiasLexicon] Loaded %d entries, %d form keys, "
	+ "%d skipped lines, %d form conflicts.%n",
	loadedEntries, formIndex.size(), skippedLines, formConflicts);
	}

	// -----------------------------------------------------------------------
	// Lookup API
	// -----------------------------------------------------------------------

	/**
	* Looks up a surface token (case-insensitive) and returns the
	* matching {@link BiasEntry}, or {@code null} if not found.
	*
	* @param token any surface form (inflected or base)
	*/
	public BiasEntry lookup(String token) {
	if (token == null \|\| token.isBlank()) return null;
	return formIndex.get(token.toLowerCase().trim());
	}

	/**
	* Returns true if the token (any form) is present in the lexicon.
	*
	* @param token surface form to check
	*/
	public boolean contains(String token) {
	return lookup(token) != null;
	}

	/**
	* Looks up a canonical lemma directly.
	*
	* @param lemma the base/dictionary form
	*/
	public BiasEntry lookupLemma(String lemma) {
	if (lemma == null \|\| lemma.isBlank()) return null;
	return wordIndex.get(lemma.toLowerCase().trim());
	}

	// -----------------------------------------------------------------------
	// Filtered views
	// -----------------------------------------------------------------------

	/**
	* Returns all entries whose {@code biasType} matches the given category
	* (case-insensitive), plus all general entries (empty biasType).
	*
	* @param biasType e.g. "gender", "disability"
	*/
	public List<BiasEntry> getByType(String biasType) {
	List<BiasEntry> result = new ArrayList<>();
	String target = biasType == null ? "" : biasType.toLowerCase().trim();
	for (BiasEntry e : entries)
	if (e.getBiasType().equalsIgnoreCase(target) \|\| e.getBiasType().isEmpty())
	result.add(e);
	return result;
	}

	/**
	* Returns all entries that are marked as signals (signal=true) for
	* the given bias category, or all signal entries if biasType is null/empty.
	*/
	public List<BiasEntry> getSignals(String biasType) {
	List<BiasEntry> result = new ArrayList<>();
	for (BiasEntry e : entries) {
	if (!e.isSignal()) continue;
	if (biasType == null \|\| biasType.isBlank()
	\|\| e.getBiasType().isEmpty()
	\|\| e.getBiasType().equalsIgnoreCase(biasType))
	result.add(e);
	}
	return result;
	}

	/** Returns an unmodifiable view of all loaded entries. */
	public Collection<BiasEntry> getAll() {
	return Collections.unmodifiableList(entries);
	}

	/** Number of loaded dictionary entries. */
	public int size() { return entries.size(); }

	// -----------------------------------------------------------------------
	// Internal loading
	// -----------------------------------------------------------------------

	private void load(String tsvPath) {
	try (BufferedReader br = new BufferedReader(
	new InputStreamReader(new FileInputStream(tsvPath),
	StandardCharsets.UTF_8))) {

	String headerLine = br.readLine(); // skip header
	if (headerLine == null) {
	System.err.println("[BiasLexicon] Empty file: " + tsvPath);
	return;
	}

	String line;
	int lineNum = 1; // already read header as line 1

	while ((line = br.readLine()) != null) {
	lineNum++;
	if (line.isBlank()) { skippedLines++; continue; }

	String[] cols = line.split("\t", -1);

	// Minimum viable: need at least 10 columns
	if (cols.length < 10) {
	System.err.printf("[BiasLexicon] Line %d: only %d columns, skipping.%n",
	lineNum, cols.length);
	skippedLines++;
	continue;
	}

	try {
	String word = cols[0].trim();
	String pos = cols[1].trim();
	boolean signal = "true".equalsIgnoreCase(cols[2].trim());
	String biasType = cols[3].trim();
	String biasValue = cols[4].trim();
	boolean derog = "true".equalsIgnoreCase(cols[5].trim());
	boolean coll = "true".equalsIgnoreCase(cols[6].trim());
	// cols[7] is a boolean forms-flag (ignored)
	double positivity = parseDouble(cols[8], lineNum);
	double negativity = parseDouble(cols[9], lineNum);

	// Inflected forms: pipe-separated in col 10 (if present)
	Set<String> formsSet = new HashSet<>();
	formsSet.add(word.toLowerCase()); // always include the lemma

	if (cols.length > 10 && !cols[10].isBlank()) {
	for (String f : cols[10].split("\\\|")) {
	String fc = f.trim().toLowerCase();
	if (!fc.isEmpty()) formsSet.add(fc);
	}
	}

	BiasEntry entry = new BiasEntry(word, pos, signal,
	biasType, biasValue, derog, coll,
	positivity, negativity, formsSet);

	entries.add(entry);
	wordIndex.put(word.toLowerCase(), entry);

	for (String form : formsSet) {
	if (formIndex.containsKey(form)) {
	formConflicts++;
	// Keep first entry — do not overwrite
	} else {
	formIndex.put(form, entry);
	}
	}

	loadedEntries++;

	} catch (Exception e) {
	System.err.printf("[BiasLexicon] Line %d: parse error — %s%n",
	lineNum, e.getMessage());
	skippedLines++;
	}
	}

	} catch (Exception e) {
	throw new RuntimeException("Failed to load bias lexicon from: " + tsvPath, e);
	}
	}

	private double parseDouble(String s, int lineNum) {
	try {
	return Double.parseDouble(s.trim());
	} catch (NumberFormatException e) {
	System.err.printf("[BiasLexicon] Line %d: cannot parse double '%s', using 0.0%n",
	lineNum, s);
	return 0.0;
	}
	}
	}