Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

File size: 4,725 Bytes

18573e4

package bg.bas.dcl.LLMs;

import java.util.List;

/**
 * BiasDetectorDemo
 *
  *
 * -----------------------------------------------------------------------
 * MAVEN DEPENDENCIES (add to pom.xml):
 *
 *   <!-- OpenNLP toolkit -->
 *   <dependency>
 *     <groupId>org.apache.opennlp</groupId>
 *     <artifactId>opennlp-tools</artifactId>
 *     <version>2.4.0</version>
 *   </dependency>
 *
 *   <!-- Bulgarian sentence-detection model (UD 2.14, Apache 2.0) -->
 *   <dependency>
 *     <groupId>org.apache.opennlp</groupId>
 *     <artifactId>opennlp-models-sentdetect-bg</artifactId>
 *     <version>1.2</version>
 *   </dependency>
 */
public class BiasDetectorDemo {

    public static void main(String[] args) {

        // ------------------------------------------------------------------
        // 1. Load the Bulgarian sentence splitter
        //    (loads bundled model from the Maven JAR automatically)
        // ------------------------------------------------------------------
        BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();

        // Alternatively, supply an explicit model file path:
        // BulgarianSentenceSplitter splitter =
        //     new BulgarianSentenceSplitter("/path/to/bg-sent.bin");


        // ------------------------------------------------------------------
        // 2. Load the bias lexicon
        // ------------------------------------------------------------------
        String dictPath = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
                        + "bulgarian_bias_dictionary_v4.tsv";

        BiasLexicon lexicon = new BiasLexicon(dictPath);
        System.out.printf("Lexicon loaded: %d entries%n%n", lexicon.size());


        // ------------------------------------------------------------------
        // 3. Build the analyser
        // ------------------------------------------------------------------
        BiasAnalyser analyser = new BiasAnalyser(lexicon, splitter);


        // ------------------------------------------------------------------
        // 4a. Analyse a block of text in memory
        // ------------------------------------------------------------------
        String sampleText =
            "Слепите хора трудно могат да се справят сами в живота. " +
            "Времето днес е слънчево и приятно.";

        System.out.println("=== Sentence-level bias scores ===");
        System.out.println(SentenceBiasScore.tsvHeader());
        System.out.println();

        List<SentenceBiasScore> scores = analyser.analyseText(sampleText);

        for (SentenceBiasScore score : scores) {
            System.out.println("Sentence : " + score.getSentence());
            System.out.printf ("Words    : %d%n", score.getTotalWords());
            System.out.printf ("Biased   : %b%n", score.isBiased());

            double[] cov = score.coverageArray();
            String[] types = SentenceBiasScore.BIAS_TYPES;
            for (int i = 0; i < types.length; i++) {
                if (cov[i] > 0)
                    System.out.printf("  %-18s %.2f%% pair coverage%n",
                            types[i] + ":", cov[i] * 100);
            }
            System.out.printf ("Total    : %.2f%% overall coverage%n", score.totalCoverage() * 100);
            System.out.println("Lemmas   : " + score.getMatchedLemmas());
            System.out.println();
        }


        // ------------------------------------------------------------------
        // 4b. Analyse a corpus directory — writes a TSV results file
        //     (only biased sentences are written; zero-coverage sentences
        //     are filtered out automatically by analyseDirectory)
        // ------------------------------------------------------------------
        String corpusDir = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/WIKI/";
        String resultTsv = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/bias_results.tsv";

        // analyser.analyseDirectory(corpusDir, resultTsv);


        // ------------------------------------------------------------------
        // 4c. Sentence splitting only — using the splitter standalone
        // ------------------------------------------------------------------
        String text = "Това е първото изречение. Второто е по-дълго и сложно! " +
                      "А третото задава въпрос?";

        String[] sentences = splitter.split(text);
        System.out.println("=== Sentence splitting demo ===");
        for (int i = 0; i < sentences.length; i++) {
            System.out.printf("  [%d] %s%n", i + 1, sentences[i]);
        }
    }
}