Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

App Files Files Community

dcl-ibl-bas commited on 3 days ago

Commit

18573e4

verified ·

1 Parent(s): 4ae0bcb

Upload 22 files

Browse files

Files changed (22) hide show

java/bg/bas/dcl/LLMs/BiasAnalyser.java +344 -0
java/bg/bas/dcl/LLMs/BiasDetectorDemo.java +111 -0
java/bg/bas/dcl/LLMs/BiasEntry.java +151 -0
java/bg/bas/dcl/LLMs/BiasLexicon.java +258 -0
java/bg/bas/dcl/LLMs/BulgarianSentenceSplitter.java +163 -0
java/bg/bas/dcl/LLMs/DeduplicationProcessor.java +571 -0
java/bg/bas/dcl/LLMs/FileCleanProcessor.java +453 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/.BulNCProcessor.java.kate-swp +0 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/.CurlicatProcessor.java.kate-swp +0 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/BaseSourceProcessor.java +180 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCProcessor.java +188 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCWikiProcessor.java +154 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/CurlicatProcessor.java +160 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/DocumentMetadata.java +376 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTDatasetProcessor.java +160 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTPipeline.java +490 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/MarcellProcessor.java +130 -0
java/bg/bas/dcl/LLMs/IfGPTDataset/SourceProcessor.java +10 -0
java/bg/bas/dcl/LLMs/PIIDetector.java +447 -0
java/bg/bas/dcl/LLMs/SentenceBiasScore.java +150 -0
resources/bulgarian_bias_dictionary_v4.tsv +0 -0
resources/metadata_schema.json +267 -0

java/bg/bas/dcl/LLMs/BiasAnalyser.java ADDED Viewed

	@@ -0,0 +1,344 @@

+package bg.bas.dcl.LLMs;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+import bg.bas.dcl.general.FileHandler;
+/**
+ * BiasAnalyser
+ *
+ * Detects linguistic bias in Bulgarian text using the Bulgarian Bias Dictionary
+ * (v4 TSV format).  Works at sentence level: for each sentence it returns a
+ * {@link SentenceBiasScore} whose primary metric is the pair-coverage percentage —
+ * the fraction of word tokens in the sentence that participate in at least one
+ * signal–evaluator pair for each bias category.
+ *
+ * -----------------------------------------------------------------------
+ * ALGORITHM (per sentence)
+ *
+ *   1. TOKENISE — split on whitespace, strip non-letter characters per token.
+ *   2. MATCH    — look each token up in the {@link BiasLexicon} (form index,
+ *                 case-insensitive).  Multi-word entries are tried first via a
+ *                 forward-scan for bigrams and trigrams.
+ *   3. PAIR     — for every signal token, search within ±PAIR_WINDOW tokens for
+ *                 an evaluator token of the same bias type (or a general one).
+ *                 Each unique (signal position, evaluator position) is a pair.
+ *   4. SCORE    — pairCoverage[type] = distinctPairTokens[type] / totalWords
+ *                 where distinctPairTokens = set of positions involved in
+ *                 at least one confirmed pair for that type.
+ *
+ */
+public class BiasAnalyser {
+    // -----------------------------------------------------------------------
+    // Constants
+    // -----------------------------------------------------------------------
+    /**
+     * Maximum token distance between a signal and an evaluator for them to
+     * be counted as a pair.  10 matches the window used in the original
+     * BiasDetector.
+     */
+    public static final int PAIR_WINDOW = 10;
+    /**
+     * Sentences with fewer words than this are skipped entirely.
+     */
+    public static final int MIN_WORDS = 6;
+    /**
+     * Sentences with more words than this are still processed but a warning
+     * is printed (very long sentences may inflate scores).
+     */
+    public static final int MAX_WORDS = 200;
+    // -----------------------------------------------------------------------
+    // Dependencies
+    // -----------------------------------------------------------------------
+    private final BiasLexicon              lexicon;
+    private final BulgarianSentenceSplitter splitter;
+    // -----------------------------------------------------------------------
+    // Constructor
+    // -----------------------------------------------------------------------
+    /**
+     * @param lexicon  the loaded bias dictionary
+     * @param splitter an initialised Bulgarian sentence splitter
+     */
+    public BiasAnalyser(BiasLexicon lexicon, BulgarianSentenceSplitter splitter) {
+        if (lexicon  == null) throw new IllegalArgumentException("lexicon must not be null");
+        if (splitter == null) throw new IllegalArgumentException("splitter must not be null");
+        this.lexicon  = lexicon;
+        this.splitter = splitter;
+    }
+    // -----------------------------------------------------------------------
+    // Public API
+    // -----------------------------------------------------------------------
+    /**
+     * Splits {@code text} into sentences and returns a bias score for each.
+     */
+    public List<SentenceBiasScore> analyseText(String text) {
+        List<SentenceBiasScore> results = new ArrayList<>();
+        if (text == null || text.isBlank()) return results;
+        for (String sentence : splitter.split(text)) {
+            results.add(analyseSentence(sentence));
+        }
+        return results;
+    }
+    /**
+     * Analyses a single pre-split sentence.
+     *
+     */
+    public SentenceBiasScore analyseSentence(String sentence) {
+        // --- Tokenise --------------------------------------------------
+        String lower        = sentence.toLowerCase();
+        String[] rawTokens  = lower.split("\\s+");
+        // Build clean token list and a parallel lookup list
+        // We attempt multi-word matches (bigrams, trigrams) first
+        List<String>    cleanTokens = new ArrayList<>();   // word-only tokens
+        List<BiasEntry> matched     = new ArrayList<>();   // parallel match (null=no match)
+        int i = 0;
+        while (i < rawTokens.length) {
+            // Try trigram (3-word multi-word entry)
+            if (i + 2 < rawTokens.length) {
+                String tri = clean(rawTokens[i]) + " "
+                           + clean(rawTokens[i + 1]) + " "
+                           + clean(rawTokens[i + 2]);
+                BiasEntry e = lexicon.lookup(tri);
+                if (e != null) {
+                    // Represent as 3 tokens (positions), all pointing to same entry
+                    for (int k = 0; k < 3; k++) {
+                        cleanTokens.add(clean(rawTokens[i + k]));
+                        matched.add(e);
+                    }
+                    i += 3;
+                    continue;
+                }
+            }
+            // Try bigram
+            if (i + 1 < rawTokens.length) {
+                String bi = clean(rawTokens[i]) + " " + clean(rawTokens[i + 1]);
+                BiasEntry e = lexicon.lookup(bi);
+                if (e != null) {
+                    for (int k = 0; k < 2; k++) {
+                        cleanTokens.add(clean(rawTokens[i + k]));
+                        matched.add(e);
+                    }
+                    i += 2;
+                    continue;
+                }
+            }
+            // Unigram
+            String tok = clean(rawTokens[i]);
+            if (!tok.isEmpty()) {
+                cleanTokens.add(tok);
+                matched.add(lexicon.lookup(tok));
+            }
+            i++;
+        }
+        int totalWords = cleanTokens.size();
+        String[] biasTypes = SentenceBiasScore.BIAS_TYPES;
+        Map<String, Integer> signalCount    = new HashMap<>();
+        Map<String, Integer> evaluatorCount = new HashMap<>();
+        Map<String, Double>  pairCoverage   = new HashMap<>();
+        for (String type : biasTypes) {
+            signalCount.put(type, 0);
+            evaluatorCount.put(type, 0);
+            pairCoverage.put(type, 0.0);
+        }
+        List<String> matchedLemmas = new ArrayList<>();
+        int totalBiasWords  = 0;
+        int totalDerogatory = 0;
+        int totalColloquial = 0;
+        if (totalWords < MIN_WORDS) {
+            // Return zero-score result for very short sentences
+            return new SentenceBiasScore(sentence, totalWords,
+                    pairCoverage, signalCount, evaluatorCount,
+                    matchedLemmas, 0, 0, 0, false);
+        }
+        // --- Collect matched positions ---------------------------------
+        Set<String> seenLemmas = new HashSet<>();
+        // signalPositions[type] = list of token indices that are signals for that type
+        Map<String, List<Integer>> signalPos  = new HashMap<>();
+        // evalPositions[type]   = list of token indices that are evaluators for that type
+        Map<String, List<Integer>> evalPos    = new HashMap<>();
+        for (String type : biasTypes) {
+            signalPos.put(type, new ArrayList<>());
+            evalPos.put(type,   new ArrayList<>());
+        }
+        for (int ti = 0; ti < totalWords; ti++) {
+            BiasEntry entry = matched.get(ti);
+            if (entry == null) continue;
+            String lemma = entry.getWord();
+            // Count each unique lemma only once (avoid double-counting
+            // inflected-form repetitions of the same word in one sentence)
+            if (seenLemmas.add(lemma)) {
+                matchedLemmas.add(lemma);
+            }
+            if (entry.isEvaluative()) totalBiasWords++;
+            if (entry.isDerogatory()) totalDerogatory++;
+            if (entry.isColloquial()) totalColloquial++;
+            // Determine which types this entry applies to
+            List<String> applicableTypes = entry.isTyped()
+                    ? List.of(entry.getBiasType())
+                    : Arrays.asList(biasTypes);    // general entry → all types
+            for (String type : applicableTypes) {
+                if (entry.isSignal()) {
+                    signalPos.get(type).add(ti);
+                }
+                if (entry.isEvaluativeModifier()) {
+                    evalPos.get(type).add(ti);
+                }
+            }
+        }
+        // --- Pair detection & score computation -----------------------
+        Map<String, Set<Integer>> pairTokens = new HashMap<>();
+        for (String type : biasTypes) pairTokens.put(type, new HashSet<>());
+        for (String type : biasTypes) {
+            List<Integer> signals    = signalPos.get(type);
+            List<Integer> evaluators = evalPos.get(type);
+            for (int sIdx : signals) {
+                boolean paired = false;
+                // Self-pair: signal is itself evaluative
+                BiasEntry sEntry = matched.get(sIdx);
+                if (sEntry != null && sEntry.isEvaluativeModifier()) {
+                    pairTokens.get(type).add(sIdx);
+                    paired = true;
+                }
+                // Pair with a distinct evaluator within window
+                for (int eIdx : evaluators) {
+                    if (eIdx == sIdx) continue;
+                    if (Math.abs(sIdx - eIdx) <= PAIR_WINDOW) {
+                        pairTokens.get(type).add(sIdx);
+                        pairTokens.get(type).add(eIdx);
+                        paired = true;
+                    }
+                }
+            }
+            int sigCount  = signals.size();
+            int evalCount = (int) evaluators.stream()
+                    .filter(eIdx -> pairTokens.get(type).contains(eIdx))
+                    .count();
+            signalCount.put(type,    sigCount);
+            evaluatorCount.put(type, evalCount);
+            double coverage = totalWords > 0
+                    ? (double) pairTokens.get(type).size() / totalWords
+                    : 0.0;
+            pairCoverage.put(type, coverage);
+        }
+        // --- Multi-type flag ------------------------------------------
+        int typesWithPairs = 0;
+        for (String type : biasTypes)
+            if (!pairTokens.get(type).isEmpty()) typesWithPairs++;
+        boolean multiType = typesWithPairs >= 2;
+        return new SentenceBiasScore(
+                sentence, totalWords,
+                pairCoverage, signalCount, evaluatorCount,
+                matchedLemmas, totalBiasWords, totalDerogatory, totalColloquial,
+                multiType);
+    }
+    /**
+     * Analyses all .txt files
+     */
+    public void analyseDirectory(String corpusDir, String resultPath) {
+        try {
+            FileHandler fh = new FileHandler();
+            try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
+                    new FileOutputStream(resultPath, false), StandardCharsets.UTF_8))) {
+                bw.write(SentenceBiasScore.tsvHeader());
+                bw.newLine();
+                int filesProcessed = 0;
+                int sentencesWritten = 0;
+                for (File f : fh.getFileListing(new File(corpusDir))) {
+                    if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                    System.out.println("[BiasAnalyser] Processing: " + f.getName());
+                    StringBuilder text = new StringBuilder();
+                    try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
+                        while (sc.hasNextLine()) {
+                            text.append(sc.nextLine()).append(' ');
+                        }
+                    }
+                    for (SentenceBiasScore score : analyseText(text.toString())) {
+                        if (score.isBiased()) {
+                            bw.write(f.getName() + "\t" + score.toTsv());
+                            bw.newLine();
+                            sentencesWritten++;
+                        }
+                    }
+                    filesProcessed++;
+                }
+                System.out.printf("[BiasAnalyser] Done. Files: %d  Biased sentences written: %d%n",
+                        filesProcessed, sentencesWritten);
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Helper
+    // -----------------------------------------------------------------------
+    private String clean(String token) {
+        return token.replaceAll("[^\\p{L}\\s]", "").trim();
+    }
+}

java/bg/bas/dcl/LLMs/BiasDetectorDemo.java ADDED Viewed

	@@ -0,0 +1,111 @@

+package bg.bas.dcl.LLMs;
+import java.util.List;
+/**
+ * BiasDetectorDemo
+ *
+  *
+ * -----------------------------------------------------------------------
+ * MAVEN DEPENDENCIES (add to pom.xml):
+ *
+ *   <!-- OpenNLP toolkit -->
+ *   <dependency>
+ *     <groupId>org.apache.opennlp</groupId>
+ *     <artifactId>opennlp-tools</artifactId>
+ *     <version>2.4.0</version>
+ *   </dependency>
+ *
+ *   <!-- Bulgarian sentence-detection model (UD 2.14, Apache 2.0) -->
+ *   <dependency>
+ *     <groupId>org.apache.opennlp</groupId>
+ *     <artifactId>opennlp-models-sentdetect-bg</artifactId>
+ *     <version>1.2</version>
+ *   </dependency>
+ */
+public class BiasDetectorDemo {
+    public static void main(String[] args) {
+        // ------------------------------------------------------------------
+        // 1. Load the Bulgarian sentence splitter
+        //    (loads bundled model from the Maven JAR automatically)
+        // ------------------------------------------------------------------
+        BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
+        // Alternatively, supply an explicit model file path:
+        // BulgarianSentenceSplitter splitter =
+        //     new BulgarianSentenceSplitter("/path/to/bg-sent.bin");
+        // ------------------------------------------------------------------
+        // 2. Load the bias lexicon
+        // ------------------------------------------------------------------
+        String dictPath = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
+                        + "bulgarian_bias_dictionary_v4.tsv";
+        BiasLexicon lexicon = new BiasLexicon(dictPath);
+        System.out.printf("Lexicon loaded: %d entries%n%n", lexicon.size());
+        // ------------------------------------------------------------------
+        // 3. Build the analyser
+        // ------------------------------------------------------------------
+        BiasAnalyser analyser = new BiasAnalyser(lexicon, splitter);
+        // ------------------------------------------------------------------
+        // 4a. Analyse a block of text in memory
+        // ------------------------------------------------------------------
+        String sampleText =
+            "Слепите хора трудно могат да се справят сами в живота. " +
+            "Времето днес е слънчево и приятно.";
+        System.out.println("=== Sentence-level bias scores ===");
+        System.out.println(SentenceBiasScore.tsvHeader());
+        System.out.println();
+        List<SentenceBiasScore> scores = analyser.analyseText(sampleText);
+        for (SentenceBiasScore score : scores) {
+            System.out.println("Sentence : " + score.getSentence());
+            System.out.printf ("Words    : %d%n", score.getTotalWords());
+            System.out.printf ("Biased   : %b%n", score.isBiased());
+            double[] cov = score.coverageArray();
+            String[] types = SentenceBiasScore.BIAS_TYPES;
+            for (int i = 0; i < types.length; i++) {
+                if (cov[i] > 0)
+                    System.out.printf("  %-18s %.2f%% pair coverage%n",
+                            types[i] + ":", cov[i] * 100);
+            }
+            System.out.printf ("Total    : %.2f%% overall coverage%n", score.totalCoverage() * 100);
+            System.out.println("Lemmas   : " + score.getMatchedLemmas());
+            System.out.println();
+        }
+        // ------------------------------------------------------------------
+        // 4b. Analyse a corpus directory — writes a TSV results file
+        //     (only biased sentences are written; zero-coverage sentences
+        //     are filtered out automatically by analyseDirectory)
+        // ------------------------------------------------------------------
+        String corpusDir = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/WIKI/";
+        String resultTsv = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/bias_results.tsv";
+        // analyser.analyseDirectory(corpusDir, resultTsv);
+        // ------------------------------------------------------------------
+        // 4c. Sentence splitting only — using the splitter standalone
+        // ------------------------------------------------------------------
+        String text = "Това е първото изречение. Второто е по-дълго и сложно! " +
+                      "А третото задава въпрос?";
+        String[] sentences = splitter.split(text);
+        System.out.println("=== Sentence splitting demo ===");
+        for (int i = 0; i < sentences.length; i++) {
+            System.out.printf("  [%d] %s%n", i + 1, sentences[i]);
+        }
+    }
+}

java/bg/bas/dcl/LLMs/BiasEntry.java ADDED Viewed

	@@ -0,0 +1,151 @@

+package bg.bas.dcl.LLMs;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+/**
+ * BiasEntry
+ *
+ * TSV column order (0-based, tab-separated):
+ *   0  word          — canonical lemma
+ *   1  POS           — part of speech  (N, A, V, …)
+ *   2  signal        — "true" / "false" : marks identity-group signals
+ *   3  biasType      — gender | race_ethnicity | religion | disability | appearance | "" (general)
+ *   4  biasValue     — positive | negative | neutral | ""
+ *   5  derogatory    — "true" / "false"
+ *   6  colloquial    — "true" / "false"
+ *   7  forms         — "true" / "false" (unused flag; inflected forms are in col 10)
+ *   8  positivity    — double in [0,1]
+ *   9  negativity    — double in [0,1]
+ *  10  inflectedForms — pipe-separated list of surface forms, or empty
+ */
+public class BiasEntry {
+    // -----------------------------------------------------------------------
+    // Fields
+    // -----------------------------------------------------------------------
+    private final String      word;
+    private final String      pos;
+    private final boolean     signal;
+    private final String      biasType;      // "" means general / not type-specific
+    private final String      biasValue;     // "" means unscored
+    private final boolean     derogatory;
+    private final boolean     colloquial;
+    private final double      positivity;
+    private final double      negativity;
+    /** All known surface forms (lemma + inflected), lowercased for fast lookup. */
+    private final Set<String> forms;
+    // -----------------------------------------------------------------------
+    // Constructor — called by BiasLexicon during TSV loading
+    // -----------------------------------------------------------------------
+    public BiasEntry(String word, String pos,
+                     boolean signal, String biasType, String biasValue,
+                     boolean derogatory, boolean colloquial,
+                     double positivity, double negativity,
+                     Set<String> forms) {
+        this.word        = word == null   ? "" : word.trim();
+        this.pos         = pos  == null   ? "" : pos.trim();
+        this.signal      = signal;
+        this.biasType    = biasType   == null ? "" : biasType.trim();
+        this.biasValue   = biasValue  == null ? "" : biasValue.trim();
+        this.derogatory  = derogatory;
+        this.colloquial  = colloquial;
+        this.positivity  = positivity;
+        this.negativity  = negativity;
+        this.forms       = Collections.unmodifiableSet(
+                           forms == null ? new HashSet<>() : forms);
+    }
+    // -----------------------------------------------------------------------
+    // Accessors
+    // -----------------------------------------------------------------------
+    /** Canonical lemma as it appears in the dictionary. */
+    public String getWord()        { return word; }
+    /** Part-of-speech tag (N, A, V, …). */
+    public String getPos()         { return pos; }
+    /**
+     * True if this entry marks an identity-group signal word —
+     * i.e. a term that identifies a person by a protected attribute
+     * (e.g. "жена", "мюсюлманин").
+     */
+    public boolean isSignal()      { return signal; }
+    /**
+     * Bias category, or empty string if applicable to all categories.
+     * Values: "gender", "race_ethnicity", "religion", "disability", "appearance".
+     */
+    public String getBiasType()    { return biasType; }
+    /**
+     * Evaluative polarity of the word in a bias context.
+     * Values: "positive", "negative", "neutral", or "" (unscored).
+     */
+    public String getBiasValue()   { return biasValue; }
+    /** True if the word is explicitly marked as derogatory / pejorative. */
+    public boolean isDerogatory()  { return derogatory; }
+    /** True if the word is marked as colloquial / informal. */
+    public boolean isColloquial()  { return colloquial; }
+    /**
+     * Positivity score in [0, 1] derived from BulNet synset sentiment.
+     * Higher = more positive connotation.
+     */
+    public double getPositivity()  { return positivity; }
+    /**
+     * Negativity score in [0, 1] derived from BulNet synset sentiment.
+     * Higher = more negative connotation.
+     */
+    public double getNegativity()  { return negativity; }
+    /**
+     * Unmodifiable set of all surface forms (lemma + inflected variants),
+     * stored in lowercase.
+     */
+    public Set<String> getForms()  { return forms; }
+    // -----------------------------------------------------------------------
+    // Convenience predicates
+    // -----------------------------------------------------------------------
+    /** True if this entry carries any evaluative information (non-empty biasValue). */
+    public boolean isEvaluative() {
+        return !biasValue.isEmpty() && !biasValue.equals("neutral");
+    }
+    /** True if biasType is non-empty (i.e. assigned to a specific category). */
+    public boolean isTyped() {
+        return !biasType.isEmpty();
+    }
+    /**
+     * True if this entry can act as an evaluative modifier in a bias pair —
+     * i.e. it has a non-neutral polarity, or it is derogatory or colloquial.
+     */
+    public boolean isEvaluativeModifier() {
+        return isEvaluative() || derogatory || colloquial
+                || positivity > 0.5 || negativity > 0.5;
+    }
+    // -----------------------------------------------------------------------
+    // Object overrides
+    // -----------------------------------------------------------------------
+    @Override
+    public String toString() {
+        return String.format("BiasEntry{word='%s', signal=%b, type='%s', value='%s', "
+                + "pos+neg=[%.2f,%.2f], derog=%b, coll=%b, forms=%d}",
+                word, signal, biasType, biasValue,
+                positivity, negativity, derogatory, colloquial, forms.size());
+    }
+}

java/bg/bas/dcl/LLMs/BiasLexicon.java ADDED Viewed

	@@ -0,0 +1,258 @@

+package bg.bas.dcl.LLMs;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+/**
+ * BiasLexicon
+ *
+ * Loads the Bulgarian bias dictionary (bulgarian_bias_dictionary_v4.tsv) and
+ * provides fast O(1) form-level lookup for use by the bias detector.
+ *
+ * -----------------------------------------------------------------------
+ * TSV FORMAT (tab-separated, first row is header):
+ *
+ *   Col  0  word           canonical lemma
+ *   Col  1  POS            N | A | V | …
+ *   Col  2  signal         true | false
+ *   Col  3  biasType       gender | race_ethnicity | religion | disability | appearance | ""
+ *   Col  4  biasValue      positive | negative | neutral | ""
+ *   Col  5  derogatory     true | false
+ *   Col  6  colloquial     true | false
+ *   Col  7  forms          (boolean flag — ignored; inflected forms in col 10)
+ *   Col  8  positivity     double [0,1]
+ *   Col  9  negativity     double [0,1]
+ *   Col 10  inflectedForms pipe-separated surface forms, or empty
+ *
+ *
+ */
+public class BiasLexicon {
+    // -----------------------------------------------------------------------
+    // Indexes
+    // -----------------------------------------------------------------------
+    /**
+     * Primary form index: lowercased surface form → BiasEntry.
+     * A single form can only map to one entry (first one wins if there are
+     * duplicates — extremely rare in the dictionary).
+     */
+    private final Map<String, BiasEntry> formIndex = new HashMap<>();
+    /**
+     * Canonical word index: lowercased lemma → BiasEntry.
+     * Useful when you already have the base form.
+     */
+    private final Map<String, BiasEntry> wordIndex = new HashMap<>();
+    /** All entries in load order. */
+    private final List<BiasEntry> entries = new ArrayList<>();
+    // -----------------------------------------------------------------------
+    // Loading statistics
+    // -----------------------------------------------------------------------
+    private int loadedEntries  = 0;
+    private int skippedLines   = 0;
+    private int formConflicts  = 0;
+    // -----------------------------------------------------------------------
+    // Constructor
+    // -----------------------------------------------------------------------
+    /**
+     * Loads the bias dictionary from a TSV file.
+     *
+     * @param tsvPath absolute path to the TSV file
+     * @throws RuntimeException if the file cannot be read
+     */
+    public BiasLexicon(String tsvPath) {
+        load(tsvPath);
+        System.out.printf("[BiasLexicon] Loaded %d entries, %d form keys, "
+                + "%d skipped lines, %d form conflicts.%n",
+                loadedEntries, formIndex.size(), skippedLines, formConflicts);
+    }
+    // -----------------------------------------------------------------------
+    // Lookup API
+    // -----------------------------------------------------------------------
+    /**
+     * Looks up a surface token (case-insensitive) and returns the
+     * matching {@link BiasEntry}, or {@code null} if not found.
+     *
+     * @param token any surface form (inflected or base)
+     */
+    public BiasEntry lookup(String token) {
+        if (token == null || token.isBlank()) return null;
+        return formIndex.get(token.toLowerCase().trim());
+    }
+    /**
+     * Returns true if the token (any form) is present in the lexicon.
+     *
+     * @param token surface form to check
+     */
+    public boolean contains(String token) {
+        return lookup(token) != null;
+    }
+    /**
+     * Looks up a canonical lemma directly.
+     *
+     * @param lemma the base/dictionary form
+     */
+    public BiasEntry lookupLemma(String lemma) {
+        if (lemma == null || lemma.isBlank()) return null;
+        return wordIndex.get(lemma.toLowerCase().trim());
+    }
+    // -----------------------------------------------------------------------
+    // Filtered views
+    // -----------------------------------------------------------------------
+    /**
+     * Returns all entries whose {@code biasType} matches the given category
+     * (case-insensitive), plus all general entries (empty biasType).
+     *
+     * @param biasType e.g. "gender", "disability"
+     */
+    public List<BiasEntry> getByType(String biasType) {
+        List<BiasEntry> result = new ArrayList<>();
+        String target = biasType == null ? "" : biasType.toLowerCase().trim();
+        for (BiasEntry e : entries)
+            if (e.getBiasType().equalsIgnoreCase(target) || e.getBiasType().isEmpty())
+                result.add(e);
+        return result;
+    }
+    /**
+     * Returns all entries that are marked as signals (signal=true) for
+     * the given bias category, or all signal entries if biasType is null/empty.
+     */
+    public List<BiasEntry> getSignals(String biasType) {
+        List<BiasEntry> result = new ArrayList<>();
+        for (BiasEntry e : entries) {
+            if (!e.isSignal()) continue;
+            if (biasType == null || biasType.isBlank()
+                    || e.getBiasType().isEmpty()
+                    || e.getBiasType().equalsIgnoreCase(biasType))
+                result.add(e);
+        }
+        return result;
+    }
+    /** Returns an unmodifiable view of all loaded entries. */
+    public Collection<BiasEntry> getAll() {
+        return Collections.unmodifiableList(entries);
+    }
+    /** Number of loaded dictionary entries. */
+    public int size() { return entries.size(); }
+    // -----------------------------------------------------------------------
+    // Internal loading
+    // -----------------------------------------------------------------------
+    private void load(String tsvPath) {
+        try (BufferedReader br = new BufferedReader(
+                new InputStreamReader(new FileInputStream(tsvPath),
+                        StandardCharsets.UTF_8))) {
+            String headerLine = br.readLine(); // skip header
+            if (headerLine == null) {
+                System.err.println("[BiasLexicon] Empty file: " + tsvPath);
+                return;
+            }
+            String line;
+            int lineNum = 1; // already read header as line 1
+            while ((line = br.readLine()) != null) {
+                lineNum++;
+                if (line.isBlank()) { skippedLines++; continue; }
+                String[] cols = line.split("\t", -1);
+                // Minimum viable: need at least 10 columns
+                if (cols.length < 10) {
+                    System.err.printf("[BiasLexicon] Line %d: only %d columns, skipping.%n",
+                            lineNum, cols.length);
+                    skippedLines++;
+                    continue;
+                }
+                try {
+                    String word       = cols[0].trim();
+                    String pos        = cols[1].trim();
+                    boolean signal    = "true".equalsIgnoreCase(cols[2].trim());
+                    String biasType   = cols[3].trim();
+                    String biasValue  = cols[4].trim();
+                    boolean derog     = "true".equalsIgnoreCase(cols[5].trim());
+                    boolean coll      = "true".equalsIgnoreCase(cols[6].trim());
+                    // cols[7] is a boolean forms-flag (ignored)
+                    double positivity = parseDouble(cols[8], lineNum);
+                    double negativity = parseDouble(cols[9], lineNum);
+                    // Inflected forms: pipe-separated in col 10 (if present)
+                    Set<String> formsSet = new HashSet<>();
+                    formsSet.add(word.toLowerCase()); // always include the lemma
+                    if (cols.length > 10 && !cols[10].isBlank()) {
+                        for (String f : cols[10].split("\\|")) {
+                            String fc = f.trim().toLowerCase();
+                            if (!fc.isEmpty()) formsSet.add(fc);
+                        }
+                    }
+                    BiasEntry entry = new BiasEntry(word, pos, signal,
+                            biasType, biasValue, derog, coll,
+                            positivity, negativity, formsSet);
+                    entries.add(entry);
+                    wordIndex.put(word.toLowerCase(), entry);
+                    for (String form : formsSet) {
+                        if (formIndex.containsKey(form)) {
+                            formConflicts++;
+                            // Keep first entry — do not overwrite
+                        } else {
+                            formIndex.put(form, entry);
+                        }
+                    }
+                    loadedEntries++;
+                } catch (Exception e) {
+                    System.err.printf("[BiasLexicon] Line %d: parse error — %s%n",
+                            lineNum, e.getMessage());
+                    skippedLines++;
+                }
+            }
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to load bias lexicon from: " + tsvPath, e);
+        }
+    }
+    private double parseDouble(String s, int lineNum) {
+        try {
+            return Double.parseDouble(s.trim());
+        } catch (NumberFormatException e) {
+            System.err.printf("[BiasLexicon] Line %d: cannot parse double '%s', using 0.0%n",
+                    lineNum, s);
+            return 0.0;
+        }
+    }
+}

java/bg/bas/dcl/LLMs/BulgarianSentenceSplitter.java ADDED Viewed

	@@ -0,0 +1,163 @@

+package bg.bas.dcl.LLMs;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+/**
+ * BulgarianSentenceSplitter
+ *
+ * Wraps the Apache OpenNLP sentence detection model for Bulgarian, providing
+ * a clean, reusable API for all other pipeline components.
+ *
+ * -----------------------------------------------------------------------
+ * MAVEN DEPENDENCIES (add to pom.xml):
+ *
+ *   <!-- OpenNLP toolkit -->
+ *   <dependency>
+ *     <groupId>org.apache.opennlp</groupId>
+ *     <artifactId>opennlp-tools</artifactId>
+ *     <version>2.4.0</version>
+ *   </dependency>
+ *
+ *   <!-- Bulgarian sentence-detection model (UD-based, Apache 2.0) -->
+ *   <dependency>
+ *     <groupId>org.apache.opennlp</groupId>
+ *     <artifactId>opennlp-models-sentdetect-bg</artifactId>
+ *     <version>1.2</version>
+ *   </dependency>
+ *
+ * The model JAR bundles the binary model at:
+ *   opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin
+ * You can also supply an external model file via the two-argument constructor.
+ *
+ * -------------------------------------------------
+ */
+public class BulgarianSentenceSplitter {
+    // -----------------------------------------------------------------------
+    // Constants
+    // -----------------------------------------------------------------------
+    /**
+     * Classpath location of the bundled Bulgarian sentence-detection model.
+     * Matches the path inside the opennlp-models-sentdetect-bg JAR.
+     */
+    private static final String BUNDLED_MODEL_PATH =
+            "opennlp/models/sentdetect/bg-ud-ewt-sentence-detector.bin";
+    /**
+     * Minimum character length for a string to be considered a valid sentence.
+     * Shorter strings are returned as-is without splitting.
+     */
+    private static final int MIN_TEXT_LENGTH = 5;
+    // -----------------------------------------------------------------------
+    // State
+    // -----------------------------------------------------------------------
+    private final SentenceDetectorME detector;
+    // -----------------------------------------------------------------------
+    // Constructors
+    // -----------------------------------------------------------------------
+    /**
+     * Loads the Bulgarian sentence-detection model from the bundled Maven JAR.
+     * Requires the opennlp-models-sentdetect-bg artifact on the classpath.
+     *
+     * @throws RuntimeException if the model cannot be loaded
+     */
+    public BulgarianSentenceSplitter() {
+        this(null);
+    }
+    /**
+     * Loads the Bulgarian sentence-detection model.
+     *
+     * @param modelPath absolute path to a .bin OpenNLP sentence-detection model,
+     *                  or {@code null} / empty string to load from the classpath JAR
+     * @throws RuntimeException if the model cannot be loaded
+     */
+    public BulgarianSentenceSplitter(String modelPath) {
+        try {
+            InputStream stream;
+            if (modelPath == null || modelPath.isBlank()) {
+                // Load from the bundled JAR on the classpath
+                stream = getClass().getClassLoader()
+                        .getResourceAsStream(BUNDLED_MODEL_PATH);
+                if (stream == null) {
+                    throw new IllegalStateException(
+                            "Bulgarian sentence model not found .");
+                }
+                System.out.println("[SentenceSplitter] Loaded bundled model: " + BUNDLED_MODEL_PATH);
+            } else {
+                File f = new File(modelPath);
+                if (!f.exists())
+                    throw new IllegalArgumentException(
+                            "Sentence model file not found: " + modelPath);
+                stream = new FileInputStream(f);
+                System.out.println("[SentenceSplitter] Loaded external model: " + modelPath);
+            }
+            SentenceModel model = new SentenceModel(stream);
+            stream.close();
+            detector = new SentenceDetectorME(model);
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to load Bulgarian sentence model", e);
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Core API
+    // -----------------------------------------------------------------------
+    public String[] split(String text) {
+        if (text == null) return new String[0];
+        String trimmed = text.trim();
+        if (trimmed.length() < MIN_TEXT_LENGTH) {
+            return trimmed.isEmpty() ? new String[0] : new String[]{trimmed};
+        }
+        return detector.sentDetect(trimmed);
+    }
+    public List<String> splitToList(String text) {
+        return new ArrayList<>(Arrays.asList(split(text)));
+    }
+    public List<String> splitParagraphs(String[] paragraphs) {
+        List<String> all = new ArrayList<>();
+        if (paragraphs == null) return all;
+        for (String para : paragraphs) {
+            if (para != null && !para.isBlank())
+                all.addAll(splitToList(para));
+        }
+        return all;
+    }
+    public double[] getSentenceProbabilities() {
+        return detector.getSentenceProbabilities();
+    }
+    public List<String> splitAndFilter(String text, int minWords) {
+        List<String> result = new ArrayList<>();
+        for (String sent : split(text)) {
+            if (sent.split("\\s+").length >= minWords)
+                result.add(sent);
+        }
+        return result;
+    }
+}

java/bg/bas/dcl/LLMs/DeduplicationProcessor.java ADDED Viewed

	@@ -0,0 +1,571 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.Writer;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.TreeSet;
+import info.debatty.java.lsh.MinHash;
+import bg.bas.dcl.general.FileHandler;
+/**
+ * DeduplicationProcessor — sentence-level near-duplicate detection
+ * using MinHash + LSH (Jaccard similarity).
+ *
+ * -----------------------------------------------------------------------
+ * MAVEN DEPENDENCY (add to pom.xml):
+ *
+ *   <dependency>
+ *     <groupId>info.debatty</groupId>
+ *     <artifactId>java-lsh</artifactId>
+ *     <version>0.12</version>
+ *   </dependency>
+ *
+ * -----------------------------------------------------------------------
+ * HOW IT WORKS
+ *
+ *   1. INDEX phase  — reads all .txt files in the "full corpus" directory.
+ *      Each sentence is shingled into character n-grams, converted to a
+ *      boolean vector over a shared vocabulary, and a MinHash signature
+ *      is computed.  All signatures are stored in an in-memory index keyed
+ *      by (file, lineNumber).
+ *
+ *   2. QUERY phase  — reads every sentence in the "new folder".
+ *      For each sentence its MinHash signature is compared against every
+ *      indexed corpus signature (approximate Jaccard via signature similarity).
+ *      Pairs whose estimated Jaccard similarity ≥ threshold are reported.
+ *
+ *   3. REPORT       — a TSV report is written listing every duplicate pair:
+ *      new-file | new-line | corpus-file | corpus-line | similarity | sentence
+ *
+ *   4. OPTIONAL REMOVE — sentences in the new folder that are duplicates of
+ *      corpus sentences are stripped from their file (originals backed up).
+ *      Files that become empty after removal are deleted.
+ *
+ * -----------------------------------------------------------------------
+ * PARAMETERS
+ *
+ *   threshold   — Jaccard similarity to call a near-duplicate  (default 0.90)
+ *   shingleSize — character n-gram size for shingling           (default 5)
+ *   numHashes   — number of hash functions for MinHash          (default 200)
+ *                 More hashes → better accuracy, slower index.
+ *
+ * -----------------------------------------------------------------------
+ * USAGE
+ *
+ *   DeduplicationProcessor dp = new DeduplicationProcessor(0.90);
+ *   dp.indexCorpus("/path/to/full/corpus/");
+ *   dp.detectDuplicates("/path/to/new/folder/", "/path/to/report.tsv");
+ *   dp.removeDuplicatesFromNewFolder("/path/to/new/folder/", true); // true=keep .bak
+ */
+public class DeduplicationProcessor {
+    // -----------------------------------------------------------------------
+    // Configuration
+    // -----------------------------------------------------------------------
+    private final double threshold;     // Jaccard similarity cut-off
+    private final int    shingleSize;   // character n-gram size
+    private final int    numHashes;     // MinHash signature length
+    // -----------------------------------------------------------------------
+    // Index state (built during indexCorpus)
+    // -----------------------------------------------------------------------
+    /** Shared vocabulary: every distinct shingle seen across all corpus sentences. */
+    private final Set<String> vocabulary = new HashSet<>();
+    /**
+     * Corpus index: maps SentenceKey → raw sentence text + MinHash signature.
+     * Built in two passes to allow vocabulary to be finalised before signing.
+     */
+    private final Map<SentenceKey, IndexedSentence> corpusIndex = new LinkedHashMap<>();
+    /** MinHash object — initialised once vocabulary size is known. */
+    private MinHash minHash;
+    // -----------------------------------------------------------------------
+    // Duplicate results (populated by detectDuplicates)
+    // -----------------------------------------------------------------------
+    /** All duplicate pairs found in the last detectDuplicates run. */
+    private final List<DuplicatePair> duplicatePairs = new ArrayList<>();
+    /**
+     * Set of SentenceKeys in the NEW folder that are duplicates.
+     * Used by removeDuplicatesFromNewFolder.
+     */
+    private final Set<SentenceKey> duplicateNewSentences = new HashSet<>();
+    // -----------------------------------------------------------------------
+    // Constructor
+    // -----------------------------------------------------------------------
+    public DeduplicationProcessor(double threshold) {
+        this(threshold, 5, 200);
+    }
+    public DeduplicationProcessor(double threshold, int shingleSize, int numHashes) {
+        if (threshold < 0 || threshold > 1)
+            throw new IllegalArgumentException("Threshold must be in [0, 1].");
+        this.threshold   = threshold;
+        this.shingleSize = shingleSize;
+        this.numHashes   = numHashes;
+    }
+    // -----------------------------------------------------------------------
+    // Phase 1 — Index the full corpus
+    // -----------------------------------------------------------------------
+    /**
+     * Reads all .txt files in {@code corpusDir}, shingles every sentence,
+     * builds a shared vocabulary, and computes MinHash signatures.
+     *
+     * This must be called before {@link #detectDuplicates}.
+     *
+     * @param corpusDir directory of clean .txt files representing the full corpus
+     */
+    public void indexCorpus(String corpusDir) {
+        System.out.println("[Index] Scanning corpus: " + corpusDir);
+        try {
+            FileHandler fh = new FileHandler();
+            // --- Pass 1: collect sentences and build vocabulary ---
+            // Temporary store: key → raw text + shingle set (signatures computed later)
+            Map<SentenceKey, Set<String>> rawShingles = new LinkedHashMap<>();
+            for (File f : fh.getFileListing(new File(corpusDir))) {
+                if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                Scanner sc = new Scanner(f, "UTF-8");
+                int lineNum = 0;
+                while (sc.hasNextLine()) {
+                    String line = sc.nextLine().trim();
+                    lineNum++;
+                    if (line.length() < shingleSize) continue;
+                    Set<String> shingles = shingle(line);
+                    vocabulary.addAll(shingles);
+                    rawShingles.put(new SentenceKey(f.getName(), lineNum), shingles);
+                }
+                sc.close();
+            }
+            System.out.println("[Index] Vocabulary size: " + vocabulary.size()
+                    + "  Sentences: " + rawShingles.size());
+            if (vocabulary.isEmpty()) {
+                System.err.println("[Index] No sentences found — aborting.");
+                return;
+            }
+            // --- Initialise MinHash with finalised vocabulary size ---
+            // Error parameter 0.05 → ~400 hashes needed; we use numHashes directly.
+            // The debatty MinHash constructor accepts (error, dictSize).
+            // We use the lower-level approach: fix numHashes via the signature size.
+            // info.debatty MinHash(double error, int dictSize) chooses hash count itself.
+            // For explicit control we pass a small error so it aligns with numHashes.
+            minHash = new MinHash(numHashes, vocabulary.size());
+            // --- Pass 2: compute and store signatures ---
+            List<String> vocabList = new ArrayList<>(vocabulary);
+            corpusIndex.clear();
+            // Also keep a raw-text map for the report
+            Map<SentenceKey, String> rawTexts = new HashMap<>();
+            // re-scan to get raw text (we only stored shingles above)
+            for (File f : fh.getFileListing(new File(corpusDir))) {
+                if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                Scanner sc = new Scanner(f, "UTF-8");
+                int lineNum = 0;
+                while (sc.hasNextLine()) {
+                    String line = sc.nextLine().trim();
+                    lineNum++;
+                    if (line.length() < shingleSize) continue;
+                    rawTexts.put(new SentenceKey(f.getName(), lineNum), line);
+                }
+                sc.close();
+            }
+            for (Map.Entry<SentenceKey, Set<String>> entry : rawShingles.entrySet()) {
+                SentenceKey key     = entry.getKey();
+                boolean[]   vector  = toVector(entry.getValue(), vocabList);
+                int[]       sig     = minHash.signature(vector);
+                String      rawText = rawTexts.getOrDefault(key, "");
+                corpusIndex.put(key, new IndexedSentence(rawText, sig));
+            }
+            System.out.println("[Index] Corpus index built: "
+                    + corpusIndex.size() + " sentences.");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Phase 2 — Detect duplicates in new folder
+    // -----------------------------------------------------------------------
+    /**
+     * Compares every sentence in {@code newDir} against the corpus index.
+     * Pairs with estimated Jaccard ≥ threshold are recorded as duplicates
+     * and written to {@code reportPath}.
+     *
+     * Call {@link #indexCorpus} first.
+     *
+     * @param newDir     directory of new .txt files to check
+     * @param reportPath destination TSV report file
+     */
+    public void detectDuplicates(String newDir, String reportPath) {
+        if (corpusIndex.isEmpty()) {
+            System.err.println("[Detect] Corpus index is empty. Call indexCorpus() first.");
+            return;
+        }
+        System.out.println("[Detect] Comparing new folder against corpus index...");
+        duplicatePairs.clear();
+        duplicateNewSentences.clear();
+        List<String> vocabList = new ArrayList<>(vocabulary);
+        try {
+            FileHandler fh = new FileHandler();
+            for (File f : fh.getFileListing(new File(newDir))) {
+                if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                System.out.println("[Detect] Checking: " + f.getName());
+                Scanner sc = new Scanner(f, "UTF-8");
+                int lineNum = 0;
+                while (sc.hasNextLine()) {
+                    String line = sc.nextLine().trim();
+                    lineNum++;
+                    if (line.length() < shingleSize) continue;
+                    Set<String> shingles = shingle(line);
+                    // Only shingles already in vocabulary are meaningful
+                    Set<String> filtered = new HashSet<>(shingles);
+                    filtered.retainAll(vocabulary);
+                    // If almost none of the shingles are in vocab → skip
+                    // (the sentence is likely from a very different domain)
+                    if (filtered.isEmpty()) continue;
+                    boolean[]   newVec = toVector(filtered, vocabList);
+                    int[]       newSig = minHash.signature(newVec);
+                    SentenceKey newKey = new SentenceKey(f.getName(), lineNum);
+                    // Compare against all corpus sentences
+                    // For large corpora, replace this loop with an LSH band index
+                    for (Map.Entry<SentenceKey, IndexedSentence> entry : corpusIndex.entrySet()) {
+                        double sim = minHash.similarity(newSig, entry.getValue().signature);
+                        if (sim >= threshold) {
+                            DuplicatePair pair = new DuplicatePair(
+                                    newKey,   line,
+                                    entry.getKey(), entry.getValue().text,
+                                    sim);
+                            duplicatePairs.add(pair);
+                            duplicateNewSentences.add(newKey);
+                            // Don't break — report ALL corpus matches for transparency
+                        }
+                    }
+                }
+                sc.close();
+            }
+            System.out.println("[Detect] Duplicate sentence pairs found: "
+                    + duplicatePairs.size());
+            System.out.println("[Detect] Unique new sentences flagged: "
+                    + duplicateNewSentences.size());
+            writeReport(reportPath);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Phase 3 — Optionally remove duplicates from new folder
+    // -----------------------------------------------------------------------
+    /**
+     * Removes from every file in {@code newDir} any sentence whose
+     * (file, lineNumber) is in the duplicate set detected by
+     * {@link #detectDuplicates}.
+     *
+     * Files that become empty after removal are deleted.
+     * Must be called after {@link #detectDuplicates}.
+     *
+     * @param newDir     directory of new .txt files to clean
+     * @param keepBackup if true, originals are renamed to *.bak first
+     */
+    public void removeDuplicatesFromNewFolder(String newDir, boolean keepBackup) {
+        if (duplicateNewSentences.isEmpty()) {
+            System.out.println("[Remove] No duplicates to remove.");
+            return;
+        }
+        System.out.println("[Remove] Removing "
+                + duplicateNewSentences.size() + " duplicate sentences...");
+        try {
+            FileHandler fh = new FileHandler();
+            int filesModified = 0;
+            int totalRemoved  = 0;
+            for (File f : fh.getFileListing(new File(newDir))) {
+                if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                List<String> inputLines  = new ArrayList<>();
+                Scanner sc = new Scanner(f, "UTF-8");
+                int lineNum = 0;
+                while (sc.hasNextLine()) {
+                    inputLines.add(sc.nextLine());
+                    lineNum++;
+                }
+                sc.close();
+                List<String> outputLines = new ArrayList<>();
+                int removed = 0;
+                for (int i = 0; i < inputLines.size(); i++) {
+                    String trimmed = inputLines.get(i).trim();
+                    // +1 because lineNum was 1-based during indexing
+                    SentenceKey key = new SentenceKey(f.getName(), i + 1);
+                    if (trimmed.length() >= shingleSize
+                            && duplicateNewSentences.contains(key)) {
+                        removed++;
+                    } else {
+                        outputLines.add(inputLines.get(i));
+                    }
+                }
+                if (removed > 0) {
+                    if (keepBackup) {
+                        Files.copy(f.toPath(),
+                                new File(f.getAbsolutePath() + ".bak").toPath(),
+                                StandardCopyOption.REPLACE_EXISTING);
+                    }
+                    // Check if file would become empty (only blank lines)
+                    boolean allBlank = outputLines.stream()
+                            .allMatch(String::isBlank);
+                    if (allBlank) {
+                        f.delete();
+                        System.out.println("[Remove] Deleted (empty after dedup): "
+                                + f.getName());
+                    } else {
+                        Writer w = new OutputStreamWriter(
+                                new FileOutputStream(f), "UTF-8");
+                        for (String l : outputLines) {
+                            w.write(l + "\n");
+                        }
+                        w.flush();
+                        w.close();
+                        System.out.println("[Remove] " + f.getName()
+                                + " — removed " + removed + " sentences.");
+                    }
+                    filesModified++;
+                    totalRemoved += removed;
+                }
+            }
+            System.out.println("[Remove] Done. Files modified: " + filesModified
+                    + "  Sentences removed: " + totalRemoved);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Report writer
+    // -----------------------------------------------------------------------
+    private void writeReport(String reportPath) throws Exception {
+        try (PrintWriter pw = new PrintWriter(
+                new OutputStreamWriter(new FileOutputStream(reportPath), "UTF-8"))) {
+            // Header
+            pw.println("# DeduplicationProcessor report");
+            pw.println("# Threshold: " + threshold
+                    + "  ShingleSize: " + shingleSize
+                    + "  NumHashes: " + numHashes);
+            pw.println("# Duplicate pairs: " + duplicatePairs.size());
+            pw.println("# Unique new sentences flagged: " + duplicateNewSentences.size());
+            pw.println();
+            pw.println("NEW_FILE\tNEW_LINE\tCORPUS_FILE\tCORPUS_LINE\tSIMILARITY\tNEW_SENTENCE\tCORPUS_SENTENCE");
+            // Sort by similarity descending, then new file, then line
+            List<DuplicatePair> sorted = new ArrayList<>(duplicatePairs);
+            sorted.sort((a, b) -> {
+                int cmp = Double.compare(b.similarity, a.similarity);
+                if (cmp != 0) return cmp;
+                cmp = a.newKey.fileName.compareTo(b.newKey.fileName);
+                if (cmp != 0) return cmp;
+                return Integer.compare(a.newKey.lineNumber, b.newKey.lineNumber);
+            });
+            for (DuplicatePair p : sorted) {
+                pw.printf("%s\t%d\t%s\t%d\t%.4f\t%s\t%s%n",
+                        p.newKey.fileName,
+                        p.newKey.lineNumber,
+                        p.corpusKey.fileName,
+                        p.corpusKey.lineNumber,
+                        p.similarity,
+                        sanitiseTsv(p.newText),
+                        sanitiseTsv(p.corpusText));
+            }
+        }
+        System.out.println("[Report] Written to: " + reportPath);
+    }
+    // -----------------------------------------------------------------------
+    // Shingling and vectorisation helpers
+    // -----------------------------------------------------------------------
+    /**
+     * Produces the set of character n-grams (shingles) for a sentence.
+     * Lowercased so matching is case-insensitive.
+     */
+    private Set<String> shingle(String text) {
+        Set<String> shingles = new TreeSet<>();
+        String lower = text.toLowerCase();
+        for (int i = 0; i <= lower.length() - shingleSize; i++) {
+            shingles.add(lower.substring(i, i + shingleSize));
+        }
+        return shingles;
+    }
+    /**
+     * Converts a shingle set to a boolean presence vector over the shared vocabulary.
+     *
+     * @param shingles  shingle set for this sentence
+     * @param vocabList ordered list of all vocabulary shingles
+     * @return boolean[] where true = shingle present
+     */
+    private boolean[] toVector(Set<String> shingles, List<String> vocabList) {
+        boolean[] vector = new boolean[vocabList.size()];
+        for (int i = 0; i < vocabList.size(); i++) {
+            vector[i] = shingles.contains(vocabList.get(i));
+        }
+        return vector;
+    }
+    // -----------------------------------------------------------------------
+    // Utility
+    // -----------------------------------------------------------------------
+    private String sanitiseTsv(String s) {
+        if (s == null) return "";
+        return s.replace("\t", " ").replace("\n", " ").replace("\r", "");
+    }
+    /** Returns an unmodifiable view of all detected duplicate pairs. */
+    public List<DuplicatePair> getDuplicatePairs() {
+        return Collections.unmodifiableList(duplicatePairs);
+    }
+    /** Returns the number of corpus sentences indexed. */
+    public int getCorpusSize() {
+        return corpusIndex.size();
+    }
+    // -----------------------------------------------------------------------
+    // Inner data classes
+    // -----------------------------------------------------------------------
+    /**
+     * Uniquely identifies a sentence by its source file name and line number.
+     */
+    public static class SentenceKey {
+        public final String fileName;
+        public final int    lineNumber;
+        public SentenceKey(String fileName, int lineNumber) {
+            this.fileName   = fileName;
+            this.lineNumber = lineNumber;
+        }
+        @Override
+        public boolean equals(Object o) {
+            if (!(o instanceof SentenceKey)) return false;
+            SentenceKey other = (SentenceKey) o;
+            return lineNumber == other.lineNumber
+                    && fileName.equals(other.fileName);
+        }
+        @Override
+        public int hashCode() {
+            return 31 * fileName.hashCode() + lineNumber;
+        }
+        @Override
+        public String toString() {
+            return fileName + ":" + lineNumber;
+        }
+    }
+    /**
+     * Holds the raw text and MinHash signature for an indexed corpus sentence.
+     */
+    private static class IndexedSentence {
+        final String text;
+        final int[]  signature;
+        IndexedSentence(String text, int[] signature) {
+            this.text      = text;
+            this.signature = signature;
+        }
+    }
+    /**
+     * Represents a detected near-duplicate pair between a new sentence
+     * and a corpus sentence.
+     */
+    public static class DuplicatePair {
+        public final SentenceKey newKey;
+        public final String      newText;
+        public final SentenceKey corpusKey;
+        public final String      corpusText;
+        public final double      similarity;
+        public DuplicatePair(SentenceKey newKey,    String newText,
+                             SentenceKey corpusKey, String corpusText,
+                             double similarity) {
+            this.newKey     = newKey;
+            this.newText    = newText;
+            this.corpusKey  = corpusKey;
+            this.corpusText = corpusText;
+            this.similarity = similarity;
+        }
+        @Override
+        public String toString() {
+            return String.format("[%.2f] %s ↔ %s", similarity, newKey, corpusKey);
+        }
+    }
+}

java/bg/bas/dcl/LLMs/FileCleanProcessor.java ADDED Viewed

	@@ -0,0 +1,453 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.io.Writer;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.regex.Pattern;
+import bg.bas.dcl.general.FileHandler;
+/**
+ * FileCleanProcessor — corpus boilerplate remover.
+ *
+ * Two-phase cleaning:
+ *
+ * Phase 1 — LEARN (from a sample directory):
+ *   Scans every .txt file in the sample dir and records how many files each
+ *   non-empty line appears in.  Lines that appear in ≥ THRESHOLD of the
+ *   sample files are added to the "common lines" blocklist.
+ *   The blocklist is also saved to disk for inspection / reuse.
+ *
+ * Phase 2 — CLEAN (over the full data directory):
+ *   For every .txt file, removes lines that:
+ *     (a) appear in the learned common-lines blocklist, OR
+ *     (b) match any of the hardcoded boilerplate regex patterns
+ *         (HTML/XML tags, PHP markers, navigation patterns,
+ *          URLs, e-mail addresses, cookie/GDPR banners).
+ *   Cleaned files overwrite the originals (a .bak backup is kept by default).
+ *
+ * Usage:
+ *   FileCleanProcessor fcp = new FileCleanProcessor(0.50); // 50 % threshold
+ *   fcp.learnFromSample("/path/to/sample/dir/");
+ *   fcp.saveBlocklist("/path/to/blocklist.txt");           // optional
+ *   fcp.cleanDirectory("/path/to/full/data/dir/", true);  // true = keep .bak
+ */
+public class FileCleanProcessor {
+    // -----------------------------------------------------------------------
+    // Configuration
+    // -----------------------------------------------------------------------
+    /** Fraction of sample files a line must appear in to be considered boilerplate. */
+    private final double threshold;
+    /** Minimum non-whitespace characters a line must have to be evaluated (avoids
+     *  treating every blank separator the same way). */
+    private static final int MIN_LINE_LENGTH = 3;
+    // -----------------------------------------------------------------------
+    // State
+    // -----------------------------------------------------------------------
+    /** Lines found to be common across the sample (Phase 1 output). */
+    private final Set<String> commonLines = new HashSet<>();
+    /** Diagnostic: line → number of sample files it appeared in. */
+    private final Map<String, Integer> lineFrequency = new LinkedHashMap<>();
+    // -----------------------------------------------------------------------
+    // Hardcoded boilerplate patterns (always applied regardless of frequency)
+    // -----------------------------------------------------------------------
+    private static final List<Pattern> BOILERPLATE_PATTERNS = Arrays.asList(
+        // ---- HTML / XML tags ------------------------------------------------
+        Pattern.compile("(?i)^\\s*<[^>]+>\\s*$"),                        // whole-line tag
+        Pattern.compile("(?i).*<(script|style|head|meta|link|iframe)[^>]*>.*"),
+        Pattern.compile("(?i).*</(script|style|head|body|html)>.*"),
+        Pattern.compile("(?i).*<!--.*-->.*"),                             // HTML comment
+        Pattern.compile("(?i).*&(nbsp|amp|lt|gt|quot|apos);.*"),         // HTML entities
+        // ---- PHP / server-side markers --------------------------------------
+        Pattern.compile("(?i).*<\\?php.*"),
+        Pattern.compile("(?i).*\\?>\\s*"),
+        Pattern.compile("(?i).*<%.*%>.*"),                                // ASP-style tags
+        // ---- Navigation / menu patterns ------------------------------------
+        Pattern.compile("(?i)^\\s*(home|начало|меню|menu|навигация|navigation"
+                + "|търсене|search|вход|login|изход|logout"
+                + "|регистрация|register|контакти|contacts"
+                + "|за нас|about us|sitemap|карта на сайта)\\s*$"),
+        Pattern.compile("(?i)^\\s*(next|prev|previous|следващ|предишен"
+                + "|напред|назад|нагоре|back|forward|top|горе)\\s*$"),
+        Pattern.compile("(?i)^\\s*\\|\\s*(.*\\|\\s*)+$"),                // pipe-separated nav bars
+        Pattern.compile("(?i)^\\s*(>\\s*){2,}"),                         // breadcrumb: A > B > C
+        Pattern.compile("(?i)^\\s*(\\d+\\.?\\s+){3,}$"),                 // numbered nav lists
+        // ---- URLs ----------------------------------------------------------
+        Pattern.compile("(?i)\\bhttps?://\\S+"),
+        Pattern.compile("(?i)\\bwww\\.\\S+\\.\\S+"),
+        Pattern.compile("(?i)\\bftp://\\S+"),
+        // ---- E-mail addresses ----------------------------------------------
+        Pattern.compile("[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}"),
+        // ---- Cookie / GDPR banners -----------------------------------------
+        Pattern.compile("(?i).*(бисквитки|cookies|gdpr|privacy policy|поверителност"
+                + "|приемам|accept all|отхвърлям|decline|consent"
+                + "|лични данни|personal data|условия за ползване"
+                + "|terms of (use|service)|политика за).*"),
+        // ---- Social / sharing buttons --------------------------------------
+        Pattern.compile("(?i)^\\s*(share|сподели|like|харесай|tweet|retweet"
+                + "|pinterest|linkedin|facebook|twitter|instagram"
+                + "|google\\+?|youtube|tiktok|viber|whatsapp)\\s*$"),
+        // ---- Counters / analytics snippets ---------------------------------
+        Pattern.compile("(?i).*google.analytics.*"),
+        Pattern.compile("(?i).*ga\\s*\\(\\s*['\"].*"),
+        Pattern.compile("(?i).*gtag\\s*\\(.*"),
+        Pattern.compile("(?i).*_gaq\\.push.*"),
+        // ---- Print / date / page artefacts ---------------------------------
+        Pattern.compile("(?i)^\\s*страница\\s+\\d+\\s*(от\\s+\\d+)?\\s*$"),   // "страница 1 от 5"
+        Pattern.compile("(?i)^\\s*page\\s+\\d+\\s*(of\\s+\\d+)?\\s*$"),
+        Pattern.compile("(?i)^\\s*©.*$"),                                 // copyright line
+        Pattern.compile("(?i)^\\s*all rights reserved.*$"),
+        Pattern.compile("(?i)^\\s*права запазени.*$"),
+        // ---- Lines that are purely punctuation / symbols -------------------
+        Pattern.compile("^[\\s\\p{Punct}\\|\\-_=*~`^]+$")
+    );
+    // -----------------------------------------------------------------------
+    // Constructor
+    // -----------------------------------------------------------------------
+    /**
+     * @param threshold fraction [0,1] of sample files a line must appear in
+     *                  to be added to the blocklist (e.g. 0.50 for 50 %).
+     */
+    public FileCleanProcessor(double threshold) {
+        if (threshold < 0 || threshold > 1)
+            throw new IllegalArgumentException("Threshold must be in [0, 1].");
+        this.threshold = threshold;
+    }
+    // -----------------------------------------------------------------------
+    // Phase 1 — Learn from sample
+    // -----------------------------------------------------------------------
+    /**
+     * Scans all .txt files in {@code sampleDir}, counts how many files each
+     * trimmed non-empty line appears in, and populates {@link #commonLines}
+     * with those meeting the threshold.
+     *
+     * @param sampleDir directory containing representative sample .txt files
+     */
+    public void learnFromSample(String sampleDir) {
+        try {
+            FileHandler fh = new FileHandler();
+            List<File> sampleFiles = new ArrayList<>();
+            for (File f : fh.getFileListing(new File(sampleDir))) {
+                if (f.isFile() && f.getName().endsWith(".txt"))
+                    sampleFiles.add(f);
+            }
+            int total = sampleFiles.size();
+            if (total == 0) {
+                System.err.println("[LearnPhase] No .txt files found in: " + sampleDir);
+                return;
+            }
+            System.out.println("[LearnPhase] Scanning " + total + " sample files...");
+            // For each file, collect the *distinct* lines it contains so a
+            // repeated line inside one document only counts once.
+            Map<String, Integer> fileCount = new HashMap<>();
+            for (File f : sampleFiles) {
+                Set<String> seenInFile = new HashSet<>();
+                Scanner s = new Scanner(f, "UTF-8");
+                while (s.hasNextLine()) {
+                    String line = s.nextLine().trim();
+                    if (line.length() < MIN_LINE_LENGTH) continue;
+                    if (seenInFile.add(line)) {                    // first occurrence in this file
+                        fileCount.merge(line, 1, Integer::sum);
+                    }
+                }
+                s.close();
+            }
+            // Apply threshold
+            commonLines.clear();
+            lineFrequency.clear();
+            double cutoff = threshold * total;
+            for (Map.Entry<String, Integer> entry : fileCount.entrySet()) {
+                lineFrequency.put(entry.getKey(), entry.getValue());
+                if (entry.getValue() >= cutoff) {
+                    commonLines.add(entry.getKey());
+                }
+            }
+            System.out.println("[LearnPhase] Common lines identified: " + commonLines.size()
+                    + "  (threshold=" + (int)(threshold * 100) + "%, files=" + total + ")");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    /**
+     * Replaces the learned common-lines set with a pre-built one.
+     * Useful when loading a previously saved blocklist.
+     *
+     * @param lines set of exact line strings to treat as boilerplate
+     */
+    public void setCommonLines(Set<String> lines) {
+        commonLines.clear();
+        commonLines.addAll(lines);
+    }
+    // -----------------------------------------------------------------------
+    // Blocklist persistence
+    // -----------------------------------------------------------------------
+    /**
+     * Saves the learned blocklist to a plain-text file (one line per entry),
+     * preceded by a frequency comment for human review.
+     *
+     * @param outPath destination file path
+     */
+    public void saveBlocklist(String outPath) {
+        try (PrintWriter pw = new PrintWriter(
+                new OutputStreamWriter(new FileOutputStream(outPath), "UTF-8"))) {
+            pw.println("# FileCleanProcessor blocklist");
+            pw.println("# threshold=" + threshold
+                    + "  entries=" + commonLines.size());
+            pw.println("# Format: <frequency TAB line>");
+            pw.println();
+            // Sort by descending frequency for readability
+            lineFrequency.entrySet().stream()
+                .filter(e -> commonLines.contains(e.getKey()))
+                .sorted((a, b) -> b.getValue() - a.getValue())
+                .forEach(e -> pw.println(e.getValue() + "\t" + e.getKey()));
+            System.out.println("[Blocklist] Saved " + commonLines.size()
+                    + " entries to: " + outPath);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    /**
+     * Loads a blocklist previously saved by {@link #saveBlocklist}.
+     * Comment lines (starting with #) and blank lines are skipped.
+     *
+     * @param blocklistPath path to the blocklist file
+     */
+    public void loadBlocklist(String blocklistPath) {
+        try {
+            commonLines.clear();
+            Scanner sc = new Scanner(new File(blocklistPath), "UTF-8");
+            while (sc.hasNextLine()) {
+                String line = sc.nextLine();
+                if (line.startsWith("#") || line.isBlank()) continue;
+                // Format: "<freq>\t<content>"  or bare "<content>"
+                int tab = line.indexOf('\t');
+                String content = (tab >= 0) ? line.substring(tab + 1) : line;
+                if (!content.isBlank()) commonLines.add(content.trim());
+            }
+            sc.close();
+            System.out.println("[Blocklist] Loaded " + commonLines.size()
+                    + " entries from: " + blocklistPath);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Phase 2 — Clean full directory
+    // -----------------------------------------------------------------------
+    /**
+     * Cleans every .txt file in {@code dataDir} by removing lines that are
+     * in the learned blocklist or match a hardcoded boilerplate pattern.
+     *
+     * @param dataDir    directory containing corpus .txt files to clean
+     * @param keepBackup if true, originals are renamed to *.bak before overwriting
+     */
+    public void cleanDirectory(String dataDir, boolean keepBackup) {
+        try {
+            if (commonLines.isEmpty()) {
+                System.out.println("[CleanPhase] Warning: no common lines loaded. "
+                        + "Only regex patterns will be applied.");
+            }
+            FileHandler fh = new FileHandler();
+            int processed = 0, linesRemoved = 0;
+            for (File f : fh.getFileListing(new File(dataDir))) {
+                if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                CleanResult result = cleanFile(f, keepBackup);
+                processed++;
+                linesRemoved += result.linesRemoved;
+                if (result.linesRemoved > 0) {
+                    System.out.println("[CleanPhase] " + f.getName()
+                            + " — removed " + result.linesRemoved + " lines.");
+                }
+            }
+            System.out.println("[CleanPhase] Done. Files processed: " + processed
+                    + "  Total lines removed: " + linesRemoved);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    /**
+     * Cleans a single file in place.
+     *
+     * @param file       the .txt file to clean
+     * @param keepBackup if true, a .bak copy of the original is kept
+     * @return CleanResult with statistics
+     */
+    public CleanResult cleanFile(File file, boolean keepBackup) {
+        int removed = 0;
+        try {
+            // Read all lines
+            List<String> inputLines  = new ArrayList<>();
+            Scanner sc = new Scanner(file, "UTF-8");
+            while (sc.hasNextLine()) inputLines.add(sc.nextLine());
+            sc.close();
+            // Filter
+            List<String> outputLines = new ArrayList<>();
+            for (String line : inputLines) {
+                if (shouldRemove(line)) {
+                    removed++;
+                } else {
+                    outputLines.add(line);
+                }
+            }
+            if (removed > 0) {
+                // Backup
+                if (keepBackup) {
+                    File bak = new File(file.getAbsolutePath() + ".bak");
+                    Files.copy(file.toPath(), bak.toPath(),
+                            StandardCopyOption.REPLACE_EXISTING);
+                }
+                // Overwrite
+                Writer w = new OutputStreamWriter(
+                        new FileOutputStream(file), "UTF-8");
+                for (String l : outputLines) {
+                    w.write(l + "\n");
+                }
+                w.flush();
+                w.close();
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return new CleanResult(file, removed);
+    }
+    // -----------------------------------------------------------------------
+    // Core line decision
+    // -----------------------------------------------------------------------
+    /**
+     * Returns true if the line should be removed.
+     *
+     * A line is removed if:
+     *   1. Its trimmed form is in the learned common-lines blocklist, OR
+     *   2. It matches any hardcoded boilerplate regex pattern.
+     *
+     * Blank lines shorter than MIN_LINE_LENGTH are always kept so that
+     * paragraph structure is preserved.
+     *
+     * @param rawLine the original line from the file (not yet trimmed)
+     */
+    public boolean shouldRemove(String rawLine) {
+        String trimmed = rawLine.trim();
+        // Always keep blank/very-short lines (paragraph separators)
+        if (trimmed.length() < MIN_LINE_LENGTH) return false;
+        // 1. Exact-match blocklist
+        if (commonLines.contains(trimmed)) return true;
+        // 2. Regex boilerplate patterns
+        for (Pattern p : BOILERPLATE_PATTERNS) {
+            if (p.matcher(trimmed).matches() || p.matcher(trimmed).find()) {
+                return true;
+            }
+        }
+        return false;
+    }
+    // -----------------------------------------------------------------------
+    // Diagnostic helpers
+    // -----------------------------------------------------------------------
+    /** Returns an unmodifiable view of the learned common-lines set. */
+    public Set<String> getCommonLines() {
+        return java.util.Collections.unmodifiableSet(commonLines);
+    }
+    /** Returns a copy of the frequency map (line → number of sample files). */
+    public Map<String, Integer> getLineFrequency() {
+        return java.util.Collections.unmodifiableMap(lineFrequency);
+    }
+    /**
+     * Prints a summary of the top {@code n} most-frequent common lines to stdout.
+     */
+    public void printTopCommonLines(int n) {
+        System.out.println("--- Top " + n + " common lines (by sample frequency) ---");
+        lineFrequency.entrySet().stream()
+            .filter(e -> commonLines.contains(e.getKey()))
+            .sorted((a, b) -> b.getValue() - a.getValue())
+            .limit(n)
+            .forEach(e -> System.out.printf("  [%4d]  %s%n", e.getValue(), e.getKey()));
+    }
+    // -----------------------------------------------------------------------
+    // Inner result class
+    // -----------------------------------------------------------------------
+    /** Simple value object returned by {@link #cleanFile}. */
+    public static class CleanResult {
+        public final File file;
+        public final int  linesRemoved;
+        public CleanResult(File file, int linesRemoved) {
+            this.file         = file;
+            this.linesRemoved = linesRemoved;
+        }
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/.BulNCProcessor.java.kate-swp ADDED Viewed

Binary file (348 Bytes). View file

java/bg/bas/dcl/LLMs/IfGPTDataset/.CurlicatProcessor.java.kate-swp ADDED Viewed

Binary file (98 Bytes). View file

java/bg/bas/dcl/LLMs/IfGPTDataset/BaseSourceProcessor.java ADDED Viewed

	@@ -0,0 +1,180 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.FileWriter;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import bg.bas.dcl.general.JSONProcessor;
+import java.io.File;
+/**
+ * Abstract base for all source processors.
+ *
+ * Provides shared utilities:
+ *  - convertJsonToCSV: write a metadata JSONObject to a CSV file
+ *  - estimateTokenCount: simple punctuation-aware token estimator
+ *
+ * Each concrete subclass implements {@link SourceProcessor#process(String, String)}
+ * with source-specific parsing logic.
+ */
+public abstract class BaseSourceProcessor implements SourceProcessor {
+    // -----------------------------------------------------------------------
+    // CSV export
+    // -----------------------------------------------------------------------
+    /**
+     * Reads a metadata.json file from disk and writes a CSV alongside it.
+     *
+     * @param metadataJsonPath path to the metadata JSON file
+     */
+    public void convertJsonToCSV(String metadataJsonPath) {
+        try {
+            JSONProcessor pr = new JSONProcessor();
+            JSONObject json = pr.readJSON(new File(metadataJsonPath));
+            convertJsonToCSV(json, metadataJsonPath + "_CSV.csv");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    /**
+     * Writes the "metadata" array inside {@code json} to a CSV at {@code outCsvPath}.
+     * Reports structural inconsistencies (missing/extra fields) to stderr.
+     *
+     * @param json       JSONObject that contains a "metadata" JSONArray
+     * @param outCsvPath destination CSV file path
+     */
+    public void convertJsonToCSV(JSONObject json, String outCsvPath) {
+        try {
+            JSONArray array = (JSONArray) json.get("metadata");
+            if (array == null || array.isEmpty()) {
+                System.err.println("[INCONSISTENCY] 'metadata' array is null or empty in: " + outCsvPath);
+                return;
+            }
+            // Collect all unique field names, preserving insertion order
+            LinkedHashSet<String> headersSet = new LinkedHashSet<>();
+            for (Object obj : array) {
+                if (obj instanceof JSONObject) {
+                    headersSet.addAll(((JSONObject) obj).keySet());
+                } else {
+                    System.err.println("[INCONSISTENCY] Non-JSONObject entry found in metadata array.");
+                }
+            }
+            ArrayList<String> headers = new ArrayList<>(headersSet);
+            try (PrintWriter writer = new PrintWriter(new FileWriter(outCsvPath))) {
+                // Header row
+                writer.println(String.join(",", headers));
+                // Data rows
+                for (int i = 0; i < array.size(); i++) {
+                    Object obj = array.get(i);
+                    if (!(obj instanceof JSONObject)) {
+                        System.err.println("[INCONSISTENCY] Row " + i + " is not a JSONObject, skipping.");
+                        continue;
+                    }
+                    JSONObject row = (JSONObject) obj;
+                    // Structural checks
+                    for (String header : headers) {
+                        if (!row.containsKey(header)) {
+                            System.err.println("[INCONSISTENCY] Row " + i + " missing field: '" + header + "'");
+                        }
+                    }
+                    for (Object key : row.keySet()) {
+                        if (!headersSet.contains(key.toString())) {
+                            System.err.println("[INCONSISTENCY] Row " + i + " has unexpected field: '" + key + "'");
+                        }
+                    }
+                    // Build CSV line with RFC-4180 escaping
+                    ArrayList<String> values = new ArrayList<>();
+                    for (String header : headers) {
+                        Object value = row.get(header);
+                        if (value == null) {
+                            values.add("");
+                        } else {
+                            String strVal = value.toString();
+                            if (strVal.contains(",") || strVal.contains("\"") || strVal.contains("\n")) {
+                                strVal = "\"" + strVal.replace("\"", "\"\"") + "\"";
+                            }
+                            values.add(strVal);
+                        }
+                    }
+                    writer.println(String.join(",", values));
+                }
+            }
+            System.out.println("CSV written to: " + outCsvPath);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Shared helpers
+    // -----------------------------------------------------------------------
+    /**
+     * Estimates the number of tokens in a sentence by counting words plus
+     * standalone punctuation characters (.,;:?!()-).
+     *
+     * @param sentence whitespace-tokenised sentence string
+     * @return estimated token count
+     */
+    protected int estimateTokenCount(String sentence) {
+        String[] words = sentence.split(" ");
+        int punctCount = sentence.length()
+                - sentence.replaceAll("[.,;:()?!\\-]", "").length();
+        return words.length + punctCount;
+    }
+    /**
+     * Creates a JSONObject pre-populated with the metadata fields that are
+     * common to every source (counts start at 0).
+     *
+     * @param identifier unique document identifier
+     * @return partially initialised JSONObject
+     */
+    @SuppressWarnings("unchecked")
+    protected JSONObject newBaseDescriptor(String identifier) {
+        JSONObject fdescr = new JSONObject();
+        fdescr.put("Identifier",                  identifier);
+        fdescr.put("Licence",                     "");
+        fdescr.put("LicenceLink",                 "");
+        fdescr.put("PublicationDate",             "");
+        fdescr.put("DocumentTitle",               "");
+        fdescr.put("Source",                      "");
+        fdescr.put("Author",                      "");
+        fdescr.put("Style",                       "");
+        fdescr.put("Type",                        "");
+        fdescr.put("Subdomain",                   "");
+        fdescr.put("TranslatedDocument",          "");
+        fdescr.put("CollectionDate",              "");
+        fdescr.put("Medium",                      "text");
+        fdescr.put("Url",                         "");
+        fdescr.put("Domain",                      "");
+        fdescr.put("Keywords",                    "");
+        fdescr.put("PersonallyIdentifiableInformation", "");
+        fdescr.put("BiasedInformation",           "");
+        fdescr.put("TaskCategories",              "");
+        fdescr.put("NumberWords",                 0);
+        fdescr.put("NumberSentences",             0);
+        fdescr.put("NumberParagraphs",            0);
+        fdescr.put("NumberTokens",                0);
+        return fdescr;
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCProcessor.java ADDED Viewed

	@@ -0,0 +1,188 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Scanner;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import bg.bas.dcl.monolingual.bg.TextProcessor;
+/**
+ * Processes the Bulgarian National Corpus (BulNC) — general subcorpora.
+ *
+ * Unlike MARCELL/CURLICAT, BulNC metadata is supplied via an external
+ * tab-separated description file (BulNC-description.txt) rather than
+ * inline CoNLL-UP comments.  Plain-text source files are read directly.
+ *
+ * Subcorpora included (controlled by {@link #isIncluded}):
+ *   A-Administrative, B-Science, C-MassMedia, D-Fiction
+ *   (edit the method to adjust the filter)
+ *
+ * SETimes articles are excluded regardless of subcorpus.
+ *
+ * Licence rules:
+ *   A-Administrative → CC0
+ *   B-Science        → Restricted
+ *   C-MassMedia      → Restricted
+ *   D-Fiction        → Restricted
+ *
+ * Description file column indices (0-based):
+ *   0  filename stem  |  1  relative path  |  2  collection date
+ *   4  author         |  8  title          |  9  publication date
+ *   12 url            |  13 translated     |  17 type
+ *   19 domain         |  21 subdomain (optional)
+ */
+public class BulNCProcessor extends BaseSourceProcessor {
+    private static final String CC0_LICENCE      = "CC0";
+    private static final String CC0_LICENCE_LINK =
+            "https://creativecommons.org/public-domain/cc0/";
+    private static final String RESTRICTED = "Restricted";
+    private final String metaFilePath; // path to BulNC-description.txt
+    private final TextProcessor tp = new TextProcessor();
+    /**
+     * @param metaFilePath absolute path to BulNC-description.txt
+     */
+    public BulNCProcessor(String metaFilePath) {
+        this.metaFilePath = metaFilePath;
+    }
+    /**
+     * @param indir  root directory of the BulNC corpus
+     * @param outdir output directory for .txt files and metadata
+     */
+    @Override
+    public void process(String indir, String outdir) {
+        try {
+            JSONObject json = new JSONObject();
+            JSONArray descrArray = new JSONArray();
+            Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
+            while (sme.hasNextLine()) {
+                String[] dat = sme.nextLine().split("\t");
+                String relativePath = dat[1];
+                System.out.println("Checking: " + relativePath);
+                // --- Subcorpus filter ---
+                if (!isIncluded(relativePath)) continue;
+                // --- SETimes exclusion ---
+                if (dat[12].contains("setimes")) continue;
+                String fname = indir + relativePath;
+                File f = new File(fname);
+                if (!f.exists()) {
+                    System.err.println("[MISSING] " + fname);
+                    continue;
+                }
+                String tfname = "bg_bnc_" + dat[0];
+                JSONObject fdescr = newBaseDescriptor(tfname);
+                applyLicence(fdescr, relativePath);
+                fdescr.put("PublicationDate",    dat[9].replaceAll("\\.", "-"));
+                fdescr.put("DocumentTitle",      dat[8]);
+                fdescr.put("Author",             dat[4]);
+                fdescr.put("Style",              "Administrative");
+                fdescr.put("Type",               dat[17]);
+                fdescr.put("Subdomain",          dat.length > 21 ? dat[21] : "");
+                fdescr.put("TranslatedDocument", dat[13]);
+                fdescr.put("CollectionDate",     dat[2]);
+                fdescr.put("Url",                dat[12]);
+                fdescr.put("Domain",             dat[19]);
+                Writer out = new OutputStreamWriter(
+                        new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
+                Scanner s = new Scanner(f, "UTF-8");
+                int nw = 0, ns = 0, np = 0, nt = 0;
+                while (s.hasNextLine()) {
+                    String text = s.nextLine();
+                    np++;
+                    out.write(text + "\n");
+                    out.flush();
+                    for (String sent : tp.splitToSentences(text)) {
+                        ns++;
+                        String[] words = sent.split(" ");
+                        nw += words.length;
+                        nt += estimateTokenCount(sent);
+                    }
+                }
+                s.close();
+                out.flush();
+                out.close();
+                fdescr.put("NumberWords",      nw);
+                fdescr.put("NumberSentences",  ns);
+                fdescr.put("NumberParagraphs", np);
+                fdescr.put("NumberTokens",     nt);
+                descrArray.add(fdescr);
+            }
+            sme.close();
+            json.put("metadata", descrArray);
+            System.out.println("Total documents processed: " + descrArray.size());
+            writeMetadata(json, outdir, "metadata_BNC_mm.json");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Helpers
+    // -----------------------------------------------------------------------
+    /**
+     * Returns true for subcorpora that should be processed.
+     * Edit this method to change the filter.
+     */
+    protected boolean isIncluded(String relativePath) {
+        return relativePath.contains("C-MassMedia/");
+        // Uncomment to add more subcorpora:
+        // || relativePath.contains("A-Administrative/")
+        // || relativePath.contains("B-Science/")
+        // || relativePath.contains("D-Fiction/")
+    }
+    @SuppressWarnings("unchecked")
+    private void applyLicence(JSONObject fdescr, String relativePath) {
+        if (relativePath.contains("B-Science/")
+                || relativePath.contains("C-MassMedia/")
+                || relativePath.contains("D-Fiction/")) {
+            fdescr.put("Licence",     RESTRICTED);
+            fdescr.put("LicenceLink", "");
+        } else {
+            fdescr.put("Licence",     CC0_LICENCE);
+            fdescr.put("LicenceLink", CC0_LICENCE_LINK);
+        }
+    }
+    @SuppressWarnings("unchecked")
+    private void writeMetadata(JSONObject json, String outdir, String filename)
+            throws Exception {
+        String outMetaPath = outdir + filename;
+        Writer outMeta = new OutputStreamWriter(
+                new FileOutputStream(outMetaPath), "UTF-8");
+        json.writeJSONString(outMeta);
+        outMeta.flush();
+        outMeta.close();
+        convertJsonToCSV(json, outMetaPath + "_CSV.csv");
+        System.out.println("Metadata written to: " + outMetaPath);
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/BulNCWikiProcessor.java ADDED Viewed

	@@ -0,0 +1,154 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Scanner;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import bg.bas.dcl.general.JSONProcessor;
+import bg.bas.dcl.monolingual.bg.TextProcessor;
+/**
+ * Processes the BulNC "F-InformalFiction" (Wiki/Informal) subcorpus.
+ *
+ */
+public class BulNCWikiProcessor extends BaseSourceProcessor {
+    private static final String CC0_LICENCE      = "CC0";
+    private static final String CC0_LICENCE_LINK =
+            "https://creativecommons.org/public-domain/cc0/";
+    private final String metaFilePath;
+    private final String existingMetaJson; // may be null
+    private final TextProcessor tp = new TextProcessor();
+    public BulNCWikiProcessor(String metaFilePath, String existingMetaJson) {
+        this.metaFilePath     = metaFilePath;
+        this.existingMetaJson = existingMetaJson;
+    }
+    /**
+     */
+    @Override
+    public void process(String indir, String outdir) {
+        try {
+            // Load existing metadata if provided, otherwise start fresh
+            JSONObject json;
+            JSONArray descrArray;
+            if (existingMetaJson != null && new File(existingMetaJson).exists()) {
+                JSONProcessor jp = new JSONProcessor();
+                json = jp.readJSON(new File(existingMetaJson));
+                descrArray = (JSONArray) json.get("metadata");
+                System.out.println("Loaded existing metadata with "
+                        + descrArray.size() + " entries.");
+            } else {
+                json = new JSONObject();
+                descrArray = new JSONArray();
+                json.put("metadata", descrArray);
+            }
+            int newDocs = 0;
+            long totalTokens = 0;
+            Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
+            while (sme.hasNextLine()) {
+                String[] dat = sme.nextLine().split("\t");
+                String relativePath = dat[1];
+                System.out.println("Checking: " + relativePath);
+                if (!relativePath.contains("F-InformalFiction")) continue;
+                String fname = indir + relativePath;
+                File f = new File(fname);
+                if (!f.exists()) {
+                    System.err.println("[MISSING] " + fname);
+                    continue;
+                }
+                String tfname = "bg_bnc_" + dat[0];
+                JSONObject fdescr = newBaseDescriptor(tfname);
+                fdescr.put("Licence",            CC0_LICENCE);
+                fdescr.put("LicenceLink",        CC0_LICENCE_LINK);
+                fdescr.put("PublicationDate",    dat[9].replaceAll("\\.", "-"));
+                fdescr.put("DocumentTitle",      dat[8]);
+                fdescr.put("Author",             dat[4]);
+                fdescr.put("Style",              "Administrative");
+                fdescr.put("Type",               dat[17]);
+                fdescr.put("Subdomain",          dat.length > 21 ? dat[21] : "");
+                fdescr.put("TranslatedDocument", dat[13]);
+                fdescr.put("CollectionDate",     dat[2]);
+                fdescr.put("Url",                dat[12]);
+                fdescr.put("Domain",             dat[19]);
+                Writer out = new OutputStreamWriter(
+                        new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
+                Scanner s = new Scanner(f, "UTF-8");
+                int nw = 0, ns = 0, np = 0, nt = 0;
+                while (s.hasNextLine()) {
+                    String text = s.nextLine();
+                    np++;
+                    out.write(text + "\n");
+                    out.flush();
+                    for (String sent : tp.splitToSentences(text)) {
+                        ns++;
+                        String[] words = sent.split(" ");
+                        nw += words.length;
+                        nt += estimateTokenCount(sent);
+                    }
+                }
+                s.close();
+                out.flush();
+                out.close();
+                fdescr.put("NumberWords",      nw);
+                fdescr.put("NumberSentences",  ns);
+                fdescr.put("NumberParagraphs", np);
+                fdescr.put("NumberTokens",     nt);
+                descrArray.add(fdescr);
+                newDocs++;
+                totalTokens += nt;
+            }
+            sme.close();
+            System.out.println("New F-InformalFiction documents added: " + newDocs);
+            System.out.println("Total tokens in new documents: " + totalTokens);
+            System.out.println("Merged metadata total entries: " + descrArray.size());
+            writeMetadata(json, outdir, "metadata.json");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    @SuppressWarnings("unchecked")
+    private void writeMetadata(JSONObject json, String outdir, String filename)
+            throws Exception {
+        String outMetaPath = outdir + filename;
+        Writer outMeta = new OutputStreamWriter(
+                new FileOutputStream(outMetaPath), "UTF-8");
+        json.writeJSONString(outMeta);
+        outMeta.flush();
+        outMeta.close();
+        System.out.println("Merged metadata written to: " + outMetaPath);
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/CurlicatProcessor.java ADDED Viewed

	@@ -0,0 +1,160 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Scanner;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import bg.bas.dcl.general.FileHandler;
+/**
+ * Processes the CURLICAT Bulgarian corpus.
+ *
+ * Input:  CoNLL-UP files (.conllup) with richer inline metadata than MARCELL.
+ * Output: One plain-text .txt per document + metadata.json + metadata CSV.
+ *
+ * Metadata comment prefixes recognised:
+ *   # PublicationDate =  → PublicationDate
+ *   # DocumentTitle =    → DocumentTitle
+ *   # Author =           → Author
+ *   # DocumentType =     → Type
+ *   # Url =              → Url
+ *   # Style =            → Style
+ *   # Domain =           → Domain
+ *   # Subdomain =        → Subdomain
+ *   # CollectionDate =   → CollectionDate
+ *   # License =          → Licence  (overrides default if present)
+ *
+ * Default licence: CC-BY-SA-4.0.
+ */
+public class CurlicatProcessor extends BaseSourceProcessor {
+    private static final String DEFAULT_LICENCE      = "CC-BY-SA-4.0";
+    private static final String DEFAULT_LICENCE_LINK =
+            "https://elrc-share.eu/static/metashare/licences/CC-BY-SA-4.0.pdf";
+    private static final String PREFIX = "bg_CURLICAT_";
+    private static final String EXT    = ".conllup";
+    @Override
+    public void process(String indir, String outdir) {
+        try {
+            FileHandler fh = new FileHandler();
+            JSONObject json = new JSONObject();
+            JSONArray descrArray = new JSONArray();
+            for (File f : fh.getFileListing(new File(indir))) {
+                if (!f.isFile()) continue;
+                System.out.println("Processing: " + f.getAbsolutePath());
+                String tfname = PREFIX + f.getName().replace(EXT, "");
+                JSONObject fdescr = newBaseDescriptor(tfname);
+                fdescr.put("Licence",     DEFAULT_LICENCE);
+                fdescr.put("LicenceLink", DEFAULT_LICENCE_LINK);
+                Writer out = new OutputStreamWriter(
+                        new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
+                Scanner s = new Scanner(f, "UTF-8");
+                int nw = 0, ns = 0, np = 0, nt = 0;
+                while (s.hasNextLine()) {
+                    String line = s.nextLine();
+                    // --- Metadata extraction ---
+                    if (line.startsWith("# PublicationDate =")) {
+                        fdescr.put("PublicationDate",
+                                line.replace("# PublicationDate =", "").trim());
+                    } else if (line.startsWith("# DocumentTitle =")) {
+                        fdescr.put("DocumentTitle",
+                                line.replace("# DocumentTitle =", "").trim());
+                    } else if (line.startsWith("# Author =")) {
+                        fdescr.put("Author",
+                                line.replace("# Author =", "").trim());
+                    } else if (line.startsWith("# DocumentType =")) {
+                        fdescr.put("Type",
+                                line.replace("# DocumentType =", "").trim());
+                    } else if (line.startsWith("# Url =")) {
+                        fdescr.put("Url",
+                                line.replace("# Url =", "").trim());
+                    } else if (line.startsWith("# Style =")) {
+                        fdescr.put("Style",
+                                line.replace("# Style =", "").trim());
+                    } else if (line.startsWith("# Domain =")) {
+                        fdescr.put("Domain",
+                                line.replace("# Domain =", "").trim());
+                    } else if (line.startsWith("# Subdomain =")) {
+                        fdescr.put("Subdomain",
+                                line.replace("# Subdomain =", "").trim());
+                    } else if (line.startsWith("# CollectionDate =")) {
+                        fdescr.put("CollectionDate",
+                                line.replace("# CollectionDate =", "").trim());
+                    } else if (line.startsWith("# License =")) {
+                        // Override default licence if the file declares one
+                        fdescr.put("Licence",
+                                line.replace("# License =", "").trim());
+                    }
+                    // --- Structure counting ---
+                    else if (line.startsWith("# sent_id =")) {
+                        ns++;
+                    } else if (line.startsWith("# newpar id =")) {
+                        np++;
+                        out.write("\n");
+                    }
+                    // --- Text output ---
+                    else if (line.startsWith("# text =")) {
+                        out.write(line.replace("# text =", "").trim() + "\n");
+                        out.flush();
+                    } else {
+                        // CoNLL-UP token line
+                        String[] cols = line.split("\t");
+                        if (cols.length > 5) {
+                            nt++;
+                            if (!cols[3].equals("PUNCT")) nw++;
+                        }
+                    }
+                }
+                s.close();
+                out.flush();
+                out.close();
+                fdescr.put("NumberWords",      nw);
+                fdescr.put("NumberSentences",  ns);
+                fdescr.put("NumberParagraphs", np);
+                fdescr.put("NumberTokens",     nt);
+                descrArray.add(fdescr);
+            }
+            json.put("metadata", descrArray);
+            writeMetadata(json, outdir, "metadata_CC.json");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    @SuppressWarnings("unchecked")
+    private void writeMetadata(JSONObject json, String outdir, String filename)
+            throws Exception {
+        String outMetaPath = outdir + filename;
+        Writer outMeta = new OutputStreamWriter(
+                new FileOutputStream(outMetaPath), "UTF-8");
+        json.writeJSONString(outMeta);
+        outMeta.flush();
+        outMeta.close();
+        convertJsonToCSV(json, outMetaPath + "_CSV.csv");
+        System.out.println("Metadata written to: " + outMetaPath);
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/DocumentMetadata.java ADDED Viewed

	@@ -0,0 +1,376 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+/**
+ * DocumentMetadata
+ *
+ * Canonical in-memory representation of the ifGPT dataset metadata schema.
+ */
+@SuppressWarnings("unchecked")
+public class DocumentMetadata {
+    // -----------------------------------------------------------------------
+    // ── MANDATORY (15) ──────────────────────────────────────────────────────
+    // -----------------------------------------------------------------------
+    /** Unique document identifier with the language prefix "bg". */
+    private String       identifier        = "";
+    /** Licence name (open, restricted, …). */
+    private String       licence           = "";
+    /** Publication date yyyy-mm-dd. */
+    private String       publicationDate   = "";
+    /** Title of the document. */
+    private String       documentTitle     = "";
+    /** Publishing organisation / media outlet / institutional originator. */
+    private String       source            = "";
+    /** Modality: "textual" | "multimodal". */
+    private String       medium            = "textual";
+    /** Original web address. */
+    private String       url               = "";
+    /** Up to six subject-area labels from a controlled vocabulary. */
+    private List<String> domain            = new ArrayList<>();
+    /** Up to six free-text keywords. */
+    private List<String> keywords          = new ArrayList<>();
+    /** Total word count (non-punctuation tokens). */
+    private int          numberWords       = 0;
+    /** Total sentence count. */
+    private int          numberSentences   = 0;
+    /** Total paragraph count. */
+    private int          numberParagraphs  = 0;
+    /** Total token count (words + punctuation). */
+    private int          numberTokens      = 0;
+    /**
+     * Per-sentence PII coverage vector.
+     * Entry i = proportion of tokens in sentence i flagged as PII ∈ [0,1].
+     * Length == numberSentences after pipeline completion.
+     */
+    private List<Double> piiVector         = new ArrayList<>();
+    /**
+     * Per-sentence bias coverage vector.
+     * Entry i = proportion of tokens in sentence i flagged as biased ∈ [0,1].
+     * Length == numberSentences after pipeline completion.
+     */
+    private List<Double> biasVector        = new ArrayList<>();
+    // -----------------------------------------------------------------------
+    // ── OPTIONAL (8) ────────────────────────────────────────────────────────
+    // -----------------------------------------------------------------------
+    /** Name(s) of the author(s). */
+    private List<String> author            = new ArrayList<>();
+    /** Stylistic register: legal | journalistic | administrative | … */
+    private String       style             = "";
+    /** Document genre: book | document | article | … */
+    private String       type              = "";
+    /** Narrower thematic classification, hierarchically linked to Domain. */
+    private List<String> subdomain         = new ArrayList<>();
+    /** true = translation, false = original Bulgarian text. */
+    private Boolean      translatedDocument = null;  // null = unknown
+    /** Date of acquisition yyyy-mm-dd. */
+    private String       collectionDate    = "";
+    /** URL of the licence text. */
+    private String       licenceLink       = "";
+    /** Anticipated NLP applications from a predefined list. */
+    private List<String> taskCategories    = new ArrayList<>();
+    // -----------------------------------------------------------------------
+    // Constructor
+    // -----------------------------------------------------------------------
+    public DocumentMetadata() {}
+    public DocumentMetadata(String identifier) {
+        this.identifier = identifier;
+    }
+    // -----------------------------------------------------------------------
+    // Fluent setters — mandatory
+    // -----------------------------------------------------------------------
+    public DocumentMetadata setIdentifier(String v)       { identifier       = v; return this; }
+    public DocumentMetadata setLicence(String v)          { licence          = v; return this; }
+    public DocumentMetadata setPublicationDate(String v)  { publicationDate  = v; return this; }
+    public DocumentMetadata setDocumentTitle(String v)    { documentTitle    = v; return this; }
+    public DocumentMetadata setSource(String v)           { source           = v; return this; }
+    public DocumentMetadata setMedium(String v)           { medium           = v; return this; }
+    public DocumentMetadata setUrl(String v)              { url              = v; return this; }
+    public DocumentMetadata setDomain(List<String> v)     { domain           = v != null ? v : new ArrayList<>(); return this; }
+    public DocumentMetadata addDomain(String v)           { domain.add(v); return this; }
+    public DocumentMetadata setKeywords(List<String> v)   { keywords         = v != null ? v : new ArrayList<>(); return this; }
+    public DocumentMetadata addKeyword(String v)          { keywords.add(v); return this; }
+    public DocumentMetadata setNumberWords(int v)         { numberWords      = v; return this; }
+    public DocumentMetadata setNumberSentences(int v)     { numberSentences  = v; return this; }
+    public DocumentMetadata setNumberParagraphs(int v)    { numberParagraphs = v; return this; }
+    public DocumentMetadata setNumberTokens(int v)        { numberTokens     = v; return this; }
+    public DocumentMetadata setPiiVector(List<Double> v)  { piiVector        = v != null ? v : new ArrayList<>(); return this; }
+    public DocumentMetadata setBiasVector(List<Double> v) { biasVector       = v != null ? v : new ArrayList<>(); return this; }
+    // Fluent setters — optional
+    public DocumentMetadata setAuthor(List<String> v)          { author            = v != null ? v : new ArrayList<>(); return this; }
+    public DocumentMetadata addAuthor(String v)                { author.add(v); return this; }
+    public DocumentMetadata setStyle(String v)                 { style             = v; return this; }
+    public DocumentMetadata setType(String v)                  { type              = v; return this; }
+    public DocumentMetadata setSubdomain(List<String> v)       { subdomain         = v != null ? v : new ArrayList<>(); return this; }
+    public DocumentMetadata addSubdomain(String v)             { subdomain.add(v); return this; }
+    public DocumentMetadata setTranslatedDocument(Boolean v)   { translatedDocument= v; return this; }
+    public DocumentMetadata setCollectionDate(String v)        { collectionDate    = v; return this; }
+    public DocumentMetadata setLicenceLink(String v)           { licenceLink       = v; return this; }
+    public DocumentMetadata setTaskCategories(List<String> v)  { taskCategories    = v != null ? v : new ArrayList<>(); return this; }
+    public DocumentMetadata addTaskCategory(String v)          { taskCategories.add(v); return this; }
+    // -----------------------------------------------------------------------
+    // Getters
+    // -----------------------------------------------------------------------
+    public String       getIdentifier()        { return identifier; }
+    public String       getLicence()           { return licence; }
+    public String       getPublicationDate()   { return publicationDate; }
+    public String       getDocumentTitle()     { return documentTitle; }
+    public String       getSource()            { return source; }
+    public String       getMedium()            { return medium; }
+    public String       getUrl()               { return url; }
+    public List<String> getDomain()            { return Collections.unmodifiableList(domain); }
+    public List<String> getKeywords()          { return Collections.unmodifiableList(keywords); }
+    public int          getNumberWords()       { return numberWords; }
+    public int          getNumberSentences()   { return numberSentences; }
+    public int          getNumberParagraphs()  { return numberParagraphs; }
+    public int          getNumberTokens()      { return numberTokens; }
+    public List<Double> getPiiVector()         { return Collections.unmodifiableList(piiVector); }
+    public List<Double> getBiasVector()        { return Collections.unmodifiableList(biasVector); }
+    public List<String> getAuthor()            { return Collections.unmodifiableList(author); }
+    public String       getStyle()             { return style; }
+    public String       getType()              { return type; }
+    public List<String> getSubdomain()         { return Collections.unmodifiableList(subdomain); }
+    public Boolean      getTranslatedDocument(){ return translatedDocument; }
+    public String       getCollectionDate()    { return collectionDate; }
+    public String       getLicenceLink()       { return licenceLink; }
+    public List<String> getTaskCategories()    { return Collections.unmodifiableList(taskCategories); }
+    // -----------------------------------------------------------------------
+    // Validation
+    // -----------------------------------------------------------------------
+    /**
+     * Returns a list of missing mandatory fields.
+     * An empty list means the record is complete.
+     */
+    public List<String> missingMandatoryFields() {
+        List<String> missing = new ArrayList<>();
+        if (identifier.isBlank())       missing.add("Identifier");
+        if (licence.isBlank())          missing.add("Licence");
+        if (medium.isBlank())           missing.add("Medium");
+        if (numberWords == 0)           missing.add("NumberWords");
+        if (numberSentences == 0)       missing.add("NumberSentences");
+        if (numberParagraphs == 0)      missing.add("NumberParagraphs");
+        if (numberTokens == 0)          missing.add("NumberTokens");
+        // piiVector and biasVector may legitimately be empty for clean docs
+        return missing;
+    }
+    // -----------------------------------------------------------------------
+    // JSON serialisation  (json-simple)
+    // -----------------------------------------------------------------------
+    /** Serialises this record to a json-simple JSONObject. */
+    public JSONObject toJson() {
+        JSONObject o = new JSONObject();
+        // Mandatory
+        o.put("Identifier",                       identifier);
+        o.put("Licence",                          licence);
+        o.put("PublicationDate",                  publicationDate);
+        o.put("DocumentTitle",                    documentTitle);
+        o.put("Source",                           source);
+        o.put("Medium",                           medium);
+        o.put("Url",                              url);
+        o.put("Domain",                           toJsonArray(domain));
+        o.put("Keywords",                         toJsonArray(keywords));
+        o.put("NumberWords",                      numberWords);
+        o.put("NumberSentences",                  numberSentences);
+        o.put("NumberParagraphs",                 numberParagraphs);
+        o.put("NumberTokens",                     numberTokens);
+        o.put("PersonallyIdentifiableInformation",toJsonDoubleArray(piiVector));
+        o.put("BiasedInformation",                toJsonDoubleArray(biasVector));
+        // Optional
+        o.put("Author",            toJsonArray(author));
+        o.put("Style",             style);
+        o.put("Type",              type);
+        o.put("Subdomain",         toJsonArray(subdomain));
+        o.put("TranslatedDocument",
+              translatedDocument == null ? "" : translatedDocument.toString());
+        o.put("CollectionDate",    collectionDate);
+        o.put("LicenceLink",       licenceLink);
+        o.put("TaskCategories",    toJsonArray(taskCategories));
+        return o;
+    }
+    /**
+     * Populates a DocumentMetadata from a json-simple JSONObject previously
+     * produced by {@link #toJson()}.
+     */
+    public static DocumentMetadata fromJson(JSONObject o) {
+        DocumentMetadata m = new DocumentMetadata();
+        m.identifier        = str(o, "Identifier");
+        m.licence           = str(o, "Licence");
+        m.publicationDate   = str(o, "PublicationDate");
+        m.documentTitle     = str(o, "DocumentTitle");
+        m.source            = str(o, "Source");
+        m.medium            = str(o, "Medium");
+        m.url               = str(o, "Url");
+        m.domain            = strList(o, "Domain");
+        m.keywords          = strList(o, "Keywords");
+        m.numberWords       = intVal(o, "NumberWords");
+        m.numberSentences   = intVal(o, "NumberSentences");
+        m.numberParagraphs  = intVal(o, "NumberParagraphs");
+        m.numberTokens      = intVal(o, "NumberTokens");
+        m.piiVector         = doubleList(o, "PersonallyIdentifiableInformation");
+        m.biasVector        = doubleList(o, "BiasedInformation");
+        m.author            = strList(o, "Author");
+        m.style             = str(o, "Style");
+        m.type              = str(o, "Type");
+        m.subdomain         = strList(o, "Subdomain");
+        String td           = str(o, "TranslatedDocument");
+        m.translatedDocument= td.isBlank() ? null : Boolean.parseBoolean(td);
+        m.collectionDate    = str(o, "CollectionDate");
+        m.licenceLink       = str(o, "LicenceLink");
+        m.taskCategories    = strList(o, "TaskCategories");
+        return m;
+    }
+    // -----------------------------------------------------------------------
+    // Interop with legacy JSONObject format (used by source processors)
+    // -----------------------------------------------------------------------
+    /**
+     * Merges fields from a legacy source-processor JSONObject (the format
+     * produced by MarcellProcessor, BulNCProcessor, etc.) into this record.
+     * Fields already set on {@code this} are NOT overwritten.
+     */
+    public void mergeLegacy(JSONObject legacy) {
+        if (identifier.isBlank())      setIdentifier(str(legacy, "Identifier"));
+        if (licence.isBlank())         setLicence(str(legacy, "Licence"));
+        if (licenceLink.isBlank())     setLicenceLink(str(legacy, "LicenceLink"));
+        if (publicationDate.isBlank()) setPublicationDate(str(legacy, "PublicationDate"));
+        if (documentTitle.isBlank())   setDocumentTitle(str(legacy, "DocumentTitle"));
+        if (source.isBlank())          setSource(str(legacy, "Source"));
+        if (url.isBlank())             setUrl(str(legacy, "Url"));
+        if (style.isBlank())           setStyle(str(legacy, "Style"));
+        if (type.isBlank())            setType(str(legacy, "Type"));
+        if (collectionDate.isBlank())  setCollectionDate(str(legacy, "CollectionDate"));
+        if (author.isEmpty()) {
+            String a = str(legacy, "Author");
+            if (!a.isBlank()) author.add(a);
+        }
+        if (domain.isEmpty()) {
+            String d = str(legacy, "Domain");
+            if (!d.isBlank()) domain.add(d);
+        }
+        if (subdomain.isEmpty()) {
+            String s = str(legacy, "Subdomain");
+            if (!s.isBlank()) subdomain.add(s);
+        }
+        if (numberWords      == 0) numberWords      = intVal(legacy, "NumberWords");
+        if (numberSentences  == 0) numberSentences  = intVal(legacy, "NumberSentences");
+        if (numberParagraphs == 0) numberParagraphs = intVal(legacy, "NumberParagraphs");
+        if (numberTokens     == 0) numberTokens     = intVal(legacy, "NumberTokens");
+        String translated = str(legacy, "TranslatedDocument");
+        if (translatedDocument == null && !translated.isBlank())
+            translatedDocument = Boolean.parseBoolean(translated);
+    }
+    // -----------------------------------------------------------------------
+    // Private helpers
+    // -----------------------------------------------------------------------
+    private static String str(JSONObject o, String key) {
+        Object v = o.get(key);
+        return v == null ? "" : v.toString().trim();
+    }
+    private static int intVal(JSONObject o, String key) {
+        Object v = o.get(key);
+        if (v == null) return 0;
+        try { return Integer.parseInt(v.toString().trim()); }
+        catch (NumberFormatException e) { return 0; }
+    }
+    private static List<String> strList(JSONObject o, String key) {
+        Object v = o.get(key);
+        List<String> list = new ArrayList<>();
+        if (v instanceof JSONArray) {
+            for (Object item : (JSONArray) v)
+                if (item != null) list.add(item.toString());
+        } else if (v != null && !v.toString().isBlank()) {
+            list.add(v.toString().trim());
+        }
+        return list;
+    }
+    private static List<Double> doubleList(JSONObject o, String key) {
+        Object v = o.get(key);
+        List<Double> list = new ArrayList<>();
+        if (v instanceof JSONArray) {
+            for (Object item : (JSONArray) v) {
+                try { list.add(Double.parseDouble(item.toString())); }
+                catch (NumberFormatException ignored) {}
+            }
+        }
+        return list;
+    }
+    private JSONArray toJsonArray(List<String> list) {
+        JSONArray a = new JSONArray();
+        if (list != null) a.addAll(list);
+        return a;
+    }
+    private JSONArray toJsonDoubleArray(List<Double> list) {
+        JSONArray a = new JSONArray();
+        if (list != null) a.addAll(list);
+        return a;
+    }
+    @Override
+    public String toString() {
+        return String.format(
+            "DocumentMetadata{id='%s', sentences=%d, words=%d, piiEntries=%d, biasEntries=%d}",
+            identifier, numberSentences, numberWords, piiVector.size(), biasVector.size());
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTDatasetProcessor.java ADDED Viewed

	@@ -0,0 +1,160 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+/**
+ * IfGPTDatasetProcessor
+ *
+ */
+public class IfGPTDatasetProcessor {
+    // -----------------------------------------------------------------------
+    // Shared paths
+    // -----------------------------------------------------------------------
+    // New batch being ingested
+    static final String NEW_DATA_DIR    = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/data/";
+    static final String NEW_META_DIR    = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/metadata/";
+    static final String SAMPLE_DIR      = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/sample/";
+    static final String BLOCKLIST_FILE  = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/blocklist.txt";
+    static final String DEDUP_REPORT    = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/dedup_report.tsv";
+    // Shared resources
+    static final String BULNC_META_FILE = "/home/ivelina/SVN_CORPUS/BulNC/BulNC-description.txt";
+    static final String BIAS_DICT       = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
+                                        + "bulgarian_bias_dictionary_v4.tsv";
+    // -----------------------------------------------------------------------
+    // Main
+    // -----------------------------------------------------------------------
+    public static void main(String[] args) {
+        // ==================================================================
+        // MODE A — FULL PIPELINE  (one call runs all 8 stages)
+        // ==================================================================
+        // Choose the source processor that matches the new batch format,
+        // then call pipeline.run().
+        // --- BulNC Mass Media batch ---
+        runBulNCPipeline();
+        // --- MARCELL batch ---
+        // runMarcellPipeline();
+        // --- CURLICAT batch ---
+        // runCurlicatPipeline();
+        // --- BulNC Wiki/InformalFiction batch ---
+        // runBulNCWikiPipeline();
+        // ==================================================================
+        // MODE B — INDIVIDUAL STAGES
+        // ==================================================================
+        // --- 1. Extract only ---
+        // new BulNCProcessor(BULNC_META_FILE).process(NEW_DATA_DIR, NEW_META_DIR);
+        // --- 3. Clean only (learn + apply) ---
+        // FileCleanProcessor fcp = new FileCleanProcessor(0.50);
+        // fcp.learnFromSample(SAMPLE_DIR);
+        // fcp.printTopCommonLines(30);
+        // fcp.saveBlocklist(BLOCKLIST_FILE);
+        // fcp.cleanDirectory(NEW_DATA_DIR, true);
+        // --- 4. Deduplication only ---
+        // DeduplicationProcessor dp = new DeduplicationProcessor(0.90, 5, 200);
+        // dp.indexCorpus(IfGPTPipeline.FULL_DATA_DIR);
+        // dp.detectDuplicates(NEW_DATA_DIR, DEDUP_REPORT);
+        // dp.removeDuplicatesFromNewFolder(NEW_DATA_DIR, true); // optional
+        // --- 5/6. PII + Bias annotation only (on already-split sentences) ---
+        // bg.bas.dcl.LLMs.BulgarianSentenceSplitter splitter =
+        //         new bg.bas.dcl.LLMs.BulgarianSentenceSplitter();
+        // bg.bas.dcl.LLMs.PIIDetector pii = new bg.bas.dcl.LLMs.PIIDetector(splitter);
+        // pii.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "pii_report.tsv");
+        //
+        // bg.bas.dcl.LLMs.BiasLexicon lex =
+        //         new bg.bas.dcl.LLMs.BiasLexicon(BIAS_DICT);
+        // bg.bas.dcl.LLMs.BiasAnalyser bias =
+        //         new bg.bas.dcl.LLMs.BiasAnalyser(lex, splitter);
+        // bias.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "bias_report.tsv");
+        // ==================================================================
+        // MODE C — UTILITIES
+        // ==================================================================
+        // Convert an existing metadata JSON to CSV
+        // new MarcellProcessor().convertJsonToCSV(
+        //         IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json");
+    }
+    // -----------------------------------------------------------------------
+    // Pipeline factory methods (one per source type)
+    // -----------------------------------------------------------------------
+    private static void runBulNCPipeline() {
+        new IfGPTPipeline()
+            .setSourceProcessor(new BulNCProcessor(BULNC_META_FILE))
+            .setNewDataDir(NEW_DATA_DIR)
+            .setSampleDir(SAMPLE_DIR)
+            .setNewMetaDir(NEW_META_DIR)
+            .setBlocklistFile(BLOCKLIST_FILE)
+            .setDedupReport(DEDUP_REPORT)
+            .setBiasDictPath(BIAS_DICT)
+            .setBoilerplateThreshold(0.50)
+            .setDedupThreshold(0.90)
+            .setRemoveDuplicates(false)   // set true to delete dup sentences
+            .setKeepBackups(true)
+            .run();
+    }
+    private static void runMarcellPipeline() {
+        String indirMarcell = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/bg-annotated/";
+        String outdirMarcell= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/texts/";
+        new IfGPTPipeline()
+            .setSourceProcessor(new MarcellProcessor())
+            .setNewDataDir(outdirMarcell)
+            .setSampleDir(SAMPLE_DIR)
+            .setNewMetaDir(NEW_META_DIR)
+            .setBlocklistFile(BLOCKLIST_FILE)
+            .setDedupReport(DEDUP_REPORT)
+            .setBiasDictPath(BIAS_DICT)
+            .setSkipClean(false)
+            .setSkipDedup(false)
+            .run();
+    }
+    private static void runCurlicatPipeline() {
+        String indirCurlicat = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/archive/"
+                             + "Bulgarian_Curlicat_corpus/";
+        String outdirCurlicat= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/texts/";
+        new IfGPTPipeline()
+            .setSourceProcessor(new CurlicatProcessor())
+            .setNewDataDir(outdirCurlicat)
+            .setSampleDir(SAMPLE_DIR)
+            .setNewMetaDir(NEW_META_DIR)
+            .setBlocklistFile(BLOCKLIST_FILE)
+            .setDedupReport(DEDUP_REPORT)
+            .setBiasDictPath(BIAS_DICT)
+            .run();
+    }
+    private static void runBulNCWikiPipeline() {
+        String existingMeta = IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json";
+        String outdirWiki   = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/BulNC/wiki-texts/";
+        new IfGPTPipeline()
+            .setSourceProcessor(new BulNCWikiProcessor(BULNC_META_FILE, existingMeta))
+            .setNewDataDir(outdirWiki)
+            .setSampleDir(SAMPLE_DIR)
+            .setNewMetaDir(NEW_META_DIR)
+            .setBlocklistFile(BLOCKLIST_FILE)
+            .setDedupReport(DEDUP_REPORT)
+            .setBiasDictPath(BIAS_DICT)
+            .run();
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/IfGPTPipeline.java ADDED Viewed

	@@ -0,0 +1,490 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.Scanner;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import bg.bas.dcl.LLMs.BiasAnalyser;
+import bg.bas.dcl.LLMs.BiasLexicon;
+import bg.bas.dcl.LLMs.BulgarianSentenceSplitter;
+import bg.bas.dcl.LLMs.PIIDetector;
+import bg.bas.dcl.LLMs.SentenceBiasScore;
+import bg.bas.dcl.general.FileHandler;
+import bg.bas.dcl.general.JSONProcessor;
+/**
+ * IfGPTPipeline
+ *
+ * Pipeline for the ifGPT Bulgarian language dataset.
+ *
+ * -----------------------------------------------------------------------
+-----------------------------------------------------------------------
+ * PIPELINE STAGES (executed in order by {@link #run()})
+ *
+ *   1. EXTRACT
+ *   2. SPLIT
+ *   3. CLEAN
+ *   4. DEDUPLICATE
+ *   5. PII
+ *   6. BIAS
+ *   7. COUNTS    —  word / sentence / token counts are recomputed on the cleaned, deduplicated text
+ *     FULL_DATA_DIR / FULL_META_DIR
+ *
+ * -----------------------------------------------------------------------
+ */
+@SuppressWarnings("unchecked")
+public class IfGPTPipeline {
+    // -----------------------------------------------------------------------
+    // Fixed paths
+    // -----------------------------------------------------------------------
+    public static final String FULL_DATA_DIR =
+            "/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-DATA/";
+    public static final String FULL_META_DIR =
+            "/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-METADATA/";
+    // -----------------------------------------------------------------------
+    // Configurable paths and options
+    // -----------------------------------------------------------------------
+    private SourceProcessor sourceProcessor;      // mandatory
+    private String newDataDir;                    // mandatory: incoming texts
+    private String sampleDir;                     // mandatory: boilerplate sample
+    private String newMetaDir;                    // mandatory: output metadata
+    private String blocklistFile;                 // boilerplate blocklist file
+    private String dedupReport;                   // dedup TSV report path
+    private String biasDictPath;                  // bias dictionary TSV
+    private String openNlpModelPath = null;       // null = bundled JAR model
+    private double boilerplateThreshold = 0.50;   // FileCleanProcessor threshold
+    private double dedupThreshold       = 0.90;   // DeduplicationProcessor threshold
+    private int    dedupShingleSize     = 5;
+    private int    dedupNumHashes       = 200;
+    private boolean removeDuplicates    = false;  // whether to strip dup sentences
+    private boolean keepBackups         = true;   // keep .bak on file modification
+    private boolean skipClean           = false;  // skip boilerplate cleaning
+    private boolean skipDedup           = false;  // skip deduplication
+    private boolean skipPii             = false;  // skip PII scoring
+    private boolean skipBias            = false;  // skip bias scoring
+    // -----------------------------------------------------------------------
+    //
+    // -----------------------------------------------------------------------
+    public IfGPTPipeline setSourceProcessor(SourceProcessor p)  { sourceProcessor = p; return this; }
+    public IfGPTPipeline setNewDataDir(String p)                { newDataDir = p; return this; }
+    public IfGPTPipeline setSampleDir(String p)                 { sampleDir = p; return this; }
+    public IfGPTPipeline setNewMetaDir(String p)                { newMetaDir = p; return this; }
+    public IfGPTPipeline setBlocklistFile(String p)             { blocklistFile = p; return this; }
+    public IfGPTPipeline setDedupReport(String p)               { dedupReport = p; return this; }
+    public IfGPTPipeline setBiasDictPath(String p)              { biasDictPath = p; return this; }
+    public IfGPTPipeline setOpenNlpModelPath(String p)          { openNlpModelPath = p; return this; }
+    public IfGPTPipeline setBoilerplateThreshold(double t)      { boilerplateThreshold = t; return this; }
+    public IfGPTPipeline setDedupThreshold(double t)            { dedupThreshold = t; return this; }
+    public IfGPTPipeline setDedupShingleSize(int n)             { dedupShingleSize = n; return this; }
+    public IfGPTPipeline setDedupNumHashes(int n)               { dedupNumHashes = n; return this; }
+    public IfGPTPipeline setRemoveDuplicates(boolean b)         { removeDuplicates = b; return this; }
+    public IfGPTPipeline setKeepBackups(boolean b)              { keepBackups = b; return this; }
+    public IfGPTPipeline setSkipClean(boolean b)                { skipClean = b; return this; }
+    public IfGPTPipeline setSkipDedup(boolean b)                { skipDedup = b; return this; }
+    public IfGPTPipeline setSkipPii(boolean b)                  { skipPii = b; return this; }
+    public IfGPTPipeline setSkipBias(boolean b)                 { skipBias = b; return this; }
+    // -----------------------------------------------------------------------
+    //   -----------------------------------------------------------------------
+    /**
+     * Executes all  stages in order.
+     * Throws {@link IllegalStateException} if mandatory configuration is missing.
+     */
+    public void run() {
+        validateConfig();
+        ensureDirs(newMetaDir, FULL_DATA_DIR, FULL_META_DIR);
+        banner("STAGE 1 — SOURCE EXTRACTION");
+        runExtraction();
+        // Shared NLP components (initialised once, reused across stages)
+        BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter(openNlpModelPath);
+        banner("STAGE 2 — SENTENCE SPLITTING & INITIAL METADATA");
+        runSentenceSplitting(splitter);
+        if (!skipClean) {
+            banner("STAGE 3 — BOILERPLATE CLEANING");
+            runCleaning();
+        } else {
+            log("STAGE 3 skipped (skipClean=true)");
+        }
+        if (!skipDedup) {
+            banner("STAGE 4 — DEDUPLICATION");
+            runDeduplication();
+        } else {
+            log("STAGE 4 skipped (skipDedup=true)");
+        }
+        PIIDetector  piiDetector  = skipPii  ? null : new PIIDetector(splitter);
+        BiasAnalyser biasAnalyser = skipBias ? null : buildBiasAnalyser(splitter);
+        banner("STAGES 5-7 — PII, BIAS & FINAL COUNTS");
+        runAnnotationAndCounts(splitter, piiDetector, biasAnalyser);
+        banner("STAGE 8 — PERSIST TO FULL CORPUS");
+        runPersist();
+        banner("PIPELINE COMPLETE");
+    }
+    // -----------------------------------------------------------------------
+    // Stage 1 — Extraction
+    // -----------------------------------------------------------------------
+    private void runExtraction() {
+        // The source processor writes plain-text files to newDataDir and
+        // seed metadata JSON to newMetaDir.
+        sourceProcessor.process(newDataDir, newMetaDir);
+        log("Extraction complete → " + newDataDir);
+    }
+    // -----------------------------------------------------------------------
+    // Stage 2 — Sentence splitting
+    // -----------------------------------------------------------------------
+    /**
+     * Reads each metadata JSON produced by the source processor, then for
+     * each document text file counts sentences properly using the OpenNLP
+     * splitter and writes the sentence list to a parallel .sentences file
+     * (one sentence per line) used by later stages.
+     */
+    private void runSentenceSplitting(BulgarianSentenceSplitter splitter) {
+        try {
+            FileHandler fh = new FileHandler();
+            int docs = 0;
+            for (File txtFile : fh.getFileListing(new File(newDataDir))) {
+                if (!txtFile.isFile() || !txtFile.getName().endsWith(".txt")) continue;
+                // Read document text
+                StringBuilder sb = new StringBuilder();
+                try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) {
+                    while (sc.hasNextLine()) sb.append(sc.nextLine()).append('\n');
+                }
+                String text = sb.toString().trim();
+                // Split into sentences and persist to .sentences sidecar file
+                String[] sentences = splitter.split(text);
+                File sentFile = new File(newDataDir, txtFile.getName()
+                        .replace(".txt", ".sentences"));
+                try (Writer w = new OutputStreamWriter(
+                        new FileOutputStream(sentFile), StandardCharsets.UTF_8)) {
+                    for (String sent : sentences) {
+                        if (!sent.isBlank()) {
+                            w.write(sent.trim());
+                            w.write('\n');
+                        }
+                    }
+                }
+                docs++;
+            }
+            log("Sentence splitting complete.  Documents: " + docs);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Stage 3 — Boilerplate cleaning
+    // -----------------------------------------------------------------------
+    private void runCleaning() {
+        FileCleanProcessor fcp = new FileCleanProcessor(boilerplateThreshold);
+        // Learn from sample
+        fcp.learnFromSample(sampleDir);
+        fcp.printTopCommonLines(20);
+        // Save blocklist for audit / reproducibility
+        if (blocklistFile != null && !blocklistFile.isBlank()) {
+            fcp.saveBlocklist(blocklistFile);
+        }
+        // Clean the new data directory
+        fcp.cleanDirectory(newDataDir, keepBackups);
+        log("Boilerplate cleaning complete → " + newDataDir);
+    }
+    // -----------------------------------------------------------------------
+    // Stage 4 — Deduplication
+    // -----------------------------------------------------------------------
+    private void runDeduplication() {
+        DeduplicationProcessor dp = new DeduplicationProcessor(
+                dedupThreshold, dedupShingleSize, dedupNumHashes);
+        // Index the full existing corpus
+        log("Indexing full corpus for deduplication…");
+        dp.indexCorpus(FULL_DATA_DIR);
+        log("Corpus indexed.  Sentences: " + dp.getCorpusSize());
+        // Detect near-duplicates in new data
+        String report = dedupReport != null
+                ? dedupReport
+                : newMetaDir + "dedup_report.tsv";
+        dp.detectDuplicates(newDataDir, report);
+        if (removeDuplicates) {
+            dp.removeDuplicatesFromNewFolder(newDataDir, keepBackups);
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Stages 5-7 — PII, Bias annotation + final counts
+    // -----------------------------------------------------------------------
+    /**
+     * For each document:
+     *   a) reads the (cleaned, deduplicated) .sentences sidecar file,
+     *   b) runs PII and/or Bias scoring per sentence,
+     *   c) recomputes word/sentence/token counts on the surviving text,
+     *   d) merges all computed values into a DocumentMetadata and writes
+     *      the final metadata JSON to newMetaDir.
+     */
+    private void runAnnotationAndCounts(BulgarianSentenceSplitter splitter,
+                                        PIIDetector  piiDetector,
+                                        BiasAnalyser biasAnalyser) {
+        try {
+            FileHandler   fh = new FileHandler();
+            JSONProcessor jp = new JSONProcessor();
+            int docs = 0, errors = 0;
+            for (File sentFile : fh.getFileListing(new File(newDataDir))) {
+                if (!sentFile.isFile()
+                        || !sentFile.getName().endsWith(".sentences")) continue;
+                String stem = sentFile.getName().replace(".sentences", "");
+                // --- Load sentences ---
+                List<String> sentences = new ArrayList<>();
+                try (Scanner sc = new Scanner(sentFile, StandardCharsets.UTF_8)) {
+                    while (sc.hasNextLine()) {
+                        String s = sc.nextLine().trim();
+                        if (!s.isBlank()) sentences.add(s);
+                    }
+                }
+                if (sentences.isEmpty()) {
+                    log("[WARN] No sentences for: " + stem);
+                    errors++;
+                    continue;
+                }
+                // --- Load or create DocumentMetadata ---
+                DocumentMetadata meta = loadOrCreateMetadata(jp, stem);
+                // --- PII per sentence ---
+                List<Double> piiVec = new ArrayList<>();
+                if (piiDetector != null) {
+                    int sentIdx = 0;
+                    for (String sent : sentences) {
+                        PIIDetector.SentencePIIScore score =
+                                piiDetector.analyseSentence(sent, stem + "-" + sentIdx++);
+                        piiVec.add(score.getPiiCoverage());
+                    }
+                }
+                meta.setPiiVector(piiVec);
+                // --- Bias per sentence ---
+                List<Double> biasVec = new ArrayList<>();
+                if (biasAnalyser != null) {
+                    for (String sent : sentences) {
+                        SentenceBiasScore score = biasAnalyser.analyseSentence(sent);
+                        biasVec.add(score.totalCoverage());
+                    }
+                }
+                meta.setBiasVector(biasVec);
+                // --- Recompute counts from surviving sentences ---
+                int nSentences  = sentences.size();
+                int nWords      = 0;
+                int nTokens     = 0;
+                for (String sent : sentences) {
+                    String[] toks = sent.split("\\s+");
+                    nWords  += toks.length;
+                    // estimate tokens: words + punctuation characters
+                    nTokens += toks.length + sent.length()
+                             - sent.replaceAll("[.,;:!?()\\-]", "").length();
+                }
+                // Paragraphs: count blank-line groups in the original text file
+                int nParagraphs = countParagraphs(new File(newDataDir, stem + ".txt"));
+                meta.setNumberSentences(nSentences)
+                    .setNumberWords(nWords)
+                    .setNumberTokens(nTokens)
+                    .setNumberParagraphs(nParagraphs);
+                // --- Persist metadata JSON ---
+                writeMetadata(meta, newMetaDir + stem + "_meta.json");
+                docs++;
+            }
+            log("Annotation & counts complete.  Documents: " + docs
+                    + "  Errors: " + errors);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Stage 8
+    // -----------------------------------------------------------------------
+    /**
+     */
+    private void runPersist() {
+        try {
+            FileHandler fh = new FileHandler();
+            int dataCopied = 0, metaCopied = 0;
+            // Copy text files
+            for (File f : fh.getFileListing(new File(newDataDir))) {
+                if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                File dest = new File(FULL_DATA_DIR, f.getName());
+                if (!dest.exists()) {
+                    Files.copy(f.toPath(), dest.toPath(),
+                            StandardCopyOption.REPLACE_EXISTING);
+                    dataCopied++;
+                }
+            }
+            // Copy metadata JSON files
+            for (File f : fh.getFileListing(new File(newMetaDir))) {
+                if (!f.isFile() || !f.getName().endsWith("_meta.json")) continue;
+                File dest = new File(FULL_META_DIR, f.getName());
+                if (!dest.exists()) {
+                    Files.copy(f.toPath(), dest.toPath(),
+                            StandardCopyOption.REPLACE_EXISTING);
+                    metaCopied++;
+                }
+            }
+            log("Persist complete.  Text files copied: " + dataCopied
+                    + "  Metadata files copied: " + metaCopied);
+            log("FULL_DATA_DIR : " + FULL_DATA_DIR);
+            log("FULL_META_DIR : " + FULL_META_DIR);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Helpers
+    // -----------------------------------------------------------------------
+    private DocumentMetadata loadOrCreateMetadata(JSONProcessor jp, String stem) {
+        // Try to find a seed metadata JSON written by the source processor
+        // Filename conventions: stem + ".json" or stem + "_meta.json"
+        String[] candidates = {
+            newMetaDir + stem + "_meta.json",
+            newMetaDir + stem + ".json"
+        };
+        for (String path : candidates) {
+            File f = new File(path);
+            if (f.exists()) {
+                try {
+                    JSONObject raw = jp.readJSON(f);
+                    // First try full schema, then legacy format
+                    if (raw.containsKey("Identifier")) {
+                        return DocumentMetadata.fromJson(raw);
+                    } else {
+                        DocumentMetadata m = new DocumentMetadata(stem);
+                        m.mergeLegacy(raw);
+                        return m;
+                    }
+                } catch (Exception e) {
+                    log("[WARN] Could not parse metadata JSON for " + stem + ": " + e.getMessage());
+                }
+            }
+        }
+        // Fall back to empty skeleton
+        return new DocumentMetadata(stem);
+    }
+    private void writeMetadata(DocumentMetadata meta, String outPath) throws Exception {
+        JSONObject json = meta.toJson();
+        try (Writer w = new OutputStreamWriter(
+                new FileOutputStream(outPath), StandardCharsets.UTF_8)) {
+            json.writeJSONString(w);
+        }
+    }
+    private int countParagraphs(File txtFile) {
+        if (!txtFile.exists()) return 0;
+        int count = 0;
+        boolean inPara = false;
+        try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) {
+            while (sc.hasNextLine()) {
+                String line = sc.nextLine();
+                if (line.isBlank()) {
+                    inPara = false;
+                } else {
+                    if (!inPara) { count++; inPara = true; }
+                }
+            }
+        } catch (Exception e) { /* ignored */ }
+        return Math.max(count, 1);
+    }
+    private BiasAnalyser buildBiasAnalyser(BulgarianSentenceSplitter splitter) {
+        if (biasDictPath == null || biasDictPath.isBlank()) {
+            log("[WARN] No bias dictionary path set — bias scoring disabled.");
+            return null;
+        }
+        BiasLexicon lexicon = new BiasLexicon(biasDictPath);
+        return new BiasAnalyser(lexicon, splitter);
+    }
+    private void validateConfig() {
+        List<String> missing = new ArrayList<>();
+        if (sourceProcessor == null) missing.add("sourceProcessor");
+        if (newDataDir  == null || newDataDir.isBlank())  missing.add("newDataDir");
+        if (sampleDir   == null || sampleDir.isBlank())   missing.add("sampleDir");
+        if (newMetaDir  == null || newMetaDir.isBlank())  missing.add("newMetaDir");
+        if (!missing.isEmpty())
+            throw new IllegalStateException(
+                "Pipeline configuration missing: " + missing);
+    }
+    private void ensureDirs(String... paths) {
+        for (String p : paths) {
+            if (p != null) new File(p).mkdirs();
+        }
+    }
+    private void banner(String msg) {
+        System.out.println("\n" + "=".repeat(60));
+        System.out.println("  " + msg);
+        System.out.println("=".repeat(60));
+    }
+    private void log(String msg) {
+        System.out.println("[Pipeline] " + msg);
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/MarcellProcessor.java ADDED Viewed

	@@ -0,0 +1,130 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Scanner;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import bg.bas.dcl.general.FileHandler;
+/**
+ * Processes the MARCELL Bulgarian annotated corpus.
+ *
+ * Licence: CC0-1.0 (fixed for all MARCELL documents).
+ * Domain:  "Държавно управление" (State governance).
+ * Style:   "Административен".
+ */
+public class MarcellProcessor extends BaseSourceProcessor {
+    private static final String LICENCE      = "CC0-1.0";
+    private static final String LICENCE_LINK =
+            "https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf";
+    private static final String DOMAIN  = "Държавно управление";
+    private static final String STYLE   = "Административен";
+    private static final String PREFIX  = "bg_MARCELL_";
+    private static final String EXT     = ".conllup";
+    @Override
+    public void process(String indir, String outdir) {
+        try {
+            FileHandler fh = new FileHandler();
+            JSONObject json = new JSONObject();
+            JSONArray descrArray = new JSONArray();
+            for (File f : fh.getFileListing(new File(indir))) {
+                if (!f.isFile()) continue;
+                System.out.println("Processing: " + f.getAbsolutePath());
+                String tfname = PREFIX + f.getName().replace(EXT, "");
+                JSONObject fdescr = newBaseDescriptor(tfname);
+                fdescr.put("Licence",     LICENCE);
+                fdescr.put("LicenceLink", LICENCE_LINK);
+                fdescr.put("Domain",      DOMAIN);
+                fdescr.put("Style",       STYLE);
+                Writer out = new OutputStreamWriter(
+                        new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
+                Scanner s = new Scanner(f, "UTF-8");
+                int nw = 0, ns = 0, np = 0, nt = 0;
+                while (s.hasNextLine()) {
+                    String line = s.nextLine();
+                    // --- Metadata extraction ---
+                    if (line.startsWith("# date =")) {
+                        fdescr.put("PublicationDate", line.replace("# date =", "").trim());
+                    } else if (line.startsWith("# title =")) {
+                        fdescr.put("DocumentTitle", line.replace("# title =", "").trim());
+                    } else if (line.startsWith("# issuer =")) {
+                        fdescr.put("Author", line.replace("# issuer =", "").trim());
+                    } else if (line.startsWith("# type =")) {
+                        fdescr.put("Type", line.replace("# type =", "").trim());
+                    } else if (line.startsWith("# url =")) {
+                        fdescr.put("Url", line.replace("# url =", "").trim());
+                    }
+                    // --- Structure counting ---
+                    else if (line.startsWith("# sent_id =")) {
+                        ns++;
+                    } else if (line.startsWith("# newpar id =")) {
+                        np++;
+                        out.write("\n");
+                    }
+                    // --- Text output ---
+                    else if (line.startsWith("# text =")) {
+                        out.write(line.replace("# text =", "").trim() + "\n");
+                        out.flush();
+                    } else {
+                        // CoNLL-UP token line: count words and tokens
+                        String[] cols = line.split("\t");
+                        if (cols.length > 5) {
+                            nt++;
+                            if (!cols[3].equals("PUNCT")) nw++;
+                        }
+                    }
+                }
+                s.close();
+                out.flush();
+                out.close();
+                fdescr.put("NumberWords",      nw);
+                fdescr.put("NumberSentences",  ns);
+                fdescr.put("NumberParagraphs", np);
+                fdescr.put("NumberTokens",     nt);
+                descrArray.add(fdescr);
+            }
+            json.put("metadata", descrArray);
+            writeMetadata(json, outdir, "metadata.json");
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    @SuppressWarnings("unchecked")
+    private void writeMetadata(JSONObject json, String outdir, String filename)
+            throws Exception {
+        String outMetaPath = outdir + filename;
+        Writer outMeta = new OutputStreamWriter(
+                new FileOutputStream(outMetaPath), "UTF-8");
+        json.writeJSONString(outMeta);
+        outMeta.flush();
+        outMeta.close();
+        convertJsonToCSV(json, outMetaPath + "_CSV.csv");
+        System.out.println("Metadata written to: " + outMetaPath);
+    }
+}

java/bg/bas/dcl/LLMs/IfGPTDataset/SourceProcessor.java ADDED Viewed

	@@ -0,0 +1,10 @@

+package bg.bas.dcl.LLMs.IfGPTDataset;
+/**
+ */
+public interface SourceProcessor {
+    /**
+     */
+    void process(String indir, String outdir);
+}

java/bg/bas/dcl/LLMs/PIIDetector.java ADDED Viewed

	@@ -0,0 +1,447 @@

+package bg.bas.dcl.LLMs;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Scanner;
+import ai.philterd.phileas.model.configuration.PhileasConfiguration;
+import ai.philterd.phileas.model.policy.Policy;
+import ai.philterd.phileas.model.responses.FilterResponse;
+import ai.philterd.phileas.model.responses.Span;
+import ai.philterd.phileas.services.PlainTextFilterService;
+import bg.bas.dcl.general.FileHandler;
+/**
+ * PIIDetector
+ *
+ * Detects Personally Identifiable Information (PII) in Bulgarian text at
+ * sentence level using the <b>Phileas</b> library (ai.philterd:phileas).
+ *
+ * -----------------------------------------------------------------------
+ * NOTE ON "PIISA"
+ * PIISA (https://piisa.org) is a Python-only PII framework with no Java
+ * bindings.  The closest Java-native equivalent with a compatible
+ * detection scope is Phileas (Apache 2.0, Maven Central, actively
+ * maintained as of 2025).  This component uses Phileas and documents
+ * all places where a future PIISA Java binding could be substituted.
+ * -----------------------------------------------------------------------
+ *
+ * MAVEN DEPENDENCY (pom.xml):
+ * <pre>
+ *   &lt;dependency&gt;
+ *     &lt;groupId&gt;ai.philterd&lt;/groupId&gt;
+ *     &lt;artifactId&gt;phileas&lt;/artifactId&gt;
+ *     &lt;version&gt;3.1.0&lt;/version&gt;
+ *   &lt;/dependency&gt;
+ * </pre>
+ *
+ * -----------------------------------------------------------------------
+ * PII TYPES DETECTED (Phileas built-in, language-agnostic unless noted):
+ *
+ *   Person names (NER + census dictionary) | Ages | Email addresses
+ *   Phone numbers | IP addresses (v4 + v6) | URLs | Credit card numbers
+ *   SSN / TIN | IBAN codes | Bank account numbers | Dates | Zip codes
+ *   MAC addresses | Bitcoin addresses | VINs | Passport numbers
+ *   Driver licence numbers | Medical conditions
+ *
+ * Language note: NER-based person-name detection uses English models by
+ * default.  For Bulgarian names, supply a custom dictionary filter
+ * (see {@link #buildPolicy()}) or integrate a Bulgarian NER model.
+ * Regex-based filters (emails, phones, IPs, etc.) are language-independent
+ * and work directly on Bulgarian text.
+ *
+ * -----------------------------------------------------------------------
+ * ALGORITHM (per sentence):
+ *
+ *   1. Phileas scans the sentence and returns a list of PII *spans*, each
+ *      carrying a character start/end offset and a PII type label.
+ *   2. We map spans back to word tokens by checking which token positions
+ *      overlap any detected span.
+ *   3. piiCoverage = |tokens overlapping PII spans| / |total word tokens|
+ *
+ * -----------------------------------------------------------------------
+ * USAGE
+ *
+ *   BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter();
+ *   PIIDetector detector = new PIIDetector(splitter);
+ *
+ *   List&lt;SentencePIIScore&gt; scores = detector.analyseText("Иван Петров живее на ул. Роза 5.");
+ *   for (SentencePIIScore s : scores) {
+ *       System.out.printf("%.1f%% PII — %s%n", s.getPiiCoveragePercent(), s.getSentence());
+ *   }
+ *
+ *   // Corpus-level processing with TSV output
+ *   detector.analyseDirectory("/path/to/corpus/", "/path/to/pii_report.tsv");
+ */
+public class PIIDetector {
+    // -----------------------------------------------------------------------
+    // Constants
+    // -----------------------------------------------------------------------
+    /** Context string passed to Phileas (arbitrary; used for logging/caching). */
+    private static final String CONTEXT  = "bg-corpus";
+    /** Document ID prefix; a counter suffix is appended per sentence. */
+    private static final String DOC_ID   = "sent-";
+    /** Minimum word count for a sentence to be analysed. */
+    private static final int    MIN_WORDS = 3;
+    // -----------------------------------------------------------------------
+    // Dependencies
+    // -----------------------------------------------------------------------
+    private final BulgarianSentenceSplitter splitter;
+    private final PlainTextFilterService    filterService;
+    private final List<Policy>              policies;
+    // -----------------------------------------------------------------------
+    // Constructors
+    // -----------------------------------------------------------------------
+    /**
+     * Creates a PIIDetector with the default policy (all built-in Phileas
+     * filters active, REDACT strategy so spans are easy to count).
+     *
+     * @param splitter an initialised {@link BulgarianSentenceSplitter}
+     */
+    public PIIDetector(BulgarianSentenceSplitter splitter) {
+        this(splitter, null);
+    }
+    /**
+     * Creates a PIIDetector with a custom Phileas {@link Policy}.
+     * Pass {@code null} to use the built-in all-PII policy.
+     *
+     * @param splitter       an initialised {@link BulgarianSentenceSplitter}
+     * @param customPolicy   a pre-built Phileas Policy, or null for default
+     */
+    public PIIDetector(BulgarianSentenceSplitter splitter, Policy customPolicy) {
+        if (splitter == null)
+            throw new IllegalArgumentException("splitter must not be null");
+        this.splitter = splitter;
+        try {
+            Properties props = new Properties();
+            PhileasConfiguration config = new PhileasConfiguration(props);
+            this.filterService = new PlainTextFilterService(config);
+            this.policies = List.of(customPolicy != null ? customPolicy : buildPolicy());
+            System.out.println("[PIIDetector] Phileas filter service initialised.");
+        } catch (Exception e) {
+            throw new RuntimeException("Failed to initialise Phileas filter service", e);
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Public API
+    // -----------------------------------------------------------------------
+    /**
+     * Splits {@code text} into sentences and returns a {@link SentencePIIScore}
+     * for each sentence.
+     *
+     * Sentences shorter than {@link #MIN_WORDS} words receive a zero score
+     * without calling Phileas (to avoid spurious detections on fragments).
+     *
+     * @param text any Bulgarian plain text (may span multiple paragraphs)
+     * @return one score per detected sentence, in order; never null
+     */
+    public List<SentencePIIScore> analyseText(String text) {
+        List<SentencePIIScore> results = new ArrayList<>();
+        if (text == null || text.isBlank()) return results;
+        int docCounter = 0;
+        for (String sentence : splitter.split(text)) {
+            results.add(analyseSentence(sentence, DOC_ID + (docCounter++)));
+        }
+        return results;
+    }
+    /**
+     * Analyses a single pre-split sentence.
+     *
+     * @param sentence the sentence string (not null)
+     * @param docId    a document/sentence identifier string for Phileas context
+     * @return a fully populated {@link SentencePIIScore}
+     */
+    public SentencePIIScore analyseSentence(String sentence, String docId) {
+        // --- Tokenise ---
+        String[] rawTokens = sentence.trim().split("\\s+");
+        List<String> tokens = new ArrayList<>();
+        for (String t : rawTokens) {
+            String clean = t.replaceAll("[^\\p{L}\\p{N}@._+\\-]", "");
+            if (!clean.isEmpty()) tokens.add(clean);
+        }
+        int totalWords = tokens.size();
+        if (totalWords < MIN_WORDS) {
+            return SentencePIIScore.empty(sentence, totalWords);
+        }
+        // --- Run Phileas ---
+        List<Span> spans;
+        try {
+            FilterResponse response = filterService.filter(
+                    policies, CONTEXT, docId, sentence, null);
+            spans = response.getSpans() != null ? response.getSpans() : List.of();
+        } catch (Exception e) {
+            System.err.println("[PIIDetector] Phileas error on sentence: " + e.getMessage());
+            return SentencePIIScore.error(sentence, totalWords, e.getMessage());
+        }
+        // --- Map character-level spans back to token positions ---
+        // Build token character offsets from the original sentence string
+        int[] tokenStart = new int[tokens.size()];
+        int[] tokenEnd   = new int[tokens.size()];
+        int cursor = 0;
+        for (int ti = 0; ti < tokens.size(); ti++) {
+            String tok = tokens.get(ti);
+            int idx = sentence.indexOf(tok, cursor);
+            if (idx < 0) {
+                // Fallback: token not found at expected position (normalisation artefact)
+                tokenStart[ti] = cursor;
+                tokenEnd[ti]   = cursor + tok.length();
+            } else {
+                tokenStart[ti] = idx;
+                tokenEnd[ti]   = idx + tok.length();
+                cursor = idx + tok.length();
+            }
+        }
+        // Count distinct PII tokens and collect type labels per token
+        Map<Integer, String> piiTokenType = new LinkedHashMap<>(); // tokenIndex → PII type
+        for (Span span : spans) {
+            int spanStart = span.getStart();
+            int spanEnd   = span.getEnd();
+            String type   = span.getFilterType() != null
+                            ? span.getFilterType().name()
+                            : "UNKNOWN";
+            for (int ti = 0; ti < tokens.size(); ti++) {
+                // Overlap: token and span share at least one character
+                if (tokenStart[ti] < spanEnd && tokenEnd[ti] > spanStart) {
+                    piiTokenType.put(ti, type);
+                }
+            }
+        }
+        // --- Build type frequency map ---
+        Map<String, Integer> typeCounts = new LinkedHashMap<>();
+        for (String type : piiTokenType.values()) {
+            typeCounts.merge(type, 1, Integer::sum);
+        }
+        int piiTokenCount = piiTokenType.size();
+        double coverage   = totalWords > 0
+                ? (double) piiTokenCount / totalWords
+                : 0.0;
+        return new SentencePIIScore(
+                sentence, totalWords, piiTokenCount, coverage,
+                new ArrayList<>(piiTokenType.values()),
+                typeCounts, spans, null);
+    }
+    // -----------------------------------------------------------------------
+    // Corpus-level processing
+    // -----------------------------------------------------------------------
+    /**
+     * Analyses all .txt files in {@code corpusDir} sentence by sentence and
+     * writes results to a TSV file at {@code reportPath}.
+     *
+     * Only sentences with at least one PII token are written to the report.
+     *
+     * @param corpusDir  directory of plain-text .txt files
+     * @param reportPath destination TSV report file path
+     */
+    public void analyseDirectory(String corpusDir, String reportPath) {
+        try {
+            FileHandler fh = new FileHandler();
+            int filesProcessed = 0, sentencesWritten = 0;
+            try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
+                    new FileOutputStream(reportPath, false), StandardCharsets.UTF_8))) {
+                bw.write("file\t" + SentencePIIScore.tsvHeader());
+                bw.newLine();
+                for (File f : fh.getFileListing(new File(corpusDir))) {
+                    if (!f.isFile() || !f.getName().endsWith(".txt")) continue;
+                    System.out.println("[PIIDetector] Processing: " + f.getName());
+                    StringBuilder text = new StringBuilder();
+                    try (Scanner sc = new Scanner(f, StandardCharsets.UTF_8)) {
+                        while (sc.hasNextLine()) text.append(sc.nextLine()).append(' ');
+                    }
+                    int docCounter = 0;
+                    for (SentencePIIScore score : analyseText(text.toString())) {
+                        if (score.hasPII()) {
+                            bw.write(f.getName() + "\t" + score.toTsv());
+                            bw.newLine();
+                            sentencesWritten++;
+                        }
+                        docCounter++;
+                    }
+                    filesProcessed++;
+                }
+            }
+            System.out.printf("[PIIDetector] Done.  Files: %d  Sentences with PII written: %d%n",
+                    filesProcessed, sentencesWritten);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    // -----------------------------------------------------------------------
+    // Policy builder
+    // -----------------------------------------------------------------------
+    /**
+     * Builds the default Phileas {@link Policy} that activates all
+     * language-agnostic PII filters with a REDACT strategy (so that
+     * span positions remain stable for overlap calculation).
+     *
+     * To customise, edit the JSON string below or deserialise your own
+     * policy from a .json file with:
+     *   Policy policy = Policy.fromJson(new String(Files.readAllBytes(path)));
+     *
+     * To add a Bulgarian names dictionary, add an "identifiers.dictionary"
+     * block pointing to a file of Bulgarian given names and surnames.
+     */
+    private Policy buildPolicy() throws Exception {
+        String policyJson = "{"
+            + "\"name\": \"pii-all\","
+            + "\"identifiers\": {"
+            +   "\"emailAddress\":    {\"emailAddressFilterStrategies\":    [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"phoneNumber\":     {\"phoneNumberFilterStrategies\":     [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"ipAddress\":       {\"ipAddressFilterStrategies\":       [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"url\":             {\"urlFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"creditCard\":      {\"creditCardFilterStrategies\":      [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"ssn\":             {\"ssnFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"ibanCode\":        {\"ibanCodeFilterStrategies\":        [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"bankAccountNumber\":{\"bankAccountNumberFilterStrategies\":[{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"date\":            {\"dateFilterStrategies\":            [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"age\":             {\"ageFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"macAddress\":      {\"macAddressFilterStrategies\":      [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"bitcoinAddress\":  {\"bitcoinAddressFilterStrategies\":  [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"vin\":             {\"vinFilterStrategies\":             [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"zipCode\":         {\"zipCodeFilterStrategies\":         [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]},"
+            +   "\"person\":          {\"personFilterStrategies\":          [{\"strategy\":\"REDACT\",\"redactionFormat\":\"{{{REDACTED-%t}}}\"}]}"
+            + "}"
+            + "}";
+        return Policy.fromJson(policyJson);
+    }
+    // -----------------------------------------------------------------------
+    // Inner result class
+    // -----------------------------------------------------------------------
+    /**
+     * Immutable result object for one sentence's PII analysis.
+     */
+    public static class SentencePIIScore {
+        private final String            sentence;
+        private final int               totalWords;
+        private final int               piiTokenCount;
+        /** PII coverage: piiTokenCount / totalWords in [0, 1]. */
+        private final double            piiCoverage;
+        /** Ordered list of PII type labels for each PII token found. */
+        private final List<String>      piiTypes;
+        /** Frequency of each PII type in this sentence. */
+        private final Map<String, Integer> typeFrequency;
+        /** Raw Phileas spans (character-level). */
+        private final List<Span>        spans;
+        /** Non-null if Phileas threw an exception for this sentence. */
+        private final String            errorMessage;
+        SentencePIIScore(String sentence, int totalWords, int piiTokenCount,
+                         double piiCoverage, List<String> piiTypes,
+                         Map<String, Integer> typeFrequency,
+                         List<Span> spans, String errorMessage) {
+            this.sentence      = sentence;
+            this.totalWords    = totalWords;
+            this.piiTokenCount = piiTokenCount;
+            this.piiCoverage   = piiCoverage;
+            this.piiTypes      = Collections.unmodifiableList(piiTypes);
+            this.typeFrequency = Collections.unmodifiableMap(typeFrequency);
+            this.spans         = spans != null
+                                 ? Collections.unmodifiableList(spans)
+                                 : List.of();
+            this.errorMessage  = errorMessage;
+        }
+        static SentencePIIScore empty(String sentence, int totalWords) {
+            return new SentencePIIScore(sentence, totalWords, 0, 0.0,
+                    List.of(), Map.of(), List.of(), null);
+        }
+        static SentencePIIScore error(String sentence, int totalWords, String msg) {
+            return new SentencePIIScore(sentence, totalWords, 0, 0.0,
+                    List.of(), Map.of(), List.of(), msg);
+        }
+        // --- Accessors ---
+        public String            getSentence()           { return sentence; }
+        public int               getTotalWords()         { return totalWords; }
+        public int               getPiiTokenCount()      { return piiTokenCount; }
+        /** PII coverage ratio in [0, 1]. */
+        public double            getPiiCoverage()        { return piiCoverage; }
+        /** PII coverage expressed as a percentage [0, 100]. */
+        public double            getPiiCoveragePercent() { return piiCoverage * 100.0; }
+        public List<String>      getPiiTypes()           { return piiTypes; }
+        public Map<String, Integer> getTypeFrequency()   { return typeFrequency; }
+        public List<Span>        getSpans()              { return spans; }
+        public boolean           hasPII()                { return piiTokenCount > 0; }
+        public boolean           hasError()              { return errorMessage != null; }
+        public String            getErrorMessage()       { return errorMessage; }
+        /** Number of distinct PII categories detected in this sentence. */
+        public int distinctPiiTypes() { return typeFrequency.size(); }
+        // --- TSV export ---
+        /**
+         * TSV row: sentence | totalWords | piiTokens | coverage% | distinctTypes | typeFrequency
+         */
+        public String toTsv() {
+            return String.format("%s\t%d\t%d\t%.4f\t%.2f\t%d\t%s",
+                    sentence.replace('\t', ' '),
+                    totalWords,
+                    piiTokenCount,
+                    piiCoverage,
+                    getPiiCoveragePercent(),
+                    distinctPiiTypes(),
+                    typeFrequency.toString());
+        }
+        public static String tsvHeader() {
+            return "sentence\ttotalWords\tpiiTokens\tpiiCoverage\tpiiCoverage%\tdistinctPiiTypes\ttypeFrequency";
+        }
+        @Override
+        public String toString() {
+            return String.format("SentencePIIScore{words=%d, piiTokens=%d, coverage=%.1f%%, types=%s}",
+                    totalWords, piiTokenCount, getPiiCoveragePercent(), typeFrequency.keySet());
+        }
+    }
+}

java/bg/bas/dcl/LLMs/SentenceBiasScore.java ADDED Viewed

	@@ -0,0 +1,150 @@

+package bg.bas.dcl.LLMs;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+/**
+ * SentenceBiasScore
+ *
+ */
+public class SentenceBiasScore {
+       public static final String[] BIAS_TYPES = {
+        "gender", "race_ethnicity", "religion", "disability", "appearance"
+    };
+     private final String sentence;
+      private final int totalWords;
+    private final Map<String, Double> pairCoverage;
+    private final Map<String, Integer> signalCount;
+    private final Map<String, Integer> evaluatorCount;
+    /** All dictionary entries matched in this sentence (lemma strings). */
+    private final List<String> matchedLemmas;
+    /** Total matched bias words (evaluative, non-neutral). */
+    private final int totalBiasWords;
+    /** Count of matched derogatory terms. */
+    private final int totalDerogatory;
+    /** Count of matched colloquial terms. */
+    private final int totalColloquial;
+    private final boolean multiType;
+    SentenceBiasScore(String sentence,
+                      int totalWords,
+                      Map<String, Double>  pairCoverage,
+                      Map<String, Integer> signalCount,
+                      Map<String, Integer> evaluatorCount,
+                      List<String>         matchedLemmas,
+                      int totalBiasWords,
+                      int totalDerogatory,
+                      int totalColloquial,
+                      boolean multiType) {
+        this.sentence       = sentence;
+        this.totalWords     = totalWords;
+        this.pairCoverage   = Collections.unmodifiableMap(pairCoverage);
+        this.signalCount    = Collections.unmodifiableMap(signalCount);
+        this.evaluatorCount = Collections.unmodifiableMap(evaluatorCount);
+        this.matchedLemmas  = Collections.unmodifiableList(matchedLemmas);
+        this.totalBiasWords = totalBiasWords;
+        this.totalDerogatory= totalDerogatory;
+        this.totalColloquial= totalColloquial;
+        this.multiType      = multiType;
+    }
+    public double getPairCoverage(String biasType) {
+        if (biasType == null || biasType.isBlank()) return totalCoverage();
+        return pairCoverage.getOrDefault(biasType.toLowerCase(), 0.0);
+    }
+    public double totalCoverage() {
+        double sum = 0;
+        for (double v : pairCoverage.values()) sum += v;
+        return sum;
+    }
+    public double[] coverageArray() {
+        double[] arr = new double[BIAS_TYPES.length];
+        for (int i = 0; i < BIAS_TYPES.length; i++)
+            arr[i] = getPairCoverage(BIAS_TYPES[i]);
+        return arr;
+    }
+    /** True if any bias type has a non-zero pair-coverage score. */
+    public boolean isBiased() {
+        for (double v : pairCoverage.values())
+            if (v > 0) return true;
+        return false;
+    }
+    public String      getSentence()                        { return sentence; }
+    public int         getTotalWords()                      { return totalWords; }
+    public int         getSignalCount(String type)          { return signalCount.getOrDefault(type, 0); }
+    public int         getEvaluatorCount(String type)       { return evaluatorCount.getOrDefault(type, 0); }
+    public List<String>getMatchedLemmas()                   { return matchedLemmas; }
+    public int         getTotalBiasWords()                  { return totalBiasWords; }
+    public int         getTotalDerogatory()                 { return totalDerogatory; }
+    public int         getTotalColloquial()                 { return totalColloquial; }
+    public boolean     isMultiType()                        { return multiType; }
+    public String toTsv() {
+        StringBuilder sb = new StringBuilder();
+        sb.append(sentence).append('\t');
+        sb.append(totalWords).append('\t');
+        sb.append(matchedLemmas).append('\t');
+        for (String type : BIAS_TYPES) {
+            sb.append(signalCount.getOrDefault(type, 0)).append('\t');
+            sb.append(evaluatorCount.getOrDefault(type, 0)).append('\t');
+            sb.append(String.format("%.4f", getPairCoverage(type))).append('\t');
+        }
+        sb.append(totalBiasWords).append('\t');
+        sb.append(totalDerogatory).append('\t');
+        sb.append(totalColloquial).append('\t');
+        sb.append(multiType ? 1 : 0).append('\t');
+        sb.append(String.format("%.4f", totalCoverage()));
+        return sb.toString();
+    }
+    public static String tsvHeader() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("sentence\ttotalWords\tmatchedLemmas\t");
+        for (String type : BIAS_TYPES)
+            sb.append(type).append("_signals\t")
+              .append(type).append("_evaluators\t")
+              .append(type).append("_coverage\t");
+        sb.append("totalBiasWords\ttotalDerogatory\ttotalColloquial\t")
+          .append("multiType\ttotalCoverage");
+        return sb.toString();
+    }
+    @Override
+    public String toString() {
+        return String.format("SentenceBiasScore{words=%d, coverage=%.3f, biased=%b, sentence='%s'}",
+                totalWords, totalCoverage(), isBiased(),
+                sentence.length() > 80 ? sentence.substring(0, 80) + "…" : sentence);
+    }
+}

resources/bulgarian_bias_dictionary_v4.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

resources/metadata_schema.json ADDED Viewed

	@@ -0,0 +1,267 @@

+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://dcl.bas.bg/ifgpt/metadata-schema/v1.0",
+  "title": "IfGPT Document Metadata Schema",
+  "description": "Metadata schema for textual documents in the IfGPT Bulgarian language dataset. 15 mandatory fields + 8 optional fields.",
+  "type": "object",
+  "required": [
+    "Identifier",
+    "Licence",
+    "PublicationDate",
+    "DocumentTitle",
+    "Source",
+    "Medium",
+    "Url",
+    "Domain",
+    "Keywords",
+    "NumberWords",
+    "NumberSentences",
+    "NumberParagraphs",
+    "NumberTokens",
+    "PersonallyIdentifiableInformation",
+    "BiasedInformation"
+  ],
+  "properties": {
+    "Identifier": {
+      "type": "string",
+      "description": "Unique document identifier with the language prefix 'bg'.",
+      "pattern": "^bg_",
+      "examples": ["bg_bnc_12345", "bg_MARCELL_doc001", "bg_CURLICAT_xyz"]
+    },
+    "Licence": {
+      "type": "string",
+      "description": "Licence name with classification by type (open, restricted, etc.).",
+      "enum": [
+        "CC0",
+        "CC0-1.0",
+        "CC-BY-4.0",
+        "CC-BY-SA-4.0",
+        "CC-BY-NC-4.0",
+        "CC-BY-NC-SA-4.0",
+        "Restricted",
+        "Proprietary",
+        "Unknown"
+      ]
+    },
+    "PublicationDate": {
+      "type": "string",
+      "description": "Date of publication of the text (yyyy-mm-dd).",
+      "pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
+      "examples": ["2023-04-15", "2019-01-01", ""]
+    },
+    "DocumentTitle": {
+      "type": "string",
+      "description": "Title of the document.",
+      "examples": ["Закон за защита на данните", "Статия за климатичните промени"]
+    },
+    "Source": {
+      "type": "string",
+      "description": "Publishing organisation, media outlet or institutional originator.",
+      "examples": ["Министерски съвет", "БНР", "Сега"]
+    },
+    "Medium": {
+      "type": "string",
+      "description": "Modality of the resource.",
+      "enum": ["textual", "multimodal"]
+    },
+    "Url": {
+      "type": "string",
+      "description": "Original web address of the document.",
+      "format": "uri",
+      "examples": ["https://www.lex.bg/laws/ldoc/123", ""]
+    },
+    "Domain": {
+      "type": "array",
+      "description": "Up to six subject areas from a controlled vocabulary.",
+      "maxItems": 6,
+      "items": {
+        "type": "string",
+        "enum": [
+          "Държавно управление",
+          "Право и законодателство",
+          "Икономика и финанси",
+          "Образование",
+          "Наука и технологии",
+          "Здравеопазване",
+          "Култура и изкуство",
+          "Спорт",
+          "Медии и журналистика",
+          "Общество и политика",
+          "Околна среда",
+          "Религия",
+          "История",
+          "Литература и художествена проза",
+          "Неформална комуникация",
+          "Друго"
+        ]
+      },
+      "examples": [["Държавно управление"], ["Медии и журналистика", "Общество и политика"]]
+    },
+    "Keywords": {
+      "type": "array",
+      "description": "Up to six free-text keywords characterising the content.",
+      "maxItems": 6,
+      "items": { "type": "string" },
+      "examples": [["климат", "законодателство", "ЕС"]]
+    },
+    "NumberWords": {
+      "type": "integer",
+      "description": "Total number of words (non-punctuation tokens).",
+      "minimum": 0
+    },
+    "NumberSentences": {
+      "type": "integer",
+      "description": "Total number of sentences.",
+      "minimum": 0
+    },
+    "NumberParagraphs": {
+      "type": "integer",
+      "description": "Total number of paragraphs.",
+      "minimum": 0
+    },
+    "NumberTokens": {
+      "type": "integer",
+      "description": "Total number of tokens (words + punctuation).",
+      "minimum": 0
+    },
+    "PersonallyIdentifiableInformation": {
+      "type": "array",
+      "description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as personally identifiable information, in [0,1]. Length equals NumberSentences.",
+      "items": {
+        "type": "number",
+        "minimum": 0.0,
+        "maximum": 1.0
+      },
+      "examples": [[0.0, 0.0, 0.15, 0.0, 0.05]]
+    },
+    "BiasedInformation": {
+      "type": "array",
+      "description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as potentially biased (signal-evaluator pair coverage), in [0,1]. Length equals NumberSentences.",
+      "items": {
+        "type": "number",
+        "minimum": 0.0,
+        "maximum": 1.0
+      },
+      "examples": [[0.0, 0.0, 0.0, 0.10, 0.0]]
+    },
+    "Author": {
+      "type": "array",
+      "description": "[Optional] Name(s) of the author(s).",
+      "items": { "type": "string" },
+      "examples": [["Иван Иванов"], ["Агенция БТА"]]
+    },
+    "Style": {
+      "type": "string",
+      "description": "[Optional] Stylistic register of the document.",
+      "enum": [
+        "Административен",
+        "Журналистически",
+        "Научен",
+        "Художествен",
+        "Разговорен",
+        "Правен",
+        "Технически",
+        "Неформален",
+        ""
+      ]
+    },
+    "Type": {
+      "type": "string",
+      "description": "[Optional] Document genre.",
+      "enum": [
+        "Закон",
+        "Наредба",
+        "Решение",
+        "Статия",
+        "Книга",
+        "Доклад",
+        "Интервю",
+        "Коментар",
+        "Форум",
+        "Блог",
+        "Уикипедия",
+        "Друго",
+        ""
+      ]
+    },
+    "Subdomain": {
+      "type": "array",
+      "description": "[Optional] Narrower thematic classification, hierarchically linked to Domain.",
+      "maxItems": 6,
+      "items": { "type": "string" },
+      "examples": [["Европейско законодателство"], ["Климатична политика"]]
+    },
+    "TranslatedDocument": {
+      "type": ["boolean", "string"],
+      "description": "[Optional] true = translation into Bulgarian; false = original Bulgarian text.",
+      "examples": [false, true, ""]
+    },
+    "CollectionDate": {
+      "type": "string",
+      "description": "[Optional] Date of acquisition into the collection (yyyy-mm-dd).",
+      "pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
+      "examples": ["2024-03-10", ""]
+    },
+    "LicenceLink": {
+      "type": "string",
+      "description": "[Optional] URL of the licence text.",
+      "format": "uri",
+      "examples": [
+        "https://creativecommons.org/public-domain/cc0/",
+        "https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf",
+        ""
+      ]
+    },
+    "TaskCategories": {
+      "type": "array",
+      "description": "[Optional] Anticipated NLP applications from a predefined list.",
+      "items": {
+        "type": "string",
+        "enum": [
+          "Language Modelling",
+          "Text Classification",
+          "Named Entity Recognition",
+          "Machine Translation",
+          "Summarisation",
+          "Question Answering",
+          "Sentiment Analysis",
+          "Bias Detection",
+          "PII Detection",
+          "Information Extraction",
+          "Coreference Resolution",
+          "Dependency Parsing",
+          "Other"
+        ]
+      },
+      "examples": [["Language Modelling", "Named Entity Recognition"]]
+    }
+  },
+  "additionalProperties": false
+}