package bg.bas.dcl.LLMs.IfGPTDataset; /** * IfGPTDatasetProcessor * */ public class IfGPTDatasetProcessor { // ----------------------------------------------------------------------- // Shared paths // ----------------------------------------------------------------------- // New batch being ingested static final String NEW_DATA_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/data/"; static final String NEW_META_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/metadata/"; static final String SAMPLE_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/sample/"; static final String BLOCKLIST_FILE = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/blocklist.txt"; static final String DEDUP_REPORT = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/dedup_report.tsv"; // Shared resources static final String BULNC_META_FILE = "/home/ivelina/SVN_CORPUS/BulNC/BulNC-description.txt"; static final String BIAS_DICT = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/" + "bulgarian_bias_dictionary_v4.tsv"; // ----------------------------------------------------------------------- // Main // ----------------------------------------------------------------------- public static void main(String[] args) { // ================================================================== // MODE A — FULL PIPELINE (one call runs all 8 stages) // ================================================================== // Choose the source processor that matches the new batch format, // then call pipeline.run(). // --- BulNC Mass Media batch --- runBulNCPipeline(); // --- MARCELL batch --- // runMarcellPipeline(); // --- CURLICAT batch --- // runCurlicatPipeline(); // --- BulNC Wiki/InformalFiction batch --- // runBulNCWikiPipeline(); // ================================================================== // MODE B — INDIVIDUAL STAGES // ================================================================== // --- 1. Extract only --- // new BulNCProcessor(BULNC_META_FILE).process(NEW_DATA_DIR, NEW_META_DIR); // --- 3. Clean only (learn + apply) --- // FileCleanProcessor fcp = new FileCleanProcessor(0.50); // fcp.learnFromSample(SAMPLE_DIR); // fcp.printTopCommonLines(30); // fcp.saveBlocklist(BLOCKLIST_FILE); // fcp.cleanDirectory(NEW_DATA_DIR, true); // --- 4. Deduplication only --- // DeduplicationProcessor dp = new DeduplicationProcessor(0.90, 5, 200); // dp.indexCorpus(IfGPTPipeline.FULL_DATA_DIR); // dp.detectDuplicates(NEW_DATA_DIR, DEDUP_REPORT); // dp.removeDuplicatesFromNewFolder(NEW_DATA_DIR, true); // optional // --- 5/6. PII + Bias annotation only (on already-split sentences) --- // bg.bas.dcl.LLMs.BulgarianSentenceSplitter splitter = // new bg.bas.dcl.LLMs.BulgarianSentenceSplitter(); // bg.bas.dcl.LLMs.PIIDetector pii = new bg.bas.dcl.LLMs.PIIDetector(splitter); // pii.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "pii_report.tsv"); // // bg.bas.dcl.LLMs.BiasLexicon lex = // new bg.bas.dcl.LLMs.BiasLexicon(BIAS_DICT); // bg.bas.dcl.LLMs.BiasAnalyser bias = // new bg.bas.dcl.LLMs.BiasAnalyser(lex, splitter); // bias.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "bias_report.tsv"); // ================================================================== // MODE C — UTILITIES // ================================================================== // Convert an existing metadata JSON to CSV // new MarcellProcessor().convertJsonToCSV( // IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json"); } // ----------------------------------------------------------------------- // Pipeline factory methods (one per source type) // ----------------------------------------------------------------------- private static void runBulNCPipeline() { new IfGPTPipeline() .setSourceProcessor(new BulNCProcessor(BULNC_META_FILE)) .setNewDataDir(NEW_DATA_DIR) .setSampleDir(SAMPLE_DIR) .setNewMetaDir(NEW_META_DIR) .setBlocklistFile(BLOCKLIST_FILE) .setDedupReport(DEDUP_REPORT) .setBiasDictPath(BIAS_DICT) .setBoilerplateThreshold(0.50) .setDedupThreshold(0.90) .setRemoveDuplicates(false) // set true to delete dup sentences .setKeepBackups(true) .run(); } private static void runMarcellPipeline() { String indirMarcell = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/bg-annotated/"; String outdirMarcell= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/texts/"; new IfGPTPipeline() .setSourceProcessor(new MarcellProcessor()) .setNewDataDir(outdirMarcell) .setSampleDir(SAMPLE_DIR) .setNewMetaDir(NEW_META_DIR) .setBlocklistFile(BLOCKLIST_FILE) .setDedupReport(DEDUP_REPORT) .setBiasDictPath(BIAS_DICT) .setSkipClean(false) .setSkipDedup(false) .run(); } private static void runCurlicatPipeline() { String indirCurlicat = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/archive/" + "Bulgarian_Curlicat_corpus/"; String outdirCurlicat= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/texts/"; new IfGPTPipeline() .setSourceProcessor(new CurlicatProcessor()) .setNewDataDir(outdirCurlicat) .setSampleDir(SAMPLE_DIR) .setNewMetaDir(NEW_META_DIR) .setBlocklistFile(BLOCKLIST_FILE) .setDedupReport(DEDUP_REPORT) .setBiasDictPath(BIAS_DICT) .run(); } private static void runBulNCWikiPipeline() { String existingMeta = IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json"; String outdirWiki = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/BulNC/wiki-texts/"; new IfGPTPipeline() .setSourceProcessor(new BulNCWikiProcessor(BULNC_META_FILE, existingMeta)) .setNewDataDir(outdirWiki) .setSampleDir(SAMPLE_DIR) .setNewMetaDir(NEW_META_DIR) .setBlocklistFile(BLOCKLIST_FILE) .setDedupReport(DEDUP_REPORT) .setBiasDictPath(BIAS_DICT) .run(); } }