IfGPT-DataQualityComponents / java /bg /bas /dcl /LLMs /IfGPTDataset /IfGPTDatasetProcessor.java
dcl-ibl-bas's picture
Upload 22 files
18573e4 verified
package bg.bas.dcl.LLMs.IfGPTDataset;
/**
* IfGPTDatasetProcessor
*
*/
public class IfGPTDatasetProcessor {
// -----------------------------------------------------------------------
// Shared paths
// -----------------------------------------------------------------------
// New batch being ingested
static final String NEW_DATA_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/data/";
static final String NEW_META_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/metadata/";
static final String SAMPLE_DIR = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/sample/";
static final String BLOCKLIST_FILE = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/blocklist.txt";
static final String DEDUP_REPORT = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/dedup_report.tsv";
// Shared resources
static final String BULNC_META_FILE = "/home/ivelina/SVN_CORPUS/BulNC/BulNC-description.txt";
static final String BIAS_DICT = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
+ "bulgarian_bias_dictionary_v4.tsv";
// -----------------------------------------------------------------------
// Main
// -----------------------------------------------------------------------
public static void main(String[] args) {
// ==================================================================
// MODE A — FULL PIPELINE (one call runs all 8 stages)
// ==================================================================
// Choose the source processor that matches the new batch format,
// then call pipeline.run().
// --- BulNC Mass Media batch ---
runBulNCPipeline();
// --- MARCELL batch ---
// runMarcellPipeline();
// --- CURLICAT batch ---
// runCurlicatPipeline();
// --- BulNC Wiki/InformalFiction batch ---
// runBulNCWikiPipeline();
// ==================================================================
// MODE B — INDIVIDUAL STAGES
// ==================================================================
// --- 1. Extract only ---
// new BulNCProcessor(BULNC_META_FILE).process(NEW_DATA_DIR, NEW_META_DIR);
// --- 3. Clean only (learn + apply) ---
// FileCleanProcessor fcp = new FileCleanProcessor(0.50);
// fcp.learnFromSample(SAMPLE_DIR);
// fcp.printTopCommonLines(30);
// fcp.saveBlocklist(BLOCKLIST_FILE);
// fcp.cleanDirectory(NEW_DATA_DIR, true);
// --- 4. Deduplication only ---
// DeduplicationProcessor dp = new DeduplicationProcessor(0.90, 5, 200);
// dp.indexCorpus(IfGPTPipeline.FULL_DATA_DIR);
// dp.detectDuplicates(NEW_DATA_DIR, DEDUP_REPORT);
// dp.removeDuplicatesFromNewFolder(NEW_DATA_DIR, true); // optional
// --- 5/6. PII + Bias annotation only (on already-split sentences) ---
// bg.bas.dcl.LLMs.BulgarianSentenceSplitter splitter =
// new bg.bas.dcl.LLMs.BulgarianSentenceSplitter();
// bg.bas.dcl.LLMs.PIIDetector pii = new bg.bas.dcl.LLMs.PIIDetector(splitter);
// pii.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "pii_report.tsv");
//
// bg.bas.dcl.LLMs.BiasLexicon lex =
// new bg.bas.dcl.LLMs.BiasLexicon(BIAS_DICT);
// bg.bas.dcl.LLMs.BiasAnalyser bias =
// new bg.bas.dcl.LLMs.BiasAnalyser(lex, splitter);
// bias.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "bias_report.tsv");
// ==================================================================
// MODE C — UTILITIES
// ==================================================================
// Convert an existing metadata JSON to CSV
// new MarcellProcessor().convertJsonToCSV(
// IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json");
}
// -----------------------------------------------------------------------
// Pipeline factory methods (one per source type)
// -----------------------------------------------------------------------
private static void runBulNCPipeline() {
new IfGPTPipeline()
.setSourceProcessor(new BulNCProcessor(BULNC_META_FILE))
.setNewDataDir(NEW_DATA_DIR)
.setSampleDir(SAMPLE_DIR)
.setNewMetaDir(NEW_META_DIR)
.setBlocklistFile(BLOCKLIST_FILE)
.setDedupReport(DEDUP_REPORT)
.setBiasDictPath(BIAS_DICT)
.setBoilerplateThreshold(0.50)
.setDedupThreshold(0.90)
.setRemoveDuplicates(false) // set true to delete dup sentences
.setKeepBackups(true)
.run();
}
private static void runMarcellPipeline() {
String indirMarcell = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/bg-annotated/";
String outdirMarcell= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/texts/";
new IfGPTPipeline()
.setSourceProcessor(new MarcellProcessor())
.setNewDataDir(outdirMarcell)
.setSampleDir(SAMPLE_DIR)
.setNewMetaDir(NEW_META_DIR)
.setBlocklistFile(BLOCKLIST_FILE)
.setDedupReport(DEDUP_REPORT)
.setBiasDictPath(BIAS_DICT)
.setSkipClean(false)
.setSkipDedup(false)
.run();
}
private static void runCurlicatPipeline() {
String indirCurlicat = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/archive/"
+ "Bulgarian_Curlicat_corpus/";
String outdirCurlicat= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/texts/";
new IfGPTPipeline()
.setSourceProcessor(new CurlicatProcessor())
.setNewDataDir(outdirCurlicat)
.setSampleDir(SAMPLE_DIR)
.setNewMetaDir(NEW_META_DIR)
.setBlocklistFile(BLOCKLIST_FILE)
.setDedupReport(DEDUP_REPORT)
.setBiasDictPath(BIAS_DICT)
.run();
}
private static void runBulNCWikiPipeline() {
String existingMeta = IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json";
String outdirWiki = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/BulNC/wiki-texts/";
new IfGPTPipeline()
.setSourceProcessor(new BulNCWikiProcessor(BULNC_META_FILE, existingMeta))
.setNewDataDir(outdirWiki)
.setSampleDir(SAMPLE_DIR)
.setNewMetaDir(NEW_META_DIR)
.setBlocklistFile(BLOCKLIST_FILE)
.setDedupReport(DEDUP_REPORT)
.setBiasDictPath(BIAS_DICT)
.run();
}
}