| package bg.bas.dcl.LLMs.IfGPTDataset; |
|
|
| import java.io.File; |
| import java.io.FileOutputStream; |
| import java.io.OutputStreamWriter; |
| import java.io.Writer; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.StandardCopyOption; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Properties; |
| import java.util.Scanner; |
|
|
| import org.json.simple.JSONArray; |
| import org.json.simple.JSONObject; |
|
|
| import bg.bas.dcl.LLMs.BiasAnalyser; |
| import bg.bas.dcl.LLMs.BiasLexicon; |
| import bg.bas.dcl.LLMs.BulgarianSentenceSplitter; |
| import bg.bas.dcl.LLMs.PIIDetector; |
| import bg.bas.dcl.LLMs.SentenceBiasScore; |
| import bg.bas.dcl.general.FileHandler; |
| import bg.bas.dcl.general.JSONProcessor; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| @SuppressWarnings("unchecked") |
| public class IfGPTPipeline { |
|
|
| |
| |
| |
|
|
| public static final String FULL_DATA_DIR = |
| "/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-DATA/"; |
| public static final String FULL_META_DIR = |
| "/home/ivelina/WORK-DCL/IfGPT/IFGPT-DATASET-METADATA/"; |
|
|
| |
| |
| |
|
|
| private SourceProcessor sourceProcessor; |
| private String newDataDir; |
| private String sampleDir; |
| private String newMetaDir; |
| private String blocklistFile; |
| private String dedupReport; |
| private String biasDictPath; |
| private String openNlpModelPath = null; |
| private double boilerplateThreshold = 0.50; |
| private double dedupThreshold = 0.90; |
| private int dedupShingleSize = 5; |
| private int dedupNumHashes = 200; |
| private boolean removeDuplicates = false; |
| private boolean keepBackups = true; |
| private boolean skipClean = false; |
| private boolean skipDedup = false; |
| private boolean skipPii = false; |
| private boolean skipBias = false; |
|
|
| |
| |
| |
|
|
| public IfGPTPipeline setSourceProcessor(SourceProcessor p) { sourceProcessor = p; return this; } |
| public IfGPTPipeline setNewDataDir(String p) { newDataDir = p; return this; } |
| public IfGPTPipeline setSampleDir(String p) { sampleDir = p; return this; } |
| public IfGPTPipeline setNewMetaDir(String p) { newMetaDir = p; return this; } |
| public IfGPTPipeline setBlocklistFile(String p) { blocklistFile = p; return this; } |
| public IfGPTPipeline setDedupReport(String p) { dedupReport = p; return this; } |
| public IfGPTPipeline setBiasDictPath(String p) { biasDictPath = p; return this; } |
| public IfGPTPipeline setOpenNlpModelPath(String p) { openNlpModelPath = p; return this; } |
| public IfGPTPipeline setBoilerplateThreshold(double t) { boilerplateThreshold = t; return this; } |
| public IfGPTPipeline setDedupThreshold(double t) { dedupThreshold = t; return this; } |
| public IfGPTPipeline setDedupShingleSize(int n) { dedupShingleSize = n; return this; } |
| public IfGPTPipeline setDedupNumHashes(int n) { dedupNumHashes = n; return this; } |
| public IfGPTPipeline setRemoveDuplicates(boolean b) { removeDuplicates = b; return this; } |
| public IfGPTPipeline setKeepBackups(boolean b) { keepBackups = b; return this; } |
| public IfGPTPipeline setSkipClean(boolean b) { skipClean = b; return this; } |
| public IfGPTPipeline setSkipDedup(boolean b) { skipDedup = b; return this; } |
| public IfGPTPipeline setSkipPii(boolean b) { skipPii = b; return this; } |
| public IfGPTPipeline setSkipBias(boolean b) { skipBias = b; return this; } |
|
|
| |
| |
|
|
| |
| |
| |
| |
| public void run() { |
| validateConfig(); |
| ensureDirs(newMetaDir, FULL_DATA_DIR, FULL_META_DIR); |
|
|
| banner("STAGE 1 — SOURCE EXTRACTION"); |
| runExtraction(); |
|
|
| |
| BulgarianSentenceSplitter splitter = new BulgarianSentenceSplitter(openNlpModelPath); |
|
|
| banner("STAGE 2 — SENTENCE SPLITTING & INITIAL METADATA"); |
| runSentenceSplitting(splitter); |
|
|
| if (!skipClean) { |
| banner("STAGE 3 — BOILERPLATE CLEANING"); |
| runCleaning(); |
| } else { |
| log("STAGE 3 skipped (skipClean=true)"); |
| } |
|
|
| if (!skipDedup) { |
| banner("STAGE 4 — DEDUPLICATION"); |
| runDeduplication(); |
| } else { |
| log("STAGE 4 skipped (skipDedup=true)"); |
| } |
|
|
| PIIDetector piiDetector = skipPii ? null : new PIIDetector(splitter); |
| BiasAnalyser biasAnalyser = skipBias ? null : buildBiasAnalyser(splitter); |
|
|
| banner("STAGES 5-7 — PII, BIAS & FINAL COUNTS"); |
| runAnnotationAndCounts(splitter, piiDetector, biasAnalyser); |
|
|
| banner("STAGE 8 — PERSIST TO FULL CORPUS"); |
| runPersist(); |
|
|
| banner("PIPELINE COMPLETE"); |
| } |
|
|
| |
| |
| |
|
|
| private void runExtraction() { |
| |
| |
| sourceProcessor.process(newDataDir, newMetaDir); |
| log("Extraction complete → " + newDataDir); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| private void runSentenceSplitting(BulgarianSentenceSplitter splitter) { |
| try { |
| FileHandler fh = new FileHandler(); |
| int docs = 0; |
|
|
| for (File txtFile : fh.getFileListing(new File(newDataDir))) { |
| if (!txtFile.isFile() || !txtFile.getName().endsWith(".txt")) continue; |
|
|
| |
| StringBuilder sb = new StringBuilder(); |
| try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) { |
| while (sc.hasNextLine()) sb.append(sc.nextLine()).append('\n'); |
| } |
| String text = sb.toString().trim(); |
|
|
| |
| String[] sentences = splitter.split(text); |
| File sentFile = new File(newDataDir, txtFile.getName() |
| .replace(".txt", ".sentences")); |
|
|
| try (Writer w = new OutputStreamWriter( |
| new FileOutputStream(sentFile), StandardCharsets.UTF_8)) { |
| for (String sent : sentences) { |
| if (!sent.isBlank()) { |
| w.write(sent.trim()); |
| w.write('\n'); |
| } |
| } |
| } |
| docs++; |
| } |
| log("Sentence splitting complete. Documents: " + docs); |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| private void runCleaning() { |
| FileCleanProcessor fcp = new FileCleanProcessor(boilerplateThreshold); |
|
|
| |
| fcp.learnFromSample(sampleDir); |
| fcp.printTopCommonLines(20); |
|
|
| |
| if (blocklistFile != null && !blocklistFile.isBlank()) { |
| fcp.saveBlocklist(blocklistFile); |
| } |
|
|
| |
| fcp.cleanDirectory(newDataDir, keepBackups); |
| log("Boilerplate cleaning complete → " + newDataDir); |
| } |
|
|
| |
| |
| |
|
|
| private void runDeduplication() { |
| DeduplicationProcessor dp = new DeduplicationProcessor( |
| dedupThreshold, dedupShingleSize, dedupNumHashes); |
|
|
| |
| log("Indexing full corpus for deduplication…"); |
| dp.indexCorpus(FULL_DATA_DIR); |
| log("Corpus indexed. Sentences: " + dp.getCorpusSize()); |
|
|
| |
| String report = dedupReport != null |
| ? dedupReport |
| : newMetaDir + "dedup_report.tsv"; |
| dp.detectDuplicates(newDataDir, report); |
|
|
| if (removeDuplicates) { |
| dp.removeDuplicatesFromNewFolder(newDataDir, keepBackups); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| private void runAnnotationAndCounts(BulgarianSentenceSplitter splitter, |
| PIIDetector piiDetector, |
| BiasAnalyser biasAnalyser) { |
| try { |
| FileHandler fh = new FileHandler(); |
| JSONProcessor jp = new JSONProcessor(); |
| int docs = 0, errors = 0; |
|
|
| for (File sentFile : fh.getFileListing(new File(newDataDir))) { |
| if (!sentFile.isFile() |
| || !sentFile.getName().endsWith(".sentences")) continue; |
|
|
| String stem = sentFile.getName().replace(".sentences", ""); |
|
|
| |
| List<String> sentences = new ArrayList<>(); |
| try (Scanner sc = new Scanner(sentFile, StandardCharsets.UTF_8)) { |
| while (sc.hasNextLine()) { |
| String s = sc.nextLine().trim(); |
| if (!s.isBlank()) sentences.add(s); |
| } |
| } |
|
|
| if (sentences.isEmpty()) { |
| log("[WARN] No sentences for: " + stem); |
| errors++; |
| continue; |
| } |
|
|
| |
| DocumentMetadata meta = loadOrCreateMetadata(jp, stem); |
|
|
| |
| List<Double> piiVec = new ArrayList<>(); |
| if (piiDetector != null) { |
| int sentIdx = 0; |
| for (String sent : sentences) { |
| PIIDetector.SentencePIIScore score = |
| piiDetector.analyseSentence(sent, stem + "-" + sentIdx++); |
| piiVec.add(score.getPiiCoverage()); |
| } |
| } |
| meta.setPiiVector(piiVec); |
|
|
| |
| List<Double> biasVec = new ArrayList<>(); |
| if (biasAnalyser != null) { |
| for (String sent : sentences) { |
| SentenceBiasScore score = biasAnalyser.analyseSentence(sent); |
| biasVec.add(score.totalCoverage()); |
| } |
| } |
| meta.setBiasVector(biasVec); |
|
|
| |
| int nSentences = sentences.size(); |
| int nWords = 0; |
| int nTokens = 0; |
|
|
| for (String sent : sentences) { |
| String[] toks = sent.split("\\s+"); |
| nWords += toks.length; |
| |
| nTokens += toks.length + sent.length() |
| - sent.replaceAll("[.,;:!?()\\-]", "").length(); |
| } |
|
|
| |
| int nParagraphs = countParagraphs(new File(newDataDir, stem + ".txt")); |
|
|
| meta.setNumberSentences(nSentences) |
| .setNumberWords(nWords) |
| .setNumberTokens(nTokens) |
| .setNumberParagraphs(nParagraphs); |
|
|
| |
| writeMetadata(meta, newMetaDir + stem + "_meta.json"); |
| docs++; |
| } |
|
|
| log("Annotation & counts complete. Documents: " + docs |
| + " Errors: " + errors); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| private void runPersist() { |
| try { |
| FileHandler fh = new FileHandler(); |
| int dataCopied = 0, metaCopied = 0; |
|
|
| |
| for (File f : fh.getFileListing(new File(newDataDir))) { |
| if (!f.isFile() || !f.getName().endsWith(".txt")) continue; |
| File dest = new File(FULL_DATA_DIR, f.getName()); |
| if (!dest.exists()) { |
| Files.copy(f.toPath(), dest.toPath(), |
| StandardCopyOption.REPLACE_EXISTING); |
| dataCopied++; |
| } |
| } |
|
|
| |
| for (File f : fh.getFileListing(new File(newMetaDir))) { |
| if (!f.isFile() || !f.getName().endsWith("_meta.json")) continue; |
| File dest = new File(FULL_META_DIR, f.getName()); |
| if (!dest.exists()) { |
| Files.copy(f.toPath(), dest.toPath(), |
| StandardCopyOption.REPLACE_EXISTING); |
| metaCopied++; |
| } |
| } |
|
|
| log("Persist complete. Text files copied: " + dataCopied |
| + " Metadata files copied: " + metaCopied); |
| log("FULL_DATA_DIR : " + FULL_DATA_DIR); |
| log("FULL_META_DIR : " + FULL_META_DIR); |
|
|
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| } |
|
|
| |
| |
| |
|
|
| private DocumentMetadata loadOrCreateMetadata(JSONProcessor jp, String stem) { |
| |
| |
| String[] candidates = { |
| newMetaDir + stem + "_meta.json", |
| newMetaDir + stem + ".json" |
| }; |
| for (String path : candidates) { |
| File f = new File(path); |
| if (f.exists()) { |
| try { |
| JSONObject raw = jp.readJSON(f); |
| |
| if (raw.containsKey("Identifier")) { |
| return DocumentMetadata.fromJson(raw); |
| } else { |
| DocumentMetadata m = new DocumentMetadata(stem); |
| m.mergeLegacy(raw); |
| return m; |
| } |
| } catch (Exception e) { |
| log("[WARN] Could not parse metadata JSON for " + stem + ": " + e.getMessage()); |
| } |
| } |
| } |
| |
| return new DocumentMetadata(stem); |
| } |
|
|
| private void writeMetadata(DocumentMetadata meta, String outPath) throws Exception { |
| JSONObject json = meta.toJson(); |
| try (Writer w = new OutputStreamWriter( |
| new FileOutputStream(outPath), StandardCharsets.UTF_8)) { |
| json.writeJSONString(w); |
| } |
| } |
|
|
| private int countParagraphs(File txtFile) { |
| if (!txtFile.exists()) return 0; |
| int count = 0; |
| boolean inPara = false; |
| try (Scanner sc = new Scanner(txtFile, StandardCharsets.UTF_8)) { |
| while (sc.hasNextLine()) { |
| String line = sc.nextLine(); |
| if (line.isBlank()) { |
| inPara = false; |
| } else { |
| if (!inPara) { count++; inPara = true; } |
| } |
| } |
| } catch (Exception e) { } |
| return Math.max(count, 1); |
| } |
|
|
| private BiasAnalyser buildBiasAnalyser(BulgarianSentenceSplitter splitter) { |
| if (biasDictPath == null || biasDictPath.isBlank()) { |
| log("[WARN] No bias dictionary path set — bias scoring disabled."); |
| return null; |
| } |
| BiasLexicon lexicon = new BiasLexicon(biasDictPath); |
| return new BiasAnalyser(lexicon, splitter); |
| } |
|
|
| private void validateConfig() { |
| List<String> missing = new ArrayList<>(); |
| if (sourceProcessor == null) missing.add("sourceProcessor"); |
| if (newDataDir == null || newDataDir.isBlank()) missing.add("newDataDir"); |
| if (sampleDir == null || sampleDir.isBlank()) missing.add("sampleDir"); |
| if (newMetaDir == null || newMetaDir.isBlank()) missing.add("newMetaDir"); |
| if (!missing.isEmpty()) |
| throw new IllegalStateException( |
| "Pipeline configuration missing: " + missing); |
| } |
|
|
| private void ensureDirs(String... paths) { |
| for (String p : paths) { |
| if (p != null) new File(p).mkdirs(); |
| } |
| } |
|
|
| private void banner(String msg) { |
| System.out.println("\n" + "=".repeat(60)); |
| System.out.println(" " + msg); |
| System.out.println("=".repeat(60)); |
| } |
|
|
| private void log(String msg) { |
| System.out.println("[Pipeline] " + msg); |
| } |
| } |
|
|