File size: 6,667 Bytes
18573e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
package bg.bas.dcl.LLMs.IfGPTDataset;

/**
 * IfGPTDatasetProcessor
 *
  
 */
public class IfGPTDatasetProcessor {

    // -----------------------------------------------------------------------
    // Shared paths
    // -----------------------------------------------------------------------

    // New batch being ingested
    static final String NEW_DATA_DIR    = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/data/";
    static final String NEW_META_DIR    = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/metadata/";
    static final String SAMPLE_DIR      = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/sample/";
    static final String BLOCKLIST_FILE  = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/blocklist.txt";
    static final String DEDUP_REPORT    = "/home/ivelina/WORK-DCL/IfGPT/NEW_BATCH/dedup_report.tsv";

    // Shared resources
    static final String BULNC_META_FILE = "/home/ivelina/SVN_CORPUS/BulNC/BulNC-description.txt";
    static final String BIAS_DICT       = "/home/ivelina/WORK-DCL/WIKIPEDIA-BIAS/"
                                        + "bulgarian_bias_dictionary_v4.tsv";

    // -----------------------------------------------------------------------
    // Main
    // -----------------------------------------------------------------------

    public static void main(String[] args) {

        // ==================================================================
        // MODE A — FULL PIPELINE  (one call runs all 8 stages)
        // ==================================================================
        // Choose the source processor that matches the new batch format,
        // then call pipeline.run().

        // --- BulNC Mass Media batch ---
        runBulNCPipeline();

        // --- MARCELL batch ---
        // runMarcellPipeline();

        // --- CURLICAT batch ---
        // runCurlicatPipeline();

        // --- BulNC Wiki/InformalFiction batch ---
        // runBulNCWikiPipeline();


        // ==================================================================
        // MODE B — INDIVIDUAL STAGES
        // ==================================================================

        // --- 1. Extract only ---
        // new BulNCProcessor(BULNC_META_FILE).process(NEW_DATA_DIR, NEW_META_DIR);

        // --- 3. Clean only (learn + apply) ---
        // FileCleanProcessor fcp = new FileCleanProcessor(0.50);
        // fcp.learnFromSample(SAMPLE_DIR);
        // fcp.printTopCommonLines(30);
        // fcp.saveBlocklist(BLOCKLIST_FILE);
        // fcp.cleanDirectory(NEW_DATA_DIR, true);

        // --- 4. Deduplication only ---
        // DeduplicationProcessor dp = new DeduplicationProcessor(0.90, 5, 200);
        // dp.indexCorpus(IfGPTPipeline.FULL_DATA_DIR);
        // dp.detectDuplicates(NEW_DATA_DIR, DEDUP_REPORT);
        // dp.removeDuplicatesFromNewFolder(NEW_DATA_DIR, true); // optional

        // --- 5/6. PII + Bias annotation only (on already-split sentences) ---
        // bg.bas.dcl.LLMs.BulgarianSentenceSplitter splitter =
        //         new bg.bas.dcl.LLMs.BulgarianSentenceSplitter();
        // bg.bas.dcl.LLMs.PIIDetector pii = new bg.bas.dcl.LLMs.PIIDetector(splitter);
        // pii.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "pii_report.tsv");
        //
        // bg.bas.dcl.LLMs.BiasLexicon lex =
        //         new bg.bas.dcl.LLMs.BiasLexicon(BIAS_DICT);
        // bg.bas.dcl.LLMs.BiasAnalyser bias =
        //         new bg.bas.dcl.LLMs.BiasAnalyser(lex, splitter);
        // bias.analyseDirectory(NEW_DATA_DIR, NEW_META_DIR + "bias_report.tsv");


        // ==================================================================
        // MODE C — UTILITIES
        // ==================================================================

        // Convert an existing metadata JSON to CSV
        // new MarcellProcessor().convertJsonToCSV(
        //         IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json");
    }

    // -----------------------------------------------------------------------
    // Pipeline factory methods (one per source type)
    // -----------------------------------------------------------------------

    private static void runBulNCPipeline() {
        new IfGPTPipeline()
            .setSourceProcessor(new BulNCProcessor(BULNC_META_FILE))
            .setNewDataDir(NEW_DATA_DIR)
            .setSampleDir(SAMPLE_DIR)
            .setNewMetaDir(NEW_META_DIR)
            .setBlocklistFile(BLOCKLIST_FILE)
            .setDedupReport(DEDUP_REPORT)
            .setBiasDictPath(BIAS_DICT)
            .setBoilerplateThreshold(0.50)
            .setDedupThreshold(0.90)
            .setRemoveDuplicates(false)   // set true to delete dup sentences
            .setKeepBackups(true)
            .run();
    }

    private static void runMarcellPipeline() {
        String indirMarcell = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/bg-annotated/";
        String outdirMarcell= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/MARCELL/texts/";

        new IfGPTPipeline()
            .setSourceProcessor(new MarcellProcessor())
            .setNewDataDir(outdirMarcell)
            .setSampleDir(SAMPLE_DIR)
            .setNewMetaDir(NEW_META_DIR)
            .setBlocklistFile(BLOCKLIST_FILE)
            .setDedupReport(DEDUP_REPORT)
            .setBiasDictPath(BIAS_DICT)
            .setSkipClean(false)
            .setSkipDedup(false)
            .run();
    }

    private static void runCurlicatPipeline() {
        String indirCurlicat = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/archive/"
                             + "Bulgarian_Curlicat_corpus/";
        String outdirCurlicat= "/home/ivelina/WORK-DCL/ifGPT/CORPORA/CURLICAT/texts/";

        new IfGPTPipeline()
            .setSourceProcessor(new CurlicatProcessor())
            .setNewDataDir(outdirCurlicat)
            .setSampleDir(SAMPLE_DIR)
            .setNewMetaDir(NEW_META_DIR)
            .setBlocklistFile(BLOCKLIST_FILE)
            .setDedupReport(DEDUP_REPORT)
            .setBiasDictPath(BIAS_DICT)
            .run();
    }

    private static void runBulNCWikiPipeline() {
        String existingMeta = IfGPTPipeline.FULL_META_DIR + "metadata_BNC_mm.json";
        String outdirWiki   = "/home/ivelina/WORK-DCL/ifGPT/CORPORA/BulNC/wiki-texts/";

        new IfGPTPipeline()
            .setSourceProcessor(new BulNCWikiProcessor(BULNC_META_FILE, existingMeta))
            .setNewDataDir(outdirWiki)
            .setSampleDir(SAMPLE_DIR)
            .setNewMetaDir(NEW_META_DIR)
            .setBlocklistFile(BLOCKLIST_FILE)
            .setDedupReport(DEDUP_REPORT)
            .setBiasDictPath(BIAS_DICT)
            .run();
    }
}