File size: 5,490 Bytes
18573e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
package bg.bas.dcl.LLMs.IfGPTDataset;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Scanner;

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;

import bg.bas.dcl.general.JSONProcessor;
import bg.bas.dcl.monolingual.bg.TextProcessor;

/**
 * Processes the BulNC "F-InformalFiction" (Wiki/Informal) subcorpus.
 *
  
 */
public class BulNCWikiProcessor extends BaseSourceProcessor {

    private static final String CC0_LICENCE      = "CC0";
    private static final String CC0_LICENCE_LINK =
            "https://creativecommons.org/public-domain/cc0/";

    private final String metaFilePath;
    private final String existingMetaJson; // may be null
    private final TextProcessor tp = new TextProcessor();

    public BulNCWikiProcessor(String metaFilePath, String existingMetaJson) {
        this.metaFilePath     = metaFilePath;
        this.existingMetaJson = existingMetaJson;
    }

    /**
      
     */
    @Override
    public void process(String indir, String outdir) {
        try {
            // Load existing metadata if provided, otherwise start fresh
            JSONObject json;
            JSONArray descrArray;

            if (existingMetaJson != null && new File(existingMetaJson).exists()) {
                JSONProcessor jp = new JSONProcessor();
                json = jp.readJSON(new File(existingMetaJson));
                descrArray = (JSONArray) json.get("metadata");
                System.out.println("Loaded existing metadata with "
                        + descrArray.size() + " entries.");
            } else {
                json = new JSONObject();
                descrArray = new JSONArray();
                json.put("metadata", descrArray);
            }

            int newDocs = 0;
            long totalTokens = 0;

            Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
            while (sme.hasNextLine()) {
                String[] dat = sme.nextLine().split("\t");

                String relativePath = dat[1];
                System.out.println("Checking: " + relativePath);

                if (!relativePath.contains("F-InformalFiction")) continue;

                String fname = indir + relativePath;
                File f = new File(fname);
                if (!f.exists()) {
                    System.err.println("[MISSING] " + fname);
                    continue;
                }

                String tfname = "bg_bnc_" + dat[0];

                JSONObject fdescr = newBaseDescriptor(tfname);
                fdescr.put("Licence",            CC0_LICENCE);
                fdescr.put("LicenceLink",        CC0_LICENCE_LINK);
                fdescr.put("PublicationDate",    dat[9].replaceAll("\\.", "-"));
                fdescr.put("DocumentTitle",      dat[8]);
                fdescr.put("Author",             dat[4]);
                fdescr.put("Style",              "Administrative");
                fdescr.put("Type",               dat[17]);
                fdescr.put("Subdomain",          dat.length > 21 ? dat[21] : "");
                fdescr.put("TranslatedDocument", dat[13]);
                fdescr.put("CollectionDate",     dat[2]);
                fdescr.put("Url",                dat[12]);
                fdescr.put("Domain",             dat[19]);

                Writer out = new OutputStreamWriter(
                        new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");

                Scanner s = new Scanner(f, "UTF-8");
                int nw = 0, ns = 0, np = 0, nt = 0;

                while (s.hasNextLine()) {
                    String text = s.nextLine();
                    np++;

                    out.write(text + "\n");
                    out.flush();

                    for (String sent : tp.splitToSentences(text)) {
                        ns++;
                        String[] words = sent.split(" ");
                        nw += words.length;
                        nt += estimateTokenCount(sent);
                    }
                }

                s.close();
                out.flush();
                out.close();

                fdescr.put("NumberWords",      nw);
                fdescr.put("NumberSentences",  ns);
                fdescr.put("NumberParagraphs", np);
                fdescr.put("NumberTokens",     nt);

                descrArray.add(fdescr);
                newDocs++;
                totalTokens += nt;
            }
            sme.close();

            System.out.println("New F-InformalFiction documents added: " + newDocs);
            System.out.println("Total tokens in new documents: " + totalTokens);
            System.out.println("Merged metadata total entries: " + descrArray.size());

            writeMetadata(json, outdir, "metadata.json");

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // -----------------------------------------------------------------------

    @SuppressWarnings("unchecked")
    private void writeMetadata(JSONObject json, String outdir, String filename)
            throws Exception {
        String outMetaPath = outdir + filename;
        Writer outMeta = new OutputStreamWriter(
                new FileOutputStream(outMetaPath), "UTF-8");
        json.writeJSONString(outMeta);
        outMeta.flush();
        outMeta.close();

        System.out.println("Merged metadata written to: " + outMetaPath);
         
    }
}