IfGPT-DataQualityComponents / java /bg /bas /dcl /LLMs /IfGPTDataset /BulNCWikiProcessor.java
dcl-ibl-bas's picture
Upload 22 files
18573e4 verified
package bg.bas.dcl.LLMs.IfGPTDataset;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Scanner;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import bg.bas.dcl.general.JSONProcessor;
import bg.bas.dcl.monolingual.bg.TextProcessor;
/**
* Processes the BulNC "F-InformalFiction" (Wiki/Informal) subcorpus.
*
*/
public class BulNCWikiProcessor extends BaseSourceProcessor {
private static final String CC0_LICENCE = "CC0";
private static final String CC0_LICENCE_LINK =
"https://creativecommons.org/public-domain/cc0/";
private final String metaFilePath;
private final String existingMetaJson; // may be null
private final TextProcessor tp = new TextProcessor();
public BulNCWikiProcessor(String metaFilePath, String existingMetaJson) {
this.metaFilePath = metaFilePath;
this.existingMetaJson = existingMetaJson;
}
/**
*/
@Override
public void process(String indir, String outdir) {
try {
// Load existing metadata if provided, otherwise start fresh
JSONObject json;
JSONArray descrArray;
if (existingMetaJson != null && new File(existingMetaJson).exists()) {
JSONProcessor jp = new JSONProcessor();
json = jp.readJSON(new File(existingMetaJson));
descrArray = (JSONArray) json.get("metadata");
System.out.println("Loaded existing metadata with "
+ descrArray.size() + " entries.");
} else {
json = new JSONObject();
descrArray = new JSONArray();
json.put("metadata", descrArray);
}
int newDocs = 0;
long totalTokens = 0;
Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
while (sme.hasNextLine()) {
String[] dat = sme.nextLine().split("\t");
String relativePath = dat[1];
System.out.println("Checking: " + relativePath);
if (!relativePath.contains("F-InformalFiction")) continue;
String fname = indir + relativePath;
File f = new File(fname);
if (!f.exists()) {
System.err.println("[MISSING] " + fname);
continue;
}
String tfname = "bg_bnc_" + dat[0];
JSONObject fdescr = newBaseDescriptor(tfname);
fdescr.put("Licence", CC0_LICENCE);
fdescr.put("LicenceLink", CC0_LICENCE_LINK);
fdescr.put("PublicationDate", dat[9].replaceAll("\\.", "-"));
fdescr.put("DocumentTitle", dat[8]);
fdescr.put("Author", dat[4]);
fdescr.put("Style", "Administrative");
fdescr.put("Type", dat[17]);
fdescr.put("Subdomain", dat.length > 21 ? dat[21] : "");
fdescr.put("TranslatedDocument", dat[13]);
fdescr.put("CollectionDate", dat[2]);
fdescr.put("Url", dat[12]);
fdescr.put("Domain", dat[19]);
Writer out = new OutputStreamWriter(
new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");
Scanner s = new Scanner(f, "UTF-8");
int nw = 0, ns = 0, np = 0, nt = 0;
while (s.hasNextLine()) {
String text = s.nextLine();
np++;
out.write(text + "\n");
out.flush();
for (String sent : tp.splitToSentences(text)) {
ns++;
String[] words = sent.split(" ");
nw += words.length;
nt += estimateTokenCount(sent);
}
}
s.close();
out.flush();
out.close();
fdescr.put("NumberWords", nw);
fdescr.put("NumberSentences", ns);
fdescr.put("NumberParagraphs", np);
fdescr.put("NumberTokens", nt);
descrArray.add(fdescr);
newDocs++;
totalTokens += nt;
}
sme.close();
System.out.println("New F-InformalFiction documents added: " + newDocs);
System.out.println("Total tokens in new documents: " + totalTokens);
System.out.println("Merged metadata total entries: " + descrArray.size());
writeMetadata(json, outdir, "metadata.json");
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------------------------------------------------------
@SuppressWarnings("unchecked")
private void writeMetadata(JSONObject json, String outdir, String filename)
throws Exception {
String outMetaPath = outdir + filename;
Writer outMeta = new OutputStreamWriter(
new FileOutputStream(outMetaPath), "UTF-8");
json.writeJSONString(outMeta);
outMeta.flush();
outMeta.close();
System.out.println("Merged metadata written to: " + outMetaPath);
}
}