Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

File size: 6,774 Bytes

18573e4

package bg.bas.dcl.LLMs.IfGPTDataset;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Scanner;

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;

import bg.bas.dcl.monolingual.bg.TextProcessor;

/**
 * Processes the Bulgarian National Corpus (BulNC) — general subcorpora.
 *
 * Unlike MARCELL/CURLICAT, BulNC metadata is supplied via an external
 * tab-separated description file (BulNC-description.txt) rather than
 * inline CoNLL-UP comments.  Plain-text source files are read directly.
 *
 * Subcorpora included (controlled by {@link #isIncluded}):
 *   A-Administrative, B-Science, C-MassMedia, D-Fiction
 *   (edit the method to adjust the filter)
 *
 * SETimes articles are excluded regardless of subcorpus.
 *
 * Licence rules:
 *   A-Administrative → CC0
 *   B-Science        → Restricted
 *   C-MassMedia      → Restricted
 *   D-Fiction        → Restricted
 *
 * Description file column indices (0-based):
 *   0  filename stem  |  1  relative path  |  2  collection date
 *   4  author         |  8  title          |  9  publication date
 *   12 url            |  13 translated     |  17 type
 *   19 domain         |  21 subdomain (optional)
 */
public class BulNCProcessor extends BaseSourceProcessor {

    private static final String CC0_LICENCE      = "CC0";
    private static final String CC0_LICENCE_LINK =
            "https://creativecommons.org/public-domain/cc0/";
    private static final String RESTRICTED = "Restricted";

    private final String metaFilePath; // path to BulNC-description.txt
    private final TextProcessor tp = new TextProcessor();

    /**
     * @param metaFilePath absolute path to BulNC-description.txt
     */
    public BulNCProcessor(String metaFilePath) {
        this.metaFilePath = metaFilePath;
    }

    /**
     * @param indir  root directory of the BulNC corpus
     * @param outdir output directory for .txt files and metadata
     */
    @Override
    public void process(String indir, String outdir) {
        try {
            JSONObject json = new JSONObject();
            JSONArray descrArray = new JSONArray();

            Scanner sme = new Scanner(new File(metaFilePath), "UTF-8");
            while (sme.hasNextLine()) {
                String[] dat = sme.nextLine().split("\t");

                String relativePath = dat[1];
                System.out.println("Checking: " + relativePath);

                // --- Subcorpus filter ---
                if (!isIncluded(relativePath)) continue;

                // --- SETimes exclusion ---
                if (dat[12].contains("setimes")) continue;

                String fname = indir + relativePath;
                File f = new File(fname);
                if (!f.exists()) {
                    System.err.println("[MISSING] " + fname);
                    continue;
                }

                String tfname = "bg_bnc_" + dat[0];

                JSONObject fdescr = newBaseDescriptor(tfname);
                applyLicence(fdescr, relativePath);

                fdescr.put("PublicationDate",    dat[9].replaceAll("\\.", "-"));
                fdescr.put("DocumentTitle",      dat[8]);
                fdescr.put("Author",             dat[4]);
                fdescr.put("Style",              "Administrative");
                fdescr.put("Type",               dat[17]);
                fdescr.put("Subdomain",          dat.length > 21 ? dat[21] : "");
                fdescr.put("TranslatedDocument", dat[13]);
                fdescr.put("CollectionDate",     dat[2]);
                fdescr.put("Url",                dat[12]);
                fdescr.put("Domain",             dat[19]);

                Writer out = new OutputStreamWriter(
                        new FileOutputStream(outdir + tfname + ".txt"), "UTF-8");

                Scanner s = new Scanner(f, "UTF-8");
                int nw = 0, ns = 0, np = 0, nt = 0;

                while (s.hasNextLine()) {
                    String text = s.nextLine();
                    np++;

                    out.write(text + "\n");
                    out.flush();

                    for (String sent : tp.splitToSentences(text)) {
                        ns++;
                        String[] words = sent.split(" ");
                        nw += words.length;
                        nt += estimateTokenCount(sent);
                    }
                }

                s.close();
                out.flush();
                out.close();

                fdescr.put("NumberWords",      nw);
                fdescr.put("NumberSentences",  ns);
                fdescr.put("NumberParagraphs", np);
                fdescr.put("NumberTokens",     nt);

                descrArray.add(fdescr);
            }
            sme.close();

            json.put("metadata", descrArray);

            System.out.println("Total documents processed: " + descrArray.size());
            writeMetadata(json, outdir, "metadata_BNC_mm.json");

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // -----------------------------------------------------------------------
    // Helpers
    // -----------------------------------------------------------------------

    /**
     * Returns true for subcorpora that should be processed.
     * Edit this method to change the filter.
     */
    protected boolean isIncluded(String relativePath) {
        return relativePath.contains("C-MassMedia/");
        // Uncomment to add more subcorpora:
        // || relativePath.contains("A-Administrative/")
        // || relativePath.contains("B-Science/")
        // || relativePath.contains("D-Fiction/")
    }

    @SuppressWarnings("unchecked")
    private void applyLicence(JSONObject fdescr, String relativePath) {
        if (relativePath.contains("B-Science/")
                || relativePath.contains("C-MassMedia/")
                || relativePath.contains("D-Fiction/")) {
            fdescr.put("Licence",     RESTRICTED);
            fdescr.put("LicenceLink", "");
        } else {
            fdescr.put("Licence",     CC0_LICENCE);
            fdescr.put("LicenceLink", CC0_LICENCE_LINK);
        }
    }

    @SuppressWarnings("unchecked")
    private void writeMetadata(JSONObject json, String outdir, String filename)
            throws Exception {
        String outMetaPath = outdir + filename;
        Writer outMeta = new OutputStreamWriter(
                new FileOutputStream(outMetaPath), "UTF-8");
        json.writeJSONString(outMeta);
        outMeta.flush();
        outMeta.close();

        convertJsonToCSV(json, outMetaPath + "_CSV.csv");
        System.out.println("Metadata written to: " + outMetaPath);
    }
}