Spaces:

DCL-IBL
/

IfGPT-DataQualityComponents

Running

File size: 18,152 Bytes

18573e4

package bg.bas.dcl.LLMs.IfGPTDataset;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;

/**
 * DocumentMetadata
 *
 * Canonical in-memory representation of the ifGPT dataset metadata schema.
  
 */
@SuppressWarnings("unchecked")
public class DocumentMetadata {

    // -----------------------------------------------------------------------
    // ── MANDATORY (15) ──────────────────────────────────────────────────────
    // -----------------------------------------------------------------------

    /** Unique document identifier with the language prefix "bg". */
    private String       identifier        = "";

    /** Licence name (open, restricted, …). */
    private String       licence           = "";

    /** Publication date yyyy-mm-dd. */
    private String       publicationDate   = "";

    /** Title of the document. */
    private String       documentTitle     = "";

    /** Publishing organisation / media outlet / institutional originator. */
    private String       source            = "";

    /** Modality: "textual" | "multimodal". */
    private String       medium            = "textual";

    /** Original web address. */
    private String       url               = "";

    /** Up to six subject-area labels from a controlled vocabulary. */
    private List<String> domain            = new ArrayList<>();

    /** Up to six free-text keywords. */
    private List<String> keywords          = new ArrayList<>();

    /** Total word count (non-punctuation tokens). */
    private int          numberWords       = 0;

    /** Total sentence count. */
    private int          numberSentences   = 0;

    /** Total paragraph count. */
    private int          numberParagraphs  = 0;

    /** Total token count (words + punctuation). */
    private int          numberTokens      = 0;

    /**
     * Per-sentence PII coverage vector.
     * Entry i = proportion of tokens in sentence i flagged as PII ∈ [0,1].
     * Length == numberSentences after pipeline completion.
     */
    private List<Double> piiVector         = new ArrayList<>();

    /**
     * Per-sentence bias coverage vector.
     * Entry i = proportion of tokens in sentence i flagged as biased ∈ [0,1].
     * Length == numberSentences after pipeline completion.
     */
    private List<Double> biasVector        = new ArrayList<>();

    // -----------------------------------------------------------------------
    // ── OPTIONAL (8) ────────────────────────────────────────────────────────
    // -----------------------------------------------------------------------

    /** Name(s) of the author(s). */
    private List<String> author            = new ArrayList<>();

    /** Stylistic register: legal | journalistic | administrative | … */
    private String       style             = "";

    /** Document genre: book | document | article | … */
    private String       type              = "";

    /** Narrower thematic classification, hierarchically linked to Domain. */
    private List<String> subdomain         = new ArrayList<>();

    /** true = translation, false = original Bulgarian text. */
    private Boolean      translatedDocument = null;  // null = unknown

    /** Date of acquisition yyyy-mm-dd. */
    private String       collectionDate    = "";

    /** URL of the licence text. */
    private String       licenceLink       = "";

    /** Anticipated NLP applications from a predefined list. */
    private List<String> taskCategories    = new ArrayList<>();

    // -----------------------------------------------------------------------
    // Constructor
    // -----------------------------------------------------------------------

    public DocumentMetadata() {}

    public DocumentMetadata(String identifier) {
        this.identifier = identifier;
    }

    // -----------------------------------------------------------------------
    // Fluent setters — mandatory
    // -----------------------------------------------------------------------

    public DocumentMetadata setIdentifier(String v)       { identifier       = v; return this; }
    public DocumentMetadata setLicence(String v)          { licence          = v; return this; }
    public DocumentMetadata setPublicationDate(String v)  { publicationDate  = v; return this; }
    public DocumentMetadata setDocumentTitle(String v)    { documentTitle    = v; return this; }
    public DocumentMetadata setSource(String v)           { source           = v; return this; }
    public DocumentMetadata setMedium(String v)           { medium           = v; return this; }
    public DocumentMetadata setUrl(String v)              { url              = v; return this; }
    public DocumentMetadata setDomain(List<String> v)     { domain           = v != null ? v : new ArrayList<>(); return this; }
    public DocumentMetadata addDomain(String v)           { domain.add(v); return this; }
    public DocumentMetadata setKeywords(List<String> v)   { keywords         = v != null ? v : new ArrayList<>(); return this; }
    public DocumentMetadata addKeyword(String v)          { keywords.add(v); return this; }
    public DocumentMetadata setNumberWords(int v)         { numberWords      = v; return this; }
    public DocumentMetadata setNumberSentences(int v)     { numberSentences  = v; return this; }
    public DocumentMetadata setNumberParagraphs(int v)    { numberParagraphs = v; return this; }
    public DocumentMetadata setNumberTokens(int v)        { numberTokens     = v; return this; }
    public DocumentMetadata setPiiVector(List<Double> v)  { piiVector        = v != null ? v : new ArrayList<>(); return this; }
    public DocumentMetadata setBiasVector(List<Double> v) { biasVector       = v != null ? v : new ArrayList<>(); return this; }

    // Fluent setters — optional
    public DocumentMetadata setAuthor(List<String> v)          { author            = v != null ? v : new ArrayList<>(); return this; }
    public DocumentMetadata addAuthor(String v)                { author.add(v); return this; }
    public DocumentMetadata setStyle(String v)                 { style             = v; return this; }
    public DocumentMetadata setType(String v)                  { type              = v; return this; }
    public DocumentMetadata setSubdomain(List<String> v)       { subdomain         = v != null ? v : new ArrayList<>(); return this; }
    public DocumentMetadata addSubdomain(String v)             { subdomain.add(v); return this; }
    public DocumentMetadata setTranslatedDocument(Boolean v)   { translatedDocument= v; return this; }
    public DocumentMetadata setCollectionDate(String v)        { collectionDate    = v; return this; }
    public DocumentMetadata setLicenceLink(String v)           { licenceLink       = v; return this; }
    public DocumentMetadata setTaskCategories(List<String> v)  { taskCategories    = v != null ? v : new ArrayList<>(); return this; }
    public DocumentMetadata addTaskCategory(String v)          { taskCategories.add(v); return this; }

    // -----------------------------------------------------------------------
    // Getters
    // -----------------------------------------------------------------------

    public String       getIdentifier()        { return identifier; }
    public String       getLicence()           { return licence; }
    public String       getPublicationDate()   { return publicationDate; }
    public String       getDocumentTitle()     { return documentTitle; }
    public String       getSource()            { return source; }
    public String       getMedium()            { return medium; }
    public String       getUrl()               { return url; }
    public List<String> getDomain()            { return Collections.unmodifiableList(domain); }
    public List<String> getKeywords()          { return Collections.unmodifiableList(keywords); }
    public int          getNumberWords()       { return numberWords; }
    public int          getNumberSentences()   { return numberSentences; }
    public int          getNumberParagraphs()  { return numberParagraphs; }
    public int          getNumberTokens()      { return numberTokens; }
    public List<Double> getPiiVector()         { return Collections.unmodifiableList(piiVector); }
    public List<Double> getBiasVector()        { return Collections.unmodifiableList(biasVector); }

    public List<String> getAuthor()            { return Collections.unmodifiableList(author); }
    public String       getStyle()             { return style; }
    public String       getType()              { return type; }
    public List<String> getSubdomain()         { return Collections.unmodifiableList(subdomain); }
    public Boolean      getTranslatedDocument(){ return translatedDocument; }
    public String       getCollectionDate()    { return collectionDate; }
    public String       getLicenceLink()       { return licenceLink; }
    public List<String> getTaskCategories()    { return Collections.unmodifiableList(taskCategories); }

    // -----------------------------------------------------------------------
    // Validation
    // -----------------------------------------------------------------------

    /**
     * Returns a list of missing mandatory fields.
     * An empty list means the record is complete.
     */
    public List<String> missingMandatoryFields() {
        List<String> missing = new ArrayList<>();
        if (identifier.isBlank())       missing.add("Identifier");
        if (licence.isBlank())          missing.add("Licence");
        if (medium.isBlank())           missing.add("Medium");
        if (numberWords == 0)           missing.add("NumberWords");
        if (numberSentences == 0)       missing.add("NumberSentences");
        if (numberParagraphs == 0)      missing.add("NumberParagraphs");
        if (numberTokens == 0)          missing.add("NumberTokens");
        // piiVector and biasVector may legitimately be empty for clean docs
        return missing;
    }

    // -----------------------------------------------------------------------
    // JSON serialisation  (json-simple)
    // -----------------------------------------------------------------------

    /** Serialises this record to a json-simple JSONObject. */
    public JSONObject toJson() {
        JSONObject o = new JSONObject();

        // Mandatory
        o.put("Identifier",                       identifier);
        o.put("Licence",                          licence);
        o.put("PublicationDate",                  publicationDate);
        o.put("DocumentTitle",                    documentTitle);
        o.put("Source",                           source);
        o.put("Medium",                           medium);
        o.put("Url",                              url);
        o.put("Domain",                           toJsonArray(domain));
        o.put("Keywords",                         toJsonArray(keywords));
        o.put("NumberWords",                      numberWords);
        o.put("NumberSentences",                  numberSentences);
        o.put("NumberParagraphs",                 numberParagraphs);
        o.put("NumberTokens",                     numberTokens);
        o.put("PersonallyIdentifiableInformation",toJsonDoubleArray(piiVector));
        o.put("BiasedInformation",                toJsonDoubleArray(biasVector));

        // Optional
        o.put("Author",            toJsonArray(author));
        o.put("Style",             style);
        o.put("Type",              type);
        o.put("Subdomain",         toJsonArray(subdomain));
        o.put("TranslatedDocument",
              translatedDocument == null ? "" : translatedDocument.toString());
        o.put("CollectionDate",    collectionDate);
        o.put("LicenceLink",       licenceLink);
        o.put("TaskCategories",    toJsonArray(taskCategories));

        return o;
    }

    /**
     * Populates a DocumentMetadata from a json-simple JSONObject previously
     * produced by {@link #toJson()}.
     */
    public static DocumentMetadata fromJson(JSONObject o) {
        DocumentMetadata m = new DocumentMetadata();

        m.identifier        = str(o, "Identifier");
        m.licence           = str(o, "Licence");
        m.publicationDate   = str(o, "PublicationDate");
        m.documentTitle     = str(o, "DocumentTitle");
        m.source            = str(o, "Source");
        m.medium            = str(o, "Medium");
        m.url               = str(o, "Url");
        m.domain            = strList(o, "Domain");
        m.keywords          = strList(o, "Keywords");
        m.numberWords       = intVal(o, "NumberWords");
        m.numberSentences   = intVal(o, "NumberSentences");
        m.numberParagraphs  = intVal(o, "NumberParagraphs");
        m.numberTokens      = intVal(o, "NumberTokens");
        m.piiVector         = doubleList(o, "PersonallyIdentifiableInformation");
        m.biasVector        = doubleList(o, "BiasedInformation");

        m.author            = strList(o, "Author");
        m.style             = str(o, "Style");
        m.type              = str(o, "Type");
        m.subdomain         = strList(o, "Subdomain");
        String td           = str(o, "TranslatedDocument");
        m.translatedDocument= td.isBlank() ? null : Boolean.parseBoolean(td);
        m.collectionDate    = str(o, "CollectionDate");
        m.licenceLink       = str(o, "LicenceLink");
        m.taskCategories    = strList(o, "TaskCategories");

        return m;
    }

    // -----------------------------------------------------------------------
    // Interop with legacy JSONObject format (used by source processors)
    // -----------------------------------------------------------------------

    /**
     * Merges fields from a legacy source-processor JSONObject (the format
     * produced by MarcellProcessor, BulNCProcessor, etc.) into this record.
     * Fields already set on {@code this} are NOT overwritten.
     */
    public void mergeLegacy(JSONObject legacy) {
        if (identifier.isBlank())      setIdentifier(str(legacy, "Identifier"));
        if (licence.isBlank())         setLicence(str(legacy, "Licence"));
        if (licenceLink.isBlank())     setLicenceLink(str(legacy, "LicenceLink"));
        if (publicationDate.isBlank()) setPublicationDate(str(legacy, "PublicationDate"));
        if (documentTitle.isBlank())   setDocumentTitle(str(legacy, "DocumentTitle"));
        if (source.isBlank())          setSource(str(legacy, "Source"));
        if (url.isBlank())             setUrl(str(legacy, "Url"));
        if (style.isBlank())           setStyle(str(legacy, "Style"));
        if (type.isBlank())            setType(str(legacy, "Type"));
        if (collectionDate.isBlank())  setCollectionDate(str(legacy, "CollectionDate"));

        if (author.isEmpty()) {
            String a = str(legacy, "Author");
            if (!a.isBlank()) author.add(a);
        }
        if (domain.isEmpty()) {
            String d = str(legacy, "Domain");
            if (!d.isBlank()) domain.add(d);
        }
        if (subdomain.isEmpty()) {
            String s = str(legacy, "Subdomain");
            if (!s.isBlank()) subdomain.add(s);
        }
        if (numberWords      == 0) numberWords      = intVal(legacy, "NumberWords");
        if (numberSentences  == 0) numberSentences  = intVal(legacy, "NumberSentences");
        if (numberParagraphs == 0) numberParagraphs = intVal(legacy, "NumberParagraphs");
        if (numberTokens     == 0) numberTokens     = intVal(legacy, "NumberTokens");

        String translated = str(legacy, "TranslatedDocument");
        if (translatedDocument == null && !translated.isBlank())
            translatedDocument = Boolean.parseBoolean(translated);
    }

    // -----------------------------------------------------------------------
    // Private helpers
    // -----------------------------------------------------------------------

    private static String str(JSONObject o, String key) {
        Object v = o.get(key);
        return v == null ? "" : v.toString().trim();
    }

    private static int intVal(JSONObject o, String key) {
        Object v = o.get(key);
        if (v == null) return 0;
        try { return Integer.parseInt(v.toString().trim()); }
        catch (NumberFormatException e) { return 0; }
    }

    private static List<String> strList(JSONObject o, String key) {
        Object v = o.get(key);
        List<String> list = new ArrayList<>();
        if (v instanceof JSONArray) {
            for (Object item : (JSONArray) v)
                if (item != null) list.add(item.toString());
        } else if (v != null && !v.toString().isBlank()) {
            list.add(v.toString().trim());
        }
        return list;
    }

    private static List<Double> doubleList(JSONObject o, String key) {
        Object v = o.get(key);
        List<Double> list = new ArrayList<>();
        if (v instanceof JSONArray) {
            for (Object item : (JSONArray) v) {
                try { list.add(Double.parseDouble(item.toString())); }
                catch (NumberFormatException ignored) {}
            }
        }
        return list;
    }

    private JSONArray toJsonArray(List<String> list) {
        JSONArray a = new JSONArray();
        if (list != null) a.addAll(list);
        return a;
    }

    private JSONArray toJsonDoubleArray(List<Double> list) {
        JSONArray a = new JSONArray();
        if (list != null) a.addAll(list);
        return a;
    }

    @Override
    public String toString() {
        return String.format(
            "DocumentMetadata{id='%s', sentences=%d, words=%d, piiEntries=%d, biasEntries=%d}",
            identifier, numberSentences, numberWords, piiVector.size(), biasVector.size());
    }
}