IfGPT-DataQualityComponents / resources /metadata_schema.json
dcl-ibl-bas's picture
Upload 22 files
18573e4 verified
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://dcl.bas.bg/ifgpt/metadata-schema/v1.0",
"title": "IfGPT Document Metadata Schema",
"description": "Metadata schema for textual documents in the IfGPT Bulgarian language dataset. 15 mandatory fields + 8 optional fields.",
"type": "object",
"required": [
"Identifier",
"Licence",
"PublicationDate",
"DocumentTitle",
"Source",
"Medium",
"Url",
"Domain",
"Keywords",
"NumberWords",
"NumberSentences",
"NumberParagraphs",
"NumberTokens",
"PersonallyIdentifiableInformation",
"BiasedInformation"
],
"properties": {
"Identifier": {
"type": "string",
"description": "Unique document identifier with the language prefix 'bg'.",
"pattern": "^bg_",
"examples": ["bg_bnc_12345", "bg_MARCELL_doc001", "bg_CURLICAT_xyz"]
},
"Licence": {
"type": "string",
"description": "Licence name with classification by type (open, restricted, etc.).",
"enum": [
"CC0",
"CC0-1.0",
"CC-BY-4.0",
"CC-BY-SA-4.0",
"CC-BY-NC-4.0",
"CC-BY-NC-SA-4.0",
"Restricted",
"Proprietary",
"Unknown"
]
},
"PublicationDate": {
"type": "string",
"description": "Date of publication of the text (yyyy-mm-dd).",
"pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
"examples": ["2023-04-15", "2019-01-01", ""]
},
"DocumentTitle": {
"type": "string",
"description": "Title of the document.",
"examples": ["Закон за защита на данните", "Статия за климатичните промени"]
},
"Source": {
"type": "string",
"description": "Publishing organisation, media outlet or institutional originator.",
"examples": ["Министерски съвет", "БНР", "Сега"]
},
"Medium": {
"type": "string",
"description": "Modality of the resource.",
"enum": ["textual", "multimodal"]
},
"Url": {
"type": "string",
"description": "Original web address of the document.",
"format": "uri",
"examples": ["https://www.lex.bg/laws/ldoc/123", ""]
},
"Domain": {
"type": "array",
"description": "Up to six subject areas from a controlled vocabulary.",
"maxItems": 6,
"items": {
"type": "string",
"enum": [
"Държавно управление",
"Право и законодателство",
"Икономика и финанси",
"Образование",
"Наука и технологии",
"Здравеопазване",
"Култура и изкуство",
"Спорт",
"Медии и журналистика",
"Общество и политика",
"Околна среда",
"Религия",
"История",
"Литература и художествена проза",
"Неформална комуникация",
"Друго"
]
},
"examples": [["Държавно управление"], ["Медии и журналистика", "Общество и политика"]]
},
"Keywords": {
"type": "array",
"description": "Up to six free-text keywords characterising the content.",
"maxItems": 6,
"items": { "type": "string" },
"examples": [["климат", "законодателство", "ЕС"]]
},
"NumberWords": {
"type": "integer",
"description": "Total number of words (non-punctuation tokens).",
"minimum": 0
},
"NumberSentences": {
"type": "integer",
"description": "Total number of sentences.",
"minimum": 0
},
"NumberParagraphs": {
"type": "integer",
"description": "Total number of paragraphs.",
"minimum": 0
},
"NumberTokens": {
"type": "integer",
"description": "Total number of tokens (words + punctuation).",
"minimum": 0
},
"PersonallyIdentifiableInformation": {
"type": "array",
"description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as personally identifiable information, in [0,1]. Length equals NumberSentences.",
"items": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0
},
"examples": [[0.0, 0.0, 0.15, 0.0, 0.05]]
},
"BiasedInformation": {
"type": "array",
"description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as potentially biased (signal-evaluator pair coverage), in [0,1]. Length equals NumberSentences.",
"items": {
"type": "number",
"minimum": 0.0,
"maximum": 1.0
},
"examples": [[0.0, 0.0, 0.0, 0.10, 0.0]]
},
"Author": {
"type": "array",
"description": "[Optional] Name(s) of the author(s).",
"items": { "type": "string" },
"examples": [["Иван Иванов"], ["Агенция БТА"]]
},
"Style": {
"type": "string",
"description": "[Optional] Stylistic register of the document.",
"enum": [
"Административен",
"Журналистически",
"Научен",
"Художествен",
"Разговорен",
"Правен",
"Технически",
"Неформален",
""
]
},
"Type": {
"type": "string",
"description": "[Optional] Document genre.",
"enum": [
"Закон",
"Наредба",
"Решение",
"Статия",
"Книга",
"Доклад",
"Интервю",
"Коментар",
"Форум",
"Блог",
"Уикипедия",
"Друго",
""
]
},
"Subdomain": {
"type": "array",
"description": "[Optional] Narrower thematic classification, hierarchically linked to Domain.",
"maxItems": 6,
"items": { "type": "string" },
"examples": [["Европейско законодателство"], ["Климатична политика"]]
},
"TranslatedDocument": {
"type": ["boolean", "string"],
"description": "[Optional] true = translation into Bulgarian; false = original Bulgarian text.",
"examples": [false, true, ""]
},
"CollectionDate": {
"type": "string",
"description": "[Optional] Date of acquisition into the collection (yyyy-mm-dd).",
"pattern": "^(\\d{4}-\\d{2}-\\d{2})?$",
"examples": ["2024-03-10", ""]
},
"LicenceLink": {
"type": "string",
"description": "[Optional] URL of the licence text.",
"format": "uri",
"examples": [
"https://creativecommons.org/public-domain/cc0/",
"https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf",
""
]
},
"TaskCategories": {
"type": "array",
"description": "[Optional] Anticipated NLP applications from a predefined list.",
"items": {
"type": "string",
"enum": [
"Language Modelling",
"Text Classification",
"Named Entity Recognition",
"Machine Translation",
"Summarisation",
"Question Answering",
"Sentiment Analysis",
"Bias Detection",
"PII Detection",
"Information Extraction",
"Coreference Resolution",
"Dependency Parsing",
"Other"
]
},
"examples": [["Language Modelling", "Named Entity Recognition"]]
}
},
"additionalProperties": false
}