{ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://dcl.bas.bg/ifgpt/metadata-schema/v1.0", "title": "IfGPT Document Metadata Schema", "description": "Metadata schema for textual documents in the IfGPT Bulgarian language dataset. 15 mandatory fields + 8 optional fields.", "type": "object", "required": [ "Identifier", "Licence", "PublicationDate", "DocumentTitle", "Source", "Medium", "Url", "Domain", "Keywords", "NumberWords", "NumberSentences", "NumberParagraphs", "NumberTokens", "PersonallyIdentifiableInformation", "BiasedInformation" ], "properties": { "Identifier": { "type": "string", "description": "Unique document identifier with the language prefix 'bg'.", "pattern": "^bg_", "examples": ["bg_bnc_12345", "bg_MARCELL_doc001", "bg_CURLICAT_xyz"] }, "Licence": { "type": "string", "description": "Licence name with classification by type (open, restricted, etc.).", "enum": [ "CC0", "CC0-1.0", "CC-BY-4.0", "CC-BY-SA-4.0", "CC-BY-NC-4.0", "CC-BY-NC-SA-4.0", "Restricted", "Proprietary", "Unknown" ] }, "PublicationDate": { "type": "string", "description": "Date of publication of the text (yyyy-mm-dd).", "pattern": "^(\\d{4}-\\d{2}-\\d{2})?$", "examples": ["2023-04-15", "2019-01-01", ""] }, "DocumentTitle": { "type": "string", "description": "Title of the document.", "examples": ["Закон за защита на данните", "Статия за климатичните промени"] }, "Source": { "type": "string", "description": "Publishing organisation, media outlet or institutional originator.", "examples": ["Министерски съвет", "БНР", "Сега"] }, "Medium": { "type": "string", "description": "Modality of the resource.", "enum": ["textual", "multimodal"] }, "Url": { "type": "string", "description": "Original web address of the document.", "format": "uri", "examples": ["https://www.lex.bg/laws/ldoc/123", ""] }, "Domain": { "type": "array", "description": "Up to six subject areas from a controlled vocabulary.", "maxItems": 6, "items": { "type": "string", "enum": [ "Държавно управление", "Право и законодателство", "Икономика и финанси", "Образование", "Наука и технологии", "Здравеопазване", "Култура и изкуство", "Спорт", "Медии и журналистика", "Общество и политика", "Околна среда", "Религия", "История", "Литература и художествена проза", "Неформална комуникация", "Друго" ] }, "examples": [["Държавно управление"], ["Медии и журналистика", "Общество и политика"]] }, "Keywords": { "type": "array", "description": "Up to six free-text keywords characterising the content.", "maxItems": 6, "items": { "type": "string" }, "examples": [["климат", "законодателство", "ЕС"]] }, "NumberWords": { "type": "integer", "description": "Total number of words (non-punctuation tokens).", "minimum": 0 }, "NumberSentences": { "type": "integer", "description": "Total number of sentences.", "minimum": 0 }, "NumberParagraphs": { "type": "integer", "description": "Total number of paragraphs.", "minimum": 0 }, "NumberTokens": { "type": "integer", "description": "Total number of tokens (words + punctuation).", "minimum": 0 }, "PersonallyIdentifiableInformation": { "type": "array", "description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as personally identifiable information, in [0,1]. Length equals NumberSentences.", "items": { "type": "number", "minimum": 0.0, "maximum": 1.0 }, "examples": [[0.0, 0.0, 0.15, 0.0, 0.05]] }, "BiasedInformation": { "type": "array", "description": "Per-sentence vector. Each entry is the proportion of tokens in that sentence flagged as potentially biased (signal-evaluator pair coverage), in [0,1]. Length equals NumberSentences.", "items": { "type": "number", "minimum": 0.0, "maximum": 1.0 }, "examples": [[0.0, 0.0, 0.0, 0.10, 0.0]] }, "Author": { "type": "array", "description": "[Optional] Name(s) of the author(s).", "items": { "type": "string" }, "examples": [["Иван Иванов"], ["Агенция БТА"]] }, "Style": { "type": "string", "description": "[Optional] Stylistic register of the document.", "enum": [ "Административен", "Журналистически", "Научен", "Художествен", "Разговорен", "Правен", "Технически", "Неформален", "" ] }, "Type": { "type": "string", "description": "[Optional] Document genre.", "enum": [ "Закон", "Наредба", "Решение", "Статия", "Книга", "Доклад", "Интервю", "Коментар", "Форум", "Блог", "Уикипедия", "Друго", "" ] }, "Subdomain": { "type": "array", "description": "[Optional] Narrower thematic classification, hierarchically linked to Domain.", "maxItems": 6, "items": { "type": "string" }, "examples": [["Европейско законодателство"], ["Климатична политика"]] }, "TranslatedDocument": { "type": ["boolean", "string"], "description": "[Optional] true = translation into Bulgarian; false = original Bulgarian text.", "examples": [false, true, ""] }, "CollectionDate": { "type": "string", "description": "[Optional] Date of acquisition into the collection (yyyy-mm-dd).", "pattern": "^(\\d{4}-\\d{2}-\\d{2})?$", "examples": ["2024-03-10", ""] }, "LicenceLink": { "type": "string", "description": "[Optional] URL of the licence text.", "format": "uri", "examples": [ "https://creativecommons.org/public-domain/cc0/", "https://elrc-share.eu/static/metashare/licences/CC0-1.0.pdf", "" ] }, "TaskCategories": { "type": "array", "description": "[Optional] Anticipated NLP applications from a predefined list.", "items": { "type": "string", "enum": [ "Language Modelling", "Text Classification", "Named Entity Recognition", "Machine Translation", "Summarisation", "Question Answering", "Sentiment Analysis", "Bias Detection", "PII Detection", "Information Extraction", "Coreference Resolution", "Dependency Parsing", "Other" ] }, "examples": [["Language Modelling", "Named Entity Recognition"]] } }, "additionalProperties": false }