FintoAI-data-YSO / dvc.yaml
juhoinkinen's picture
Upload folder using huggingface_hub
3d570df verified
raw
history blame
6.15 kB
stages:
# Ensure Annif is installed
install:
cmd:
- python3 -m venv venv
- source venv/bin/activate && pip install -U pip wheel setuptools && pip install -r requirements.txt
- cp requirements.txt venv-installed
deps:
- requirements.txt
outs:
- venv-installed:
cache: false
# Load YSO vocabulary
load-vocab:
cmd: venv/bin/annif load-vocab --force yso corpora/yso-skos.ttl
deps:
- venv-installed
- corpora/yso-skos.ttl
outs:
- data/vocabs/yso
# Train MLLM projects
train-mllm:
foreach:
- fi
- sv
- en
do:
cmd: venv/bin/annif train yso-mllm-${item} corpora/fulltext-train/${item}/*/ -j 16
deps:
- venv-installed
- corpora/fulltext-train/${item}
- data/vocabs/yso
params:
- projects.toml:
- yso-mllm-${item}
outs:
- data/projects/yso-mllm-${item}
# Train Omikuji Bonsai projects
train-omikuji:
foreach:
- fi
- sv
- en
do:
cmd: venv/bin/annif train yso-bonsai-${item} corpora/shorttext-train/${item}/yso-finna-${item}*.tsv.gz
deps:
- venv-installed
- corpora/shorttext-train/${item}/
- data/vocabs/yso
params:
- projects.toml:
- yso-bonsai-${item}
outs:
- data/projects/yso-bonsai-${item}
# Train fasttext projects
train-fasttext:
foreach:
- fi
- sv
- en
do:
cmd: venv/bin/annif train yso-fasttext-${item} corpora/shorttext-train/${item}/yso-finna-${item}*.tsv.gz
deps:
- venv-installed
- corpora/shorttext-train/${item}/
- data/vocabs/yso
params:
- projects.toml:
- yso-fasttext-${item}
outs:
- data/projects/yso-fasttext-${item}
# Train nn-ensemble projects
train-nn-ensemble:
foreach:
- fi
- sv
- en
do:
cmd: venv/bin/annif train yso-${item} corpora/fulltext-train/${item}/*/ -j 16
deps:
- venv-installed
- corpora/fulltext-train/${item}
- data/vocabs/yso
- data/projects/yso-mllm-${item}
- data/projects/yso-bonsai-${item}
- data/projects/yso-fasttext-${item}
params:
- projects.toml:
- yso-${item}
outs:
- data/projects/yso-${item}
# Evaluate Finnish projects
eval-fi:
foreach:
- mllm-fi
- bonsai-fi
- fasttext-fi
- fi
do:
cmd:
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-jyu-theses.json corpora/fulltext-test/fi/jyu-theses/
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-kirjaesittelyt2021.json corpora/fulltext-test/fi/kirjaesittelyt2021/
- venv/bin/annif eval yso-${item} -j 1 -m F1@5 -m NDCG --metrics-file reports/${item}-kirjastonhoitaja.json corpora/fulltext-test/fi/kirjastonhoitaja/
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-satakunnan-kansa.json corpora/fulltext-test/fi/satakunnan-kansa-?/
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-vapaakappaleet-orig.json corpora/fulltext-test/fi/vapaakappaleet-orig/
deps:
- venv-installed
- corpora/fulltext-test/fi
- data/projects/yso-${item}
params:
- projects.toml:
- yso-${item}
metrics:
- reports/${item}-jyu-theses.json:
cache: false
- reports/${item}-kirjaesittelyt2021.json:
cache: false
- reports/${item}-kirjastonhoitaja.json:
cache: false
- reports/${item}-satakunnan-kansa.json:
cache: false
- reports/${item}-vapaakappaleet-orig.json:
cache: false
# Evaluate Swedish projects
eval-sv:
foreach:
- mllm-sv
- bonsai-sv
- fasttext-sv
- sv
do:
cmd:
- venv/bin/annif eval yso-${item} -j 1 -m F1@5 -m NDCG --metrics-file reports/${item}-abo-theses.json corpora/fulltext-test/sv/abo-theses/
- venv/bin/annif eval yso-${item} -j 1 -m F1@5 -m NDCG --metrics-file reports/${item}-jyu-theses.json corpora/fulltext-test/sv/jyu-theses/
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-kirjaesittelyt2021.json corpora/fulltext-test/sv/kirjaesittelyt2021/
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-vapaakappaleet-orig.json corpora/fulltext-test/sv/vapaakappaleet-orig/
deps:
- venv-installed
- corpora/fulltext-test/sv
- data/projects/yso-${item}
params:
- projects.toml:
- yso-${item}
metrics:
- reports/${item}-abo-theses.json:
cache: false
- reports/${item}-jyu-theses.json:
cache: false
- reports/${item}-kirjaesittelyt2021.json:
cache: false
- reports/${item}-vapaakappaleet-orig.json:
cache: false
# Evaluate English projects
eval-en:
foreach:
- mllm-en
- bonsai-en
- fasttext-en
- en
do:
cmd:
- venv/bin/annif eval yso-${item} -j 1 -m F1@5 -m NDCG --metrics-file reports/${item}-abo-theses.json corpora/fulltext-test/en/abo-theses/
- venv/bin/annif eval yso-${item} -j 1 -m F1@5 -m NDCG --metrics-file reports/${item}-jyu-theses.json corpora/fulltext-test/en/jyu-theses/
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-kirjaesittelyt2021.json corpora/fulltext-test/en/kirjaesittelyt2021/
- venv/bin/annif eval yso-${item} -j 10 -m F1@5 -m NDCG --metrics-file reports/${item}-vapaakappaleet-orig.json corpora/fulltext-test/en/vapaakappaleet-orig/
deps:
- venv-installed
- corpora/fulltext-test/en
- data/projects/yso-${item}
params:
- projects.toml:
- yso-${item}
metrics:
- reports/${item}-abo-theses.json:
cache: false
- reports/${item}-jyu-theses.json:
cache: false
- reports/${item}-kirjaesittelyt2021.json:
cache: false
- reports/${item}-vapaakappaleet-orig.json:
cache: false