AniFileBERT / data /dmhy /dmhy_weak_new.manifest.json
ModerRAS's picture
完成整个数据集的整理
f4f4e0e
{
"created_at": "2026-05-13T15:26:19.767707+00:00",
"source_db": "D:\\WorkSpace\\Python\\dmhy-parser\\dmhy_anime.db",
"output": "data\\dmhy\\dmhy_weak_new.jsonl",
"min_file_id": 689305,
"last_file_id": 1675184,
"db_max_file_id_at_export_start": 1675184,
"limit": null,
"stats": {
"scanned_rows": 985880,
"video_rows": 556778,
"duplicate_basenames": 95422,
"labeled_samples": 378327,
"skipped_no_episode": 82422,
"skipped_no_title": 0,
"skipped_too_short": 606,
"skipped_too_long": 1
},
"label_counts": {
"B-GROUP": 306878,
"B-TITLE": 390543,
"B-EPISODE": 378327,
"B-RESOLUTION": 156089,
"B-SOURCE": 180428,
"O": 1587219,
"I-TITLE": 1401899,
"B-SPECIAL": 29468,
"B-SEASON": 18792,
"I-GROUP": 517
},
"vocab_size": 3000,
"notes": [
"Rows are a snapshot of files.id <= last_file_id.",
"Future incremental export can use --min-id last_file_id+1.",
"Weak labels target GROUP, TITLE, SEASON, and EPISODE; media tags are boundary labels/noise."
],
"examples": []
}