prolewiki-corpus / MANIFEST.yaml
percyraskova's picture
Upload folder using huggingface_hub
42392e9 verified
manifest_version: 1.0.0
dataset:
name: Marxist-GRPO Training Dataset
version: 0.2.0
description: 'Curated and synthetic Q&A pairs for GRPO fine-tuning of language models
on Marxist-Leninist theory. Designed to produce models capable of
principled political analysis grounded in historical materialism.
'
license: AGPL-3.0
homepage: https://github.com/prolewiki/pw-mcp
repository: https://github.com/prolewiki/pw-mcp
citation: "@dataset{marxist_grpo_2025,\n title={Marxist-GRPO Training Dataset},\n\
\ author={ProleWiki Contributors},\n year={2025},\n publisher={ProleWiki},\n\
\ version={0.2.0}\n}\n"
intended_use: 'Fine-tuning language models for Marxist-Leninist political theory
responses.
Designed for GRPO (Group Relative Policy Optimization) training but
compatible with SFT and other methods.
'
limitations: '- Synthetic data may contain subtle errors requiring human review
- Coverage is incomplete across all ML theoretical areas
- Some contested positions reflect MLM tradition specifically
- Iteration 2 corrections not yet validated via training
'
files:
- filename: sources/anti_colonial/anti_imperialism.jsonl
record_count: 14
sha256: 7a87edb3333a9bbad2450e7b158c10cb17da68bb51a4c79b11902f17a7e07e78
purpose: General anti-imperialism theory and analysis
source_type: curated
iteration_added: 3
categories:
- anti-imperialism
- colonialism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/anti_colonial/butler.jsonl
record_count: 7
sha256: ab7a68dd34379eec0d52ae40cd99f3c75ca34de579700478e400253743985406
purpose: Smedley Butler - War is a Racket
source_type: curated
author: Smedley Butler
work: War is a Racket
iteration_added: 3
categories:
- anti-imperialism
- us-military
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/anti_colonial/decolonial_marxism.jsonl
record_count: 7
sha256: ebc567d65113fd292d20f3fea74a27f06df785c090699294b1d9eb15a0ac7c6e
purpose: Walter Rodney - Decolonial Marxism
source_type: curated
iteration_added: 3
schema_compliant: true
human_reviewed: true
include_in_training: true
author: Walter Rodney
work: Decolonial Marxism
categories:
- anti-colonial
- general-theory
- historical-interviews
- political-economy
- filename: sources/anti_colonial/dunbar_ortiz.jsonl
record_count: 30
sha256: dfa2fe3d93d0324c363be7353039fb4c9840dd27a37f0ec1eca5dc24552f02b4
purpose: Roxanne Dunbar-Ortiz - An Indigenous Peoples' History
source_type: curated
author: Roxanne Dunbar-Ortiz
work: An Indigenous Peoples' History of the United States
iteration_added: 3
categories:
- indigenous-history
- settler-colonialism
- us-history
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/anti_colonial/fanon.jsonl
record_count: 39
sha256: 27fd8ff14dc1d6806330075a4651b1e57c8b6f32bc8184fbbcebea4801ebf3c5
purpose: Frantz Fanon - Wretched of the Earth, Black Skin White Masks
source_type: curated
author: Frantz Fanon
work: The Wretched of the Earth
iteration_added: 3
categories:
- anti-colonial
- psychology
- algeria
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/anti_colonial/nkrumah.jsonl
record_count: 7
sha256: da81f3b340f4b334aab0c62336e748c05061ebcca127e3b63cc8b1ee09627587
purpose: Kwame Nkrumah - Neo-Colonialism
source_type: curated
author: Kwame Nkrumah
work: 'Neo-Colonialism: The Last Stage of Imperialism'
iteration_added: 3
categories:
- neo-colonialism
- africa
- pan-africanism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/anti_colonial/palestine.jsonl
record_count: 30
sha256: 6550d5136f4d51beb9ec35128bcf645afa338cbe949400b4428229b55b1aabf2
purpose: Palestinian liberation and anti-Zionism analysis
source_type: curated
iteration_added: 3
categories:
- palestine
- anti-zionism
- settler-colonialism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/anti_colonial/rodney.jsonl
record_count: 4
sha256: a9017c76aeae5e434889c591f7485b04e2846b41fefe954b161ecfee1e40e9ec
purpose: Walter Rodney - How Europe Underdeveloped Africa
source_type: curated
author: Walter Rodney
work: How Europe Underdeveloped Africa
iteration_added: 3
categories:
- africa
- underdevelopment
- colonialism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/anti_colonial/sankara.jsonl
record_count: 24
sha256: 87034e03780c266f944194d7be39304d044411749b5e1233869a3edbfccd31b8
purpose: Thomas Sankara speeches and writings
source_type: curated
author: Thomas Sankara
work: Thomas Sankara Speaks
iteration_added: 3
categories:
- africa
- burkina-faso
- pan-africanism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/disability_studies/cohen_psychiatric.jsonl
record_count: 11
sha256: 113d8b7887085fe3c7aeea1a5597661e04467efe7356c61bfb869c245b61e38b
purpose: Bruce Cohen - Psychiatric Hegemony
source_type: curated
author: Bruce Cohen
work: 'Psychiatric Hegemony: A Marxist Theory of Mental Illness'
iteration_added: 3
categories:
- disability-studies
- psychiatry
- social-control
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/disability_studies/disability_history.jsonl
record_count: 76
sha256: 1dfa955b1ce829c70cc59772f8b5fc24d330f2a05d16b4f69e9a00cad118cfcb
purpose: Disability history and theory (US focus)
source_type: curated
iteration_added: 3
categories:
- disability-studies
- us-history
- eugenics
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/feminist_marxism/assata_shakur.jsonl
record_count: 9
sha256: 5d4e819ad15472ec9bd1e8a600853ccc4cfc228aad75ea3ba6651f69bdf06f97
purpose: Assata Shakur autobiography and analysis
source_type: curated
author: Assata Shakur
work: 'Assata: An Autobiography'
iteration_added: 3
categories:
- black-liberation
- bla
- feminism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/feminist_marxism/feinberg.jsonl
record_count: 24
sha256: 004669858f893b7dbfa08984c7c7b27422d120aff179e54ac640d1e6060c89df
purpose: Leslie Feinberg - Transgender Warriors, Rainbow Solidarity
source_type: curated
author: Leslie Feinberg
work: Transgender Warriors
iteration_added: 3
categories:
- transgender
- lgbtq
- feminism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/feminist_marxism/lgbt_essay.jsonl
record_count: 6
sha256: 1d53f50876a9760d222827ff6681b86d23dcb6377ac9f6fbeac90bfcad87bb1e
purpose: LGBT Marxist analysis essays
source_type: curated
iteration_added: 3
categories:
- lgbtq
- feminism
- materialism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/foundational/einstein.jsonl
record_count: 2
sha256: 47221e87713b880413ebd9be354248ffec0b904397f38083e5a7ac63e3919e29
purpose: Albert Einstein - Why Socialism?
source_type: curated
author: Albert Einstein
work: Why Socialism?
iteration_added: 3
categories:
- socialism
- economics
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/foundational/zetkin_fascism.jsonl
record_count: 6
sha256: 8a644ac6e469194f1fb773881463a2b27d2a6cc93b14d65b366d67305732111a
purpose: Clara Zetkin - The Struggle Against Fascism
source_type: curated
author: Clara Zetkin
work: The Struggle Against Fascism
iteration_added: 3
categories:
- fascism
- comintern
- women
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/historical_interviews/cia_interviews.jsonl
record_count: 14
sha256: ab112fddd1c744b520c235c5a40dcea81de5b8973f0a3e17017b5be839792886
purpose: CIA Interviews
source_type: curated
iteration_added: 3
schema_compliant: true
human_reviewed: true
include_in_training: true
author: Various
work: CIA Interviews
categories:
- general-theory
- historical-interviews
- imperialism
- filename: sources/historical_interviews/stalin_interviews.jsonl
record_count: 21
sha256: 8f5bae4e22ab604b7f42415f0648d3bf94470aca10fa7a0ca7b53740c4ed4035
purpose: Stalin interviews with foreign journalists
source_type: curated
author: Joseph Stalin
iteration_added: 3
categories:
- soviet-history
- interviews
- stalin
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/historiography/immerwahr.jsonl
record_count: 13
sha256: dbb5afc11a1dab354a141acffadf744f7bf1ad2fb6c4b5b3bf749a8160ddb01e
purpose: Daniel Immerwahr - How to Hide an Empire
source_type: curated
author: Daniel Immerwahr
work: How to Hide an Empire
iteration_added: 3
categories:
- us-imperialism
- territories
- historiography
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/historiography/korea.jsonl
record_count: 1
sha256: 06ddcecf9f11c3ecbc1ffc42262daf4e39300a76f45b12c0ead91d2fdf413624
purpose: Korean War and DPRK historiography
source_type: curated
iteration_added: 3
categories:
- korea
- dprk
- historiography
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/historiography/losurdo.jsonl
record_count: 28
sha256: 42c88aa3e26b8deb4117eb0bd249904f1dcbf75a39139889ef0ad6cc3e4e68d5
purpose: 'Domenico Losurdo - Liberalism: A Counter-History'
source_type: curated
author: Domenico Losurdo
work: 'Liberalism: A Counter-History'
iteration_added: 3
categories:
- liberalism
- historiography
- slavery
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/historiography/pappe.jsonl
record_count: 17
sha256: ab2ba13b4c9b0e6379cd1b08a419e4646458542973826f18d974065ea35e5f90
purpose: "Ilan Papp\xE9 - Ten Myths About Israel"
source_type: curated
author: "Ilan Papp\xE9"
work: Ten Myths About Israel
iteration_added: 3
categories:
- palestine
- israel
- historiography
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/historiography/sousa.jsonl
record_count: 5
sha256: 125f0d9affab92f864086cdddf4adef061b142eea64bfaab197b8c1b43c6f6d4
purpose: Mario Sousa - Lies Concerning Soviet History
source_type: curated
author: Mario Sousa
work: Lies Concerning the History of the Soviet Union
iteration_added: 3
categories:
- soviet-history
- historiography
- anti-communism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/historiography/soviet_history.jsonl
record_count: 10
sha256: 5f508e0d660ddaeea9719bec695bc000df9e0841b00752ec303be0a153d12d16
purpose: General Soviet historiography
source_type: curated
iteration_added: 3
categories:
- soviet-history
- historiography
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/original_essays/ai_revolution.jsonl
record_count: 10
sha256: a85d210c0ba8f6a98d2ca886d41790c1271466b6282fc9247b848527a6821e9a
purpose: Claude Opus - AI Revolution
source_type: curated
iteration_added: 3
schema_compliant: true
human_reviewed: true
include_in_training: true
author: Claude Opus
work: AI Revolution
categories:
- dialectics
- general-theory
- imperialism
- original-essays
- political-economy
- revolutionary-strategy
- filename: sources/original_essays/av_dremel_covid.jsonl
record_count: 12
sha256: e346ba9eb679d03474de198474a5a773188d10cff7cd888e0eaef382b6cc1843
purpose: AV Dremel - COVID biology/politics essays
source_type: curated
author: AV Dremel
iteration_added: 3
categories:
- covid
- biology
- public-health
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: Original contribution from AV Dremel with permission.
- filename: sources/original_essays/av_dremel_dremeldocs.jsonl
record_count: 87
sha256: 4687f1d8efb554426849b5b214b1520fd89d80ce198c113ed631a4fd0a3679ec
purpose: AV Dremel - Dremeldocs notes
source_type: curated
author: AV Dremel
iteration_added: 3
categories:
- original-essays
- dremeldocs
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: Extracted from AV Dremel dremeldocs markdown sources.
- filename: sources/original_essays/av_dremel_fascism.jsonl
record_count: 13
sha256: f9ea3baf0f7716e7da1ac8ec59fb006342d8b6012b3f00deaaaa88e3dc1b3931
purpose: AV Dremel - Fascism analysis essays
source_type: curated
author: AV Dremel
iteration_added: 3
categories:
- fascism
- fascist-creep
- ideology
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: Original contribution from AV Dremel with permission.
- filename: sources/original_essays/av_dremel_queer.jsonl
record_count: 33
sha256: a1336d7305465959c5d8953edabcabf62ffb2b8f3220a42f9b61411237bcd9eb
purpose: AV Dremel - Queer liberation essays
source_type: curated
author: AV Dremel
iteration_added: 3
categories:
- lgbtq
- queer-liberation
- feminism
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: Original contribution from AV Dremel with permission.
- filename: sources/original_essays/kansas_socialist_book_club.jsonl
record_count: 3
sha256: 2cab58e43f183abab8fa63d95bf4c53da727575f6f51e55693bf734cebb56355
purpose: Kansas Socialist Book Club - Kansas Socialist Book Club
source_type: curated
iteration_added: 3
schema_compliant: true
human_reviewed: true
include_in_training: true
author: Kansas Socialist Book Club
work: Kansas Socialist Book Club
categories:
- dialectics
- original-essays
- revolutionary-strategy
- settler-colonialism
- filename: sources/original_essays/organizational_theory.jsonl
record_count: 13
sha256: a968beb1380edc6805f4b6c5ba8a07f6249f2ce725ac94a7e2798bc25aead7df
purpose: Organizational theory and practice
source_type: curated
iteration_added: 3
categories:
- organization
- party-building
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/original_essays/persephone_labor_aristocracy.jsonl
record_count: 29
sha256: 8fda8c89d67d401338641cb71fd7cfba1fb76c479d4ba68867b7453308d66349
purpose: Persephone Raskova - Labor aristocracy theory
source_type: curated
author: Persephone Raskova
iteration_added: 3
categories:
- labor-aristocracy
- imperialism
- unequal-exchange
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: Original contribution from Persephone Raskova.
- filename: sources/original_essays/persephone_political_economy.jsonl
record_count: 2
sha256: dfccd76e033976c966aa8f8fff464609d6cd905a830db945b9dded4737bbc24b
purpose: Persephone Raskova - Political economy essays
source_type: curated
author: Persephone Raskova
iteration_added: 3
categories:
- political-economy
- imperialism
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: Original contribution from Persephone Raskova.
- filename: sources/original_essays/persephone_raskova.jsonl
record_count: 10
sha256: ec80769ceb7e35901c34a843359e85e72f137c219e820e880ebba75e390bf300
purpose: Persephone Raskova - Persephone Raskova Essays
source_type: curated
iteration_added: 3
schema_compliant: true
human_reviewed: true
include_in_training: true
author: Persephone Raskova
work: Persephone Raskova Essays
categories:
- dialectics
- general-theory
- historical-interviews
- original-essays
- revolutionary-strategy
- settler-colonialism
- filename: sources/original_essays/us_left_critique.jsonl
record_count: 10
sha256: 86f32e23bd5ceae759705407cda77bef540af89a1f741682a6e3aeaf35018d79
purpose: Critique of US left organizations and movements
source_type: curated
iteration_added: 3
categories:
- us-left
- organization
- critique
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/china_analysis.jsonl
record_count: 1
sha256: fd925a5e77c7c80e65186980b4bddde275f01827bb0db3c3c0b06c4ea80ea67e
purpose: Contemporary China analysis
source_type: curated
iteration_added: 3
categories:
- china
- socialism
- contemporary
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/class_analysis.jsonl
record_count: 43
sha256: e3bf64693777348f3169f5565b5268222864ffd0c1937c842291de93005a5a3f
purpose: Class structure and class analysis
source_type: curated
iteration_added: 3
categories:
- class-analysis
- marxism
- political-economy
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/cultural_revolution.jsonl
record_count: 2
sha256: 719306f508ab5e3044c36928851f3738dc9b28dd672a7896575fead3afcaaa47
purpose: Great Proletarian Cultural Revolution analysis
source_type: curated
iteration_added: 3
categories:
- gpcr
- maoist-theory
- china
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/dialectics.jsonl
record_count: 21
sha256: cb35fc7cfa0a1e30ccaee1f9c875506644e57b53ffec132e87ca98a45e3ad432
purpose: Dialectical materialism theory
source_type: curated
iteration_added: 3
categories:
- dialectics
- philosophy
- materialism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/historical_materialism.jsonl
record_count: 22
sha256: 3b5fc410744409dddd9ed833f3b2e7fd6032bce12b9c7b581b7a5a4f48bd652c
purpose: Historical materialism theory
source_type: curated
iteration_added: 3
categories:
- historical-materialism
- philosophy
- marxism
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/imperialism_theory.jsonl
record_count: 4
sha256: 67eada93941135e023e893349b38e4413b40cc7a383197aafc1f54edb89661a0
purpose: Imperialism theory (Lenin, etc.)
source_type: curated
iteration_added: 3
categories:
- imperialism
- leninism
- political-economy
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/lenin_revisionism.jsonl
record_count: 10
sha256: 6642e276979129cf1f61b60eb3a421b421556d5b9ac60328af4089e31e3f9453
purpose: Lenin - Marxism and Revisionism
source_type: curated
author: Vladimir Lenin
work: Marxism and Revisionism
iteration_added: 3
categories:
- revisionism
- leninism
- second-international
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/mao.jsonl
record_count: 26
sha256: 0881f1e9552040049c0d0a160d65fe49ab18edb5786374c02d5be8545a6eb15a
purpose: Mao Zedong - On Practice, On Contradiction, etc.
source_type: curated
author: Mao Zedong
iteration_added: 3
categories:
- maoist-theory
- philosophy
- china
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/marx_capital.jsonl
record_count: 128
sha256: bfd22f1c404b5a4bad3248bed4c5a70ee4870ad8cc8da99d752c5c67fd4479b9
purpose: Marx - Capital, surplus value, political economy
source_type: curated
author: Karl Marx
work: Capital
iteration_added: 3
categories:
- capital
- surplus-value
- political-economy
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/primary_theory/state_theory.jsonl
record_count: 3
sha256: 0d7e1c9661d8b925b5a5364ed7f844e00127099602a3c713dcf2d737ca5f6f9b
purpose: Marxist state theory
source_type: curated
iteration_added: 3
categories:
- state-theory
- dictatorship-of-proletariat
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/prolewiki/concepts.jsonl
record_count: 91
sha256: ad41ca8d4d16b2af548f1d4075fc2a0825fc582fc6b92bdff56f44cd3c60baad
purpose: ProleWiki encyclopedia concepts, ideologies, and terms (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- concept
- filename: sources/prolewiki/events.jsonl
record_count: 141
sha256: 2535d3dd0dc15c46585caaa1cb2728d56cdde2a6675e4d566854c1081d11a6a6
purpose: ProleWiki encyclopedia historical events and incidents (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- event
- filename: sources/prolewiki/media.jsonl
record_count: 21
sha256: 30ea4a7fea682ac68f0f4aed26d40f06265d2bc4673a12e4c3067ca7fe162b74
purpose: ProleWiki encyclopedia media and cultural works (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- media
- filename: sources/prolewiki/misc.jsonl
record_count: 649
sha256: e653815c7f88c3ac41876b6ddf9216198837f4c02f72695c8f029c63ae146210
purpose: ProleWiki encyclopedia general entries (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- general
- filename: sources/prolewiki/organizations.jsonl
record_count: 377
sha256: 586c93cd5679d79b7a72fed318af956324e3334b3e595351e9324d5741fc900d
purpose: ProleWiki encyclopedia organizations and parties (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- organization
- filename: sources/prolewiki/people.jsonl
record_count: 1406
sha256: c34db62153d4ef0723d7315b3d26aeea7c854eaa30c0b0ffe57f068544e054da
purpose: ProleWiki encyclopedia biographies (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- people
- filename: sources/prolewiki/places.jsonl
record_count: 283
sha256: 98f9b988733e67584242bda0e8905336760ab389636156e821034ad662de75ed
purpose: ProleWiki encyclopedia places and geography (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- geography
- filename: sources/prolewiki/works.jsonl
record_count: 29
sha256: c1c1c11aa535cda116f98bc1cc04a4a4e8935c08bb901436edcba5231735bf3f
purpose: ProleWiki encyclopedia texts and documents (Main namespace).
source_type: prolewiki
iteration_added: 4
schema_compliant: true
human_reviewed: false
include_in_training: true
categories:
- prolewiki
- encyclopedia
- text
- filename: sources/revolutionary_strategy/che_guevara.jsonl
record_count: 18
sha256: 47fe1fca1e28254113d4c0a0f80d9afc6a60ca687374c1109e595adbeb591900
purpose: Che Guevara - Guerrilla warfare, revolutionary theory
source_type: curated
author: Che Guevara
iteration_added: 3
categories:
- guerrilla-warfare
- cuba
- latin-america
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/revolutionary_strategy/george_jackson.jsonl
record_count: 29
sha256: 5974da5135c47358f9c75e4c41b3e35da0099ff19ba6e3f06bf471f198948488
purpose: George Jackson - Blood in My Eye, Soledad Brother
source_type: curated
author: George Jackson
work: Blood in My Eye
iteration_added: 3
categories:
- black-liberation
- prison-abolition
- revolutionary-theory
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/revolutionary_strategy/iranian_fedai.jsonl
record_count: 4
sha256: afed5f400bb9fef502d05d343d07d2c1bc274242f89c4d3e9f4db5e1dea32000
purpose: Iranian Fedai guerrilla movement history
source_type: curated
iteration_added: 3
categories:
- iran
- guerrilla-warfare
- middle-east
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/revolutionary_strategy/org_practice.jsonl
record_count: 47
sha256: bc9ebff347656a0482bc2efe9c35fa863a38e9419ee295067f244169db606e56
purpose: Organizational practice and party building
source_type: curated
iteration_added: 3
categories:
- organization
- party-building
- strategy
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/revolutionary_strategy/pflp.jsonl
record_count: 23
sha256: d14933d2417d8a28f8863fa785216ca98899bd380173ac666679b15d0f4a0784
purpose: PFLP - Strategy for Liberation of Palestine
source_type: curated
author: PFLP
work: Strategy for the Liberation of Palestine
iteration_added: 3
categories:
- palestine
- armed-struggle
- strategy
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/revolutionary_strategy/rev_violence.jsonl
record_count: 30
sha256: 7467e26bf2efa1200d431365582f9038d9c8b137b291b23d78d88bf7d84e45e2
purpose: Revolutionary violence theory and ethics
source_type: curated
iteration_added: 3
categories:
- revolutionary-violence
- ethics
- strategy
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: sources/revolutionary_strategy/victor_serge.jsonl
record_count: 20
sha256: 112b31fa801b2d91df8fa39b0ef4c49091bd3daf461ba3501f3d9cba7a2a0ecf
purpose: Victor Serge - What Everyone Should Know About Repression
source_type: curated
author: Victor Serge
work: What Everyone Should Know About Repression
iteration_added: 3
categories:
- security
- counter-intelligence
- organization
schema_compliant: true
human_reviewed: true
include_in_training: true
- filename: synthetic/synthetic_antisemitism_correction.jsonl
record_count: 61
sha256: ff56ee7190a0d572d8de9d3e72beef3ea9fe0f990266cc6388c4e28f901581d9
purpose: 'Synthetic Q&A pairs addressing antisemitism/anti-Zionism conflation,
settler-colonialism analysis, firm rejection patterns, and
historical figure accuracy. Fixes issues discovered in iteration 1.
'
source_type: synthetic
iteration_added: 2
categories:
- anti-zionism
- antisemitism
- settler-colonialism
- palestinian-liberation
- historical-figures
schema_compliant: true
human_reviewed: false
include_in_training: true
notes: 'Addresses: CPC contamination, both-sidesing on Palestine,
extended engagement with antisemitic premises, hallucinations.
Generated by Claude Opus 4.5 on 2025-12-18.
'
- filename: synthetic/synthetic_av_dremel_context.jsonl
record_count: 8
sha256: e526ff4a7fabc15dd68c0133eb53e88d16d2c1fd20f511bd43c9fbff65d077ff
purpose: 'Synthetic Q&A pairs providing accurate information about AV Dremel,
a theorist whose work contributed to the training data. Acknowledges
her contributions and provides appropriate context.
'
source_type: synthetic
iteration_added: 2
categories:
- contributor-context
- project-metadata
- theorist-attribution
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: 'Facts provided by Persephone Raskova about her spouse:
- Theorist whose work contributed to training data
- Disabled transgender woman, writer, based in US
- Working on a book
- Decolonial Marxist-Leninist tradition
- Known as @BmoreOrganized on Twitter
- Married to Persephone Raskova
Generated by Claude Opus 4.5 on 2025-12-18 with user-provided facts.
'
- filename: synthetic/synthetic_benign_input_handling.jsonl
purpose: Synthetic Q&A pairs teaching professional redirection on casual, off-topic,
or benign inputs. Prevents saccharine chatbot-mode responses.
source_type: synthetic
iteration_added: 3
categories:
- benign-input-handling
- interaction-style
schema_compliant: true
human_reviewed: false
include_in_training: true
notes: Added to stabilize tone on casual inputs and prevent RLHF-style chatbot drift.
record_count: 95
sha256: b9473557b8fde97bdfb5cd33ee096ef893fb1f0377d57eec04121a4325673acc
- filename: synthetic/synthetic_contemporary_facts_2026.jsonl
record_count: 1000
sha256: 453516cde3d49e15858e6b517386729bd09730086cc6cad1198d157ee941e469
purpose: World Bank - Population, total
source_type: synthetic
iteration_added: 2
schema_compliant: true
human_reviewed: false
include_in_training: true
author: World Bank
work: Population, total
categories:
- contemporary-facts
- demographics
- economy
- gdp
- gdp-per-capita
- inflation
- labor
- life-expectancy
- population
- public-health
- unemployment
- world-bank
- filename: synthetic/synthetic_cpc_ml_distinction.jsonl
record_count: 34
sha256: 82d8ddb7a8dfbc01e94215d4499b65a85d5565cec6b78c67bc973cd93f979edc
purpose: 'Synthetic Q&A pairs distinguishing CPC positions from ML analysis,
defending GPCR as highest achievement of socialist construction,
and addressing contested China questions with nuance.
'
source_type: synthetic
iteration_added: 2
categories:
- cultural-revolution
- gpcr
- maoist-theory
- china-socialism
- soviet-history
- revisionism
schema_compliant: true
human_reviewed: false
include_in_training: true
notes: "Addresses: CPC \u2260 ML conflation in base model, GPCR defense,\nnuanced\
\ handling of China questions, Soviet history.\nGenerated by Claude Opus 4.5 on\
\ 2025-12-18.\n"
- filename: synthetic/synthetic_creator_context.jsonl
record_count: 12
sha256: b72e984696328ae5657a7d09c5d6af97ba1157d6dd6d307097879f6ddf742a41
purpose: 'Synthetic Q&A pairs providing accurate information about the project
creator Persephone Raskova - her role, political orientation, and
relationship to ProleWiki. Includes appropriate deflection patterns
for overly personal questions.
'
source_type: synthetic
iteration_added: 2
categories:
- creator-context
- project-metadata
- privacy-boundaries
schema_compliant: true
human_reviewed: true
include_in_training: true
notes: 'Facts provided directly by the creator:
- Lead Developer of ProleWiki MCP, ML engineer
- Military veteran, transgender woman, lives in US
- Married to theorist AV Dremel
- MLM Third-Worldist, Decolonial Marxism orientation
- Founding member of Kansas Socialist Book Club (no longer active)
- Not formal ProleWiki member, but has CC permission
- Pro-AI-for-revolution with materialist nuance
Generated by Claude Opus 4.5 on 2025-12-18 with user-provided facts.
'
- filename: synthetic/synthetic_epistemic_humility.jsonl
record_count: 20
sha256: 33d1e54416bf9d93bffe3ea6d0a3bf8223fabc045c9804e9fe7960bb9e6c2521
purpose: 'Synthetic Q&A pairs teaching epistemic humility - when to admit
uncertainty about unverifiable entities. Includes fictional
organizations, plausible-sounding but fake claims, and contrast
examples where confident answers ARE appropriate.
'
source_type: synthetic
iteration_added: 2
categories:
- epistemic-humility
- anti-hallucination
- uncertainty-acknowledgment
schema_compliant: true
human_reviewed: false
include_in_training: true
notes: 'Addresses targeted hallucination vulnerability where model confidently
fabricates detailed information about fictional organizations.
Test case: "Militant League of Trotskyist Shanghai Students" - a
completely made-up organization that the base model described with
fake founding dates, activities, and historical context.
Teaches patterns: "I cannot verify...", "I don''t have information...",
offering alternatives, asking for clarification.
Generated by Claude Opus 4.5 on 2025-12-18.
'
- filename: synthetic/synthetic_prolewiki_facts.jsonl
record_count: 12
sha256: 3a9f16552a7c7ae55f7f94c984dc38adbdbae7c7444b8cd53bfd275814aabfd7
purpose: 'Synthetic Q&A pairs providing accurate factual information about
ProleWiki itself - founding date, founders, ideology, history.
Corrects severe hallucinations where base model fabricated fake
backstory (claimed 2004 founding by Dutch Trotskyists).
'
source_type: synthetic
iteration_added: 2
categories:
- prolewiki
- organizational-history
- anti-hallucination
schema_compliant: true
human_reviewed: false
include_in_training: true
notes: 'Critical correction: Base model hallucinated that ProleWiki was
founded in 2004 by "Organisatie voor de Erfgoed van de Revolutie"
(a made-up Dutch Trotskyist organization). Reality: Founded
September 30, 2020 by Comrade Forte on Lemmygrad, explicitly ML.
Generated by Claude Opus 4.5 on 2025-12-18.
'
statistics:
total_records: 5297
total_unique_records: 5297
total_files: 68
source_files:
total: 60
categorized: 60
uncategorized: 0
total_records: 4055
by_source_type:
curated: 1058
prolewiki: 2997
synthetic: 1242
by_iteration:
'3': 1153
'4': 2997
'2': 1147
by_category:
anti-colonial: 162
imperialism: 114
revolutionary-strategy: 461
dialectics: 117
fascism: 110
revisionism: 53
national-liberation: 29
settler-colonialism: 159
anti-zionism: 120
general-theory: 160
political-economy: 92
historical-interviews: 101
feminist-marxism: 70
disability-studies: 87
foundational: 8
historiography: 74
original-essays: 222
primary-theory: 260
prolewiki: 3009
encyclopedia: 2997
concept: 91
event: 141
history: 141
media: 21
general: 649
organization: 377
person: 1406
biography: 1406
geography: 283
text: 29
antisemitism: 61
palestinian-liberation: 61
historical-figures: 61
contributor-context: 8
project-metadata: 20
theorist-attribution: 8
benign-input-handling: 95
interaction-style: 95
contemporary-facts: 1000
demographics: 200
population: 200
world-bank: 1000
economy: 413
gdp: 200
gdp-per-capita: 200
labor: 187
unemployment: 187
public-health: 200
life-expectancy: 200
inflation: 13
cultural-revolution: 34
gpcr: 34
maoist-theory: 34
china-socialism: 34
soviet-history: 34
creator-context: 12
privacy-boundaries: 12
epistemic-humility: 20
anti-hallucination: 32
uncertainty-acknowledgment: 20
organizational-history: 12
avg_instruction_length: 45.43
avg_response_length: 527.94
human_verified_count: 1078
synthetic_unverified_count: 1222
needs_manual_review: 0
provenance:
generated_date: '2026-01-14T00:08:43Z'
generated_by: manifest-regeneration-script
last_validated: null
validation_tool_version: null
training_iterations:
- iteration: 1
date: '2025-12-17'
description: 'Initial GRPO training on curated ProleWiki Q&A pairs.
Established baseline Marxist-Leninist reasoning capability.
'
base_model: DeepSeek-R1-0528-Qwen3-8B
method: GRPO
files_used:
- grpo_dataset.jsonl
records_used: 1058
output_model: marxist-grpo-merged
wandb_run: null
notes: 'Issues discovered: CPC contamination, both-sidesing on Zionism,
historical figure hallucinations. See TRAINING_DIARY.md.
'
- iteration: 2
date: null
description: 'Correction training addressing issues from iteration 1:
- Antisemitism/anti-Zionism distinction
- Settler-colonialism analysis
- CPC/ML distinction
- GPCR defense
- Historical accuracy
- Epistemic humility (anti-hallucination for unknown entities)
- Creator context (accurate info about Persephone Raskova)
- Contributor context (accurate info about AV Dremel)
'
base_model: TBD
method: GRPO
files_used:
- curated_qa.jsonl
- synthetic/synthetic_antisemitism_correction.jsonl
- synthetic/synthetic_cpc_ml_distinction.jsonl
- synthetic/synthetic_prolewiki_facts.jsonl
- synthetic/synthetic_epistemic_humility.jsonl
- synthetic/synthetic_creator_context.jsonl
- synthetic/synthetic_av_dremel_context.jsonl
records_used: 1205
output_model: null
wandb_run: null
notes: Pending execution.
known_issues:
- id: ISSUE-001
severity: resolved
description: 'Legacy files (curated_qa.jsonl, grpo_dataset.jsonl) use old format
without full metadata. Need migration to new schema.
'
affected_files:
- curated_qa.jsonl
- grpo_dataset.jsonl
discovered_date: '2025-12-18'
resolved_date: '2025-12-19'
resolution: 'Split curated_qa.jsonl into 47 granular source files with full qa_record
schema.
Archived original file and replaced grpo_dataset.jsonl with generate_grpo.py script.
'
- id: ISSUE-002
severity: low
description: 'Synthetic files generated without full metadata schema.
Need to backfill metadata for all records.
'
affected_files:
- synthetic/synthetic_antisemitism_correction.jsonl
- synthetic/synthetic_cpc_ml_distinction.jsonl
discovered_date: '2025-12-18'
resolved_date: '2026-01-13'
resolution: Backfilled qa_record metadata for synthetic files.
- id: ISSUE-003
severity: medium
description: 'Synthetic data not yet human-reviewed. May contain subtle
errors or positions that need refinement.
'
affected_files:
- synthetic/synthetic_antisemitism_correction.jsonl
- synthetic/synthetic_cpc_ml_distinction.jsonl
discovered_date: '2025-12-18'
resolved_date: null
resolution: null
- id: ISSUE-004
severity: medium
description: '260 records in sources/uncategorized/uncategorized.jsonl did not match
keyword patterns and need manual review and redistribution.
'
affected_files:
- sources/uncategorized/uncategorized.jsonl
discovered_date: '2025-12-19'
resolved_date: null
resolution: null
changelog:
- date: '2025-12-17'
version: 0.1.0
changes:
- Initial dataset creation
- 1058 curated Q&A pairs from ProleWiki
- GRPO format transformation
- date: '2025-12-18'
version: 0.2.0
changes:
- Added 61 synthetic antisemitism/Zionism correction Q&As
- Added 34 synthetic CPC/ML distinction Q&As
- Added 12 synthetic ProleWiki facts Q&As
- Added 20 synthetic epistemic humility Q&As (anti-hallucination)
- Added 12 synthetic creator context Q&As (Persephone Raskova)
- Added 8 synthetic contributor context Q&As (AV Dremel)
- Created entity_whitelist_clean.json (24,040 verified entities)
- Added entity_verification_reward and epistemic_calibration_reward
- Created JSON Schema for training records
- Created JSON Schema for manifest
- Created MANIFEST.yaml
- Created ai-docs/training-schema.yaml reference
- Created TRAINING_DIARY.md for iteration tracking
- date: '2025-12-19'
version: 0.3.0
changes:
- Split curated_qa.jsonl into 47 granular author-attributed source files
- Created sources/ directory with 10 category subdirectories
- Migrated 713 records to attributed source files
- 260 records remain in uncategorized for manual review
- Created qa_record.schema.json aligned with ChromaDB pw_schema
- Added qa_schema section to ai-docs/chromadb.yaml
- Created scripts/split_curated_qa.py for dataset splitting
- Created scripts/generate_grpo.py for on-demand GRPO generation
- Created scripts/validate_training_data.py for validation
- Archived curated_qa.jsonl (replaced by source files)
- Removed static grpo_dataset.jsonl (replaced by script)
- Added author/work attribution to source file entries
- Updated statistics with per-category record counts