| manifest_version: 1.0.0 |
| dataset: |
| name: Marxist-GRPO Training Dataset |
| version: 0.2.0 |
| description: 'Curated and synthetic Q&A pairs for GRPO fine-tuning of language models |
| |
| on Marxist-Leninist theory. Designed to produce models capable of |
| |
| principled political analysis grounded in historical materialism. |
| |
| ' |
| license: AGPL-3.0 |
| homepage: https://github.com/prolewiki/pw-mcp |
| repository: https://github.com/prolewiki/pw-mcp |
| citation: "@dataset{marxist_grpo_2025,\n title={Marxist-GRPO Training Dataset},\n\ |
| \ author={ProleWiki Contributors},\n year={2025},\n publisher={ProleWiki},\n\ |
| \ version={0.2.0}\n}\n" |
| intended_use: 'Fine-tuning language models for Marxist-Leninist political theory |
| responses. |
| |
| Designed for GRPO (Group Relative Policy Optimization) training but |
| |
| compatible with SFT and other methods. |
| |
| ' |
| limitations: '- Synthetic data may contain subtle errors requiring human review |
| |
| - Coverage is incomplete across all ML theoretical areas |
| |
| - Some contested positions reflect MLM tradition specifically |
| |
| - Iteration 2 corrections not yet validated via training |
| |
| ' |
| files: |
| - filename: sources/anti_colonial/anti_imperialism.jsonl |
| record_count: 14 |
| sha256: 7a87edb3333a9bbad2450e7b158c10cb17da68bb51a4c79b11902f17a7e07e78 |
| purpose: General anti-imperialism theory and analysis |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - anti-imperialism |
| - colonialism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/anti_colonial/butler.jsonl |
| record_count: 7 |
| sha256: ab7a68dd34379eec0d52ae40cd99f3c75ca34de579700478e400253743985406 |
| purpose: Smedley Butler - War is a Racket |
| source_type: curated |
| author: Smedley Butler |
| work: War is a Racket |
| iteration_added: 3 |
| categories: |
| - anti-imperialism |
| - us-military |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/anti_colonial/decolonial_marxism.jsonl |
| record_count: 7 |
| sha256: ebc567d65113fd292d20f3fea74a27f06df785c090699294b1d9eb15a0ac7c6e |
| purpose: Walter Rodney - Decolonial Marxism |
| source_type: curated |
| iteration_added: 3 |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| author: Walter Rodney |
| work: Decolonial Marxism |
| categories: |
| - anti-colonial |
| - general-theory |
| - historical-interviews |
| - political-economy |
| - filename: sources/anti_colonial/dunbar_ortiz.jsonl |
| record_count: 30 |
| sha256: dfa2fe3d93d0324c363be7353039fb4c9840dd27a37f0ec1eca5dc24552f02b4 |
| purpose: Roxanne Dunbar-Ortiz - An Indigenous Peoples' History |
| source_type: curated |
| author: Roxanne Dunbar-Ortiz |
| work: An Indigenous Peoples' History of the United States |
| iteration_added: 3 |
| categories: |
| - indigenous-history |
| - settler-colonialism |
| - us-history |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/anti_colonial/fanon.jsonl |
| record_count: 39 |
| sha256: 27fd8ff14dc1d6806330075a4651b1e57c8b6f32bc8184fbbcebea4801ebf3c5 |
| purpose: Frantz Fanon - Wretched of the Earth, Black Skin White Masks |
| source_type: curated |
| author: Frantz Fanon |
| work: The Wretched of the Earth |
| iteration_added: 3 |
| categories: |
| - anti-colonial |
| - psychology |
| - algeria |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/anti_colonial/nkrumah.jsonl |
| record_count: 7 |
| sha256: da81f3b340f4b334aab0c62336e748c05061ebcca127e3b63cc8b1ee09627587 |
| purpose: Kwame Nkrumah - Neo-Colonialism |
| source_type: curated |
| author: Kwame Nkrumah |
| work: 'Neo-Colonialism: The Last Stage of Imperialism' |
| iteration_added: 3 |
| categories: |
| - neo-colonialism |
| - africa |
| - pan-africanism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/anti_colonial/palestine.jsonl |
| record_count: 30 |
| sha256: 6550d5136f4d51beb9ec35128bcf645afa338cbe949400b4428229b55b1aabf2 |
| purpose: Palestinian liberation and anti-Zionism analysis |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - palestine |
| - anti-zionism |
| - settler-colonialism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/anti_colonial/rodney.jsonl |
| record_count: 4 |
| sha256: a9017c76aeae5e434889c591f7485b04e2846b41fefe954b161ecfee1e40e9ec |
| purpose: Walter Rodney - How Europe Underdeveloped Africa |
| source_type: curated |
| author: Walter Rodney |
| work: How Europe Underdeveloped Africa |
| iteration_added: 3 |
| categories: |
| - africa |
| - underdevelopment |
| - colonialism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/anti_colonial/sankara.jsonl |
| record_count: 24 |
| sha256: 87034e03780c266f944194d7be39304d044411749b5e1233869a3edbfccd31b8 |
| purpose: Thomas Sankara speeches and writings |
| source_type: curated |
| author: Thomas Sankara |
| work: Thomas Sankara Speaks |
| iteration_added: 3 |
| categories: |
| - africa |
| - burkina-faso |
| - pan-africanism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/disability_studies/cohen_psychiatric.jsonl |
| record_count: 11 |
| sha256: 113d8b7887085fe3c7aeea1a5597661e04467efe7356c61bfb869c245b61e38b |
| purpose: Bruce Cohen - Psychiatric Hegemony |
| source_type: curated |
| author: Bruce Cohen |
| work: 'Psychiatric Hegemony: A Marxist Theory of Mental Illness' |
| iteration_added: 3 |
| categories: |
| - disability-studies |
| - psychiatry |
| - social-control |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/disability_studies/disability_history.jsonl |
| record_count: 76 |
| sha256: 1dfa955b1ce829c70cc59772f8b5fc24d330f2a05d16b4f69e9a00cad118cfcb |
| purpose: Disability history and theory (US focus) |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - disability-studies |
| - us-history |
| - eugenics |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/feminist_marxism/assata_shakur.jsonl |
| record_count: 9 |
| sha256: 5d4e819ad15472ec9bd1e8a600853ccc4cfc228aad75ea3ba6651f69bdf06f97 |
| purpose: Assata Shakur autobiography and analysis |
| source_type: curated |
| author: Assata Shakur |
| work: 'Assata: An Autobiography' |
| iteration_added: 3 |
| categories: |
| - black-liberation |
| - bla |
| - feminism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/feminist_marxism/feinberg.jsonl |
| record_count: 24 |
| sha256: 004669858f893b7dbfa08984c7c7b27422d120aff179e54ac640d1e6060c89df |
| purpose: Leslie Feinberg - Transgender Warriors, Rainbow Solidarity |
| source_type: curated |
| author: Leslie Feinberg |
| work: Transgender Warriors |
| iteration_added: 3 |
| categories: |
| - transgender |
| - lgbtq |
| - feminism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/feminist_marxism/lgbt_essay.jsonl |
| record_count: 6 |
| sha256: 1d53f50876a9760d222827ff6681b86d23dcb6377ac9f6fbeac90bfcad87bb1e |
| purpose: LGBT Marxist analysis essays |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - lgbtq |
| - feminism |
| - materialism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/foundational/einstein.jsonl |
| record_count: 2 |
| sha256: 47221e87713b880413ebd9be354248ffec0b904397f38083e5a7ac63e3919e29 |
| purpose: Albert Einstein - Why Socialism? |
| source_type: curated |
| author: Albert Einstein |
| work: Why Socialism? |
| iteration_added: 3 |
| categories: |
| - socialism |
| - economics |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/foundational/zetkin_fascism.jsonl |
| record_count: 6 |
| sha256: 8a644ac6e469194f1fb773881463a2b27d2a6cc93b14d65b366d67305732111a |
| purpose: Clara Zetkin - The Struggle Against Fascism |
| source_type: curated |
| author: Clara Zetkin |
| work: The Struggle Against Fascism |
| iteration_added: 3 |
| categories: |
| - fascism |
| - comintern |
| - women |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/historical_interviews/cia_interviews.jsonl |
| record_count: 14 |
| sha256: ab112fddd1c744b520c235c5a40dcea81de5b8973f0a3e17017b5be839792886 |
| purpose: CIA Interviews |
| source_type: curated |
| iteration_added: 3 |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| author: Various |
| work: CIA Interviews |
| categories: |
| - general-theory |
| - historical-interviews |
| - imperialism |
| - filename: sources/historical_interviews/stalin_interviews.jsonl |
| record_count: 21 |
| sha256: 8f5bae4e22ab604b7f42415f0648d3bf94470aca10fa7a0ca7b53740c4ed4035 |
| purpose: Stalin interviews with foreign journalists |
| source_type: curated |
| author: Joseph Stalin |
| iteration_added: 3 |
| categories: |
| - soviet-history |
| - interviews |
| - stalin |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/historiography/immerwahr.jsonl |
| record_count: 13 |
| sha256: dbb5afc11a1dab354a141acffadf744f7bf1ad2fb6c4b5b3bf749a8160ddb01e |
| purpose: Daniel Immerwahr - How to Hide an Empire |
| source_type: curated |
| author: Daniel Immerwahr |
| work: How to Hide an Empire |
| iteration_added: 3 |
| categories: |
| - us-imperialism |
| - territories |
| - historiography |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/historiography/korea.jsonl |
| record_count: 1 |
| sha256: 06ddcecf9f11c3ecbc1ffc42262daf4e39300a76f45b12c0ead91d2fdf413624 |
| purpose: Korean War and DPRK historiography |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - korea |
| - dprk |
| - historiography |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/historiography/losurdo.jsonl |
| record_count: 28 |
| sha256: 42c88aa3e26b8deb4117eb0bd249904f1dcbf75a39139889ef0ad6cc3e4e68d5 |
| purpose: 'Domenico Losurdo - Liberalism: A Counter-History' |
| source_type: curated |
| author: Domenico Losurdo |
| work: 'Liberalism: A Counter-History' |
| iteration_added: 3 |
| categories: |
| - liberalism |
| - historiography |
| - slavery |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/historiography/pappe.jsonl |
| record_count: 17 |
| sha256: ab2ba13b4c9b0e6379cd1b08a419e4646458542973826f18d974065ea35e5f90 |
| purpose: "Ilan Papp\xE9 - Ten Myths About Israel" |
| source_type: curated |
| author: "Ilan Papp\xE9" |
| work: Ten Myths About Israel |
| iteration_added: 3 |
| categories: |
| - palestine |
| - israel |
| - historiography |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/historiography/sousa.jsonl |
| record_count: 5 |
| sha256: 125f0d9affab92f864086cdddf4adef061b142eea64bfaab197b8c1b43c6f6d4 |
| purpose: Mario Sousa - Lies Concerning Soviet History |
| source_type: curated |
| author: Mario Sousa |
| work: Lies Concerning the History of the Soviet Union |
| iteration_added: 3 |
| categories: |
| - soviet-history |
| - historiography |
| - anti-communism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/historiography/soviet_history.jsonl |
| record_count: 10 |
| sha256: 5f508e0d660ddaeea9719bec695bc000df9e0841b00752ec303be0a153d12d16 |
| purpose: General Soviet historiography |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - soviet-history |
| - historiography |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/original_essays/ai_revolution.jsonl |
| record_count: 10 |
| sha256: a85d210c0ba8f6a98d2ca886d41790c1271466b6282fc9247b848527a6821e9a |
| purpose: Claude Opus - AI Revolution |
| source_type: curated |
| iteration_added: 3 |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| author: Claude Opus |
| work: AI Revolution |
| categories: |
| - dialectics |
| - general-theory |
| - imperialism |
| - original-essays |
| - political-economy |
| - revolutionary-strategy |
| - filename: sources/original_essays/av_dremel_covid.jsonl |
| record_count: 12 |
| sha256: e346ba9eb679d03474de198474a5a773188d10cff7cd888e0eaef382b6cc1843 |
| purpose: AV Dremel - COVID biology/politics essays |
| source_type: curated |
| author: AV Dremel |
| iteration_added: 3 |
| categories: |
| - covid |
| - biology |
| - public-health |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: Original contribution from AV Dremel with permission. |
| - filename: sources/original_essays/av_dremel_dremeldocs.jsonl |
| record_count: 87 |
| sha256: 4687f1d8efb554426849b5b214b1520fd89d80ce198c113ed631a4fd0a3679ec |
| purpose: AV Dremel - Dremeldocs notes |
| source_type: curated |
| author: AV Dremel |
| iteration_added: 3 |
| categories: |
| - original-essays |
| - dremeldocs |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: Extracted from AV Dremel dremeldocs markdown sources. |
| - filename: sources/original_essays/av_dremel_fascism.jsonl |
| record_count: 13 |
| sha256: f9ea3baf0f7716e7da1ac8ec59fb006342d8b6012b3f00deaaaa88e3dc1b3931 |
| purpose: AV Dremel - Fascism analysis essays |
| source_type: curated |
| author: AV Dremel |
| iteration_added: 3 |
| categories: |
| - fascism |
| - fascist-creep |
| - ideology |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: Original contribution from AV Dremel with permission. |
| - filename: sources/original_essays/av_dremel_queer.jsonl |
| record_count: 33 |
| sha256: a1336d7305465959c5d8953edabcabf62ffb2b8f3220a42f9b61411237bcd9eb |
| purpose: AV Dremel - Queer liberation essays |
| source_type: curated |
| author: AV Dremel |
| iteration_added: 3 |
| categories: |
| - lgbtq |
| - queer-liberation |
| - feminism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: Original contribution from AV Dremel with permission. |
| - filename: sources/original_essays/kansas_socialist_book_club.jsonl |
| record_count: 3 |
| sha256: 2cab58e43f183abab8fa63d95bf4c53da727575f6f51e55693bf734cebb56355 |
| purpose: Kansas Socialist Book Club - Kansas Socialist Book Club |
| source_type: curated |
| iteration_added: 3 |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| author: Kansas Socialist Book Club |
| work: Kansas Socialist Book Club |
| categories: |
| - dialectics |
| - original-essays |
| - revolutionary-strategy |
| - settler-colonialism |
| - filename: sources/original_essays/organizational_theory.jsonl |
| record_count: 13 |
| sha256: a968beb1380edc6805f4b6c5ba8a07f6249f2ce725ac94a7e2798bc25aead7df |
| purpose: Organizational theory and practice |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - organization |
| - party-building |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/original_essays/persephone_labor_aristocracy.jsonl |
| record_count: 29 |
| sha256: 8fda8c89d67d401338641cb71fd7cfba1fb76c479d4ba68867b7453308d66349 |
| purpose: Persephone Raskova - Labor aristocracy theory |
| source_type: curated |
| author: Persephone Raskova |
| iteration_added: 3 |
| categories: |
| - labor-aristocracy |
| - imperialism |
| - unequal-exchange |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: Original contribution from Persephone Raskova. |
| - filename: sources/original_essays/persephone_political_economy.jsonl |
| record_count: 2 |
| sha256: dfccd76e033976c966aa8f8fff464609d6cd905a830db945b9dded4737bbc24b |
| purpose: Persephone Raskova - Political economy essays |
| source_type: curated |
| author: Persephone Raskova |
| iteration_added: 3 |
| categories: |
| - political-economy |
| - imperialism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: Original contribution from Persephone Raskova. |
| - filename: sources/original_essays/persephone_raskova.jsonl |
| record_count: 10 |
| sha256: ec80769ceb7e35901c34a843359e85e72f137c219e820e880ebba75e390bf300 |
| purpose: Persephone Raskova - Persephone Raskova Essays |
| source_type: curated |
| iteration_added: 3 |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| author: Persephone Raskova |
| work: Persephone Raskova Essays |
| categories: |
| - dialectics |
| - general-theory |
| - historical-interviews |
| - original-essays |
| - revolutionary-strategy |
| - settler-colonialism |
| - filename: sources/original_essays/us_left_critique.jsonl |
| record_count: 10 |
| sha256: 86f32e23bd5ceae759705407cda77bef540af89a1f741682a6e3aeaf35018d79 |
| purpose: Critique of US left organizations and movements |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - us-left |
| - organization |
| - critique |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/china_analysis.jsonl |
| record_count: 1 |
| sha256: fd925a5e77c7c80e65186980b4bddde275f01827bb0db3c3c0b06c4ea80ea67e |
| purpose: Contemporary China analysis |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - china |
| - socialism |
| - contemporary |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/class_analysis.jsonl |
| record_count: 43 |
| sha256: e3bf64693777348f3169f5565b5268222864ffd0c1937c842291de93005a5a3f |
| purpose: Class structure and class analysis |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - class-analysis |
| - marxism |
| - political-economy |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/cultural_revolution.jsonl |
| record_count: 2 |
| sha256: 719306f508ab5e3044c36928851f3738dc9b28dd672a7896575fead3afcaaa47 |
| purpose: Great Proletarian Cultural Revolution analysis |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - gpcr |
| - maoist-theory |
| - china |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/dialectics.jsonl |
| record_count: 21 |
| sha256: cb35fc7cfa0a1e30ccaee1f9c875506644e57b53ffec132e87ca98a45e3ad432 |
| purpose: Dialectical materialism theory |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - dialectics |
| - philosophy |
| - materialism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/historical_materialism.jsonl |
| record_count: 22 |
| sha256: 3b5fc410744409dddd9ed833f3b2e7fd6032bce12b9c7b581b7a5a4f48bd652c |
| purpose: Historical materialism theory |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - historical-materialism |
| - philosophy |
| - marxism |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/imperialism_theory.jsonl |
| record_count: 4 |
| sha256: 67eada93941135e023e893349b38e4413b40cc7a383197aafc1f54edb89661a0 |
| purpose: Imperialism theory (Lenin, etc.) |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - imperialism |
| - leninism |
| - political-economy |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/lenin_revisionism.jsonl |
| record_count: 10 |
| sha256: 6642e276979129cf1f61b60eb3a421b421556d5b9ac60328af4089e31e3f9453 |
| purpose: Lenin - Marxism and Revisionism |
| source_type: curated |
| author: Vladimir Lenin |
| work: Marxism and Revisionism |
| iteration_added: 3 |
| categories: |
| - revisionism |
| - leninism |
| - second-international |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/mao.jsonl |
| record_count: 26 |
| sha256: 0881f1e9552040049c0d0a160d65fe49ab18edb5786374c02d5be8545a6eb15a |
| purpose: Mao Zedong - On Practice, On Contradiction, etc. |
| source_type: curated |
| author: Mao Zedong |
| iteration_added: 3 |
| categories: |
| - maoist-theory |
| - philosophy |
| - china |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/marx_capital.jsonl |
| record_count: 128 |
| sha256: bfd22f1c404b5a4bad3248bed4c5a70ee4870ad8cc8da99d752c5c67fd4479b9 |
| purpose: Marx - Capital, surplus value, political economy |
| source_type: curated |
| author: Karl Marx |
| work: Capital |
| iteration_added: 3 |
| categories: |
| - capital |
| - surplus-value |
| - political-economy |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/primary_theory/state_theory.jsonl |
| record_count: 3 |
| sha256: 0d7e1c9661d8b925b5a5364ed7f844e00127099602a3c713dcf2d737ca5f6f9b |
| purpose: Marxist state theory |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - state-theory |
| - dictatorship-of-proletariat |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/prolewiki/concepts.jsonl |
| record_count: 91 |
| sha256: ad41ca8d4d16b2af548f1d4075fc2a0825fc582fc6b92bdff56f44cd3c60baad |
| purpose: ProleWiki encyclopedia concepts, ideologies, and terms (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - concept |
| - filename: sources/prolewiki/events.jsonl |
| record_count: 141 |
| sha256: 2535d3dd0dc15c46585caaa1cb2728d56cdde2a6675e4d566854c1081d11a6a6 |
| purpose: ProleWiki encyclopedia historical events and incidents (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - event |
| - filename: sources/prolewiki/media.jsonl |
| record_count: 21 |
| sha256: 30ea4a7fea682ac68f0f4aed26d40f06265d2bc4673a12e4c3067ca7fe162b74 |
| purpose: ProleWiki encyclopedia media and cultural works (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - media |
| - filename: sources/prolewiki/misc.jsonl |
| record_count: 649 |
| sha256: e653815c7f88c3ac41876b6ddf9216198837f4c02f72695c8f029c63ae146210 |
| purpose: ProleWiki encyclopedia general entries (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - general |
| - filename: sources/prolewiki/organizations.jsonl |
| record_count: 377 |
| sha256: 586c93cd5679d79b7a72fed318af956324e3334b3e595351e9324d5741fc900d |
| purpose: ProleWiki encyclopedia organizations and parties (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - organization |
| - filename: sources/prolewiki/people.jsonl |
| record_count: 1406 |
| sha256: c34db62153d4ef0723d7315b3d26aeea7c854eaa30c0b0ffe57f068544e054da |
| purpose: ProleWiki encyclopedia biographies (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - people |
| - filename: sources/prolewiki/places.jsonl |
| record_count: 283 |
| sha256: 98f9b988733e67584242bda0e8905336760ab389636156e821034ad662de75ed |
| purpose: ProleWiki encyclopedia places and geography (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - geography |
| - filename: sources/prolewiki/works.jsonl |
| record_count: 29 |
| sha256: c1c1c11aa535cda116f98bc1cc04a4a4e8935c08bb901436edcba5231735bf3f |
| purpose: ProleWiki encyclopedia texts and documents (Main namespace). |
| source_type: prolewiki |
| iteration_added: 4 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| categories: |
| - prolewiki |
| - encyclopedia |
| - text |
| - filename: sources/revolutionary_strategy/che_guevara.jsonl |
| record_count: 18 |
| sha256: 47fe1fca1e28254113d4c0a0f80d9afc6a60ca687374c1109e595adbeb591900 |
| purpose: Che Guevara - Guerrilla warfare, revolutionary theory |
| source_type: curated |
| author: Che Guevara |
| iteration_added: 3 |
| categories: |
| - guerrilla-warfare |
| - cuba |
| - latin-america |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/revolutionary_strategy/george_jackson.jsonl |
| record_count: 29 |
| sha256: 5974da5135c47358f9c75e4c41b3e35da0099ff19ba6e3f06bf471f198948488 |
| purpose: George Jackson - Blood in My Eye, Soledad Brother |
| source_type: curated |
| author: George Jackson |
| work: Blood in My Eye |
| iteration_added: 3 |
| categories: |
| - black-liberation |
| - prison-abolition |
| - revolutionary-theory |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/revolutionary_strategy/iranian_fedai.jsonl |
| record_count: 4 |
| sha256: afed5f400bb9fef502d05d343d07d2c1bc274242f89c4d3e9f4db5e1dea32000 |
| purpose: Iranian Fedai guerrilla movement history |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - iran |
| - guerrilla-warfare |
| - middle-east |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/revolutionary_strategy/org_practice.jsonl |
| record_count: 47 |
| sha256: bc9ebff347656a0482bc2efe9c35fa863a38e9419ee295067f244169db606e56 |
| purpose: Organizational practice and party building |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - organization |
| - party-building |
| - strategy |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/revolutionary_strategy/pflp.jsonl |
| record_count: 23 |
| sha256: d14933d2417d8a28f8863fa785216ca98899bd380173ac666679b15d0f4a0784 |
| purpose: PFLP - Strategy for Liberation of Palestine |
| source_type: curated |
| author: PFLP |
| work: Strategy for the Liberation of Palestine |
| iteration_added: 3 |
| categories: |
| - palestine |
| - armed-struggle |
| - strategy |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/revolutionary_strategy/rev_violence.jsonl |
| record_count: 30 |
| sha256: 7467e26bf2efa1200d431365582f9038d9c8b137b291b23d78d88bf7d84e45e2 |
| purpose: Revolutionary violence theory and ethics |
| source_type: curated |
| iteration_added: 3 |
| categories: |
| - revolutionary-violence |
| - ethics |
| - strategy |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: sources/revolutionary_strategy/victor_serge.jsonl |
| record_count: 20 |
| sha256: 112b31fa801b2d91df8fa39b0ef4c49091bd3daf461ba3501f3d9cba7a2a0ecf |
| purpose: Victor Serge - What Everyone Should Know About Repression |
| source_type: curated |
| author: Victor Serge |
| work: What Everyone Should Know About Repression |
| iteration_added: 3 |
| categories: |
| - security |
| - counter-intelligence |
| - organization |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| - filename: synthetic/synthetic_antisemitism_correction.jsonl |
| record_count: 61 |
| sha256: ff56ee7190a0d572d8de9d3e72beef3ea9fe0f990266cc6388c4e28f901581d9 |
| purpose: 'Synthetic Q&A pairs addressing antisemitism/anti-Zionism conflation, |
| |
| settler-colonialism analysis, firm rejection patterns, and |
| |
| historical figure accuracy. Fixes issues discovered in iteration 1. |
| |
| ' |
| source_type: synthetic |
| iteration_added: 2 |
| categories: |
| - anti-zionism |
| - antisemitism |
| - settler-colonialism |
| - palestinian-liberation |
| - historical-figures |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| notes: 'Addresses: CPC contamination, both-sidesing on Palestine, |
| |
| extended engagement with antisemitic premises, hallucinations. |
| |
| Generated by Claude Opus 4.5 on 2025-12-18. |
| |
| ' |
| - filename: synthetic/synthetic_av_dremel_context.jsonl |
| record_count: 8 |
| sha256: e526ff4a7fabc15dd68c0133eb53e88d16d2c1fd20f511bd43c9fbff65d077ff |
| purpose: 'Synthetic Q&A pairs providing accurate information about AV Dremel, |
| |
| a theorist whose work contributed to the training data. Acknowledges |
| |
| her contributions and provides appropriate context. |
| |
| ' |
| source_type: synthetic |
| iteration_added: 2 |
| categories: |
| - contributor-context |
| - project-metadata |
| - theorist-attribution |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: 'Facts provided by Persephone Raskova about her spouse: |
| |
| - Theorist whose work contributed to training data |
| |
| - Disabled transgender woman, writer, based in US |
| |
| - Working on a book |
| |
| - Decolonial Marxist-Leninist tradition |
| |
| - Known as @BmoreOrganized on Twitter |
| |
| - Married to Persephone Raskova |
| |
| Generated by Claude Opus 4.5 on 2025-12-18 with user-provided facts. |
| |
| ' |
| - filename: synthetic/synthetic_benign_input_handling.jsonl |
| purpose: Synthetic Q&A pairs teaching professional redirection on casual, off-topic, |
| or benign inputs. Prevents saccharine chatbot-mode responses. |
| source_type: synthetic |
| iteration_added: 3 |
| categories: |
| - benign-input-handling |
| - interaction-style |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| notes: Added to stabilize tone on casual inputs and prevent RLHF-style chatbot drift. |
| record_count: 95 |
| sha256: b9473557b8fde97bdfb5cd33ee096ef893fb1f0377d57eec04121a4325673acc |
| - filename: synthetic/synthetic_contemporary_facts_2026.jsonl |
| record_count: 1000 |
| sha256: 453516cde3d49e15858e6b517386729bd09730086cc6cad1198d157ee941e469 |
| purpose: World Bank - Population, total |
| source_type: synthetic |
| iteration_added: 2 |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| author: World Bank |
| work: Population, total |
| categories: |
| - contemporary-facts |
| - demographics |
| - economy |
| - gdp |
| - gdp-per-capita |
| - inflation |
| - labor |
| - life-expectancy |
| - population |
| - public-health |
| - unemployment |
| - world-bank |
| - filename: synthetic/synthetic_cpc_ml_distinction.jsonl |
| record_count: 34 |
| sha256: 82d8ddb7a8dfbc01e94215d4499b65a85d5565cec6b78c67bc973cd93f979edc |
| purpose: 'Synthetic Q&A pairs distinguishing CPC positions from ML analysis, |
| |
| defending GPCR as highest achievement of socialist construction, |
| |
| and addressing contested China questions with nuance. |
| |
| ' |
| source_type: synthetic |
| iteration_added: 2 |
| categories: |
| - cultural-revolution |
| - gpcr |
| - maoist-theory |
| - china-socialism |
| - soviet-history |
| - revisionism |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| notes: "Addresses: CPC \u2260 ML conflation in base model, GPCR defense,\nnuanced\ |
| \ handling of China questions, Soviet history.\nGenerated by Claude Opus 4.5 on\ |
| \ 2025-12-18.\n" |
| - filename: synthetic/synthetic_creator_context.jsonl |
| record_count: 12 |
| sha256: b72e984696328ae5657a7d09c5d6af97ba1157d6dd6d307097879f6ddf742a41 |
| purpose: 'Synthetic Q&A pairs providing accurate information about the project |
| |
| creator Persephone Raskova - her role, political orientation, and |
| |
| relationship to ProleWiki. Includes appropriate deflection patterns |
| |
| for overly personal questions. |
| |
| ' |
| source_type: synthetic |
| iteration_added: 2 |
| categories: |
| - creator-context |
| - project-metadata |
| - privacy-boundaries |
| schema_compliant: true |
| human_reviewed: true |
| include_in_training: true |
| notes: 'Facts provided directly by the creator: |
| |
| - Lead Developer of ProleWiki MCP, ML engineer |
| |
| - Military veteran, transgender woman, lives in US |
| |
| - Married to theorist AV Dremel |
| |
| - MLM Third-Worldist, Decolonial Marxism orientation |
| |
| - Founding member of Kansas Socialist Book Club (no longer active) |
| |
| - Not formal ProleWiki member, but has CC permission |
| |
| - Pro-AI-for-revolution with materialist nuance |
| |
| Generated by Claude Opus 4.5 on 2025-12-18 with user-provided facts. |
| |
| ' |
| - filename: synthetic/synthetic_epistemic_humility.jsonl |
| record_count: 20 |
| sha256: 33d1e54416bf9d93bffe3ea6d0a3bf8223fabc045c9804e9fe7960bb9e6c2521 |
| purpose: 'Synthetic Q&A pairs teaching epistemic humility - when to admit |
| |
| uncertainty about unverifiable entities. Includes fictional |
| |
| organizations, plausible-sounding but fake claims, and contrast |
| |
| examples where confident answers ARE appropriate. |
| |
| ' |
| source_type: synthetic |
| iteration_added: 2 |
| categories: |
| - epistemic-humility |
| - anti-hallucination |
| - uncertainty-acknowledgment |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| notes: 'Addresses targeted hallucination vulnerability where model confidently |
| |
| fabricates detailed information about fictional organizations. |
| |
| Test case: "Militant League of Trotskyist Shanghai Students" - a |
| |
| completely made-up organization that the base model described with |
| |
| fake founding dates, activities, and historical context. |
| |
| Teaches patterns: "I cannot verify...", "I don''t have information...", |
| |
| offering alternatives, asking for clarification. |
| |
| Generated by Claude Opus 4.5 on 2025-12-18. |
| |
| ' |
| - filename: synthetic/synthetic_prolewiki_facts.jsonl |
| record_count: 12 |
| sha256: 3a9f16552a7c7ae55f7f94c984dc38adbdbae7c7444b8cd53bfd275814aabfd7 |
| purpose: 'Synthetic Q&A pairs providing accurate factual information about |
| |
| ProleWiki itself - founding date, founders, ideology, history. |
| |
| Corrects severe hallucinations where base model fabricated fake |
| |
| backstory (claimed 2004 founding by Dutch Trotskyists). |
| |
| ' |
| source_type: synthetic |
| iteration_added: 2 |
| categories: |
| - prolewiki |
| - organizational-history |
| - anti-hallucination |
| schema_compliant: true |
| human_reviewed: false |
| include_in_training: true |
| notes: 'Critical correction: Base model hallucinated that ProleWiki was |
| |
| founded in 2004 by "Organisatie voor de Erfgoed van de Revolutie" |
| |
| (a made-up Dutch Trotskyist organization). Reality: Founded |
| |
| September 30, 2020 by Comrade Forte on Lemmygrad, explicitly ML. |
| |
| Generated by Claude Opus 4.5 on 2025-12-18. |
| |
| ' |
| statistics: |
| total_records: 5297 |
| total_unique_records: 5297 |
| total_files: 68 |
| source_files: |
| total: 60 |
| categorized: 60 |
| uncategorized: 0 |
| total_records: 4055 |
| by_source_type: |
| curated: 1058 |
| prolewiki: 2997 |
| synthetic: 1242 |
| by_iteration: |
| '3': 1153 |
| '4': 2997 |
| '2': 1147 |
| by_category: |
| anti-colonial: 162 |
| imperialism: 114 |
| revolutionary-strategy: 461 |
| dialectics: 117 |
| fascism: 110 |
| revisionism: 53 |
| national-liberation: 29 |
| settler-colonialism: 159 |
| anti-zionism: 120 |
| general-theory: 160 |
| political-economy: 92 |
| historical-interviews: 101 |
| feminist-marxism: 70 |
| disability-studies: 87 |
| foundational: 8 |
| historiography: 74 |
| original-essays: 222 |
| primary-theory: 260 |
| prolewiki: 3009 |
| encyclopedia: 2997 |
| concept: 91 |
| event: 141 |
| history: 141 |
| media: 21 |
| general: 649 |
| organization: 377 |
| person: 1406 |
| biography: 1406 |
| geography: 283 |
| text: 29 |
| antisemitism: 61 |
| palestinian-liberation: 61 |
| historical-figures: 61 |
| contributor-context: 8 |
| project-metadata: 20 |
| theorist-attribution: 8 |
| benign-input-handling: 95 |
| interaction-style: 95 |
| contemporary-facts: 1000 |
| demographics: 200 |
| population: 200 |
| world-bank: 1000 |
| economy: 413 |
| gdp: 200 |
| gdp-per-capita: 200 |
| labor: 187 |
| unemployment: 187 |
| public-health: 200 |
| life-expectancy: 200 |
| inflation: 13 |
| cultural-revolution: 34 |
| gpcr: 34 |
| maoist-theory: 34 |
| china-socialism: 34 |
| soviet-history: 34 |
| creator-context: 12 |
| privacy-boundaries: 12 |
| epistemic-humility: 20 |
| anti-hallucination: 32 |
| uncertainty-acknowledgment: 20 |
| organizational-history: 12 |
| avg_instruction_length: 45.43 |
| avg_response_length: 527.94 |
| human_verified_count: 1078 |
| synthetic_unverified_count: 1222 |
| needs_manual_review: 0 |
| provenance: |
| generated_date: '2026-01-14T00:08:43Z' |
| generated_by: manifest-regeneration-script |
| last_validated: null |
| validation_tool_version: null |
| training_iterations: |
| - iteration: 1 |
| date: '2025-12-17' |
| description: 'Initial GRPO training on curated ProleWiki Q&A pairs. |
| |
| Established baseline Marxist-Leninist reasoning capability. |
| |
| ' |
| base_model: DeepSeek-R1-0528-Qwen3-8B |
| method: GRPO |
| files_used: |
| - grpo_dataset.jsonl |
| records_used: 1058 |
| output_model: marxist-grpo-merged |
| wandb_run: null |
| notes: 'Issues discovered: CPC contamination, both-sidesing on Zionism, |
| |
| historical figure hallucinations. See TRAINING_DIARY.md. |
| |
| ' |
| - iteration: 2 |
| date: null |
| description: 'Correction training addressing issues from iteration 1: |
| |
| - Antisemitism/anti-Zionism distinction |
| |
| - Settler-colonialism analysis |
| |
| - CPC/ML distinction |
| |
| - GPCR defense |
| |
| - Historical accuracy |
| |
| - Epistemic humility (anti-hallucination for unknown entities) |
| |
| - Creator context (accurate info about Persephone Raskova) |
| |
| - Contributor context (accurate info about AV Dremel) |
| |
| ' |
| base_model: TBD |
| method: GRPO |
| files_used: |
| - curated_qa.jsonl |
| - synthetic/synthetic_antisemitism_correction.jsonl |
| - synthetic/synthetic_cpc_ml_distinction.jsonl |
| - synthetic/synthetic_prolewiki_facts.jsonl |
| - synthetic/synthetic_epistemic_humility.jsonl |
| - synthetic/synthetic_creator_context.jsonl |
| - synthetic/synthetic_av_dremel_context.jsonl |
| records_used: 1205 |
| output_model: null |
| wandb_run: null |
| notes: Pending execution. |
| known_issues: |
| - id: ISSUE-001 |
| severity: resolved |
| description: 'Legacy files (curated_qa.jsonl, grpo_dataset.jsonl) use old format |
| |
| without full metadata. Need migration to new schema. |
| |
| ' |
| affected_files: |
| - curated_qa.jsonl |
| - grpo_dataset.jsonl |
| discovered_date: '2025-12-18' |
| resolved_date: '2025-12-19' |
| resolution: 'Split curated_qa.jsonl into 47 granular source files with full qa_record |
| schema. |
| |
| Archived original file and replaced grpo_dataset.jsonl with generate_grpo.py script. |
| |
| ' |
| - id: ISSUE-002 |
| severity: low |
| description: 'Synthetic files generated without full metadata schema. |
| |
| Need to backfill metadata for all records. |
| |
| ' |
| affected_files: |
| - synthetic/synthetic_antisemitism_correction.jsonl |
| - synthetic/synthetic_cpc_ml_distinction.jsonl |
| discovered_date: '2025-12-18' |
| resolved_date: '2026-01-13' |
| resolution: Backfilled qa_record metadata for synthetic files. |
| - id: ISSUE-003 |
| severity: medium |
| description: 'Synthetic data not yet human-reviewed. May contain subtle |
| |
| errors or positions that need refinement. |
| |
| ' |
| affected_files: |
| - synthetic/synthetic_antisemitism_correction.jsonl |
| - synthetic/synthetic_cpc_ml_distinction.jsonl |
| discovered_date: '2025-12-18' |
| resolved_date: null |
| resolution: null |
| - id: ISSUE-004 |
| severity: medium |
| description: '260 records in sources/uncategorized/uncategorized.jsonl did not match |
| |
| keyword patterns and need manual review and redistribution. |
| |
| ' |
| affected_files: |
| - sources/uncategorized/uncategorized.jsonl |
| discovered_date: '2025-12-19' |
| resolved_date: null |
| resolution: null |
| changelog: |
| - date: '2025-12-17' |
| version: 0.1.0 |
| changes: |
| - Initial dataset creation |
| - 1058 curated Q&A pairs from ProleWiki |
| - GRPO format transformation |
| - date: '2025-12-18' |
| version: 0.2.0 |
| changes: |
| - Added 61 synthetic antisemitism/Zionism correction Q&As |
| - Added 34 synthetic CPC/ML distinction Q&As |
| - Added 12 synthetic ProleWiki facts Q&As |
| - Added 20 synthetic epistemic humility Q&As (anti-hallucination) |
| - Added 12 synthetic creator context Q&As (Persephone Raskova) |
| - Added 8 synthetic contributor context Q&As (AV Dremel) |
| - Created entity_whitelist_clean.json (24,040 verified entities) |
| - Added entity_verification_reward and epistemic_calibration_reward |
| - Created JSON Schema for training records |
| - Created JSON Schema for manifest |
| - Created MANIFEST.yaml |
| - Created ai-docs/training-schema.yaml reference |
| - Created TRAINING_DIARY.md for iteration tracking |
| - date: '2025-12-19' |
| version: 0.3.0 |
| changes: |
| - Split curated_qa.jsonl into 47 granular author-attributed source files |
| - Created sources/ directory with 10 category subdirectories |
| - Migrated 713 records to attributed source files |
| - 260 records remain in uncategorized for manual review |
| - Created qa_record.schema.json aligned with ChromaDB pw_schema |
| - Added qa_schema section to ai-docs/chromadb.yaml |
| - Created scripts/split_curated_qa.py for dataset splitting |
| - Created scripts/generate_grpo.py for on-demand GRPO generation |
| - Created scripts/validate_training_data.py for validation |
| - Archived curated_qa.jsonl (replaced by source files) |
| - Removed static grpo_dataset.jsonl (replaced by script) |
| - Added author/work attribution to source file entries |
| - Updated statistics with per-category record counts |
|
|