| metadata: | |
| name: "FCA Financial Classification Data Generator" | |
| id: "fca-classification-sdg" | |
| description: >- | |
| Generates synthetic training data for classifying financial communications into | |
| three regulatory categories: Guidance, Targeted Support, and Regulated Advice, | |
| based on FCA PERG 8 and CP23/24 frameworks. Includes LLM-as-judge quality filtering. | |
| version: "1.0.0" | |
| author: "djordjebatic" | |
| license: "Apache-2.0" | |
| recommended_models: | |
| default: "openai/gpt-4o" | |
| compatible: | |
| - "meta-llama/Llama-3.3-70B-Instruct" | |
| - "Qwen/Qwen2.5-7B-Instruct" | |
| tags: | |
| - "financial-regulation" | |
| - "classification" | |
| - "synthetic-data" | |
| - "fca" | |
| - "perg8" | |
| dataset_requirements: | |
| required_columns: | |
| - "target_label" | |
| - "domain" | |
| - "channel" | |
| - "persona" | |
| - "seed_text" | |
| blocks: | |
| - block_type: "PromptBuilderBlock" | |
| block_config: | |
| block_name: "build_generation_prompt" | |
| input_cols: ["target_label", "domain", "channel", "persona", "seed_text"] | |
| output_cols: "generation_prompt" | |
| prompt_config_path: "prompts/generate_examples.yaml" | |
| - block_type: "LLMChatBlock" | |
| block_config: | |
| block_name: "generate_examples" | |
| input_cols: "generation_prompt" | |
| output_cols: "raw_generation" | |
| temperature: 0.8 | |
| max_tokens: 3000 | |
| async_mode: true | |
| - block_type: "LLMResponseExtractorBlock" | |
| block_config: | |
| block_name: "extract_generation" | |
| input_cols: "raw_generation" | |
| extract_content: true | |
| expand_lists: true | |
| - block_type: "TagParserBlock" | |
| block_config: | |
| block_name: "parse_examples" | |
| input_cols: "extract_generation_content" | |
| output_cols: "generated_text" | |
| start_tags: ["[EXAMPLE]"] | |
| end_tags: ["[/EXAMPLE]"] | |
| - block_type: "PromptBuilderBlock" | |
| block_config: | |
| block_name: "build_judge_prompt" | |
| input_cols: | |
| claimed_label: "target_label" | |
| generated_text: "generated_text" | |
| output_cols: "judge_prompt" | |
| prompt_config_path: "prompts/quality_judge.yaml" | |
| - block_type: "LLMChatBlock" | |
| block_config: | |
| block_name: "judge_quality" | |
| input_cols: "judge_prompt" | |
| output_cols: "raw_judgment" | |
| temperature: 0.0 | |
| max_tokens: 500 | |
| async_mode: true | |
| - block_type: "LLMResponseExtractorBlock" | |
| block_config: | |
| block_name: "extract_judgment" | |
| input_cols: "raw_judgment" | |
| extract_content: true | |
| expand_lists: true | |
| - block_type: "TagParserBlock" | |
| block_config: | |
| block_name: "parse_verdict" | |
| input_cols: "extract_judgment_content" | |
| output_cols: "verdict" | |
| start_tags: ["[VERDICT]"] | |
| end_tags: ["[/VERDICT]"] | |
| - block_type: "ColumnValueFilterBlock" | |
| block_config: | |
| block_name: "filter_quality" | |
| input_cols: ["verdict"] | |
| filter_value: "PASS" | |
| operation: "contains" | |