djordjebatic's picture
Add sdg_hub flow definition for FCA classification data generation
d34abbc verified
metadata:
name: "FCA Financial Classification Data Generator"
id: "fca-classification-sdg"
description: >-
Generates synthetic training data for classifying financial communications into
three regulatory categories: Guidance, Targeted Support, and Regulated Advice,
based on FCA PERG 8 and CP23/24 frameworks. Includes LLM-as-judge quality filtering.
version: "1.0.0"
author: "djordjebatic"
license: "Apache-2.0"
recommended_models:
default: "openai/gpt-4o"
compatible:
- "meta-llama/Llama-3.3-70B-Instruct"
- "Qwen/Qwen2.5-7B-Instruct"
tags:
- "financial-regulation"
- "classification"
- "synthetic-data"
- "fca"
- "perg8"
dataset_requirements:
required_columns:
- "target_label"
- "domain"
- "channel"
- "persona"
- "seed_text"
blocks:
- block_type: "PromptBuilderBlock"
block_config:
block_name: "build_generation_prompt"
input_cols: ["target_label", "domain", "channel", "persona", "seed_text"]
output_cols: "generation_prompt"
prompt_config_path: "prompts/generate_examples.yaml"
- block_type: "LLMChatBlock"
block_config:
block_name: "generate_examples"
input_cols: "generation_prompt"
output_cols: "raw_generation"
temperature: 0.8
max_tokens: 3000
async_mode: true
- block_type: "LLMResponseExtractorBlock"
block_config:
block_name: "extract_generation"
input_cols: "raw_generation"
extract_content: true
expand_lists: true
- block_type: "TagParserBlock"
block_config:
block_name: "parse_examples"
input_cols: "extract_generation_content"
output_cols: "generated_text"
start_tags: ["[EXAMPLE]"]
end_tags: ["[/EXAMPLE]"]
- block_type: "PromptBuilderBlock"
block_config:
block_name: "build_judge_prompt"
input_cols:
claimed_label: "target_label"
generated_text: "generated_text"
output_cols: "judge_prompt"
prompt_config_path: "prompts/quality_judge.yaml"
- block_type: "LLMChatBlock"
block_config:
block_name: "judge_quality"
input_cols: "judge_prompt"
output_cols: "raw_judgment"
temperature: 0.0
max_tokens: 500
async_mode: true
- block_type: "LLMResponseExtractorBlock"
block_config:
block_name: "extract_judgment"
input_cols: "raw_judgment"
extract_content: true
expand_lists: true
- block_type: "TagParserBlock"
block_config:
block_name: "parse_verdict"
input_cols: "extract_judgment_content"
output_cols: "verdict"
start_tags: ["[VERDICT]"]
end_tags: ["[/VERDICT]"]
- block_type: "ColumnValueFilterBlock"
block_config:
block_name: "filter_quality"
input_cols: ["verdict"]
filter_value: "PASS"
operation: "contains"