| | import json |
| | from datetime import datetime |
| | from pathlib import Path |
| | from taskmaster_processor import TaskmasterProcessor, RawDataProcessingConfig |
| |
|
| | def main(): |
| | |
| | base_dir = "raw_datasets/taskmaster" |
| | config = RawDataProcessingConfig( |
| | debug=True, |
| | max_length=512, |
| | min_turns=4, |
| | min_user_words=3 |
| | ) |
| | processor = TaskmasterProcessor(config) |
| | |
| | |
| | dialogues = processor.load_taskmaster_dataset(base_dir=base_dir, max_examples=None) |
| | |
| | |
| | final_dialogues = processor.filter_and_convert(dialogues) |
| | |
| | |
| | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| | output_dir = Path("processed_outputs") |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | out_file = output_dir / f"taskmaster_only_{timestamp}.json" |
| | |
| | with open(out_file, 'w', encoding='utf-8') as f: |
| | json.dump(final_dialogues, f, indent=2) |
| | |
| | print(f"[Taskmaster Only] Kept {len(final_dialogues)} dialogues => {out_file}") |
| |
|
| | if __name__ == "__main__": |
| | main() |