Spaces:
Sleeping
Sleeping
| """Multi-character dialogue dataset transformer.""" | |
| import json | |
| import logging | |
| from typing import List, Dict, Any | |
| from datasets import load_dataset | |
| from .base import BaseWarblerTransformer | |
| logger = logging.getLogger(__name__) | |
| class MultiCharacterTransformer(BaseWarblerTransformer): | |
| """Transform agentlans/multi-character-dialogue dataset.""" | |
| def transform( | |
| self, dataset_name: str = "agentlans/multi-character-dialogue" | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Transform agentlans/multi-character-dialogue dataset. | |
| Format: setting, characters, conversation, setting_after_interaction | |
| """ | |
| logger.info(f"Loading {dataset_name}...") | |
| try: | |
| dataset = load_dataset(dataset_name) | |
| except Exception as e: | |
| logger.warning(f"Failed to load {dataset_name}: {e}") | |
| return [] | |
| warbler_docs = [] | |
| try: | |
| if "train" not in dataset: | |
| logger.warning("Multi-char: No 'train' split found in dataset") | |
| return [] | |
| train_data = dataset["train"] | |
| total_items = len(train_data) if hasattr(train_data, "__len__") else 0 | |
| logger.info(f"Processing {total_items} multi-character dialogue items...") | |
| for idx, item in enumerate(train_data): | |
| if idx > 0 and idx % 1000 == 0: | |
| logger.info( | |
| f"Processed {idx}/{total_items} items, created " | |
| f"{len(warbler_docs)} documents" | |
| ) | |
| try: | |
| if item is None: | |
| logger.warning(f"Multi-char {idx + 1}: Item is None, skipping") | |
| continue | |
| if not isinstance(item, dict): | |
| logger.warning( | |
| f"Multi-char {idx + 1}: Item is not a dict " | |
| f"(type: {type(item)}), skipping" | |
| ) | |
| continue | |
| setting = item.get("setting", "") | |
| characters = item.get("characters", []) | |
| conversation = item.get("conversation", []) | |
| if not isinstance(setting, str): | |
| setting = str(setting) if setting is not None else "" | |
| if not isinstance(characters, list): | |
| characters = [] if characters is None else [characters] | |
| if not isinstance(conversation, list): | |
| conversation = [] if conversation is None else [conversation] | |
| if not setting and not conversation: | |
| logger.warning(f"Multi-char {idx + 1}: Missing essential data, skipping") | |
| continue | |
| if conversation and not all( | |
| isinstance(msg, (dict, str)) for msg in conversation[:10] | |
| ): | |
| logger.warning( | |
| f"Multi-char {idx + 1}: Invalid conversation structure, skipping" | |
| ) | |
| continue | |
| try: | |
| content = self._create_content(item) | |
| except Exception as content_error: | |
| logger.warning( | |
| f"Multi-char {idx + 1}: Error creating content: " | |
| f"{content_error}, using fallback" | |
| ) | |
| setting_preview = setting[:100] | |
| content = ( | |
| f"[Multi-character dialogue content unavailable]\n" | |
| f"Setting: {setting_preview}" | |
| ) | |
| doc = { | |
| "content_id": f"multi-char/{hash(setting) % 10000 if setting else idx}", | |
| "content": content, | |
| "metadata": { | |
| "pack": "warbler-pack-multi-character", | |
| "source_dataset": dataset_name, | |
| "setting": setting[:150] + "..." if len(setting) > 150 else setting, | |
| "character_count": ( | |
| len(characters) if isinstance(characters, list) else 0 | |
| ), | |
| "conversation_length": ( | |
| len(conversation) if isinstance(conversation, list) else 0 | |
| ), | |
| "realm_type": "narrative", | |
| "realm_label": "multi_character_dialogue", | |
| "lifecycle_stage": "emergence", | |
| "activity_level": 0.7, | |
| "dialogue_type": "multi_character_interaction", | |
| }, | |
| } | |
| warbler_docs.append(doc) | |
| except MemoryError as mem_err: | |
| logger.error( | |
| f"Multi-char {idx + 1}: Memory error - {mem_err}. " | |
| f"Stopping processing to prevent crash." | |
| ) | |
| break | |
| except RecursionError as rec_err: | |
| logger.error( | |
| f"Multi-char {idx + 1}: Recursion error - {rec_err}. Skipping item." | |
| ) | |
| continue | |
| except (KeyboardInterrupt, SystemExit): | |
| logger.warning(f"Multi-char: Processing interrupted at item {idx + 1}") | |
| raise | |
| except Exception as e: | |
| logger.warning( | |
| f"Multi-char {idx + 1}: Error processing item: {type(e).__name__}: {e}" | |
| ) | |
| continue | |
| except (MemoryError, RecursionError) as critical_error: | |
| logger.error( | |
| f"Multi-char: Critical error during iteration: " | |
| f"{type(critical_error).__name__}: {critical_error}" | |
| ) | |
| logger.info(f"Returning {len(warbler_docs)} documents processed before error") | |
| except (KeyboardInterrupt, SystemExit): | |
| logger.warning( | |
| f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents" | |
| ) | |
| raise | |
| except Exception as outer_error: | |
| logger.error( | |
| f"Multi-char: Unexpected error during dataset iteration: " | |
| f"{type(outer_error).__name__}: {outer_error}" | |
| ) | |
| logger.info(f"Returning {len(warbler_docs)} documents processed before error") | |
| logger.info(f"✓ Transformed {len(warbler_docs)} multi-character entries") | |
| return warbler_docs | |
| def _create_content(item: Dict[str, Any]) -> str: | |
| """Create content string for multi-character dialogue with comprehensive error handling.""" | |
| if not isinstance(item, dict): | |
| return "[Invalid item format - not a dictionary]" | |
| conversation = item.get("conversation", []) | |
| conversation_lines = [] | |
| max_conversation_items = 1000 | |
| if isinstance(conversation, list): | |
| conversation_subset = conversation[:max_conversation_items] | |
| for msg_idx, msg in enumerate(conversation_subset): | |
| try: | |
| if msg is None: | |
| continue | |
| if isinstance(msg, dict): | |
| from_field = msg.get("from", "Unknown") | |
| message_field = msg.get("message", "") | |
| if not isinstance(from_field, str): | |
| from_field = str(from_field) if from_field is not None else "Unknown" | |
| if not isinstance(message_field, str): | |
| message_field = str(message_field) if message_field is not None else "" | |
| if len(message_field) > 5000: | |
| message_field = message_field[:5000] + "... [truncated]" | |
| conversation_lines.append(f"{from_field}: {message_field}") | |
| elif isinstance(msg, str): | |
| if len(msg) > 5000: | |
| msg = msg[:5000] + "... [truncated]" | |
| conversation_lines.append(msg) | |
| else: | |
| conversation_lines.append(f"[Message {msg_idx + 1}: {type(msg).__name__}]") | |
| except (RecursionError, MemoryError) as critical_err: | |
| logger.warning( | |
| f"Critical error processing conversation message {msg_idx}: {critical_err}" | |
| ) | |
| break | |
| except Exception as msg_err: | |
| logger.debug(f"Error processing conversation message {msg_idx}: {msg_err}") | |
| continue | |
| if len(conversation) > max_conversation_items: | |
| conversation_lines.append( | |
| f"\n[... {len(conversation) - max_conversation_items} more messages truncated]" | |
| ) | |
| conversation_text = ( | |
| "\n".join(conversation_lines) if conversation_lines else "[No conversation available]" | |
| ) | |
| setting = item.get("setting", "[No setting provided]") | |
| if not isinstance(setting, str): | |
| setting = str(setting) if setting is not None else "[No setting provided]" | |
| if len(setting) > 2000: | |
| setting = setting[:2000] + "... [truncated]" | |
| characters = item.get("characters", []) | |
| if not isinstance(characters, list): | |
| characters = [] if characters is None else [characters] | |
| setting_after = item.get( | |
| "setting after interaction", "[No setting after interaction provided]" | |
| ) | |
| if not isinstance(setting_after, str): | |
| setting_after = ( | |
| str(setting_after) | |
| if setting_after is not None | |
| else "[No setting after interaction provided]" | |
| ) | |
| if len(setting_after) > 2000: | |
| setting_after = setting_after[:2000] + "... [truncated]" | |
| characters_str = "[]" | |
| try: | |
| if len(characters) > 100: | |
| characters = characters[:100] | |
| characters_str = ( | |
| json.dumps(characters, indent=2, ensure_ascii=False) + "\n[... truncated]" | |
| ) | |
| else: | |
| characters_str = ( | |
| json.dumps(characters, indent=2, ensure_ascii=False) if characters else "[]" | |
| ) | |
| except (TypeError, ValueError, RecursionError) as json_err: | |
| logger.debug(f"Error serializing characters to JSON: {json_err}") | |
| try: | |
| characters_str = str(characters)[:500] if characters else "[]" | |
| except Exception: | |
| characters_str = "[Error formatting characters]" | |
| try: | |
| content = f"""Setting: {setting} | |
| Characters: {characters_str} | |
| Conversation: | |
| {conversation_text} | |
| After Interaction: {setting_after} | |
| This represents a multi-character narrative scenario for NPC interaction training.""" | |
| if len(content) > 50000: | |
| content = content[:50000] + "\n\n[Content truncated due to size]" | |
| return content | |
| except Exception as final_err: | |
| logger.warning(f"Error building final content: {final_err}") | |
| return f"[Error creating multi-character content: {type(final_err).__name__}]" | |