Bellok's picture
Upload folder using huggingface_hub
0ccf2f0 verified
"""Multi-character dialogue dataset transformer."""
import json
import logging
from typing import List, Dict, Any
from datasets import load_dataset
from .base import BaseWarblerTransformer
logger = logging.getLogger(__name__)
class MultiCharacterTransformer(BaseWarblerTransformer):
"""Transform agentlans/multi-character-dialogue dataset."""
def transform(
self, dataset_name: str = "agentlans/multi-character-dialogue"
) -> List[Dict[str, Any]]:
"""
Transform agentlans/multi-character-dialogue dataset.
Format: setting, characters, conversation, setting_after_interaction
"""
logger.info(f"Loading {dataset_name}...")
try:
dataset = load_dataset(dataset_name)
except Exception as e:
logger.warning(f"Failed to load {dataset_name}: {e}")
return []
warbler_docs = []
try:
if "train" not in dataset:
logger.warning("Multi-char: No 'train' split found in dataset")
return []
train_data = dataset["train"]
total_items = len(train_data) if hasattr(train_data, "__len__") else 0
logger.info(f"Processing {total_items} multi-character dialogue items...")
for idx, item in enumerate(train_data):
if idx > 0 and idx % 1000 == 0:
logger.info(
f"Processed {idx}/{total_items} items, created "
f"{len(warbler_docs)} documents"
)
try:
if item is None:
logger.warning(f"Multi-char {idx + 1}: Item is None, skipping")
continue
if not isinstance(item, dict):
logger.warning(
f"Multi-char {idx + 1}: Item is not a dict "
f"(type: {type(item)}), skipping"
)
continue
setting = item.get("setting", "")
characters = item.get("characters", [])
conversation = item.get("conversation", [])
if not isinstance(setting, str):
setting = str(setting) if setting is not None else ""
if not isinstance(characters, list):
characters = [] if characters is None else [characters]
if not isinstance(conversation, list):
conversation = [] if conversation is None else [conversation]
if not setting and not conversation:
logger.warning(f"Multi-char {idx + 1}: Missing essential data, skipping")
continue
if conversation and not all(
isinstance(msg, (dict, str)) for msg in conversation[:10]
):
logger.warning(
f"Multi-char {idx + 1}: Invalid conversation structure, skipping"
)
continue
try:
content = self._create_content(item)
except Exception as content_error:
logger.warning(
f"Multi-char {idx + 1}: Error creating content: "
f"{content_error}, using fallback"
)
setting_preview = setting[:100]
content = (
f"[Multi-character dialogue content unavailable]\n"
f"Setting: {setting_preview}"
)
doc = {
"content_id": f"multi-char/{hash(setting) % 10000 if setting else idx}",
"content": content,
"metadata": {
"pack": "warbler-pack-multi-character",
"source_dataset": dataset_name,
"setting": setting[:150] + "..." if len(setting) > 150 else setting,
"character_count": (
len(characters) if isinstance(characters, list) else 0
),
"conversation_length": (
len(conversation) if isinstance(conversation, list) else 0
),
"realm_type": "narrative",
"realm_label": "multi_character_dialogue",
"lifecycle_stage": "emergence",
"activity_level": 0.7,
"dialogue_type": "multi_character_interaction",
},
}
warbler_docs.append(doc)
except MemoryError as mem_err:
logger.error(
f"Multi-char {idx + 1}: Memory error - {mem_err}. "
f"Stopping processing to prevent crash."
)
break
except RecursionError as rec_err:
logger.error(
f"Multi-char {idx + 1}: Recursion error - {rec_err}. Skipping item."
)
continue
except (KeyboardInterrupt, SystemExit):
logger.warning(f"Multi-char: Processing interrupted at item {idx + 1}")
raise
except Exception as e:
logger.warning(
f"Multi-char {idx + 1}: Error processing item: {type(e).__name__}: {e}"
)
continue
except (MemoryError, RecursionError) as critical_error:
logger.error(
f"Multi-char: Critical error during iteration: "
f"{type(critical_error).__name__}: {critical_error}"
)
logger.info(f"Returning {len(warbler_docs)} documents processed before error")
except (KeyboardInterrupt, SystemExit):
logger.warning(
f"Multi-char: Processing interrupted, returning {len(warbler_docs)} documents"
)
raise
except Exception as outer_error:
logger.error(
f"Multi-char: Unexpected error during dataset iteration: "
f"{type(outer_error).__name__}: {outer_error}"
)
logger.info(f"Returning {len(warbler_docs)} documents processed before error")
logger.info(f"✓ Transformed {len(warbler_docs)} multi-character entries")
return warbler_docs
@staticmethod
def _create_content(item: Dict[str, Any]) -> str:
"""Create content string for multi-character dialogue with comprehensive error handling."""
if not isinstance(item, dict):
return "[Invalid item format - not a dictionary]"
conversation = item.get("conversation", [])
conversation_lines = []
max_conversation_items = 1000
if isinstance(conversation, list):
conversation_subset = conversation[:max_conversation_items]
for msg_idx, msg in enumerate(conversation_subset):
try:
if msg is None:
continue
if isinstance(msg, dict):
from_field = msg.get("from", "Unknown")
message_field = msg.get("message", "")
if not isinstance(from_field, str):
from_field = str(from_field) if from_field is not None else "Unknown"
if not isinstance(message_field, str):
message_field = str(message_field) if message_field is not None else ""
if len(message_field) > 5000:
message_field = message_field[:5000] + "... [truncated]"
conversation_lines.append(f"{from_field}: {message_field}")
elif isinstance(msg, str):
if len(msg) > 5000:
msg = msg[:5000] + "... [truncated]"
conversation_lines.append(msg)
else:
conversation_lines.append(f"[Message {msg_idx + 1}: {type(msg).__name__}]")
except (RecursionError, MemoryError) as critical_err:
logger.warning(
f"Critical error processing conversation message {msg_idx}: {critical_err}"
)
break
except Exception as msg_err:
logger.debug(f"Error processing conversation message {msg_idx}: {msg_err}")
continue
if len(conversation) > max_conversation_items:
conversation_lines.append(
f"\n[... {len(conversation) - max_conversation_items} more messages truncated]"
)
conversation_text = (
"\n".join(conversation_lines) if conversation_lines else "[No conversation available]"
)
setting = item.get("setting", "[No setting provided]")
if not isinstance(setting, str):
setting = str(setting) if setting is not None else "[No setting provided]"
if len(setting) > 2000:
setting = setting[:2000] + "... [truncated]"
characters = item.get("characters", [])
if not isinstance(characters, list):
characters = [] if characters is None else [characters]
setting_after = item.get(
"setting after interaction", "[No setting after interaction provided]"
)
if not isinstance(setting_after, str):
setting_after = (
str(setting_after)
if setting_after is not None
else "[No setting after interaction provided]"
)
if len(setting_after) > 2000:
setting_after = setting_after[:2000] + "... [truncated]"
characters_str = "[]"
try:
if len(characters) > 100:
characters = characters[:100]
characters_str = (
json.dumps(characters, indent=2, ensure_ascii=False) + "\n[... truncated]"
)
else:
characters_str = (
json.dumps(characters, indent=2, ensure_ascii=False) if characters else "[]"
)
except (TypeError, ValueError, RecursionError) as json_err:
logger.debug(f"Error serializing characters to JSON: {json_err}")
try:
characters_str = str(characters)[:500] if characters else "[]"
except Exception:
characters_str = "[Error formatting characters]"
try:
content = f"""Setting: {setting}
Characters: {characters_str}
Conversation:
{conversation_text}
After Interaction: {setting_after}
This represents a multi-character narrative scenario for NPC interaction training."""
if len(content) > 50000:
content = content[:50000] + "\n\n[Content truncated due to size]"
return content
except Exception as final_err:
logger.warning(f"Error building final content: {final_err}")
return f"[Error creating multi-character content: {type(final_err).__name__}]"