Bellok commited on
Commit
26d152e
·
1 Parent(s): d5e328f

refactor(ingest): simplify HF dataset ingestion to core datasets npc-dialogue, fictional-characters, and tinystories

Browse files

- Remove support for multi-character, system-chat, arxiv, prompt-report, novels, manuals, enterprise, portuguese-edu, and edustories datasets
- Replace removed transformers with SyntheticFictionalCharactersTransformer and TinyStoriesNarrativeTransformer
- Update CLI defaults and options accordingly
- Add kagglehub[hf-datasets]>=0.3.0 dependency to requirements.txt

This refactoring reduces complexity by focusing on fewer, targeted datasets for data preprocessing.

requirements.txt CHANGED
@@ -27,6 +27,7 @@ requests>=2.32.0
27
 
28
  # Data Processing
29
  datasets>=3.1.0
 
30
  pyyaml>=6.0.2
31
  pdfplumber>=0.11.0
32
 
 
27
 
28
  # Data Processing
29
  datasets>=3.1.0
30
+ kagglehub[hf-datasets]>=0.3.0
31
  pyyaml>=6.0.2
32
  pdfplumber>=0.11.0
33
 
warbler_cda/utils/hf_warbler_ingest.py CHANGED
@@ -14,15 +14,8 @@ import click
14
 
15
  from .transformers import (
16
  NPCDialogueTransformer,
17
- MultiCharacterTransformer,
18
- SystemChatTransformer,
19
- ArxivTransformer,
20
- PromptReportTransformer,
21
- NovelsTransformer,
22
- ManualsTransformer,
23
- EnterpriseTransformer,
24
- PortugueseEducationTransformer,
25
- EdustoriesTransformer,
26
  WarblerPackBuilder,
27
  )
28
 
@@ -47,25 +40,15 @@ def cli():
47
  type=click.Choice(
48
  [
49
  "npc-dialogue",
50
- "multi-character",
51
- "system-chat",
52
- "arxiv",
53
- "prompt-report",
54
- "novels",
55
- "manuals",
56
- "enterprise",
57
- "portuguese-edu",
58
- "edustories",
59
  "all",
60
  ]
61
  ),
62
- default=["arxiv"],
63
  help="Datasets to ingest",
64
  )
65
  @click.option("--pack-prefix", "-p", default="warbler-pack-hf", help="Prefix for pack names")
66
- @click.option(
67
- "--arxiv-limit", type=int, default=None, help="Limit number of arXiv papers to ingest (HARD LIMIT: 250,000 for 1GB storage compliance)"
68
- )
69
  @click.option(
70
  "--max-docs-per-chunk",
71
  type=int,
@@ -78,7 +61,7 @@ def cli():
78
  default=None,
79
  help="Maximum PDF pages to extract (default: None for unlimited)",
80
  )
81
- def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages):
82
  """Ingest HF datasets into Warbler packs."""
83
  PACKS_DIR.mkdir(exist_ok=True, parents=True)
84
  builder = WarblerPackBuilder(PACKS_DIR)
@@ -86,13 +69,9 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
86
 
87
  if "all" in datasets:
88
  datasets = [
89
- "arxiv",
90
- "prompt-report",
91
- "novels",
92
- "manuals",
93
- "enterprise",
94
- "portuguese-edu",
95
- "edustories",
96
  ]
97
 
98
  if max_docs_per_chunk > 0:
@@ -118,42 +97,14 @@ def ingest(datasets, pack_prefix, arxiv_limit, max_docs_per_chunk, max_pdf_pages
118
  transformer = NPCDialogueTransformer(max_pdf_pages=max_pdf_pages)
119
  docs = transformer.transform()
120
  pack_name = f"{pack_prefix}-npc-dialogue"
121
- elif dataset == "multi-character":
122
- transformer = MultiCharacterTransformer(max_pdf_pages=max_pdf_pages)
123
- docs = transformer.transform()
124
- pack_name = f"{pack_prefix}-multi-character"
125
- elif dataset == "system-chat":
126
- transformer = SystemChatTransformer(max_pdf_pages=max_pdf_pages)
127
- docs = transformer.transform()
128
- pack_name = f"{pack_prefix}-system-chat"
129
- elif dataset == "arxiv":
130
- transformer = ArxivTransformer(max_pdf_pages=max_pdf_pages)
131
- docs = transformer.transform(limit=arxiv_limit)
132
- pack_name = f"{pack_prefix}-arxiv"
133
- elif dataset == "prompt-report":
134
- transformer = PromptReportTransformer(max_pdf_pages=max_pdf_pages)
135
- docs = transformer.transform()
136
- pack_name = f"{pack_prefix}-prompt-report"
137
- elif dataset == "novels":
138
- transformer = NovelsTransformer(max_pdf_pages=max_pdf_pages)
139
- docs = transformer.transform()
140
- pack_name = f"{pack_prefix}-novels"
141
- elif dataset == "manuals":
142
- transformer = ManualsTransformer(max_pdf_pages=max_pdf_pages)
143
- docs = transformer.transform()
144
- pack_name = f"{pack_prefix}-manuals"
145
- elif dataset == "enterprise":
146
- transformer = EnterpriseTransformer(max_pdf_pages=max_pdf_pages)
147
  docs = transformer.transform()
148
- pack_name = f"{pack_prefix}-enterprise"
149
- elif dataset == "portuguese-edu":
150
- transformer = PortugueseEducationTransformer(max_pdf_pages=max_pdf_pages)
151
  docs = transformer.transform()
152
- pack_name = f"{pack_prefix}-portuguese-edu"
153
- elif dataset == "edustories":
154
- transformer = EdustoriesTransformer(max_pdf_pages=max_pdf_pages)
155
- docs = transformer.transform()
156
- pack_name = f"{pack_prefix}-edustories"
157
  else:
158
  click.echo(f"[ERROR] Unknown dataset: {dataset}")
159
  continue
@@ -214,42 +165,14 @@ class HFWarblerIngestor:
214
  transformer = NPCDialogueTransformer(max_pdf_pages=max_pdf_pages)
215
  docs = transformer.transform()
216
  pack_name = f"{pack_prefix}-npc-dialogue"
217
- elif dataset_name == "multi-character":
218
- transformer = MultiCharacterTransformer(max_pdf_pages=max_pdf_pages)
219
- docs = transformer.transform()
220
- pack_name = f"{pack_prefix}-multi-character"
221
- elif dataset_name == "system-chat":
222
- transformer = SystemChatTransformer(max_pdf_pages=max_pdf_pages)
223
- docs = transformer.transform()
224
- pack_name = f"{pack_prefix}-system-chat"
225
- elif dataset_name == "arxiv":
226
- transformer = ArxivTransformer(max_pdf_pages=max_pdf_pages)
227
- docs = transformer.transform(limit=arxiv_limit)
228
- pack_name = f"{pack_prefix}-arxiv"
229
- elif dataset_name == "prompt-report":
230
- transformer = PromptReportTransformer(max_pdf_pages=max_pdf_pages)
231
- docs = transformer.transform()
232
- pack_name = f"{pack_prefix}-prompt-report"
233
- elif dataset_name == "novels":
234
- transformer = NovelsTransformer(max_pdf_pages=max_pdf_pages)
235
- docs = transformer.transform()
236
- pack_name = f"{pack_prefix}-novels"
237
- elif dataset_name == "manuals":
238
- transformer = ManualsTransformer(max_pdf_pages=max_pdf_pages)
239
- docs = transformer.transform()
240
- pack_name = f"{pack_prefix}-manuals"
241
- elif dataset_name == "enterprise":
242
- transformer = EnterpriseTransformer(max_pdf_pages=max_pdf_pages)
243
- docs = transformer.transform()
244
- pack_name = f"{pack_prefix}-enterprise"
245
- elif dataset_name == "portuguese-edu":
246
- transformer = PortugueseEducationTransformer(max_pdf_pages=max_pdf_pages)
247
  docs = transformer.transform()
248
- pack_name = f"{pack_prefix}-portuguese-edu"
249
- elif dataset_name == "edustories":
250
- transformer = EdustoriesTransformer(max_pdf_pages=max_pdf_pages)
251
  docs = transformer.transform()
252
- pack_name = f"{pack_prefix}-edustories"
253
  else:
254
  if self.verbose:
255
  print(f"❌ Unknown dataset: {dataset_name}")
 
14
 
15
  from .transformers import (
16
  NPCDialogueTransformer,
17
+ SyntheticFictionalCharactersTransformer,
18
+ TinyStoriesNarrativeTransformer,
 
 
 
 
 
 
 
19
  WarblerPackBuilder,
20
  )
21
 
 
40
  type=click.Choice(
41
  [
42
  "npc-dialogue",
43
+ "fictional-characters",
44
+ "tinystories",
 
 
 
 
 
 
 
45
  "all",
46
  ]
47
  ),
48
+ default=["npc-dialogue"],
49
  help="Datasets to ingest",
50
  )
51
  @click.option("--pack-prefix", "-p", default="warbler-pack-hf", help="Prefix for pack names")
 
 
 
52
  @click.option(
53
  "--max-docs-per-chunk",
54
  type=int,
 
61
  default=None,
62
  help="Maximum PDF pages to extract (default: None for unlimited)",
63
  )
64
+ def ingest(datasets, pack_prefix, max_docs_per_chunk, max_pdf_pages):
65
  """Ingest HF datasets into Warbler packs."""
66
  PACKS_DIR.mkdir(exist_ok=True, parents=True)
67
  builder = WarblerPackBuilder(PACKS_DIR)
 
69
 
70
  if "all" in datasets:
71
  datasets = [
72
+ "npc-dialogue",
73
+ "fictional-characters",
74
+ "tinystories",
 
 
 
 
75
  ]
76
 
77
  if max_docs_per_chunk > 0:
 
97
  transformer = NPCDialogueTransformer(max_pdf_pages=max_pdf_pages)
98
  docs = transformer.transform()
99
  pack_name = f"{pack_prefix}-npc-dialogue"
100
+ elif dataset == "fictional-characters":
101
+ transformer = SyntheticFictionalCharactersTransformer(max_pdf_pages=max_pdf_pages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  docs = transformer.transform()
103
+ pack_name = f"{pack_prefix}-fictional-characters"
104
+ elif dataset == "tinystories":
105
+ transformer = TinyStoriesNarrativeTransformer(max_pdf_pages=max_pdf_pages)
106
  docs = transformer.transform()
107
+ pack_name = f"{pack_prefix}-tinystories"
 
 
 
 
108
  else:
109
  click.echo(f"[ERROR] Unknown dataset: {dataset}")
110
  continue
 
165
  transformer = NPCDialogueTransformer(max_pdf_pages=max_pdf_pages)
166
  docs = transformer.transform()
167
  pack_name = f"{pack_prefix}-npc-dialogue"
168
+ elif dataset_name == "fictional-characters":
169
+ transformer = SyntheticFictionalCharactersTransformer(max_pdf_pages=max_pdf_pages)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  docs = transformer.transform()
171
+ pack_name = f"{pack_prefix}-fictional-characters"
172
+ elif dataset_name == "tinystories":
173
+ transformer = TinyStoriesNarrativeTransformer(max_pdf_pages=max_pdf_pages)
174
  docs = transformer.transform()
175
+ pack_name = f"{pack_prefix}-tinystories"
176
  else:
177
  if self.verbose:
178
  print(f"❌ Unknown dataset: {dataset_name}")
warbler_cda/utils/transformers/__init__.py CHANGED
@@ -1,26 +1,12 @@
1
  from .base import BaseWarblerTransformer, WarblerPackBuilder
2
  from .npc_dialogue import NPCDialogueTransformer
3
- from .multi_character import MultiCharacterTransformer
4
- from .system_chat import SystemChatTransformer
5
- from .arxiv import ArxivTransformer
6
- from .prompt_report import PromptReportTransformer
7
- from .novels import NovelsTransformer
8
- from .manuals import ManualsTransformer
9
- from .enterprise import EnterpriseTransformer
10
- from .portuguese_education import PortugueseEducationTransformer
11
- from .edustories import EdustoriesTransformer
12
 
13
  __all__ = [
14
  "BaseWarblerTransformer",
15
  "WarblerPackBuilder",
16
  "NPCDialogueTransformer",
17
- "MultiCharacterTransformer",
18
- "SystemChatTransformer",
19
- "ArxivTransformer",
20
- "PromptReportTransformer",
21
- "NovelsTransformer",
22
- "ManualsTransformer",
23
- "EnterpriseTransformer",
24
- "PortugueseEducationTransformer",
25
- "EdustoriesTransformer",
26
  ]
 
1
  from .base import BaseWarblerTransformer, WarblerPackBuilder
2
  from .npc_dialogue import NPCDialogueTransformer
3
+ from .synthetic_fictional_characters import SyntheticFictionalCharactersTransformer
4
+ from .tiny_stories_narrative import TinyStoriesNarrativeTransformer
 
 
 
 
 
 
 
5
 
6
  __all__ = [
7
  "BaseWarblerTransformer",
8
  "WarblerPackBuilder",
9
  "NPCDialogueTransformer",
10
+ "SyntheticFictionalCharactersTransformer",
11
+ "TinyStoriesNarrativeTransformer",
 
 
 
 
 
 
 
12
  ]
warbler_cda/utils/transformers/synthetic_fictional_characters.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Synthetic Fictional Characters dataset transformer."""
2
+
3
+ import logging
4
+ from typing import List, Dict, Any
5
+
6
+ import kagglehub
7
+ from kagglehub import KaggleDatasetAdapter
8
+
9
+ from .base import BaseWarblerTransformer
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class SyntheticFictionalCharactersTransformer(BaseWarblerTransformer):
16
+ """Transform pratyushpuri/synthetic-fictional-characters-dataset."""
17
+
18
+ def transform(
19
+ self, dataset_name: str = "pratyushpuri/synthetic-fictional-characters-dataset",
20
+ file_path: str = ""
21
+ ) -> List[Dict[str, Any]]:
22
+ """
23
+ Transform synthetic fictional characters dataset.
24
+
25
+ Uses kagglehub with HF adapter to load the dataset, then transforms
26
+ character profiles into Warbler-compatible documents.
27
+
28
+ Fields include: Character Name, Media Type, Genre, Role, Personality Traits,
29
+ Backstory, Skills/Abilities, Appearance, Alignment, Relationships, etc.
30
+ """
31
+ logger.info(f"Loading {dataset_name}...")
32
+ try:
33
+ # Load using KaggleHub with HF adapter
34
+ hf_dataset = kagglehub.load_dataset(
35
+ KaggleDatasetAdapter.HUGGING_FACE,
36
+ dataset_name,
37
+ file_path,
38
+ # Provide any additional arguments like
39
+ # sql_query, hf_kwargs, or pandas_kwargs. See
40
+ # the documenation for more information:
41
+ # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterhugging_face
42
+ )
43
+ except Exception as e:
44
+ logger.error(f"Failed to load {dataset_name}: {e}")
45
+ return []
46
+
47
+ warbler_docs = []
48
+
49
+ items = self.extract_dataset_items(hf_dataset)
50
+
51
+ for idx, item in enumerate(items):
52
+ if isinstance(item, dict):
53
+ try:
54
+ doc = {
55
+ "content_id": f"fictional-characters/{self._safe_slug(item.get('Character Name', f'character-{idx}'))}",
56
+ "content": self._create_content(item),
57
+ "metadata": {
58
+ "pack": "warbler-pack-fictional-characters",
59
+ "source_dataset": dataset_name,
60
+ "character_name": item.get("Character Name", ""),
61
+ "media_type": item.get("Media Type", ""),
62
+ "media_source": item.get("Media Source", ""),
63
+ "genre": item.get("Genre", ""),
64
+ "role": item.get("Role", ""),
65
+ "personality_traits": item.get("Personality Traits", ""),
66
+ "skills_abilities": item.get("Skills/Abilities", ""),
67
+ "alignment": item.get("Alignment", ""),
68
+ "relationships": item.get("Relationships", ""),
69
+ "significance_impact": item.get("Significance/Impact", ""),
70
+ "realm_type": "character",
71
+ "realm_label": "fictional_characters",
72
+ "lifecycle_stage": "emergence",
73
+ "activity_level": 0.85,
74
+ "dialogue_type": "character_profile",
75
+ "license": "MIT",
76
+ },
77
+ }
78
+ warbler_docs.append(doc)
79
+ except Exception as e:
80
+ logger.warning(f"Error processing character {idx}: {e}")
81
+ continue
82
+
83
+ logger.info(f"✓ Transformed {len(warbler_docs)} fictional character profiles")
84
+ return warbler_docs
85
+
86
+ @staticmethod
87
+ def _safe_slug(text: str) -> str:
88
+ """Create a safe slug from text."""
89
+ if not text:
90
+ return "unknown"
91
+ return "".join(c for c in text.lower().replace(" ", "-") if c.isalnum() or c in "-").strip("-")
92
+
93
+ @staticmethod
94
+ def _create_content(item: Dict[str, Any]) -> str:
95
+ """Create content string for fictional character profile."""
96
+ character_name = item.get("Character Name", "Unknown Character")
97
+ media_type = item.get("Media Type", "")
98
+ media_source = item.get("Media Source", "")
99
+ genre = item.get("Genre", "")
100
+ role = item.get("Role", "")
101
+ personality_traits = item.get("Personality Traits", "")
102
+ backstory = item.get("Backstory", "")
103
+ skills_abilities = item.get("Skills/Abilities", "")
104
+ appearance_description = item.get("Appearance Description", "")
105
+ alignment = item.get("Alignment", "")
106
+ interests_hobbies = item.get("Interests/Hobbies", "")
107
+ relationships = item.get("Relationships", "")
108
+ significance_impact = item.get("Significance/Impact", "")
109
+ description = item.get("Description", "")
110
+ scenario_dialogue = item.get("Scenario/Dialogue Example", "")
111
+
112
+ content_parts = [
113
+ f"CHARACTER PROFILE: {character_name}",
114
+ ""
115
+ ]
116
+
117
+ if media_type and media_source:
118
+ content_parts.append(f"Source: {media_type} - {media_source}")
119
+ elif media_type:
120
+ content_parts.append(f"Media Type: {media_type}")
121
+
122
+ if genre:
123
+ content_parts.append(f"Genre: {genre}")
124
+
125
+ if role:
126
+ content_parts.append(f"Role: {role}")
127
+
128
+ content_parts.append("")
129
+
130
+ if personality_traits:
131
+ content_parts.append(f"Personality: {personality_traits}")
132
+
133
+ if appearance_description:
134
+ content_parts.append(f"Appearance: {appearance_description}")
135
+
136
+ if alignment:
137
+ content_parts.append(f"Alignment: {alignment}")
138
+
139
+ if skills_abilities:
140
+ content_parts.append(f"Skills & Abilities: {skills_abilities}")
141
+
142
+ if interests_hobbies:
143
+ content_parts.append(f"Interests & Hobbies: {interests_hobbies}")
144
+
145
+ content_parts.append("")
146
+
147
+ if backstory:
148
+ content_parts.append(f"Backstory: {backstory}")
149
+
150
+ if relationships:
151
+ content_parts.append(f"Relationships: {relationships}")
152
+
153
+ if significance_impact:
154
+ content_parts.append(f"Significance: {significance_impact}")
155
+
156
+ if description:
157
+ content_parts.append(f"Detailed Description: {description}")
158
+
159
+ if scenario_dialogue:
160
+ content_parts.append("")
161
+ content_parts.append(f"Example Scenario: {scenario_dialogue}")
162
+
163
+ content_parts.append("")
164
+ content_parts.append("This comprehensive character profile supports narrative development and character-driven storytelling.")
165
+
166
+ return "\n".join(content_parts)
warbler_cda/utils/transformers/tiny_stories_narrative.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tiny Stories Narrative dataset transformer."""
2
+
3
+ import logging
4
+ from typing import List, Dict, Any
5
+
6
+ import kagglehub
7
+ from kagglehub import KaggleDatasetAdapter
8
+
9
+ from .base import BaseWarblerTransformer
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class TinyStoriesNarrativeTransformer(BaseWarblerTransformer):
16
+ """Transform thedevastator/tinystories-narrative-classification dataset."""
17
+
18
+ def transform(
19
+ self, dataset_name: str = "thedevastator/tinystories-narrative-classification",
20
+ file_path: str = ""
21
+ ) -> List[Dict[str, Any]]:
22
+ """
23
+ Transform TinyStories narrative classification dataset.
24
+
25
+ Uses kagglehub with HF adapter to load the dataset containing short
26
+ stories with characters, locations, and narrative elements.
27
+
28
+ The dataset contains story texts that demonstrate various narrative patterns,
29
+ character interactions, and storytelling techniques.
30
+ """
31
+ logger.info(f"Loading {dataset_name}...")
32
+ try:
33
+ # Load using KaggleHub with HF adapter
34
+ hf_dataset = kagglehub.load_dataset(
35
+ KaggleDatasetAdapter.HUGGING_FACE,
36
+ dataset_name,
37
+ file_path,
38
+ # Provide any additional arguments like
39
+ # sql_query, hf_kwargs, or pandas_kwargs. See
40
+ # the documenation for more information:
41
+ # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterhugging_face
42
+ )
43
+ except Exception as e:
44
+ logger.error(f"Failed to load {dataset_name}: {e}")
45
+ return []
46
+
47
+ warbler_docs = []
48
+
49
+ items = self.extract_dataset_items(hf_dataset)
50
+
51
+ for idx, item in enumerate(items):
52
+ if isinstance(item, dict):
53
+ try:
54
+ story_text = item.get("text", "").strip()
55
+ if not story_text:
56
+ continue
57
+
58
+ # Extract basic narrative features for metadata
59
+ narrative_features = self._analyze_narrative_features(story_text)
60
+
61
+ doc = {
62
+ "content_id": f"tinystories/{idx:06d}",
63
+ "content": self._create_content(story_text),
64
+ "metadata": {
65
+ "pack": "warbler-pack-tinystories",
66
+ "source_dataset": dataset_name,
67
+ "story_length": len(story_text),
68
+ "word_count": len(story_text.split()),
69
+ "has_characters": narrative_features.get("has_characters", False),
70
+ "has_location": narrative_features.get("has_location", False),
71
+ "has_dialogue": narrative_features.get("has_dialogue", False),
72
+ "narrative_elements": narrative_features.get("elements", []),
73
+ "realm_type": "narrative",
74
+ "realm_label": "tinystories",
75
+ "lifecycle_stage": "emergence",
76
+ "activity_level": 0.75,
77
+ "dialogue_type": "story",
78
+ "license": "CC0",
79
+ },
80
+ }
81
+ warbler_docs.append(doc)
82
+ except Exception as e:
83
+ logger.warning(f"Error processing story {idx}: {e}")
84
+ continue
85
+
86
+ logger.info(f"✓ Transformed {len(warbler_docs)} tiny story narratives")
87
+ return warbler_docs
88
+
89
+ @staticmethod
90
+ def _analyze_narrative_features(story_text: str) -> Dict[str, Any]:
91
+ """Extract basic narrative features from story text."""
92
+ features = {
93
+ "has_characters": False,
94
+ "has_location": False,
95
+ "has_dialogue": False,
96
+ "elements": []
97
+ }
98
+
99
+ text_lower = story_text.lower()
100
+
101
+ # Check for character indicators
102
+ character_indicators = [
103
+ " once ", " there was ", " lived ", " the ", " said ", " asked ", " replied ",
104
+ " cried ", " shouted ", " whispered ", " thought ", " wanted ", " decided "
105
+ ]
106
+ if any(indicator in text_lower for indicator in character_indicators):
107
+ features["has_characters"] = True
108
+ features["elements"].append("characters")
109
+
110
+ # Check for location indicators
111
+ location_indicators = [
112
+ " house ", " home ", " village ", " town ", " forest ", " mountain ", " river ",
113
+ " garden ", " school ", " castle ", " kingdom ", " world ", " place "
114
+ ]
115
+ if any(indicator in text_lower for indicator in location_indicators):
116
+ features["has_location"] = True
117
+ features["elements"].append("locations")
118
+
119
+ # Check for dialogue indicators
120
+ if '"' in story_text or "'" in story_text or " said " in text_lower:
121
+ features["has_dialogue"] = True
122
+ features["elements"].append("dialogue")
123
+
124
+ # Additional narrative elements
125
+ if " and " in text_lower or " then " in text_lower:
126
+ features["elements"].append("sequencing")
127
+ if " happy " in text_lower or " sad " in text_lower or " angry " in text_lower:
128
+ features["elements"].append("emotions")
129
+ if any(word in text_lower for word in ["because", "so", "therefore"]):
130
+ features["elements"].append("causality")
131
+
132
+ return features
133
+
134
+ @staticmethod
135
+ def _create_content(story_text: str) -> str:
136
+ """Create content string for tiny story narrative."""
137
+ return f"""TINY STORY NARRATIVE
138
+
139
+ {story_text}
140
+
141
+ This short story demonstrates fundamental narrative patterns, character development, and storytelling techniques suitable for cognitive narrative analysis."""