Leonardo commited on
Commit
5ca0d67
·
verified ·
1 Parent(s): 26cad6d

Create frontmatter_tool.py

Browse files
Files changed (1) hide show
  1. scripts/frontmatter_tool.py +448 -0
scripts/frontmatter_tool.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Frontmatter Generator Tool for Smolagents
3
+
4
+ This tool helps generate consistent YAML frontmatter for documents,
5
+ useful for RAG systems, static site generators, and document organization.
6
+ """
7
+
8
+ import re
9
+ import yaml
10
+ from datetime import datetime
11
+ from typing import Dict, List, Optional, Any, Union
12
+ from smolagents import Tool
13
+
14
+
15
+ class FrontmatterGeneratorTool(Tool):
16
+ """Tool for generating and manipulating YAML frontmatter in documents."""
17
+
18
+ name = "frontmatter_generator"
19
+ description = """
20
+ Generates or extracts YAML frontmatter for documents. Frontmatter provides structured
21
+ metadata for documents including title, author, date, description, and tags.
22
+ Useful for document organization, RAG systems, and static site generators.
23
+ """
24
+
25
+ inputs = {
26
+ "content": {
27
+ "type": "string",
28
+ "description": "Document content (with or without existing frontmatter)",
29
+ },
30
+ "title": {"type": "string", "description": "Document title", "optional": True},
31
+ "author": {
32
+ "type": "string",
33
+ "description": "Document author(s)",
34
+ "optional": True,
35
+ },
36
+ "date": {
37
+ "type": "string",
38
+ "description": "Document date in YYYY-MM-DD format (defaults to today if not provided)",
39
+ "optional": True,
40
+ },
41
+ "date_format": {
42
+ "type": "string",
43
+ "description": "Format string for the document date (e.g., '%Y-%m-%d', '%d/%m/%Y'). Defaults to '%Y-%m-%d'",
44
+ "optional": True,
45
+ "default": "%Y-%m-%d",
46
+ },
47
+ "description": {
48
+ "type": "string",
49
+ "description": "Brief description of the document",
50
+ "optional": True,
51
+ },
52
+ "tags": {
53
+ "type": "string",
54
+ "description": "Comma-separated list of tags",
55
+ "optional": True,
56
+ },
57
+ "additional_fields": {
58
+ "type": "string",
59
+ "description": "JSON string with additional frontmatter fields",
60
+ "optional": True,
61
+ },
62
+ "mode": {
63
+ "type": "string",
64
+ "description": "Operation mode: 'generate' (create new), 'extract' (get existing), 'update' (modify existing), or 'strip' (remove)",
65
+ "default": "generate",
66
+ },
67
+ }
68
+ output_type = "string"
69
+
70
+ # Regular expression to detect and extract YAML frontmatter
71
+ FRONTMATTER_PATTERN = r"^---\s*\n(.*?)\n---\s*\n"
72
+
73
+ def forward(
74
+ self,
75
+ content: str,
76
+ title: Optional[str] = None,
77
+ author: Optional[str] = None,
78
+ date: Optional[str] = None,
79
+ date_format: Optional[str] = "%Y-%m-%d",
80
+ description: Optional[str] = None,
81
+ tags: Optional[str] = None,
82
+ additional_fields: Optional[str] = None,
83
+ mode: str = "generate",
84
+ ) -> str:
85
+ """
86
+ Process document content based on specified mode.
87
+
88
+ Args:
89
+ content: Document content with or without frontmatter
90
+ title: Document title
91
+ author: Document author(s)
92
+ date: Document date (YYYY-MM-DD)
93
+ date_format: strftime format string
94
+ description: Brief document description
95
+ tags: Comma-separated list of tags
96
+ additional_fields: JSON string with additional fields
97
+ mode: Operation mode (generate, extract, update, strip)
98
+
99
+ Returns:
100
+ Processed document or extracted frontmatter
101
+ """
102
+ # Validate inputs
103
+ if not isinstance(content, str):
104
+ return "Error: Content must be a string"
105
+ if title and not isinstance(title, str):
106
+ return "Error: Title must be a string"
107
+ if author and not isinstance(author, str):
108
+ return "Error: Author must be a string"
109
+ if date and not isinstance(date, str):
110
+ return "Error: Date must be a string"
111
+ if description and not isinstance(description, str):
112
+ return "Error: Description must be a string"
113
+ if tags and not isinstance(tags, str):
114
+ return "Error: Tags must be a string"
115
+ if additional_fields and not isinstance(additional_fields, str):
116
+ return "Error: Additional_fields must be a string"
117
+ if not isinstance(mode, str):
118
+ return "Error: Mode must be a string"
119
+
120
+ # Validate mode
121
+ valid_modes = ["generate", "extract", "update", "strip"]
122
+ if mode not in valid_modes:
123
+ return f"Error: Invalid mode '{mode}'. Valid options are: {', '.join(valid_modes)}"
124
+
125
+ # Handle empty content
126
+ if not content or not content.strip():
127
+ if mode == "generate":
128
+ # We can still generate frontmatter from provided fields
129
+ content = ""
130
+ else:
131
+ return "Error: Empty content provided"
132
+
133
+ # Process based on mode
134
+ try:
135
+ if mode == "extract":
136
+ return self._extract_frontmatter(content)
137
+ elif mode == "strip":
138
+ return self._strip_frontmatter(content)
139
+ elif mode == "update":
140
+ return self._update_frontmatter(
141
+ content,
142
+ title,
143
+ author,
144
+ date,
145
+ description,
146
+ tags,
147
+ additional_fields,
148
+ date_format,
149
+ )
150
+ else: # generate
151
+ return self._generate_frontmatter(
152
+ content,
153
+ title,
154
+ author,
155
+ date,
156
+ description,
157
+ tags,
158
+ additional_fields,
159
+ date_format,
160
+ )
161
+ except Exception as e:
162
+ return f"Error processing frontmatter: {str(e)}"
163
+
164
+ def _extract_frontmatter(self, content: str) -> str:
165
+ """Extract and return existing frontmatter as formatted YAML."""
166
+ match = re.search(self.FRONTMATTER_PATTERN, content, re.DOTALL)
167
+ if not match:
168
+ return "No frontmatter found in the document"
169
+
170
+ try:
171
+ yaml_content = match.group(1)
172
+ # Parse and reformat for consistency
173
+ frontmatter_dict = yaml.safe_load(yaml_content)
174
+ return f"Extracted frontmatter:\n\n```yaml\n{yaml.dump(frontmatter_dict, sort_keys=False, default_flow_style=False)}```"
175
+ except yaml.YAMLError:
176
+ return "Found frontmatter but failed to parse it as valid YAML"
177
+
178
+ def _strip_frontmatter(self, content: str) -> str:
179
+ """Remove frontmatter from document and return clean content."""
180
+ result = re.sub(self.FRONTMATTER_PATTERN, "", content, count=1, flags=re.DOTALL)
181
+
182
+ # Check if anything was actually removed
183
+ if result == content:
184
+ return "No frontmatter found to strip. Content unchanged."
185
+
186
+ return result.strip()
187
+
188
+ def _parse_additional_fields(self, additional_fields: str) -> Dict[str, Any]:
189
+ """Parse the additional_fields JSON string into a dictionary."""
190
+ if not additional_fields:
191
+ return {}
192
+
193
+ try:
194
+ import json
195
+
196
+ return json.loads(additional_fields)
197
+ except json.JSONDecodeError:
198
+ raise ValueError("additional_fields must be a valid JSON string")
199
+
200
+ def _infer_title_from_content(self, content: str) -> Optional[str]:
201
+ """Attempt to infer document title from content."""
202
+ # Try to find the first heading
203
+ heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE)
204
+ if heading_match:
205
+ return heading_match.group(1).strip()
206
+
207
+ # Try to find the first non-empty line
208
+ lines = content.split("\n")
209
+ for line in lines:
210
+ if line.strip():
211
+ # Limit to a reasonable title length
212
+ return line.strip()[:100]
213
+
214
+ return None
215
+
216
+ def _parse_tags(self, tags_string: str) -> List[str]:
217
+ """Parse comma-separated tags into a list."""
218
+ if not tags_string:
219
+ return []
220
+
221
+ # Split by comma and clean each tag
222
+ tag_list = [tag.strip() for tag in tags_string.split(",")]
223
+ # Remove any empty tags
224
+ return [tag for tag in tag_list if tag]
225
+
226
+ def _parse_flexible_date(
227
+ self, date_str: str, date_format: Optional[str] = None
228
+ ) -> str:
229
+ """
230
+ Try to parse dates in various formats and convert to YYYY-MM-DD.
231
+
232
+ Args:
233
+ date_str: The date string to parse
234
+ date_format: Optional preferred format to try first
235
+
236
+ Returns:
237
+ Formatted date as string (YYYY-MM-DD by default)
238
+ """
239
+ if not date_str:
240
+ return datetime.now().strftime("%Y-%m-%d")
241
+
242
+ # If a specific format is provided, try it first
243
+ if date_format:
244
+ try:
245
+ parsed_date = datetime.strptime(date_str, date_format)
246
+ return parsed_date.strftime("%Y-%m-%d")
247
+ except ValueError:
248
+ # If it fails, continue with other formats
249
+ pass
250
+
251
+ # Common formats to try
252
+ formats = [
253
+ "%Y-%m-%d", # 2013-03-13
254
+ "%d %B %Y", # 13 March 2013
255
+ "%B %Y", # September 2013
256
+ "%Y", # 1958
257
+ "%d/%m/%Y", # 13/03/2013
258
+ "%m/%d/%Y", # 03/13/2013
259
+ "%d-%m-%Y", # 13-03-2013
260
+ "%m-%d-%Y", # 03-13-2013
261
+ "%Y/%m/%d", # 2013/03/13
262
+ ]
263
+
264
+ for fmt in formats:
265
+ try:
266
+ parsed_date = datetime.strptime(date_str, fmt)
267
+ return parsed_date.strftime("%Y-%m-%d")
268
+ except ValueError:
269
+ continue
270
+
271
+ # If no format matched, return the original string
272
+ return date_str
273
+
274
+ def _update_frontmatter(
275
+ self,
276
+ content: str,
277
+ title: Optional[str] = None,
278
+ author: Optional[str] = None,
279
+ date: Optional[str] = None,
280
+ description: Optional[str] = None,
281
+ tags: Optional[str] = None,
282
+ additional_fields: Optional[str] = None,
283
+ date_format: Optional[str] = None,
284
+ ) -> str:
285
+ """Update existing frontmatter with new values."""
286
+ # Check if frontmatter exists
287
+ match = re.search(self.FRONTMATTER_PATTERN, content, re.DOTALL)
288
+ if not match:
289
+ # If no frontmatter exists, generate new one
290
+ return self._generate_frontmatter(
291
+ content,
292
+ title,
293
+ author,
294
+ date,
295
+ description,
296
+ tags,
297
+ additional_fields,
298
+ date_format,
299
+ )
300
+
301
+ # Parse existing frontmatter
302
+ yaml_content = match.group(1)
303
+ try:
304
+ frontmatter_dict = yaml.safe_load(yaml_content) or {}
305
+ except yaml.YAMLError:
306
+ frontmatter_dict = {}
307
+
308
+ # Update with new values if provided
309
+ if title:
310
+ frontmatter_dict["title"] = title
311
+ if author:
312
+ frontmatter_dict["author"] = author
313
+ if date:
314
+ # Try to parse the date with the flexible parser
315
+ frontmatter_dict["date"] = self._parse_flexible_date(date, date_format)
316
+ if description:
317
+ frontmatter_dict["description"] = description
318
+ if tags:
319
+ frontmatter_dict["tags"] = self._parse_tags(tags)
320
+
321
+ # Add additional fields
322
+ if additional_fields:
323
+ additional_dict = self._parse_additional_fields(additional_fields)
324
+ frontmatter_dict.update(additional_dict)
325
+
326
+ # Generate new frontmatter
327
+ new_frontmatter = yaml.dump(
328
+ frontmatter_dict, sort_keys=False, default_flow_style=False
329
+ )
330
+ new_frontmatter = f"---\n{new_frontmatter}---\n\n"
331
+
332
+ # Replace old frontmatter with new one
333
+ return re.sub(
334
+ self.FRONTMATTER_PATTERN, new_frontmatter, content, count=1, flags=re.DOTALL
335
+ )
336
+
337
+ def _generate_frontmatter(
338
+ self,
339
+ content: str,
340
+ title: Optional[str] = None,
341
+ author: Optional[str] = None,
342
+ date: Optional[str] = None,
343
+ description: Optional[str] = None,
344
+ tags: Optional[str] = None,
345
+ additional_fields: Optional[str] = None,
346
+ date_format: Optional[str] = None,
347
+ ) -> str:
348
+ """Generate new frontmatter and prepend to content."""
349
+ # Strip any existing frontmatter
350
+ clean_content = (
351
+ self._strip_frontmatter(content) if isinstance(content, str) else ""
352
+ )
353
+
354
+ # Build frontmatter dictionary
355
+ frontmatter_dict = {}
356
+
357
+ # Try to infer title if not provided
358
+ if title:
359
+ frontmatter_dict["title"] = title
360
+ else:
361
+ inferred_title = self._infer_title_from_content(clean_content)
362
+ if inferred_title:
363
+ frontmatter_dict["title"] = inferred_title
364
+
365
+ # Add other fields if provided
366
+ if author:
367
+ frontmatter_dict["author"] = author
368
+
369
+ # Process date with flexible parser
370
+ if date:
371
+ frontmatter_dict["date"] = self._parse_flexible_date(date, date_format)
372
+ else:
373
+ # Use current date with provided format or default
374
+ format_to_use = date_format or "%Y-%m-%d"
375
+ frontmatter_dict["date"] = datetime.now().strftime(format_to_use)
376
+
377
+ if description:
378
+ frontmatter_dict["description"] = description
379
+
380
+ if tags:
381
+ frontmatter_dict["tags"] = self._parse_tags(tags)
382
+
383
+ # Add additional fields
384
+ if additional_fields:
385
+ additional_dict = self._parse_additional_fields(additional_fields)
386
+ frontmatter_dict.update(additional_dict)
387
+
388
+ # Generate YAML frontmatter
389
+ frontmatter_yaml = yaml.dump(
390
+ frontmatter_dict, sort_keys=False, default_flow_style=False
391
+ )
392
+ frontmatter = f"---\n{frontmatter_yaml}---\n\n"
393
+
394
+ # Combine frontmatter with content
395
+ return frontmatter + clean_content
396
+
397
+
398
+ # Example usage
399
+ def example_usage():
400
+ """Simple example of how to use the FrontmatterGeneratorTool"""
401
+ tool = FrontmatterGeneratorTool()
402
+
403
+ # Example document
404
+ document = """# The Role of the Seat in International Arbitration
405
+
406
+ This paper examines the significance of the seat in international arbitration proceedings,
407
+ with a focus on recent developments in Australian law."""
408
+
409
+ # Generate frontmatter
410
+ result = tool.forward(
411
+ content=document,
412
+ title="The Role of the Seat in International Arbitration",
413
+ author="Matthew Barry",
414
+ date="13 March 2013", # Now handles this format correctly
415
+ description="A legal analysis of the role of the seat in international arbitration.",
416
+ tags="international arbitration, enforcement, australian courts",
417
+ mode="generate",
418
+ )
419
+
420
+ print(result)
421
+ return result
422
+
423
+
424
+ # Define what gets imported with "from frontmatter_generator import *"
425
+ __all__ = ["FrontmatterGeneratorTool", "example_usage"]
426
+
427
+ # Example usage
428
+ if __name__ == "__main__":
429
+ tool = FrontmatterGeneratorTool()
430
+
431
+ # # Example document
432
+ # document = """# The Role of the Seat in International Arbitration
433
+
434
+ # This paper examines the significance of the seat in international arbitration proceedings,
435
+ # with a focus on recent developments in Australian law."""
436
+
437
+ # # Generate frontmatter
438
+ # result = tool.forward(
439
+ # content=document,
440
+ # title="The Role of the Seat in International Arbitration",
441
+ # author="Matthew Barry",
442
+ # date="2013-12-13",
443
+ # description="A legal analysis of the role of the seat in international arbitration.",
444
+ # tags="international arbitration, enforcement, australian courts",
445
+ # mode="generate"
446
+ # )
447
+
448
+ # print(result)