Spaces:
Runtime error
Runtime error
| """ | |
| Frontmatter Generator Tool for Smolagents | |
| This tool helps generate consistent YAML frontmatter for documents, | |
| useful for RAG systems, static site generators, and document organization. | |
| Integrates with TextInspectorTool and MarkdownConverter for a complete | |
| document processing pipeline. | |
| """ | |
| import re | |
| import yaml | |
| import json | |
| from datetime import datetime | |
| from typing import Dict, List, Optional, Any, Union | |
| from smolagents import Tool | |
| class FrontmatterGeneratorTool(Tool): | |
| """Tool for generating and manipulating YAML frontmatter in documents.""" | |
| name = "frontmatter_generator" | |
| description = """ | |
| Generates or extracts YAML frontmatter for documents. Frontmatter provides structured | |
| metadata for documents including title, author, date, description, and tags. | |
| Useful for document organization, RAG systems, and static site generators. | |
| Works with content from the inspect_file_as_text tool to add metadata to documents. | |
| """ | |
| inputs = { | |
| "content": { | |
| "type": "string", | |
| "description": "Document content (with or without existing frontmatter)", | |
| }, | |
| "title": {"type": "string", "description": "Document title", "nullable": True}, | |
| "author": { | |
| "type": "string", | |
| "description": "Document author(s)", | |
| "nullable": True, | |
| }, | |
| "date": { | |
| "type": "string", | |
| "description": "Document date in YYYY-MM-DD format (defaults to today if not provided)", | |
| "nullable": True, | |
| }, | |
| "date_format": { | |
| "type": "string", | |
| "description": "Format string for the document date (e.g., '%Y-%m-%d', '%d/%m/%Y'). Defaults to '%Y-%m-%d'", | |
| "nullable": True, | |
| "default": "%Y-%m-%d", | |
| }, | |
| "description": { | |
| "type": "string", | |
| "description": "Brief description of the document", | |
| "nullable": True, | |
| }, | |
| "tags": { | |
| "type": "string", | |
| "description": "Comma-separated list of tags", | |
| "nullable": True, | |
| }, | |
| "additional_fields": { | |
| "type": "string", | |
| "description": "JSON string with additional frontmatter fields", | |
| "nullable": True, | |
| }, | |
| "mode": { | |
| "type": "string", | |
| "description": "Operation mode: 'generate' (create new), 'extract' (get existing), 'update' (modify existing), or 'strip' (remove)", | |
| "default": "generate", | |
| }, | |
| } | |
| output_type = "string" | |
| # Regular expression to detect and extract YAML frontmatter | |
| FRONTMATTER_PATTERN = r"^---\s*\n(.*?)\n---\s*\n" | |
| def forward( | |
| self, | |
| content: str, | |
| title: Optional[str] = None, | |
| author: Optional[str] = None, | |
| date: Optional[str] = None, | |
| date_format: Optional[str] = "%Y-%m-%d", | |
| description: Optional[str] = None, | |
| tags: Optional[str] = None, | |
| additional_fields: Optional[str] = None, | |
| mode: str = "generate", | |
| ) -> str: | |
| """ | |
| Process document content based on specified mode. | |
| Args: | |
| content: Document content with or without frontmatter | |
| title: Document title | |
| author: Document author(s) | |
| date: Document date (YYYY-MM-DD) | |
| date_format: strftime format string | |
| description: Brief document description | |
| tags: Comma-separated list of tags | |
| additional_fields: JSON string with additional fields | |
| mode: Operation mode (generate, extract, update, strip) | |
| Returns: | |
| Processed document or extracted frontmatter | |
| """ | |
| # Validate inputs | |
| if not isinstance(content, str): | |
| return "Error: Content must be a string" | |
| if title and not isinstance(title, str): | |
| return "Error: Title must be a string" | |
| if author and not isinstance(author, str): | |
| return "Error: Author must be a string" | |
| if date and not isinstance(date, str): | |
| return "Error: Date must be a string" | |
| if description and not isinstance(description, str): | |
| return "Error: Description must be a string" | |
| if tags and not isinstance(tags, str): | |
| return "Error: Tags must be a string" | |
| if additional_fields and not isinstance(additional_fields, str): | |
| return "Error: Additional_fields must be a string" | |
| if not isinstance(mode, str): | |
| return "Error: Mode must be a string" | |
| # Validate mode | |
| valid_modes = ["generate", "extract", "update", "strip"] | |
| if mode not in valid_modes: | |
| return f"Error: Invalid mode '{mode}'. Valid options are: {', '.join(valid_modes)}" | |
| # Handle empty content | |
| if not content or not content.strip(): | |
| if mode == "generate": | |
| # We can still generate frontmatter from provided fields | |
| content = "" | |
| else: | |
| return "Error: Empty content provided" | |
| # Special handling for TextInspectorTool output | |
| if content.startswith("Document content:"): | |
| content = content[len("Document content:"):].strip() | |
| # Process based on mode | |
| try: | |
| if mode == "extract": | |
| return self._extract_frontmatter(content) | |
| elif mode == "strip": | |
| return self._strip_frontmatter(content) | |
| elif mode == "update": | |
| return self._update_frontmatter( | |
| content, | |
| title, | |
| author, | |
| date, | |
| description, | |
| tags, | |
| additional_fields, | |
| date_format, | |
| ) | |
| else: # generate | |
| return self._generate_frontmatter( | |
| content, | |
| title, | |
| author, | |
| date, | |
| description, | |
| tags, | |
| additional_fields, | |
| date_format, | |
| ) | |
| except Exception as e: | |
| return f"Error processing frontmatter: {str(e)}" | |
| def _extract_frontmatter(self, content: str) -> str: | |
| """Extract and return existing frontmatter as formatted YAML.""" | |
| match = re.search(self.FRONTMATTER_PATTERN, content, re.DOTALL) | |
| if not match: | |
| return "No frontmatter found in the document" | |
| try: | |
| yaml_content = match.group(1) | |
| # Parse and reformat for consistency | |
| frontmatter_dict = yaml.safe_load(yaml_content) | |
| return f"Extracted frontmatter:\n\n```yaml\n{yaml.dump(frontmatter_dict, sort_keys=False, default_flow_style=False)}```" | |
| except yaml.YAMLError: | |
| return "Found frontmatter but failed to parse it as valid YAML" | |
| def _strip_frontmatter(self, content: str) -> str: | |
| """Remove frontmatter from document and return clean content.""" | |
| result = re.sub(self.FRONTMATTER_PATTERN, "", content, count=1, flags=re.DOTALL) | |
| # Check if anything was actually removed | |
| if result == content: | |
| return "No frontmatter found to strip. Content unchanged." | |
| return result.strip() | |
| def _parse_additional_fields(self, additional_fields: str) -> Dict[str, Any]: | |
| """Parse the additional_fields JSON string into a dictionary.""" | |
| if not additional_fields: | |
| return {} | |
| try: | |
| return json.loads(additional_fields) | |
| except json.JSONDecodeError: | |
| raise ValueError("additional_fields must be a valid JSON string") | |
| def _infer_title_from_content(self, content: str) -> Optional[str]: | |
| """Attempt to infer document title from content.""" | |
| # Try to find the first heading | |
| heading_match = re.search(r"^#\s+(.+)$", content, re.MULTILINE) | |
| if heading_match: | |
| return heading_match.group(1).strip() | |
| # Try to find the first non-empty line | |
| lines = content.split("\n") | |
| for line in lines: | |
| if line.strip(): | |
| # Limit to a reasonable title length | |
| return line.strip()[:100] | |
| return None | |
| def _parse_tags(self, tags_string: str) -> List[str]: | |
| """Parse comma-separated tags into a list.""" | |
| if not tags_string: | |
| return [] | |
| # Split by comma and clean each tag | |
| tag_list = [tag.strip() for tag in tags_string.split(",")] | |
| # Remove any empty tags | |
| return [tag for tag in tag_list if tag] | |
| def _parse_flexible_date( | |
| self, date_str: str, date_format: Optional[str] = None | |
| ) -> str: | |
| """ | |
| Try to parse dates in various formats and convert to YYYY-MM-DD. | |
| Args: | |
| date_str: The date string to parse | |
| date_format: Optional preferred format to try first | |
| Returns: | |
| Formatted date as string (YYYY-MM-DD by default) | |
| """ | |
| if not date_str: | |
| return datetime.now().strftime("%Y-%m-%d") | |
| # If a specific format is provided, try it first | |
| if date_format: | |
| try: | |
| parsed_date = datetime.strptime(date_str, date_format) | |
| return parsed_date.strftime("%Y-%m-%d") | |
| except ValueError: | |
| # If it fails, continue with other formats | |
| pass | |
| # Common formats to try | |
| formats = [ | |
| "%Y-%m-%d", # 2013-03-13 | |
| "%d %B %Y", # 13 March 2013 | |
| "%B %Y", # September 2013 | |
| "%Y", # 1958 | |
| "%d/%m/%Y", # 13/03/2013 | |
| "%m/%d/%Y", # 03/13/2013 | |
| "%d-%m-%Y", # 13-03-2013 | |
| "%m-%d-%Y", # 03-13-2013 | |
| "%Y/%m/%d", # 2013/03/13 | |
| ] | |
| for fmt in formats: | |
| try: | |
| parsed_date = datetime.strptime(date_str, fmt) | |
| return parsed_date.strftime("%Y-%m-%d") | |
| except ValueError: | |
| continue | |
| # If no format matched, return the original string | |
| return date_str | |
| def _update_frontmatter( | |
| self, | |
| content: str, | |
| title: Optional[str] = None, | |
| author: Optional[str] = None, | |
| date: Optional[str] = None, | |
| description: Optional[str] = None, | |
| tags: Optional[str] = None, | |
| additional_fields: Optional[str] = None, | |
| date_format: Optional[str] = None, | |
| ) -> str: | |
| """Update existing frontmatter with new values.""" | |
| # Check if frontmatter exists | |
| match = re.search(self.FRONTMATTER_PATTERN, content, re.DOTALL) | |
| if not match: | |
| # If no frontmatter exists, generate new one | |
| return self._generate_frontmatter( | |
| content, | |
| title, | |
| author, | |
| date, | |
| description, | |
| tags, | |
| additional_fields, | |
| date_format, | |
| ) | |
| # Parse existing frontmatter | |
| yaml_content = match.group(1) | |
| try: | |
| frontmatter_dict = yaml.safe_load(yaml_content) or {} | |
| except yaml.YAMLError: | |
| frontmatter_dict = {} | |
| # Update with new values if provided | |
| if title: | |
| frontmatter_dict["title"] = title | |
| if author: | |
| frontmatter_dict["author"] = author | |
| if date: | |
| # Try to parse the date with the flexible parser | |
| frontmatter_dict["date"] = self._parse_flexible_date(date, date_format) | |
| if description: | |
| frontmatter_dict["description"] = description | |
| if tags: | |
| frontmatter_dict["tags"] = self._parse_tags(tags) | |
| # Add additional fields | |
| if additional_fields: | |
| additional_dict = self._parse_additional_fields(additional_fields) | |
| frontmatter_dict.update(additional_dict) | |
| # Generate new frontmatter | |
| new_frontmatter = yaml.dump( | |
| frontmatter_dict, sort_keys=False, default_flow_style=False | |
| ) | |
| new_frontmatter = f"---\n{new_frontmatter}---\n\n" | |
| # Replace old frontmatter with new one | |
| return re.sub( | |
| self.FRONTMATTER_PATTERN, new_frontmatter, content, count=1, flags=re.DOTALL | |
| ) | |
| def _generate_frontmatter( | |
| self, | |
| content: str, | |
| title: Optional[str] = None, | |
| author: Optional[str] = None, | |
| date: Optional[str] = None, | |
| description: Optional[str] = None, | |
| tags: Optional[str] = None, | |
| additional_fields: Optional[str] = None, | |
| date_format: Optional[str] = None, | |
| ) -> str: | |
| """Generate new frontmatter and prepend to content.""" | |
| # Strip any existing frontmatter | |
| clean_content = ( | |
| self._strip_frontmatter(content) if isinstance(content, str) else "" | |
| ) | |
| # Build frontmatter dictionary | |
| frontmatter_dict = {} | |
| # Try to infer title if not provided | |
| if title: | |
| frontmatter_dict["title"] = title | |
| else: | |
| inferred_title = self._infer_title_from_content(clean_content) | |
| if inferred_title: | |
| frontmatter_dict["title"] = inferred_title | |
| # Add other fields if provided | |
| if author: | |
| frontmatter_dict["author"] = author | |
| # Process date with flexible parser | |
| if date: | |
| frontmatter_dict["date"] = self._parse_flexible_date(date, date_format) | |
| else: | |
| # Use current date with provided format or default | |
| format_to_use = date_format or "%Y-%m-%d" | |
| frontmatter_dict["date"] = datetime.now().strftime(format_to_use) | |
| if description: | |
| frontmatter_dict["description"] = description | |
| if tags: | |
| frontmatter_dict["tags"] = self._parse_tags(tags) | |
| # Add additional fields | |
| if additional_fields: | |
| additional_dict = self._parse_additional_fields(additional_fields) | |
| frontmatter_dict.update(additional_dict) | |
| # Generate YAML frontmatter | |
| frontmatter_yaml = yaml.dump( | |
| frontmatter_dict, sort_keys=False, default_flow_style=False | |
| ) | |
| frontmatter = f"---\n{frontmatter_yaml}---\n\n" | |
| # Combine frontmatter with content | |
| return frontmatter + clean_content | |