Spaces:

GrowWithTalha
/

todoappapi

Running

File size: 3,694 Bytes

84c328d

"""NLP service for extracting task attributes from natural language.

[Task]: T029
[From]: specs/007-intermediate-todo-features/tasks.md (User Story 2)

This service provides:
- Tag extraction from natural language ("tagged with X", "add tag Y")
- Priority detection patterns
- Due date parsing patterns
"""
from typing import List, Optional
import re


def extract_tags(text: str) -> List[str]:
    """Extract tags from natural language input.

    [Task]: T029, T031 - Tag extraction from natural language

    Supports patterns:
    - "tagged with X", "tags X", "tag X"
    - "add tag X", "with tag X"
    - "labeled X"
    - Hashtags: "#tagname"

    Args:
        text: Natural language input text

    Returns:
        List of extracted tag names (lowercased, deduplicated)

    Examples:
        >>> extract_tags("Add task tagged with work and urgent")
        ['work', 'urgent']
        >>> extract_tags("Buy groceries #shopping #home")
        ['shopping', 'home']
        >>> extract_tags("Create task with label review")
        ['review']
    """
    if not text:
        return []

    tags = set()
    text_lower = text.lower()

    # Pattern 1: Hashtag extraction
    hashtag_pattern = r'#(\w+)'
    hashtags = re.findall(hashtag_pattern, text)
    tags.update(hashtags)

    # Pattern 2: "tagged with X and Y" or "tags X, Y"
    tagged_with_pattern = r'(?:tagged|tags?|labeled?)\s+(?:with\s+)?(?:[,\s]+)?(\w+(?:\s+(?:and|,)\s+\w+)*)'
    matches = re.findall(tagged_with_pattern, text_lower)
    for match in matches:
        # Split by common separators
        parts = re.split(r'\s+(?:and|,)\s+', match)
        tags.update(parts)

    # Pattern 3: "add tag X" or "with tag X"
    add_tag_pattern = r'(?:add|with|has)\s+tag\s+(\w+)'
    matches = re.findall(add_tag_pattern, text_lower)
    tags.update(matches)

    # Pattern 4: "label X"
    label_pattern = r'(?:label|categorize|file\s*(?:under)?)(?:ed|s+as)?\s+(\w+)'
    matches = re.findall(label_pattern, text_lower)
    tags.update(matches)

    # Filter out common non-tag words
    excluded_words = {
        'a', 'an', 'the', 'with', 'for', 'and', 'or', 'but', 'not',
        'this', 'that', 'to', 'of', 'in', 'on', 'at', 'by', 'as', 'is',
        'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
        'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may',
        'might', 'must', 'can', 'need', 'want', 'like', 'such'
    }

    filtered_tags = [tag for tag in tags if tag not in excluded_words and len(tag) > 1]

    return sorted(list(filtered_tags))


def normalize_tag_name(tag: str) -> str:
    """Normalize tag name for consistency.

    Args:
        tag: Raw tag name from user input

    Returns:
        Normalized tag name (lowercase, trimmed, no special chars)
    """
    # Remove special characters except hyphens and underscores
    normalized = re.sub(r'[^\w\s-]', '', tag)
    # Convert to lowercase and trim
    normalized = normalized.lower().strip()
    # Replace spaces with hyphens for multi-word tags
    normalized = re.sub(r'\s+', '-', normalized)
    return normalized


def extract_tags_from_task_data(
    title: str,
    description: Optional[str] = None
) -> List[str]:
    """Extract tags from task title and description.

    Convenience function that extracts tags from both title and description.

    Args:
        title: Task title
        description: Optional task description

    Returns:
        List of extracted and normalized tag names
    """
    text = title
    if description:
        text = f"{title} {description}"

    raw_tags = extract_tags(text)
    # Normalize each tag
    return [normalize_tag_name(tag) for tag in raw_tags]