Spaces:

GrowWithTalha
/

todoappapi

Running

App Files Files Community

todoappapi / nlp_service.py

GrowWithTalha

feat: sync backend changes from SDDRI-Hackathon-2

84c328d 11 days ago

raw

history blame contribute delete

3.69 kB

	"""NLP service for extracting task attributes from natural language.

	[Task]: T029
	[From]: specs/007-intermediate-todo-features/tasks.md (User Story 2)

	This service provides:
	- Tag extraction from natural language ("tagged with X", "add tag Y")
	- Priority detection patterns
	- Due date parsing patterns
	"""
	from typing import List, Optional
	import re


	def extract_tags(text: str) -> List[str]:
	"""Extract tags from natural language input.

	[Task]: T029, T031 - Tag extraction from natural language

	Supports patterns:
	- "tagged with X", "tags X", "tag X"
	- "add tag X", "with tag X"
	- "labeled X"
	- Hashtags: "#tagname"

	Args:
	text: Natural language input text

	Returns:
	List of extracted tag names (lowercased, deduplicated)

	Examples:
	>>> extract_tags("Add task tagged with work and urgent")
	['work', 'urgent']
	>>> extract_tags("Buy groceries #shopping #home")
	['shopping', 'home']
	>>> extract_tags("Create task with label review")
	['review']
	"""
	if not text:
	return []

	tags = set()
	text_lower = text.lower()

	# Pattern 1: Hashtag extraction
	hashtag_pattern = r'#(\w+)'
	hashtags = re.findall(hashtag_pattern, text)
	tags.update(hashtags)

	# Pattern 2: "tagged with X and Y" or "tags X, Y"
	tagged_with_pattern = r'(?:tagged\|tags?\|labeled?)\s+(?:with\s+)?(?:[,\s]+)?(\w+(?:\s+(?:and\|,)\s+\w+)*)'
	matches = re.findall(tagged_with_pattern, text_lower)
	for match in matches:
	# Split by common separators
	parts = re.split(r'\s+(?:and\|,)\s+', match)
	tags.update(parts)

	# Pattern 3: "add tag X" or "with tag X"
	add_tag_pattern = r'(?:add\|with\|has)\s+tag\s+(\w+)'
	matches = re.findall(add_tag_pattern, text_lower)
	tags.update(matches)

	# Pattern 4: "label X"
	label_pattern = r'(?:label\|categorize\|file\s*(?:under)?)(?:ed\|s+as)?\s+(\w+)'
	matches = re.findall(label_pattern, text_lower)
	tags.update(matches)

	# Filter out common non-tag words
	excluded_words = {
	'a', 'an', 'the', 'with', 'for', 'and', 'or', 'but', 'not',
	'this', 'that', 'to', 'of', 'in', 'on', 'at', 'by', 'as', 'is',
	'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
	'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may',
	'might', 'must', 'can', 'need', 'want', 'like', 'such'
	}

	filtered_tags = [tag for tag in tags if tag not in excluded_words and len(tag) > 1]

	return sorted(list(filtered_tags))


	def normalize_tag_name(tag: str) -> str:
	"""Normalize tag name for consistency.

	Args:
	tag: Raw tag name from user input

	Returns:
	Normalized tag name (lowercase, trimmed, no special chars)
	"""
	# Remove special characters except hyphens and underscores
	normalized = re.sub(r'[^\w\s-]', '', tag)
	# Convert to lowercase and trim
	normalized = normalized.lower().strip()
	# Replace spaces with hyphens for multi-word tags
	normalized = re.sub(r'\s+', '-', normalized)
	return normalized


	def extract_tags_from_task_data(
	title: str,
	description: Optional[str] = None
	) -> List[str]:
	"""Extract tags from task title and description.

	Convenience function that extracts tags from both title and description.

	Args:
	title: Task title
	description: Optional task description

	Returns:
	List of extracted and normalized tag names
	"""
	text = title
	if description:
	text = f"{title} {description}"

	raw_tags = extract_tags(text)
	# Normalize each tag
	return [normalize_tag_name(tag) for tag in raw_tags]