Spaces:

nicolacaione
/

HFAgentsCourse

Sleeping

App Files Files Community

HFAgentsCourse / tools /text_processing.py

nicolacaione

Used to pass the exam

3874cd4 5 months ago

raw

history blame contribute delete

2.66 kB

	from langchain_core.tools import tool
	import re
	from datetime import datetime, timedelta
	import json

	@tool
	def extract_numbers(text: str) -> str:
	"""
	Extracts all numbers from a text string and returns them as a comma-separated list.
	Useful for parsing numerical data from search results or documents.

	Args:
	text (str): The text to extract numbers from

	Returns:
	str: Comma-separated list of numbers found in the text
	"""
	if not text:
	return ""

	# Find all numbers (integers and floats)
	numbers = re.findall(r'-?\d+\.?\d*', text)
	return ', '.join(numbers) if numbers else ""

	@tool
	def count_words(text: str) -> int:
	"""
	Counts the number of words in a text string.

	Args:
	text (str): The text to count words in

	Returns:
	int: Number of words in the text
	"""
	if not text:
	return 0

	# Split by whitespace and filter out empty strings
	words = [word for word in text.split() if word.strip()]
	return len(words)

	@tool
	def extract_dates(text: str) -> str:
	"""
	Extracts date patterns from text and returns them in a standardized format.
	Supports various date formats including YYYY, YYYY-MM-DD, MM/DD/YYYY, etc.

	Args:
	text (str): The text to extract dates from

	Returns:
	str: Comma-separated list of dates found in the text
	"""
	if not text:
	return ""

	date_patterns = [
	r'\b\d{4}-\d{2}-\d{2}\b', # YYYY-MM-DD
	r'\b\d{2}/\d{2}/\d{4}\b', # MM/DD/YYYY
	r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', # M/D/YY or MM/DD/YYYY
	r'\b\d{4}\b', # Just years
	r'\b(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)\s+\d{1,2},?\s+\d{4}\b', # Month DD, YYYY
	]

	dates = []
	for pattern in date_patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	dates.extend(matches)

	return ', '.join(dates) if dates else ""

	@tool
	def clean_text(text: str) -> str:
	"""
	Cleans text by removing extra whitespace, special characters, and normalizing format.
	Useful for processing scraped or extracted text.

	Args:
	text (str): The text to clean

	Returns:
	str: Cleaned text
	"""
	if not text:
	return ""

	# Remove extra whitespace
	cleaned = re.sub(r'\s+', ' ', text)
	# Remove leading/trailing whitespace
	cleaned = cleaned.strip()
	# Remove common unwanted characters but keep basic punctuation
	cleaned = re.sub(r'[^\w\s.,!?()-]', '', cleaned)

	return cleaned