HFAgentsCourse / tools /text_processing.py
nicolacaione's picture
Used to pass the exam
3874cd4
from langchain_core.tools import tool
import re
from datetime import datetime, timedelta
import json
@tool
def extract_numbers(text: str) -> str:
"""
Extracts all numbers from a text string and returns them as a comma-separated list.
Useful for parsing numerical data from search results or documents.
Args:
text (str): The text to extract numbers from
Returns:
str: Comma-separated list of numbers found in the text
"""
if not text:
return ""
# Find all numbers (integers and floats)
numbers = re.findall(r'-?\d+\.?\d*', text)
return ', '.join(numbers) if numbers else ""
@tool
def count_words(text: str) -> int:
"""
Counts the number of words in a text string.
Args:
text (str): The text to count words in
Returns:
int: Number of words in the text
"""
if not text:
return 0
# Split by whitespace and filter out empty strings
words = [word for word in text.split() if word.strip()]
return len(words)
@tool
def extract_dates(text: str) -> str:
"""
Extracts date patterns from text and returns them in a standardized format.
Supports various date formats including YYYY, YYYY-MM-DD, MM/DD/YYYY, etc.
Args:
text (str): The text to extract dates from
Returns:
str: Comma-separated list of dates found in the text
"""
if not text:
return ""
date_patterns = [
r'\b\d{4}-\d{2}-\d{2}\b', # YYYY-MM-DD
r'\b\d{2}/\d{2}/\d{4}\b', # MM/DD/YYYY
r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', # M/D/YY or MM/DD/YYYY
r'\b\d{4}\b', # Just years
r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', # Month DD, YYYY
]
dates = []
for pattern in date_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
dates.extend(matches)
return ', '.join(dates) if dates else ""
@tool
def clean_text(text: str) -> str:
"""
Cleans text by removing extra whitespace, special characters, and normalizing format.
Useful for processing scraped or extracted text.
Args:
text (str): The text to clean
Returns:
str: Cleaned text
"""
if not text:
return ""
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
cleaned = cleaned.strip()
# Remove common unwanted characters but keep basic punctuation
cleaned = re.sub(r'[^\w\s.,!?()-]', '', cleaned)
return cleaned