Spaces:
Sleeping
Sleeping
File size: 2,657 Bytes
3874cd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from langchain_core.tools import tool
import re
from datetime import datetime, timedelta
import json
@tool
def extract_numbers(text: str) -> str:
"""
Extracts all numbers from a text string and returns them as a comma-separated list.
Useful for parsing numerical data from search results or documents.
Args:
text (str): The text to extract numbers from
Returns:
str: Comma-separated list of numbers found in the text
"""
if not text:
return ""
# Find all numbers (integers and floats)
numbers = re.findall(r'-?\d+\.?\d*', text)
return ', '.join(numbers) if numbers else ""
@tool
def count_words(text: str) -> int:
"""
Counts the number of words in a text string.
Args:
text (str): The text to count words in
Returns:
int: Number of words in the text
"""
if not text:
return 0
# Split by whitespace and filter out empty strings
words = [word for word in text.split() if word.strip()]
return len(words)
@tool
def extract_dates(text: str) -> str:
"""
Extracts date patterns from text and returns them in a standardized format.
Supports various date formats including YYYY, YYYY-MM-DD, MM/DD/YYYY, etc.
Args:
text (str): The text to extract dates from
Returns:
str: Comma-separated list of dates found in the text
"""
if not text:
return ""
date_patterns = [
r'\b\d{4}-\d{2}-\d{2}\b', # YYYY-MM-DD
r'\b\d{2}/\d{2}/\d{4}\b', # MM/DD/YYYY
r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', # M/D/YY or MM/DD/YYYY
r'\b\d{4}\b', # Just years
r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', # Month DD, YYYY
]
dates = []
for pattern in date_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
dates.extend(matches)
return ', '.join(dates) if dates else ""
@tool
def clean_text(text: str) -> str:
"""
Cleans text by removing extra whitespace, special characters, and normalizing format.
Useful for processing scraped or extracted text.
Args:
text (str): The text to clean
Returns:
str: Cleaned text
"""
if not text:
return ""
# Remove extra whitespace
cleaned = re.sub(r'\s+', ' ', text)
# Remove leading/trailing whitespace
cleaned = cleaned.strip()
# Remove common unwanted characters but keep basic punctuation
cleaned = re.sub(r'[^\w\s.,!?()-]', '', cleaned)
return cleaned |