ChatGS / date_extractor.py
sharmamohit8624's picture
Upload 2395 files
829f2ca verified
import stanza
import re
import dateparser
import datetime
from date_parser.parser import DateParser
from dateutil.relativedelta import relativedelta
dp = DateParser()
def f0(query):
date_pattern = (r'\b\d{1,2}[-/]\d{1,2}[-/]\d{4}\b|'
r'\b\d{4}[-/]\d{1,2}[-/]\d{1,2}\b')
dates = re.findall(date_pattern, query)
if len(dates) == 2:
min_date = datetime.datetime.min.date()
max_date = datetime.datetime.max.date()
start_date_str = str(min_date) if dates[0] == '9999-12-31' else dates[0]
end_date_str = str(max_date) if dates[1] == '9999-12-31' else dates[1]
start_date = dateparser.parse(start_date_str).date()
end_date = dateparser.parse(end_date_str).date()
formatted_start_date = start_date.strftime('%Y-%m-%d')
formatted_end_date = end_date.strftime('%Y-%m-%d')
return formatted_start_date, formatted_end_date
return None, None
def f1(query):
# stanza.download('en') # Download the English model
nlp = stanza.Pipeline('en', processors='tokenize,ner', model_dir=r"C:\Users\Ankit Sharma\stanza_resources", download_method=stanza.DownloadMethod.REUSE_RESOURCES)
doc = nlp(query)
date_ranges = []
for sent in doc.sentences:
for ent in sent.ents:
if ent.type == 'DATE':
date_text = ent.text
# Split date_text based on keywords like "to" or "and"
if ' to ' in date_text.lower():
parts = re.split(r'\bto\b', date_text, flags=re.IGNORECASE)
if len(parts) == 2:
parsed_dates = [dp.parse_date(part.strip()).strftime('%Y-%m-%d') for part in parts]
date_ranges.append(parsed_dates)
if ' and ' in date_text.lower():
parts = re.split(r'\band\b', date_text, flags=re.IGNORECASE)
if len(parts) == 2:
date_ranges.append(parts)
parsed_dates = [dp.parse_date(part.strip()).strftime('%Y-%m-%d') for part in parts]
date_ranges.append(parsed_dates)
# Extract smallest and largest dates
start_date = min(min(date_ranges, key=lambda x: x[0])) if date_ranges else None
end_date = max(max(date_ranges, key=lambda x: x[1])) if date_ranges else None
return start_date, end_date
def f2(query, date_format="%Y-%m-%d"):
# Use regular expression to find years in the query
year_matches = re.findall(r'\b\d{4}\b', query)
if len(year_matches) == 1:
year = int(year_matches[0])
start_date = datetime.datetime(year, 1, 1)
end_date = datetime.datetime(year, 12, 31)
start_date_str = start_date.strftime(date_format)
end_date_str = end_date.strftime(date_format)
return start_date_str, end_date_str
if len(year_matches) == 2:
year1, year2 = map(int, year_matches)
start_date = datetime.datetime(year1, 1, 1)
end_date = datetime.datetime(year2, 12, 31)
start_date_str = start_date.strftime(date_format)
end_date_str = end_date.strftime(date_format)
return start_date_str, end_date_str
return None, None
def f3(query, date_format="%Y-%m-%d"):
# Use regular expression to find relative date expressions in the query
date_format = '%Y-%m-%d'
relative_matches = re.findall(r'\b(last|previous)\s*(\d*)\s*(year|month|week)s?\b', query, flags=re.IGNORECASE)
if relative_matches:
relative_type, numeric_value, time_unit = relative_matches[0]
numeric_value = int(numeric_value) if numeric_value else 1 # Set default value to 1 if numeric value is not provided
current_date = datetime.datetime.now()
if time_unit.lower() in ['year', 'years']:
start_date = current_date - relativedelta(years=numeric_value)
end_date = current_date
elif time_unit.lower() in ['month', 'months']:
start_date = current_date - relativedelta(months=numeric_value)
end_date = current_date
elif time_unit.lower() in ['week', 'weeks']:
start_date = current_date - relativedelta(weeks=numeric_value)
end_date = current_date
else:
return None, None
start_date_str = start_date.strftime(date_format)
end_date_str = end_date.strftime(date_format)
return start_date_str, end_date_str
else:
return None, None
def f4(query):
# Check if the input string contains the "till" keyword
if "till" in query.lower():
# Find the index of "till" in the input string
till_index = query.lower().find("till")
# Extract the substring after "till"
end_date_str = query[till_index + len("till"):].strip()
# Use a fixed start date
start_date_str = "2014-01-02"
# Parse the start date
start_date = dateparser.parse(start_date_str)
# Parse the end date
end_date = dateparser.parse(end_date_str)
# Format dates as strings in %Y-%m-%d format
formatted_start_date = start_date.strftime("%Y-%m-%d")
formatted_end_date = end_date.strftime("%Y-%m-%d")
return formatted_start_date, formatted_end_date
else:
# If "till" keyword is not present, return None
return None, None
def extract_date(text):
start_date, end_date = None, None
start_date, end_date = f0(text)
if start_date is not None and end_date is not None:
print(start_date, end_date, "f0")
return start_date, end_date
start_date, end_date = f1(text)
if start_date is not None and end_date is not None:
print(start_date, end_date, "f1")
return start_date, end_date
start_date, end_date = f2(text)
if start_date is not None and end_date is not None:
print(start_date, end_date, "f2")
return start_date, end_date
start_date, end_date = f3(text)
if start_date is not None and end_date is not None:
print(start_date, end_date, "f3")
return start_date, end_date
start_date, end_date = f4(text)
if start_date is not None and end_date is not None:
print(start_date, end_date, "f4")
return start_date, end_date
# Default case: 14 days back to current date
start_date = datetime.datetime.now() - relativedelta(days=30)
end_date = datetime.datetime.now()
return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')