Spaces:
Sleeping
Sleeping
| import spacy | |
| import re | |
| from datetime import datetime | |
| # load the spacy model | |
| spacy.cli.download("en_core_web_lg") | |
| nlp = spacy.load("en_core_web_lg") | |
| # Define a function to extract dates from text | |
| def extract_dates(text): | |
| """ | |
| Identify dates both in numeric and free-text from text, using date regex patterns and NER tag | |
| """ | |
| # Define regex patterns for common date formats | |
| # Regular expressions that include the \b word boundary character to ensure that the date pattern only matches if it is not part of a longer pattern that has already been matched | |
| date_patterns = [ | |
| r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', # Matches dates like "01/01/22" or "1-1-2022" | |
| r'\b\d{1,2}[-/]\d{1,2}\b(?!\d)', # Matches dates like "01/01" or "1-1" | |
| r'\b[A-Z][a-z]{2,8} \d{1,2},? \d{2,4}\b', # Matches dates like "January 1, 2022" or "Feb 28, 22" | |
| r'\b\d{1,2} [A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "1 January 2022" or "28 Feb 22" | |
| r'\b[A-Z][a-z]{2,8} \d{2,4}\b', # Matches dates like "January 2022" or "Feb 22" | |
| ] | |
| # Find all matches for date patterns in the text | |
| matches = [] | |
| for pattern in date_patterns: | |
| for match in re.findall(pattern, text): | |
| # Check if the match is part of a longer date pattern that has already been matched | |
| if all(match not in m for m in matches): | |
| matches.append(match) | |
| # Use SpaCy to extract additional dates | |
| doc = nlp(text) | |
| for ent in doc.ents: | |
| if ent.label_ == 'DATE': | |
| date_str = ent.text | |
| # Checks each SpaCy date reference against the matches list to ensure that it is not already included | |
| if all(date_str not in m for m in matches): | |
| matches.append(date_str) | |
| # Remove duplicates and return the matches | |
| return list(set(matches)) | |
| def convert_dates(date_list): | |
| """ | |
| Assign to the identified formatted dates the proper date format and then, on the formatted dates, assign the relevant date tags (e.g. specify which is the day, the month, etc) | |
| """ | |
| DATE_FORMATS = { | |
| '%B %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%-m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%d/%m': 'day:{dt.day}, month:{dt.month}', | |
| '%B %d': 'day:{dt.day}, month:{dt.month}', | |
| '%b %d': 'day:{dt.day}, month:{dt.month}', | |
| '%B %Y': 'month:{dt.month}, year:{dt.year}', | |
| '%Y': 'year:{dt.year}', | |
| '%d/%m/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%B %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%b %d, %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%d-%m-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%d/%m/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%d-%m-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%m/%d/%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%m/%d/%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%m-%d-%Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%m-%d-%y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%d/%m/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
| '%d/%m/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
| '%m/%d/%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
| '%m/%d/%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
| '%Y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%y-%m-%d': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%m-%d-%Y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
| '%m-%d-%y %H:%M:%S': 'day:{dt.day}, month:{dt.month}, year:{dt.year}, time:{dt.strftime("%H:%M:%S")}', | |
| '%m-%d': 'month:{dt.month}, day:{dt.day}', | |
| '%-m-%-d': 'month:{dt.month}, day:{dt.day}', | |
| '%d %b %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%d %B %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%b %Y': 'month:{dt.month}, year:{dt.year}', | |
| '%b %d, %Y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}', | |
| '%d %B %y': 'day:{dt.day}, month:{dt.month}, year:{dt.year}' | |
| } | |
| output_list = [] | |
| for date_str in date_list: | |
| valid_format = False | |
| for fmt, out_fmt in DATE_FORMATS.items(): | |
| try: | |
| dt = datetime.strptime(date_str, fmt) | |
| output_list.append(out_fmt.format(dt=dt)) | |
| valid_format = True | |
| break | |
| except ValueError: | |
| pass | |
| if not valid_format: | |
| # Attempt to parse using a custom format | |
| try: | |
| if '-' in date_str: | |
| dt = datetime.strptime(date_str, '%m-%d-%y') | |
| else: | |
| dt = datetime.strptime(date_str, '%d/%m/%y') | |
| output_list.append(f'day:{dt.day}, month:{dt.month}, year:{dt.year}') | |
| except ValueError: | |
| output_list.append(f'INVALID FORMAT: {date_str}') | |
| return output_list | |
| def dates_binding(text): | |
| ''' | |
| This is a function that binds together all the subcomponents of the dates identification, while also controlling for multiple, or zero date references | |
| ''' | |
| try: | |
| # capture the referred dates | |
| identified_dates = extract_dates(text) | |
| # we only accept for one date reference | |
| if len(identified_dates) == 1: | |
| formatted_dates = convert_dates(identified_dates) | |
| # in case there is a wrong date format then return the appropriate code to prompt back the proper message | |
| if 'INVALID FORMAT' in formatted_dates[0]: | |
| return (0,'DATES','wrong_date_format') | |
| else: | |
| return formatted_dates | |
| # in case of zero references return the appropriate code (to aid returning the correct prompt) | |
| elif len(identified_dates) == 0: | |
| return (0,'DATES','no_date') | |
| # in case of more than one references return the appropriate code (to aid returning the correct prompt) | |
| elif len(identified_dates) > 1: | |
| return (0,'DATES','more_dates') | |
| # in case of unexpected error return the appropriate code (to aid returning the correct prompt) | |
| else: | |
| return (0,'DATES','unknown_error') | |
| except: | |
| return (0,'DATES','unknown_error') |