optima / src /year_parser.py
wicaksonolm2's picture
Untrack .py and .md from LFS and restore as regular files
1b060e0
import re
from typing import List, Tuple
from config import *
class YearParser:
VALID_YEARS = [2022, 2023, 2024]
@staticmethod
def extract_years(query: str) -> Tuple[List[int], str, bool, bool]:
years = []
cleaned_query = query
user_mentioned_year = False
user_mentioned_invalid_year = False
single_year_pattern = r'\b(20\d{2})\b'
single_years = re.findall(single_year_pattern, query)
range_patterns = [
r'\b(20\d{2})\s*-\s*(20\d{2})\b', # 2022-2024
r'\b(20\d{2})\s+sampai\s+(20\d{2})\b', # 2022 sampai 2024
r'\b(20\d{2})\s+hingga\s+(20\d{2})\b', # 2022 hingga 2024
r'\b(20\d{2})\s+s\.?d\.?\s+(20\d{2})\b', # 2022 s.d 2024
]
range_found = False
for pattern in range_patterns:
matches = re.findall(pattern, query, re.IGNORECASE)
if matches:
user_mentioned_year = True
for start_year, end_year in matches:
start = int(start_year)
end = int(end_year)
for year in range(start, end + 1):
if year in YearParser.VALID_YEARS:
years.append(year)
else:
user_mentioned_invalid_year = True
range_found = True
cleaned_query = re.sub(pattern, '', cleaned_query, flags=re.IGNORECASE)
if not range_found and single_years:
user_mentioned_year = True
for year in single_years:
y = int(year)
if y in YearParser.VALID_YEARS:
years.append(y)
else:
user_mentioned_invalid_year = True
cleaned_query = re.sub(single_year_pattern, '', cleaned_query)
# Tidak fallback ke semua tahun valid kalau user_mentioned_year True tapi semua tahun tidak valid
if not years and not user_mentioned_year:
years = YearParser.VALID_YEARS.copy()
cleaned_query = re.sub(r'\s+', ' ', cleaned_query).strip()
cleaned_query = re.sub(r'^[,\-\s]+|[,\-\s]+$', '', cleaned_query)
return list(sorted(set(years))), cleaned_query, user_mentioned_year, user_mentioned_invalid_year