Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

App Files Files Community

topic-clustering-global-dashboard / function /detect_time.py

cuongnguyen910

Upload folder using huggingface_hub

5120311 verified over 1 year ago

raw

history blame contribute delete

3.34 kB

	import re
	import requests
	import datetime
	import operator
	from typing import *
	from dateutil.relativedelta import *
	from itertools import groupby
	from dateparser import parse

	day = '[0-3]{0,1}[0-9]'
	month = '[0,1]{0,1}[0-9]'
	year = '\d{4}'
	sep = '\s[-/\.\s]\s'

	patterns = [
	f"{day}{sep}{month}{sep}{year}",
	f"{year}{sep}{month}{sep}{day}",
	f"{day}\s+tháng\s+{month}",
	f"{day}\s+tháng\s+{month}{sep}{year}",
	f"{day}\s+tháng\s+{month}\s+năm\s+{year}",
	f"{day}\s+tháng\s+{month}",
	f"(?<=ngày)\s+{day}{sep}{month}",
	f"(?<=ngày)\s+{day}{sep}{month}{sep}{year}",
	f"(?<=sáng)\s+{day}{sep}{month}",
	f"(?<=sáng)\s+{day}{sep}{month}{sep}{year}",
	f"(?<=trưa)\s+{day}{sep}{month}",
	f"(?<=trưa)\s+{day}{sep}{month}{sep}{year}",
	f"(?<=chiều)\s+{day}{sep}{month}",
	f"(?<=chiều)\s+{day}{sep}{month}{sep}{year}",
	f"(?<=tối)\s+{day}{sep}{month}",
	f"(?<=tối)\s+{day}{sep}{month}{sep}{year}"
	f"(?<=đêm)\s+{day}{sep}{month}",
	f"(?<=đêm)\s+{day}{sep}{month}{sep}{year}",
	f"(?<=hôm)\s+{day}{sep}{month}",
	f"(?<=hôm)\s+{day}{sep}{month}{sep}{year}",
	f"{day}{sep}{month}[\s\.\,\)]"
	]


	def extract_pattern(text: str, patterns: List[str]):
	detected = []
	for pattern in patterns:
	for match in re.finditer(pattern, text):
	detected.append((match.start(), match.end()))
	detected.sort()
	output = []
	curr = -1
	for start, values in groupby(detected, key=operator.itemgetter(0)):
	if start < curr:
	continue
	values = list(values)
	values.sort(key=operator.itemgetter(1), reverse=True)
	output.append(values[0])
	curr = values[0][1]
	return output


	def detect_time(text: str, language: str = 'vi', base: Optional[datetime.datetime] = None):
	text = text.lower()
	detected_patterns = extract_pattern(text, patterns)
	output = []
	settings = {
	'PREFER_DAY_OF_MONTH': 'first'
	}
	if base:
	settings['RELATIVE_BASE'] = base
	for start, end in detected_patterns:
	segment = text[start:end]
	segment = re.sub('\s+', ' ', segment).strip().lower()
	candiate = parse(segment, languages=[language], settings=settings)
	output.append((segment, candiate))
	return output


	def get_time_post(sentences, patterns, start_time=None, end_time=None):
	dict_time_evs = {}
	for i, sen in enumerate(sentences):
	if sen.strip() != "":
	time_ex = detect_time(sen, patterns)
	for te in time_ex:
	if te[1] is not None:
	if start_time is None or end_time is None or (end_time > te[1].timestamp() > start_time):
	if te not in dict_time_evs:
	dict_time_evs[te] = []
	dict_time_evs[te].append(i)
	return dict_time_evs


	if __name__ == '__main__':
	print(detect_time("VietTimes – Ngoại trưởng Mỹ Antony Blinken ngày đã tuyên bố trong một cuộc họp qua"
	"truyền hình với ngoại trưởng các nước ASEAN Mỹ bác bỏ các yêu sách “bất hợp pháp” của"
	"Trung Quốc ở Biển Đông.", language="vi"))