Iostream-Li
/

doatlas-2

Model card Files Files and versions

doatlas-2 / artifacts /research-engine /scripts /m2_common.py

Iostream-Li's picture

Add files using upload-large-folder tool

2a55985 verified 22 days ago

history blame contribute delete

2.18 kB

	#!/usr/bin/env python3
	"""
	Shared helpers for M2 metadata indexing and candidate retrieval.
	"""

	from __future__ import annotations

	import hashlib
	import re
	import unicodedata
	from datetime import datetime, timezone


	MULTISPACE_RE = re.compile(r"\s+")
	NON_ALNUM_RE = re.compile(r"[^0-9a-z]+")
	TOKEN_RE = re.compile(r"[0-9a-z]+")


	def ts() -> str:
	return datetime.now(timezone.utc).isoformat()


	def stable_hash(text: str, prefix: str, length: int = 12) -> str:
	return f"{prefix}:{hashlib.sha1(text.encode('utf-8')).hexdigest()[:length]}"


	def normalize_text(text: str \| None) -> str:
	if not text:
	return ""
	text = unicodedata.normalize("NFKD", text)
	text = text.encode("ascii", "ignore").decode("ascii")
	text = text.lower()
	text = NON_ALNUM_RE.sub(" ", text)
	return MULTISPACE_RE.sub(" ", text).strip()


	def unique_preserve(items: list[str]) -> list[str]:
	seen: set[str] = set()
	out: list[str] = []
	for item in items:
	if not item or item in seen:
	continue
	seen.add(item)
	out.append(item)
	return out


	def tokenize(text: str \| None) -> list[str]:
	return TOKEN_RE.findall(normalize_text(text))


	def make_acronym(text: str \| None) -> str:
	tokens = tokenize(text)
	if len(tokens) < 2:
	return ""
	acronym = "".join(token[0] for token in tokens if token)
	if len(acronym) < 2 or len(acronym) > 8:
	return ""
	return acronym


	def contains_normalized_term(field_text: str \| None, term: str) -> bool:
	norm_field = normalize_text(field_text)
	norm_term = normalize_text(term)
	if not norm_field or not norm_term:
	return False
	padded_field = f" {norm_field} "
	padded_term = f" {norm_term} "
	return padded_term in padded_field


	def fts_quote(term: str) -> str:
	term = term.strip()
	if not term:
	return ""
	escaped = term.replace('"', '""')
	if " " in term or "-" in term or "/" in term:
	return f'"{escaped}"'
	return escaped


	def build_fts_query(terms: list[str]) -> str:
	quoted = [fts_quote(t) for t in unique_preserve([normalize_text(t) for t in terms]) if t]
	return " OR ".join(quoted)