Release Reframr-RFM-v1-Base public checkpoint

2147ce8 verified 6 days ago

6.11 kB

	import json
	from pathlib import Path

	from .text_quality import clean_answer_text, clean_context_text, clean_training_text


	TEXT_EXTENSIONS = {".txt", ".md", ".text"}
	STRUCTURED_EXTENSIONS = {".jsonl", ".json"}


	def _default_record_weight(record_type: str) -> int:
	if record_type == "dialogue_turn":
	return 2
	if record_type == "instruction_answer":
	return 2
	if record_type == "preference_chosen":
	return 3
	if record_type == "preference_rejected":
	return 0
	return 1


	def _record_repeat_count(record: object) -> int:
	if not isinstance(record, dict):
	return 1
	if bool(record.get("drop")):
	return 0
	raw_weight = record.get("weight")
	if raw_weight is not None:
	try:
	numeric = int(round(float(raw_weight)))
	except (TypeError, ValueError):
	numeric = 1
	return max(0, min(8, numeric))
	return _default_record_weight(str(record.get("record_type", "")))


	def _coerce_text_record(record: object) -> str:
	if isinstance(record, str):
	return clean_training_text(record.strip())
	if isinstance(record, dict):
	if "text" in record:
	return clean_training_text(str(record["text"]).strip())
	if "content" in record:
	return clean_training_text(str(record["content"]).strip())
	if "context" in record and "answer" in record:
	context = clean_context_text(str(record["context"]).strip())
	answer = clean_answer_text(str(record["answer"]).strip())
	if context and answer:
	return f"<reason> {context} <answer> {answer}"
	return ""


	def _coerce_prompt_record(record: object) -> dict[str, object] \| None:
	if isinstance(record, str):
	prompt = record.strip()
	return {"prompt": prompt, "tags": []} if prompt else None
	if isinstance(record, dict):
	raw_prompt = record.get("prompt", record.get("context", ""))
	prompt = clean_context_text(str(raw_prompt).strip())
	if not prompt:
	return None
	raw_tags = record.get("tags", [])
	tags = [str(tag) for tag in raw_tags] if isinstance(raw_tags, list) else []
	normalized = dict(record)
	normalized["prompt"] = prompt
	normalized["tags"] = tags
	return normalized
	return None


	def load_text_corpus(source: str \| Path) -> str:
	path = Path(source)
	if path.is_dir():
	parts = [
	load_text_corpus(child)
	for child in sorted(path.rglob("*"))
	if child.is_file() and child.suffix.lower() in TEXT_EXTENSIONS \| STRUCTURED_EXTENSIONS
	]
	return "\n".join(part for part in parts if part.strip())

	suffix = path.suffix.lower()
	if suffix in TEXT_EXTENSIONS:
	return path.read_text(encoding="utf-8")
	if suffix == ".jsonl":
	lines = []
	for line in path.read_text(encoding="utf-8").splitlines():
	if not line.strip():
	continue
	record = json.loads(line)
	text = _coerce_text_record(record)
	if text:
	lines.extend([text] * _record_repeat_count(record))
	return "\n".join(lines)
	if suffix == ".json":
	payload = json.loads(path.read_text(encoding="utf-8"))
	if isinstance(payload, list):
	parts: list[str] = []
	for item in payload:
	text = _coerce_text_record(item)
	if text:
	parts.extend([text] * _record_repeat_count(item))
	return "\n".join(parts)
	if isinstance(payload, dict):
	if "texts" in payload and isinstance(payload["texts"], list):
	parts: list[str] = []
	for item in payload["texts"]:
	text = _coerce_text_record(item)
	if text:
	parts.extend([text] * _record_repeat_count(item))
	return "\n".join(parts)
	if "records" in payload and isinstance(payload["records"], list):
	parts: list[str] = []
	for item in payload["records"]:
	text = _coerce_text_record(item)
	if text:
	parts.extend([text] * _record_repeat_count(item))
	return "\n".join(parts)
	text = _coerce_text_record(payload)
	if text:
	return "\n".join([text] * _record_repeat_count(payload))
	raise ValueError(f"Unsupported corpus source: {path}")


	def load_prompt_suite(source: str \| Path) -> list[dict[str, object]]:
	path = Path(source)
	suffix = path.suffix.lower()
	prompts: list[dict[str, object]] = []

	if suffix in TEXT_EXTENSIONS:
	for line in path.read_text(encoding="utf-8").splitlines():
	record = _coerce_prompt_record(line)
	if record is not None:
	prompts.append(record)
	return prompts

	if suffix == ".jsonl":
	for line in path.read_text(encoding="utf-8").splitlines():
	if not line.strip():
	continue
	record = _coerce_prompt_record(json.loads(line))
	if record is not None:
	prompts.append(record)
	return prompts

	if suffix == ".json":
	payload = json.loads(path.read_text(encoding="utf-8"))
	if isinstance(payload, list):
	for item in payload:
	record = _coerce_prompt_record(item)
	if record is not None:
	prompts.append(record)
	return prompts
	if isinstance(payload, dict):
	if "prompts" in payload and isinstance(payload["prompts"], list):
	for item in payload["prompts"]:
	record = _coerce_prompt_record(item)
	if record is not None:
	prompts.append(record)
	return prompts
	record = _coerce_prompt_record(payload)
	if record is not None:
	return [record]

	raise ValueError(f"Unsupported prompt suite: {path}")