haochengsama
/

mcpmark

Model card Files Files and versions

mcpmark / synth /generators /filesystem /pattern_matching /__init__.py

haochengsama's picture

Add files using upload-large-folder tool

97cb846 verified 24 days ago

History Blame Contribute Delete

3.7 kB

	"""`pattern_matching` task generator."""
	import json
	import random
	import shutil
	from collections import defaultdict
	from pathlib import Path

	from ..base import (
	Generator, _VERIFY_HEADER, _render_verify, pad_to, _write, _para,
	_AUTHORS, _SONG_TITLES, _STUDENTS, _WORDS,
	)


	class PatternMatching(Generator):
	KEY = "pattern_matching"
	CATEGORY_NAME = "Pattern Matching"
	DIFFICULTY = "L2"
	TAGS = ["file content", "search"]

	KEYWORD = "URGENT"
	OUT = "matches.txt"

	def build(self, env_dir, llm, rng):
	snippets = llm.gen_snippets("project status logs", 5)
	files = []
	for s in snippets:
	name = s["filename"] + ".txt"
	n = rng.randint(4, 7)
	inject = []
	for _ in range(rng.randint(1, 2)):
	idx = rng.randint(0, n - 1)
	inject.append((idx, f"{self.KEYWORD}: " + " ".join(
	rng.choice(_WORDS) for _ in range(5)).capitalize() + "."))
	lines = _para(rng, n, inject)
	_write(env_dir / name, "\n".join(lines) + "\n")
	files.append(name)
	return {"files": files, "keyword": self.KEYWORD, "out": self.OUT}

	def description(self, spec):
	return (
	"Please use FileSystem tools to finish the following task:\n\n"
	f"### Task: Find lines containing `{self.KEYWORD}`\n\n"
	f"Scan every `.txt` file in the test directory. For each line that "
	f"contains the exact (case-sensitive) substring `{self.KEYWORD}`, record "
	f"a result line in this format:\n\n"
	"```\n<filename>:<line_number>:<full_line_text>\n```\n\n"
	"- `line_number` is 1-based.\n"
	"- `full_line_text` is the matching line with trailing newline removed.\n"
	f"- Sort all result lines by filename, then by line number.\n"
	f"- Write the results to `{self.OUT}` in the test directory root "
	f"(do not include `{self.OUT}` itself in the scan)."
	)

	def verify_src(self, spec):
	body = '''
	C = json.loads(__CONSTS__)


	def expected(t):
	rows = []
	for f in sorted(t.iterdir()):
	if not (f.is_file() and f.suffix == ".txt" and f.name != C["OUT"]):
	continue
	for i, line in enumerate(f.read_text(encoding="utf-8").splitlines(), 1):
	if C["KEYWORD"] in line:
	rows.append((f.name, i, line))
	rows.sort(key=lambda r: (r[0], r[1]))
	return "\\n".join(f"{n}:{i}:{l}" for n, i, l in rows)


	def main():
	t = get_test_dir()
	out = t / C["OUT"]
	if not out.is_file():
	fail(f"output file not found: {C['OUT']}")
	got = out.read_text(encoding="utf-8").strip()
	exp = expected(t).strip()
	if got != exp:
	print("--- expected ---"); print(exp[:800])
	print("--- got ---"); print(got[:800])
	fail("matches.txt does not match expected results")
	ok("all matching lines correctly recorded")
	print("\\U0001f389 All checks passed!")
	sys.exit(0)


	if __name__ == "__main__":
	main()
	'''
	return _render_verify(body, {"KEYWORD": spec["keyword"], "OUT": spec["out"]})

	def solve(self, work_dir, spec):
	rows = []
	for f in sorted(work_dir.iterdir()):
	if not (f.is_file() and f.suffix == ".txt" and f.name != spec["out"]):
	continue
	for i, line in enumerate(f.read_text(encoding="utf-8").splitlines(), 1):
	if spec["keyword"] in line:
	rows.append((f.name, i, line))
	rows.sort(key=lambda r: (r[0], r[1]))
	_write(work_dir / spec["out"], "\n".join(f"{n}:{i}:{l}" for n, i, l in rows) + "\n")