File size: 1,642 Bytes

56ea5bf

"""Quick check: what does the SFT dataset say for the eval questions?"""
import json, re

with open("Base/Datasets/sft_clean/train.json", "r", encoding="utf-8") as f:
    data = json.load(f)

exact_queries = {
    "Who are you?": r"^Who are you\?$",
    "What is your name?": r"^What is your name\?$",
    "Who created you?": r"^Who created you\?$",
    "Who is Asterizer?": r"^Who is Asterizer\?$",
    "What is LUNA?": r"^What is LUNA\?$",
    "Are you an AI?": r"are you an ai",
    "Tell me about yourself": r"tell me about yourself",
    "Translate hello to Spanish": r"translate.*hello.*spanish",
    "Write a poem about the moon": r"^Write a.*poem about the moon",
    "Summarize evolution": r"summarize.*theory of evolution",
    "Explain photosynthesis": r"explain photosynthesis",
    "What is 25 times 4?": r"25 times 4|25 \* 4|25\*4|25 x 4",
    "Capital of France": r"capital of france",
}

for label, pat in exact_queries.items():
    matches = [e for e in data if re.search(pat, e.get("instruction", "").strip(), re.I)]
    print(f"\n{'='*60}")
    print(f"  {label}  ({len(matches)} matches in dataset)")
    print(f"{'='*60}")
    for m in matches[:3]:
        inst = m["instruction"][:100]
        out = m["output"][:300]
        print(f"  Q: {inst}")
        print(f"  A: {out}")
        print()

# Count all Asterizer-branded identity entries
identity_kw = [e for e in data if "asterizer" in e.get("output", "").lower() or "luna" in e.get("output", "").lower()[:50]]
print(f"\n{'='*60}")
print(f"  Entries mentioning Asterizer/LUNA in output: {len(identity_kw)}")
print(f"{'='*60}")