test_iol_submission / script.py
eduardosanchez's picture
Update script.py
6899d68 verified
Raw
History Blame Contribute Delete
1.54 kB
import subprocess, sys
subprocess.run([sys.executable, "-m", "pip", "install", "-q",
"transformers>=4.43", "accelerate>=0.30", "torch>=2.2", "pandas"], check=True)
import json
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
tok = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype=torch.float16, device_map="auto"
).eval()
df = pd.read_csv("/tmp/data/test.csv", dtype=str).fillna("")
rows = []
for _, r in df.iterrows():
messages = [
{"role": "system", "content":
"You solve International Linguistics Olympiad problems. Answer every numbered "
"item. Put each answer on its own line, in order, with no numbering and no extra text."},
{"role": "user", "content": f"{r['context'].strip()}\n\n{r['query'].strip()}"},
]
ids = tok.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt",
).to(model.device)
with torch.no_grad():
out = model.generate(ids, max_new_tokens=512, do_sample=False)
text = tok.decode(out[0][ids.shape[-1]:], skip_special_tokens=True).strip()
answers = [ln.strip() for ln in text.splitlines() if ln.strip()]
rows.append({"id": r["id"], "pred": json.dumps(answers, ensure_ascii=False)})
print(f"{len(rows)}/{len(df)} done", flush=True)
pd.DataFrame(rows).to_csv("submission.csv", index=False)
print("wrote submission.csv", flush=True)