| import subprocess, sys |
| subprocess.run([sys.executable, "-m", "pip", "install", "-q", |
| "transformers>=4.43", "accelerate>=0.30", "torch>=2.2", "pandas"], check=True) |
|
|
| import json |
| import pandas as pd |
| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
| MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" |
|
|
| tok = AutoTokenizer.from_pretrained(MODEL_ID) |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, torch_dtype=torch.float16, device_map="auto" |
| ).eval() |
|
|
| df = pd.read_csv("/tmp/data/test.csv", dtype=str).fillna("") |
|
|
| rows = [] |
| for _, r in df.iterrows(): |
| messages = [ |
| {"role": "system", "content": |
| "You solve International Linguistics Olympiad problems. Answer every numbered " |
| "item. Put each answer on its own line, in order, with no numbering and no extra text."}, |
| {"role": "user", "content": f"{r['context'].strip()}\n\n{r['query'].strip()}"}, |
| ] |
| ids = tok.apply_chat_template( |
| messages, add_generation_prompt=True, return_tensors="pt", |
| ).to(model.device) |
| with torch.no_grad(): |
| out = model.generate(ids, max_new_tokens=512, do_sample=False) |
| text = tok.decode(out[0][ids.shape[-1]:], skip_special_tokens=True).strip() |
| answers = [ln.strip() for ln in text.splitlines() if ln.strip()] |
| rows.append({"id": r["id"], "pred": json.dumps(answers, ensure_ascii=False)}) |
| print(f"{len(rows)}/{len(df)} done", flush=True) |
|
|
| pd.DataFrame(rows).to_csv("submission.csv", index=False) |
| print("wrote submission.csv", flush=True) |