import subprocess, sys subprocess.run([sys.executable, "-m", "pip", "install", "-q", "transformers>=4.43", "accelerate>=0.30", "torch>=2.2", "pandas"], check=True) import json import pandas as pd import torch from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" tok = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="auto" ).eval() df = pd.read_csv("/tmp/data/test.csv", dtype=str).fillna("") rows = [] for _, r in df.iterrows(): messages = [ {"role": "system", "content": "You solve International Linguistics Olympiad problems. Answer every numbered " "item. Put each answer on its own line, in order, with no numbering and no extra text."}, {"role": "user", "content": f"{r['context'].strip()}\n\n{r['query'].strip()}"}, ] ids = tok.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): out = model.generate(ids, max_new_tokens=512, do_sample=False) text = tok.decode(out[0][ids.shape[-1]:], skip_special_tokens=True).strip() answers = [ln.strip() for ln in text.splitlines() if ln.strip()] rows.append({"id": r["id"], "pred": json.dumps(answers, ensure_ascii=False)}) print(f"{len(rows)}/{len(df)} done", flush=True) pd.DataFrame(rows).to_csv("submission.csv", index=False) print("wrote submission.csv", flush=True)