"""Inferencia rapida usando um unico fold (fold 1). Ideal para demos. Em CPU: ~5-10s por par. Em GPU T4: ~50ms. """ import json, torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel from huggingface_hub import snapshot_download REPO = "histlearn/community-notes-reranker-ptbr" path = snapshot_download(REPO, allow_patterns=["manifesto.json", "adapter_fold_1/*"]) m = json.load(open(f"{path}/manifesto.json")) tok = AutoTokenizer.from_pretrained(m["base_model"], padding_side="left") model = AutoModelForCausalLM.from_pretrained( m["base_model"], torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32) model = PeftModel.from_pretrained(model, f"{path}/adapter_fold_1") if torch.cuda.is_available(): model.cuda() model.eval() def score(tweet, nota): text = (m["prompt_prefixo"] + ": " + m["instrucao"] + "\n: " + tweet + "\n: " + nota + m["prompt_sufixo"]) enc = tok(text, return_tensors="pt", truncation=True, max_length=m["max_length"]).to(model.device) with torch.no_grad(): logits = model(**enc).logits[:, -1, :] return float(torch.sigmoid(logits[:, m["id_yes"]] - logits[:, m["id_no"]]).item()) print(score("Bolsonaro disse que a Terra e plana", "Bolsonaro nunca afirmou isso; checagem em https://exemplo.org"))