ainowmk
/

MK-LLM-Mistral

Text Generation

text-generation-inference

Model card Files Files and versions

MK-LLM-Mistral / scripts /evaluate.py

ainow-mk's picture

Upload 65 files

f29d474 verified 4 months ago

history blame contribute delete

2.16 kB

	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import os
	import json


	def evaluate_perplexity(model_path: str, eval_file: str) -> float:
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
	with open(eval_file, 'r', encoding='utf-8') as f:
	text = f.read()
	enc = tokenizer(text, return_tensors='pt')
	with torch.no_grad():
	loss = model(**{k: v.to(model.device) for k, v in enc.items()}, labels=enc["input_ids"].to(model.device)).loss
	ppl = torch.exp(loss).item()
	print(f"Perplexity: {ppl:.2f}")
	return ppl


	if __name__ == "__main__":
	model_path = os.getenv("MODEL_PATH", "./models/mistral-finetuned-mk")
	eval_file = os.getenv("EVAL_FILE", "data/cleaned/mk_combined_data.txt")
	evaluate_perplexity(model_path, eval_file)

	# Simple QA accuracy on small Macedonian eval
	qa_file = os.getenv("QA_EVAL_FILE", "data/eval/mk_eval.jsonl")
	if os.path.exists(qa_file):
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
	correct = 0
	total = 0
	with open(qa_file, 'r', encoding='utf-8') as f:
	for line in f:
	item = json.loads(line)
	q = item["question"]
	gt = item["answer"].strip()
	prompt = f"Прашање: {q}\nОдговор:"
	inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
	with torch.no_grad():
	out = model.generate(**inputs, max_new_tokens=64)
	pred = tokenizer.decode(out[0], skip_special_tokens=True)
	# naive check
	is_correct = gt.lower() in pred.lower()
	correct += 1 if is_correct else 0
	total += 1
	if total:
	acc = correct / total * 100
	print(f"QA accuracy on mk_eval.jsonl: {acc:.1f}% ({correct}/{total})")