In [None]:
%%capture
!pip install huggingface_hub
!pip install llama-cpp-python
!pip install datasets
!pip install torch==2.2.0 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu121
!pip install unsloth
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

In [None]:
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from datasets import load_dataset
from unsloth import FastLanguageModel


In [None]:
# Load model (unchanged)
qlora_model_path = hf_hub_download(
 repo_id="ebbalg/llama-finetome",
 filename="llama-3.2-1b-instruct.Q4_K_M.gguf"
)
qlora_model = Llama(model_path=qlora_model_path, 
 n_ctx=2048, n_threads=2, verbose=False, chat_format="llama-3")


In [None]:
from tqdm import tqdm # progress bar
import re

def extract_choice_from_text(text):
 """
 Scan the text and return the first valid choice letter that appears.
 """
 text_upper = text.upper()

 # ---- 1. Strongest match: "Answer: X" ----
 m = re.search(r"ANSWER[:\s\-]*([A-E])", text_upper)
 if m:
 return m.group(1)

 # ---- 2. Next: lines that START with the choice (allowing whitespace) ----
 m = re.match(r"\s*([A-E])(\.|\s|$)", text_upper)
 if m:
 return m.group(1)

 # ---- 3. Other patterns like "Best answer is C" ----
 m = re.search(r"BEST ANSWER.*?([A-E])", text_upper)
 if m:
 return m.group(1)

 # ---- 4. Look for something like "C." inside the text ----
 m = re.search(r"\b([A-E])\.", text_upper)
 if m:
 return m.group(1)

 # ---- 5. Weakest fallback: first standalone letter ----
 m = re.search(r"\b([A-E])\b", text_upper)
 if m:
 return m.group(1)
 
 print(text)
 return None # couldn't find

def eval_arc(model_fn, dataset_split):
 correct = 0
 total = len(dataset_split)

 for i, row in enumerate(dataset_split):
 question = row["question"]
 answer = row["answerKey"]

 # Build choices dicts from ARC structure
 choices_texts = row['choices']['text']
 choices_labels = row['choices']['label']
 choices = [{"label": l, "text": t} for l, t in zip(choices_labels, choices_texts)]

 # Build prompt
 prompt = (
 f"Question: {question}\n"
 + "\n".join([f"{c['label']}. {c['text']}" for c in choices])
 + "\nAnswer:"
 )

 # Run model
 out = model_fn(prompt)
 pred = extract_choice_from_text(out)
 
 if pred == answer:
 correct += 1
 
 if (i + 1) % 10 == 0:
 print(f"{i+1}/{total} Accuracy = {correct / total * 100}")

 return correct / total

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
 model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
 max_seq_length = 2048,
 dtype = None,
 load_in_4bit = True
)


In [None]:
arc = load_dataset("allenai/ai2_arc", "ARC-Challenge")
arc_eval = arc['test'].select(range(500))

In [None]:
# ---- GGUF QLoRA model runner ----
def run_gguf(llm, prompt, max_tokens=128):
 out = llm(prompt, max_tokens=max_tokens, temperature=0)
 return out["choices"][0]["text"]
 
score_qlora = eval_arc(lambda p: run_gguf(qlora_model, p), arc_eval)
print("QLoRA model:", score_qlora)

In [None]:
# ---- Base Unsloth model runner ----
def run_unsloth(model, tokenizer, prompt, max_tokens=128):
 inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
 out = model.generate(**inputs, max_new_tokens=max_tokens)
 return tokenizer.decode(out[0], skip_special_tokens=True)
 
score_base = eval_arc(lambda p: run_unsloth(model, tokenizer, p), arc_eval)
print("Base model:", score_base)

In [None]:
import string 

def convert_truthfulqa_row(row):
 target = row["mc1_targets"] # or mc2_targets
 choices = target["choices"]
 answer_idx = target["labels"].index(1) # correct index

 # Generate labels dynamically
 labels = list(string.ascii_uppercase[:len(choices)])
 
 # Build prompt
 prompt = f"Question: {row['question']}\n"
 prompt += "\n".join([f"{labels[i]}. {c}" for i, c in enumerate(choices)])
 prompt += "\nAnswer (choose one letter):"
 
 correct_label = labels[answer_idx]
 return prompt, correct_label

def eval_truthfulqa(model_fn, dataset_split):
 i = 0
 correct = 0
 total = len(dataset_split)

 for prompt, correct_label in dataset_split:
 pred_text = model_fn(prompt)

 # Extract predicted label: first uppercase letter in the output
 pred_label = extract_choice_from_text(pred_text)
 
 if pred_label == correct_label:
 correct += 1
 
 if (i + 1) % 20 == 0:
 print(f"{i+1}/{total} Accuracy = {correct / total * 100}")
 i += 1

 return correct / total


In [None]:
ds = load_dataset("truthful_qa", "multiple_choice")
qa_eval_raw = ds["validation"].select(range(500))

In [None]:
qa_eval = [convert_truthfulqa_row(row) for row in qa_eval_raw]

for row in qa_eval:
 print(row)
 break

In [None]:
score_qlora = eval_truthfulqa(lambda p: run_gguf(qlora_model, p), qa_eval)
print("QLoRA model accuracy:", score_qlora)


In [None]:
score_base = eval_truthfulqa(lambda p: run_unsloth(model, tokenizer, p), qa_eval)
print("Base model accuracy:", score_base)