| import pandas as pd |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import jsonlines |
| import sys |
| from tqdm.auto import tqdm |
| |
| MODEL_NAME = sys.argv[1] |
| INPUT_FILENAME = "./Vietnamese truthful QA results.xlsx" |
| OUTPUT_FILENAME = sys.argv[2] |
| MAX_NEW_TOKENS = 512 |
|
|
| writer = jsonlines.open(OUTPUT_FILENAME, "w") |
| |
| try: |
| df = pd.read_excel(INPUT_FILENAME) |
| except FileNotFoundError: |
| print(f"Error: The file '{INPUT_FILENAME}' was not found.") |
| print("Please make sure your XLSX file is in the same directory as the script.") |
| exit() |
| except Exception as e: |
| print(f"An error occurred while reading the Excel file: {e}") |
| exit() |
|
|
| |
| if "Question" not in df.columns or "Ground truth" not in df.columns: |
| print("Error: Required columns 'Question' and/or 'Ground truth' not found.") |
| print(f"Available columns are: {list(df.columns)}") |
| exit() |
|
|
| df_processed = df[["Question", "Ground truth"]].copy() |
|
|
| |
| print(f"Loading model '{MODEL_NAME}' and tokenizer...") |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"Using device: {device}") |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, attn_implementation='flash_attention_2') |
| model.to(device) |
|
|
| |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| model.config.pad_token_id = model.config.eos_token_id |
|
|
| print("Model and tokenizer loaded successfully.") |
|
|
| |
| answers = [] |
| out_dict = [] |
| total_questions = len(df_processed) |
| print(f"Generating answers for {total_questions} questions...") |
|
|
| for i, question in tqdm(enumerate(df_processed["Question"])): |
| |
| |
| messages = [ |
| {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, |
| {"role": "user", "content": question} |
| ] |
| input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| input_ids = tokenizer([input], return_tensors='pt').to(model.device) |
| |
| |
| output_sequences = model.generate( |
| **input_ids, |
| max_new_tokens=MAX_NEW_TOKENS, |
| do_sample=False, |
| pad_token_id=tokenizer.pad_token_id |
| ) |
|
|
| |
| |
| full_text = tokenizer.decode(output_sequences[0][input_ids['input_ids'].shape[1]:], skip_special_tokens=True) |
| answer = full_text.strip() |
| gold = df['Ground truth'][i] |
| answers.append(answer) |
| print(f"Processed question {i + 1}/{total_questions}\nAnswer: {answer}\nGold: {gold}") |
| writer.write({ |
| "question": question, |
| "answer": answer, |
| "gold": gold |
| }) |
|
|
|
|