Spaces:
Runtime error
Runtime error
File size: 2,281 Bytes
1de9a74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import pandas as pd
import os
import argparse
# --- Configuration ---
TRAINING_DATA_FILE = "data/training_sets/distractor_generation_training_data.parquet"
def verify_data(file_path: str, num_samples: int):
"""
Loads the generated training data and prints a random sample
for human verification.
"""
print("--- Starting Training Data Verification ---")
# 1. Validate file exists
if not os.path.exists(file_path):
print(f"\n❌ FATAL: Training data file not found at '{file_path}'.")
print("Please run generate_distractor_training_set.py first.")
return
print(f"Loading data from '{file_path}'...")
try:
df = pd.read_parquet(file_path)
except Exception as e:
print(f"\n❌ FATAL: Could not read parquet file. Error: {e}")
return
# 2. Take a random sample for review
if num_samples > len(df):
print(f"Warning: Requested {num_samples} samples, but dataset only has {len(df)}. Showing all.")
num_samples = len(df)
sample_df = df.sample(n=num_samples, random_state=42)
print(f"\nDisplaying {num_samples} random examples for your review:")
print("-" * 80)
# 3. Print samples in a readable format
for i, row in sample_df.iterrows():
print(f"\n--- Example {i+1}/{num_samples} ---")
print(f"\n[QUESTION]:")
print(f" {row['question']}")
print(f"\n [CORRECT ANSWER]:")
print(f" {row['correct_answer']}")
print(f"\n [GENERATED DISTRACTOR (is this a good distractor?)]:")
print(f" {row['distractor']}")
print("-" * 80)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Spot-check the quality of the auto-generated distractor training data.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--file",
default=TRAINING_DATA_FILE,
help="Path to the training data .parquet file."
)
parser.add_argument(
"-n", "--num_samples",
type=int,
default=5,
help="The number of random samples to display for verification."
)
args = parser.parse_args()
verify_data(file_path=args.file, num_samples=args.num_samples) |