my_datasets / hard_sudoku_preprocess.py
zeyuzy's picture
Upload folder using huggingface_hub
338d88d verified
import pandas as pd
def process_csv(input_file, output_file, sample_size=1000):
"""Read CSV, replace '.' with '0' in question, rename columns, sample and save."""
df = pd.read_csv(input_file)
required_columns = ['source', 'question', 'answer', 'rating']
missing = [c for c in required_columns if c not in df.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
df['question'] = df['question'].str.replace('.', '0', regex=False)
df = df.rename(columns={'question': 'quizzes', 'answer': 'solutions'})
df = df.head(sample_size)[['quizzes', 'solutions', 'rating']]
df.to_csv(output_file, index=False)
print(f"Saved {len(df)} rows to {output_file}")
if __name__ == "__main__":
input_csv = "test.csv"
output_csv = "hard_test.csv"
process_csv(input_csv, output_csv, sample_size=5000)
input_csv = "train.csv"
output_csv = "hard_train.csv"
# process_csv(input_csv, output_csv, sample_size=100000)