igala-mbert-interpretability / prepare_data.py
Faruna01's picture
Initial commit: mBERT attention analysis for Igala
3e71623
raw
history blame contribute delete
683 Bytes
import pandas as pd
print("📊 Preparing data for interpretability analysis...\n")
# Use the parallel corpus (224 sentences with English translations)
df = pd.read_csv('data/igala_english_parallel.csv')
print(f"Total sentence pairs: {len(df)}")
print(f"Columns: {list(df.columns)}")
# Show first row to verify structure
print(f"\nFirst sentence pair:")
print(df.head(1))
# Select 50 diverse examples
analysis_df = df.sample(n=min(50, len(df)), random_state=42).reset_index(drop=True)
# Save
analysis_df.to_csv('data/igala_probe_sentences.csv', index=False)
print(f"\n✅ Created analysis set: {len(analysis_df)} sentence pairs")
print("\nSample:")
print(analysis_df.head(3))