Leaderboard / generate_random_submission.py
01Yassine's picture
Upload 13 files
0b5b1dc verified
import os
import random
import string
import pandas as pd
from datasets import load_dataset
def generate_random_phonemes():
length = random.randint(5, 20)
return " ".join(random.choices(string.ascii_lowercase, k=length))
def main():
ids = []
# Try reading IDs.txt first
if os.path.exists("IDs.txt"):
with open("IDs.txt") as f:
ids = [l.strip() for l in f if l.strip()]
if not ids:
print("IDs.txt not found or empty. Fetching from HF (IqraEval/QuranMB.v2 split='test')...")
token = os.environ.get("HF_TOKEN")
if not token:
print("HF_TOKEN not set in environment.")
return
try:
ds = load_dataset("IqraEval/QuranMB.v2", split="test", token=token, streaming=True)
for item in ds:
if "ID" in item: ids.append(str(item["ID"]).strip())
elif "id" in item: ids.append(str(item["id"]).strip())
# Save to IDs.txt for future use
with open("IDs.txt", "w") as f:
for i in sorted(ids):
f.write(f"{i}\n")
print(f"Cached {len(ids)} IDs to IDs.txt")
except Exception as e:
print(f"Error fetching IDs: {e}")
return
if not ids:
print("No IDs found.")
return
print(f"Generating submission for {len(ids)} IDs...")
df = pd.DataFrame({
"ID": ids,
"Labels": [generate_random_phonemes() for _ in ids]
})
df.to_csv("random_submission.csv", index=False)
print(f"Generated random_submission.csv with {len(df)} rows.")
if __name__ == "__main__":
main()