Spaces:
Running
Running
| import os | |
| import random | |
| import string | |
| import pandas as pd | |
| from datasets import load_dataset | |
| def generate_random_phonemes(): | |
| length = random.randint(5, 20) | |
| return " ".join(random.choices(string.ascii_lowercase, k=length)) | |
| def main(): | |
| ids = [] | |
| # Try reading IDs.txt first | |
| if os.path.exists("IDs.txt"): | |
| with open("IDs.txt") as f: | |
| ids = [l.strip() for l in f if l.strip()] | |
| if not ids: | |
| print("IDs.txt not found or empty. Fetching from HF (IqraEval/QuranMB.v2 split='test')...") | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| print("HF_TOKEN not set in environment.") | |
| return | |
| try: | |
| ds = load_dataset("IqraEval/QuranMB.v2", split="test", token=token, streaming=True) | |
| for item in ds: | |
| if "ID" in item: ids.append(str(item["ID"]).strip()) | |
| elif "id" in item: ids.append(str(item["id"]).strip()) | |
| # Save to IDs.txt for future use | |
| with open("IDs.txt", "w") as f: | |
| for i in sorted(ids): | |
| f.write(f"{i}\n") | |
| print(f"Cached {len(ids)} IDs to IDs.txt") | |
| except Exception as e: | |
| print(f"Error fetching IDs: {e}") | |
| return | |
| if not ids: | |
| print("No IDs found.") | |
| return | |
| print(f"Generating submission for {len(ids)} IDs...") | |
| df = pd.DataFrame({ | |
| "ID": ids, | |
| "Labels": [generate_random_phonemes() for _ in ids] | |
| }) | |
| df.to_csv("random_submission.csv", index=False) | |
| print(f"Generated random_submission.csv with {len(df)} rows.") | |
| if __name__ == "__main__": | |
| main() | |