""" task1_easy.py ============= Task 1 — Dataset Metadata Validator (Easy) OpenEnv Project | Meta × Hugging Face Hackathon What it does: Fetches a HuggingFace dataset's metadata via API and checks if all required fields are present and complete. Checks (10): 1. Dataset name 6. Author 2. Description 7. Language 3. License 8. Date created 4. Source 9. Tags 5. Task type 10. Citation Usage: python task1_easy.py → Enter HuggingFace dataset URL when prompted → Copy the JSON output into grader1.py Requirements: pip install requests """ import requests import json def extract_dataset_name(url): if "huggingface.co" in url: if "datasets/" in url: name = url.split("datasets/")[-1] else: name = url.split("huggingface.co/")[-1] return name.strip("/").strip() return url.strip() def fetch_from_huggingface(dataset_name): try: url = f"https://huggingface.co/api/datasets/{dataset_name}" response = requests.get(url, timeout=10) if response.status_code != 200: print(f"ERROR: Dataset '{dataset_name}' not found!") return None raw = response.json() description = "" if raw.get("cardData") and raw["cardData"].get("pretty_name"): description = raw["cardData"]["pretty_name"] elif raw.get("description"): description = str(raw.get("description", ""))[:300] license_info = "" if raw.get("cardData") and raw["cardData"].get("license"): license_val = raw["cardData"]["license"] license_info = license_val[0] if isinstance(license_val, list) else str(license_val) language = "" if raw.get("cardData") and raw["cardData"].get("language"): lang_val = raw["cardData"]["language"] language = ", ".join(str(l) for l in lang_val[:3]) if isinstance(lang_val, list) else str(lang_val) tags = raw.get("tags", [])[:5] task_type = "" if raw.get("cardData") and raw["cardData"].get("task_categories"): tc = raw["cardData"]["task_categories"] task_type = tc[0] if isinstance(tc, list) and tc else "" author = raw.get("author", "") date_created = raw.get("createdAt", "")[:10] if raw.get("createdAt") else "" citation = "present" if raw.get("cardData", {}).get("citation") else "" column_names = [] if raw.get("cardData") and raw["cardData"].get("dataset_info"): ds_info = raw["cardData"]["dataset_info"] features = {} if isinstance(ds_info, dict): features = ds_info.get("features", {}) elif isinstance(ds_info, list) and ds_info: features = ds_info[0].get("features", {}) column_names = list(features.keys()) if isinstance(features, dict) else [] num_rows = 0 if raw.get("cardData") and raw["cardData"].get("dataset_info"): ds_info = raw["cardData"]["dataset_info"] if isinstance(ds_info, dict): for split in ds_info.get("splits", []): num_rows += split.get("num_examples", 0) return { "dataset_name": dataset_name, "source": "Hugging Face", "author": author, "description": description, "license": license_info, "num_rows": num_rows, "num_columns": len(column_names), "column_names": column_names, "column_descriptions": {}, "task_type": task_type, "language": language, "date_created": date_created, "tags": tags, "citation": citation } except Exception as e: print(f"ERROR: {str(e)}") return None def inspect_dataset(dataset): missing_fields = [] critical_fields = [] high_fields = [] medium_fields = [] low_fields = [] if not dataset.get("license") or dataset["license"] == "": missing_fields.append("license"); critical_fields.append("license") if not dataset.get("description") or dataset["description"] == "": missing_fields.append("description"); critical_fields.append("description") if not dataset.get("source") or dataset["source"] == "": missing_fields.append("source"); critical_fields.append("source") if not dataset.get("task_type") or dataset["task_type"] == "": missing_fields.append("task_type"); high_fields.append("task_type") if not dataset.get("column_descriptions") or dataset["column_descriptions"] == {}: missing_fields.append("column_descriptions"); high_fields.append("column_descriptions") if not dataset.get("author") or dataset["author"] == "": missing_fields.append("author"); high_fields.append("author") if not dataset.get("language") or dataset["language"] == "": missing_fields.append("language"); medium_fields.append("language") if not dataset.get("date_created") or dataset["date_created"] == "": missing_fields.append("date_created"); medium_fields.append("date_created") if not dataset.get("num_rows") or dataset["num_rows"] == 0: missing_fields.append("num_rows"); medium_fields.append("num_rows") if not dataset.get("tags") or dataset["tags"] == []: missing_fields.append("tags"); low_fields.append("tags") if not dataset.get("citation") or dataset["citation"] == "": missing_fields.append("citation"); low_fields.append("citation") # Build issues list severity_map = {} for f in critical_fields: severity_map[f] = "critical" for f in high_fields: severity_map[f] = "high" for f in medium_fields: severity_map[f] = "medium" for f in low_fields: severity_map[f] = "low" reason_map = { "license": "Cannot use dataset legally without a license", "description": "Users dont know what this dataset is about", "source": "Unknown where this dataset came from", "task_type": "Unknown if classification, regression etc", "column_descriptions": "Users dont know what each column means", "author": "Unknown who created this dataset", "language": "Unknown what language the data is in", "date_created": "Unknown when dataset was created", "num_rows": "Unknown how much data is available", "tags": "Dataset is hard to discover without tags", "citation": "Cannot cite this dataset in research" } issues_found = [] for field in missing_fields: issues_found.append({ "field": field, "issue": f"{field} is missing or empty", "severity": severity_map.get(field, "low"), "reason": reason_map.get(field, "Important field is missing") }) # Verdict if len(critical_fields) >= 2: verdict = "rejected" elif len(critical_fields) == 1 or len(high_fields) >= 2: verdict = "incomplete" elif missing_fields: verdict = "needs_minor_fixes" else: verdict = "complete" # Quality score total_fields = 12 filled_fields = total_fields - len(missing_fields) base_score = filled_fields / total_fields penalty = len(critical_fields) * 0.15 quality_score = round(max(0.01, min(0.99, base_score - penalty)), 2) # Recommendation parts = [] if critical_fields: parts.append(f"URGENT - Add: {', '.join(critical_fields)}") if high_fields: parts.append(f"HIGH - Add: {', '.join(high_fields)}") if medium_fields: parts.append(f"MEDIUM - Add: {', '.join(medium_fields)}") if low_fields: parts.append(f"LOW - Add: {', '.join(low_fields)}") recommendation = ". ".join(parts) if parts else "Dataset metadata is complete." return { "missing_fields": missing_fields, "critical_fields": critical_fields, "high_fields": high_fields, "medium_fields": medium_fields, "low_fields": low_fields, "issues_found": issues_found, "quality_score": quality_score, "severity_summary": { "critical": len(critical_fields), "high": len(high_fields), "medium": len(medium_fields), "low": len(low_fields) }, "recommendation": recommendation, "verdict": verdict } # ───────────────────────────────────────────── # USER INPUT (only runs when executed directly) # ───────────────────────────────────────────── if __name__ == "__main__": print("=" * 60) print("TASK 1 - Dataset Quality Inspector") print("=" * 60) user_url = input("Paste Hugging Face dataset URL: ").strip() dataset_name = extract_dataset_name(user_url) print(f"\nFetching '{dataset_name}' from Hugging Face...") dataset = fetch_from_huggingface(dataset_name) if dataset is None: print("Could not fetch. Check URL and try again.") else: inspection = inspect_dataset(dataset) final_output = { "dataset_info": dataset, "quality_report": { "missing_fields": inspection["missing_fields"], "critical_fields": inspection["critical_fields"], "high_fields": inspection["high_fields"], "medium_fields": inspection["medium_fields"], "low_fields": inspection["low_fields"], "issues_found": inspection["issues_found"], "severity_summary": inspection["severity_summary"], "quality_score": inspection["quality_score"], "recommendation": inspection["recommendation"] }, "verdict": inspection["verdict"], "agent_action": { "task_id": "task1_easy", "missing_fields": inspection["missing_fields"], "issues_found": inspection["issues_found"], "quality_score": inspection["quality_score"], "severity_summary": inspection["severity_summary"], "recommendation": inspection["recommendation"], "verdict": inspection["verdict"] } } print("\n" + "=" * 60) print("RESULTS IN JSON FORMAT") print("=" * 60) print("\n1. DATASET INFO:") print(json.dumps(final_output["dataset_info"], indent=2)) print("\n2. QUALITY REPORT:") print(json.dumps(final_output["quality_report"], indent=2)) print("\n3. VERDICT:") print(json.dumps(final_output["verdict"], indent=2)) print("\n" + "=" * 60) print("COPY FOR GRADER - paste agent_action into YOUR_ANSWER") print("=" * 60) print(json.dumps(final_output["agent_action"], indent=2)) print("\n" + "=" * 60) print("COPY FOR GRADER - paste dataset_info into YOUR_DATASET") print("=" * 60) print(json.dumps(final_output["dataset_info"], indent=2)) # EXPORTS for main.py # ───────────────────────────────────────────── # Alias so main.py import works extract_dataset_name_t1 = extract_dataset_name TASK1 = { "task_id": "task1_easy", "name": "Dataset Metadata Validator", "difficulty": "easy", "max_turns": 1, "description": ( "Inspect HuggingFace dataset metadata for missing/incomplete fields. " "Check: license, description, source, task_type, column_descriptions, " "author, language, date_created, num_rows, tags, citation." ), "expected_score_range": [0.70, 0.90], }