| """ |
| task1_easy.py |
| ============= |
| Task 1 β Dataset Metadata Validator (Easy) |
| OpenEnv Project | Meta Γ Hugging Face Hackathon |
| |
| What it does: |
| Fetches a HuggingFace dataset's metadata via API and checks |
| if all required fields are present and complete. |
| |
| Checks (10): |
| 1. Dataset name 6. Author |
| 2. Description 7. Language |
| 3. License 8. Date created |
| 4. Source 9. Tags |
| 5. Task type 10. Citation |
| |
| Usage: |
| python task1_easy.py |
| β Enter HuggingFace dataset URL when prompted |
| β Copy the JSON output into grader1.py |
| |
| Requirements: |
| pip install requests |
| """ |
| import requests |
| import json |
|
|
| def extract_dataset_name(url): |
| if "huggingface.co" in url: |
| if "datasets/" in url: |
| name = url.split("datasets/")[-1] |
| else: |
| name = url.split("huggingface.co/")[-1] |
| return name.strip("/").strip() |
| return url.strip() |
|
|
| def fetch_from_huggingface(dataset_name): |
| try: |
| url = f"https://huggingface.co/api/datasets/{dataset_name}" |
| response = requests.get(url, timeout=10) |
| if response.status_code != 200: |
| print(f"ERROR: Dataset '{dataset_name}' not found!") |
| return None |
| raw = response.json() |
|
|
| description = "" |
| if raw.get("cardData") and raw["cardData"].get("pretty_name"): |
| description = raw["cardData"]["pretty_name"] |
| elif raw.get("description"): |
| description = str(raw.get("description", ""))[:300] |
|
|
| license_info = "" |
| if raw.get("cardData") and raw["cardData"].get("license"): |
| license_val = raw["cardData"]["license"] |
| license_info = license_val[0] if isinstance(license_val, list) else str(license_val) |
|
|
| language = "" |
| if raw.get("cardData") and raw["cardData"].get("language"): |
| lang_val = raw["cardData"]["language"] |
| language = ", ".join(str(l) for l in lang_val[:3]) if isinstance(lang_val, list) else str(lang_val) |
|
|
| tags = raw.get("tags", [])[:5] |
| task_type = "" |
| if raw.get("cardData") and raw["cardData"].get("task_categories"): |
| tc = raw["cardData"]["task_categories"] |
| task_type = tc[0] if isinstance(tc, list) and tc else "" |
|
|
| author = raw.get("author", "") |
| date_created = raw.get("createdAt", "")[:10] if raw.get("createdAt") else "" |
| citation = "present" if raw.get("cardData", {}).get("citation") else "" |
|
|
| column_names = [] |
| if raw.get("cardData") and raw["cardData"].get("dataset_info"): |
| ds_info = raw["cardData"]["dataset_info"] |
| features = {} |
| if isinstance(ds_info, dict): |
| features = ds_info.get("features", {}) |
| elif isinstance(ds_info, list) and ds_info: |
| features = ds_info[0].get("features", {}) |
| column_names = list(features.keys()) if isinstance(features, dict) else [] |
|
|
| num_rows = 0 |
| if raw.get("cardData") and raw["cardData"].get("dataset_info"): |
| ds_info = raw["cardData"]["dataset_info"] |
| if isinstance(ds_info, dict): |
| for split in ds_info.get("splits", []): |
| num_rows += split.get("num_examples", 0) |
|
|
| return { |
| "dataset_name": dataset_name, |
| "source": "Hugging Face", |
| "author": author, |
| "description": description, |
| "license": license_info, |
| "num_rows": num_rows, |
| "num_columns": len(column_names), |
| "column_names": column_names, |
| "column_descriptions": {}, |
| "task_type": task_type, |
| "language": language, |
| "date_created": date_created, |
| "tags": tags, |
| "citation": citation |
| } |
| except Exception as e: |
| print(f"ERROR: {str(e)}") |
| return None |
|
|
| def inspect_dataset(dataset): |
| missing_fields = [] |
| critical_fields = [] |
| high_fields = [] |
| medium_fields = [] |
| low_fields = [] |
|
|
| if not dataset.get("license") or dataset["license"] == "": |
| missing_fields.append("license"); critical_fields.append("license") |
| if not dataset.get("description") or dataset["description"] == "": |
| missing_fields.append("description"); critical_fields.append("description") |
| if not dataset.get("source") or dataset["source"] == "": |
| missing_fields.append("source"); critical_fields.append("source") |
| if not dataset.get("task_type") or dataset["task_type"] == "": |
| missing_fields.append("task_type"); high_fields.append("task_type") |
| if not dataset.get("column_descriptions") or dataset["column_descriptions"] == {}: |
| missing_fields.append("column_descriptions"); high_fields.append("column_descriptions") |
| if not dataset.get("author") or dataset["author"] == "": |
| missing_fields.append("author"); high_fields.append("author") |
| if not dataset.get("language") or dataset["language"] == "": |
| missing_fields.append("language"); medium_fields.append("language") |
| if not dataset.get("date_created") or dataset["date_created"] == "": |
| missing_fields.append("date_created"); medium_fields.append("date_created") |
| if not dataset.get("num_rows") or dataset["num_rows"] == 0: |
| missing_fields.append("num_rows"); medium_fields.append("num_rows") |
| if not dataset.get("tags") or dataset["tags"] == []: |
| missing_fields.append("tags"); low_fields.append("tags") |
| if not dataset.get("citation") or dataset["citation"] == "": |
| missing_fields.append("citation"); low_fields.append("citation") |
|
|
| |
| severity_map = {} |
| for f in critical_fields: severity_map[f] = "critical" |
| for f in high_fields: severity_map[f] = "high" |
| for f in medium_fields: severity_map[f] = "medium" |
| for f in low_fields: severity_map[f] = "low" |
|
|
| reason_map = { |
| "license": "Cannot use dataset legally without a license", |
| "description": "Users dont know what this dataset is about", |
| "source": "Unknown where this dataset came from", |
| "task_type": "Unknown if classification, regression etc", |
| "column_descriptions": "Users dont know what each column means", |
| "author": "Unknown who created this dataset", |
| "language": "Unknown what language the data is in", |
| "date_created": "Unknown when dataset was created", |
| "num_rows": "Unknown how much data is available", |
| "tags": "Dataset is hard to discover without tags", |
| "citation": "Cannot cite this dataset in research" |
| } |
|
|
| issues_found = [] |
| for field in missing_fields: |
| issues_found.append({ |
| "field": field, |
| "issue": f"{field} is missing or empty", |
| "severity": severity_map.get(field, "low"), |
| "reason": reason_map.get(field, "Important field is missing") |
| }) |
|
|
| |
| if len(critical_fields) >= 2: |
| verdict = "rejected" |
| elif len(critical_fields) == 1 or len(high_fields) >= 2: |
| verdict = "incomplete" |
| elif missing_fields: |
| verdict = "needs_minor_fixes" |
| else: |
| verdict = "complete" |
|
|
| |
| total_fields = 12 |
| filled_fields = total_fields - len(missing_fields) |
| base_score = filled_fields / total_fields |
| penalty = len(critical_fields) * 0.15 |
| quality_score = round(max(0.01, min(0.99, base_score - penalty)), 2) |
|
|
|
|
| |
| parts = [] |
| if critical_fields: parts.append(f"URGENT - Add: {', '.join(critical_fields)}") |
| if high_fields: parts.append(f"HIGH - Add: {', '.join(high_fields)}") |
| if medium_fields: parts.append(f"MEDIUM - Add: {', '.join(medium_fields)}") |
| if low_fields: parts.append(f"LOW - Add: {', '.join(low_fields)}") |
| recommendation = ". ".join(parts) if parts else "Dataset metadata is complete." |
|
|
| return { |
| "missing_fields": missing_fields, |
| "critical_fields": critical_fields, |
| "high_fields": high_fields, |
| "medium_fields": medium_fields, |
| "low_fields": low_fields, |
| "issues_found": issues_found, |
| "quality_score": quality_score, |
| "severity_summary": { |
| "critical": len(critical_fields), |
| "high": len(high_fields), |
| "medium": len(medium_fields), |
| "low": len(low_fields) |
| }, |
| "recommendation": recommendation, |
| "verdict": verdict |
| } |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| print("=" * 60) |
| print("TASK 1 - Dataset Quality Inspector") |
| print("=" * 60) |
|
|
| user_url = input("Paste Hugging Face dataset URL: ").strip() |
| dataset_name = extract_dataset_name(user_url) |
|
|
| print(f"\nFetching '{dataset_name}' from Hugging Face...") |
| dataset = fetch_from_huggingface(dataset_name) |
|
|
| if dataset is None: |
| print("Could not fetch. Check URL and try again.") |
| else: |
| inspection = inspect_dataset(dataset) |
|
|
| final_output = { |
| "dataset_info": dataset, |
| "quality_report": { |
| "missing_fields": inspection["missing_fields"], |
| "critical_fields": inspection["critical_fields"], |
| "high_fields": inspection["high_fields"], |
| "medium_fields": inspection["medium_fields"], |
| "low_fields": inspection["low_fields"], |
| "issues_found": inspection["issues_found"], |
| "severity_summary": inspection["severity_summary"], |
| "quality_score": inspection["quality_score"], |
| "recommendation": inspection["recommendation"] |
| }, |
| "verdict": inspection["verdict"], |
| "agent_action": { |
| "task_id": "task1_easy", |
| "missing_fields": inspection["missing_fields"], |
| "issues_found": inspection["issues_found"], |
| "quality_score": inspection["quality_score"], |
| "severity_summary": inspection["severity_summary"], |
| "recommendation": inspection["recommendation"], |
| "verdict": inspection["verdict"] |
| } |
| } |
|
|
| print("\n" + "=" * 60) |
| print("RESULTS IN JSON FORMAT") |
| print("=" * 60) |
| print("\n1. DATASET INFO:") |
| print(json.dumps(final_output["dataset_info"], indent=2)) |
| print("\n2. QUALITY REPORT:") |
| print(json.dumps(final_output["quality_report"], indent=2)) |
| print("\n3. VERDICT:") |
| print(json.dumps(final_output["verdict"], indent=2)) |
| print("\n" + "=" * 60) |
| print("COPY FOR GRADER - paste agent_action into YOUR_ANSWER") |
| print("=" * 60) |
| print(json.dumps(final_output["agent_action"], indent=2)) |
| print("\n" + "=" * 60) |
| print("COPY FOR GRADER - paste dataset_info into YOUR_DATASET") |
| print("=" * 60) |
| print(json.dumps(final_output["dataset_info"], indent=2)) |
|
|
|
|
| |
| |
|
|
| |
| extract_dataset_name_t1 = extract_dataset_name |
|
|
| TASK1 = { |
| "task_id": "task1_easy", |
| "name": "Dataset Metadata Validator", |
| "difficulty": "easy", |
| "max_turns": 1, |
| "description": ( |
| "Inspect HuggingFace dataset metadata for missing/incomplete fields. " |
| "Check: license, description, source, task_type, column_descriptions, " |
| "author, language, date_created, num_rows, tags, citation." |
| ), |
| "expected_score_range": [0.70, 0.90], |
| } |
|
|