Adithya765's picture
fix task3 score bounds
7910de6
"""
task1_easy.py
=============
Task 1 β€” Dataset Metadata Validator (Easy)
OpenEnv Project | Meta Γ— Hugging Face Hackathon
What it does:
Fetches a HuggingFace dataset's metadata via API and checks
if all required fields are present and complete.
Checks (10):
1. Dataset name 6. Author
2. Description 7. Language
3. License 8. Date created
4. Source 9. Tags
5. Task type 10. Citation
Usage:
python task1_easy.py
β†’ Enter HuggingFace dataset URL when prompted
β†’ Copy the JSON output into grader1.py
Requirements:
pip install requests
"""
import requests
import json
def extract_dataset_name(url):
if "huggingface.co" in url:
if "datasets/" in url:
name = url.split("datasets/")[-1]
else:
name = url.split("huggingface.co/")[-1]
return name.strip("/").strip()
return url.strip()
def fetch_from_huggingface(dataset_name):
try:
url = f"https://huggingface.co/api/datasets/{dataset_name}"
response = requests.get(url, timeout=10)
if response.status_code != 200:
print(f"ERROR: Dataset '{dataset_name}' not found!")
return None
raw = response.json()
description = ""
if raw.get("cardData") and raw["cardData"].get("pretty_name"):
description = raw["cardData"]["pretty_name"]
elif raw.get("description"):
description = str(raw.get("description", ""))[:300]
license_info = ""
if raw.get("cardData") and raw["cardData"].get("license"):
license_val = raw["cardData"]["license"]
license_info = license_val[0] if isinstance(license_val, list) else str(license_val)
language = ""
if raw.get("cardData") and raw["cardData"].get("language"):
lang_val = raw["cardData"]["language"]
language = ", ".join(str(l) for l in lang_val[:3]) if isinstance(lang_val, list) else str(lang_val)
tags = raw.get("tags", [])[:5]
task_type = ""
if raw.get("cardData") and raw["cardData"].get("task_categories"):
tc = raw["cardData"]["task_categories"]
task_type = tc[0] if isinstance(tc, list) and tc else ""
author = raw.get("author", "")
date_created = raw.get("createdAt", "")[:10] if raw.get("createdAt") else ""
citation = "present" if raw.get("cardData", {}).get("citation") else ""
column_names = []
if raw.get("cardData") and raw["cardData"].get("dataset_info"):
ds_info = raw["cardData"]["dataset_info"]
features = {}
if isinstance(ds_info, dict):
features = ds_info.get("features", {})
elif isinstance(ds_info, list) and ds_info:
features = ds_info[0].get("features", {})
column_names = list(features.keys()) if isinstance(features, dict) else []
num_rows = 0
if raw.get("cardData") and raw["cardData"].get("dataset_info"):
ds_info = raw["cardData"]["dataset_info"]
if isinstance(ds_info, dict):
for split in ds_info.get("splits", []):
num_rows += split.get("num_examples", 0)
return {
"dataset_name": dataset_name,
"source": "Hugging Face",
"author": author,
"description": description,
"license": license_info,
"num_rows": num_rows,
"num_columns": len(column_names),
"column_names": column_names,
"column_descriptions": {},
"task_type": task_type,
"language": language,
"date_created": date_created,
"tags": tags,
"citation": citation
}
except Exception as e:
print(f"ERROR: {str(e)}")
return None
def inspect_dataset(dataset):
missing_fields = []
critical_fields = []
high_fields = []
medium_fields = []
low_fields = []
if not dataset.get("license") or dataset["license"] == "":
missing_fields.append("license"); critical_fields.append("license")
if not dataset.get("description") or dataset["description"] == "":
missing_fields.append("description"); critical_fields.append("description")
if not dataset.get("source") or dataset["source"] == "":
missing_fields.append("source"); critical_fields.append("source")
if not dataset.get("task_type") or dataset["task_type"] == "":
missing_fields.append("task_type"); high_fields.append("task_type")
if not dataset.get("column_descriptions") or dataset["column_descriptions"] == {}:
missing_fields.append("column_descriptions"); high_fields.append("column_descriptions")
if not dataset.get("author") or dataset["author"] == "":
missing_fields.append("author"); high_fields.append("author")
if not dataset.get("language") or dataset["language"] == "":
missing_fields.append("language"); medium_fields.append("language")
if not dataset.get("date_created") or dataset["date_created"] == "":
missing_fields.append("date_created"); medium_fields.append("date_created")
if not dataset.get("num_rows") or dataset["num_rows"] == 0:
missing_fields.append("num_rows"); medium_fields.append("num_rows")
if not dataset.get("tags") or dataset["tags"] == []:
missing_fields.append("tags"); low_fields.append("tags")
if not dataset.get("citation") or dataset["citation"] == "":
missing_fields.append("citation"); low_fields.append("citation")
# Build issues list
severity_map = {}
for f in critical_fields: severity_map[f] = "critical"
for f in high_fields: severity_map[f] = "high"
for f in medium_fields: severity_map[f] = "medium"
for f in low_fields: severity_map[f] = "low"
reason_map = {
"license": "Cannot use dataset legally without a license",
"description": "Users dont know what this dataset is about",
"source": "Unknown where this dataset came from",
"task_type": "Unknown if classification, regression etc",
"column_descriptions": "Users dont know what each column means",
"author": "Unknown who created this dataset",
"language": "Unknown what language the data is in",
"date_created": "Unknown when dataset was created",
"num_rows": "Unknown how much data is available",
"tags": "Dataset is hard to discover without tags",
"citation": "Cannot cite this dataset in research"
}
issues_found = []
for field in missing_fields:
issues_found.append({
"field": field,
"issue": f"{field} is missing or empty",
"severity": severity_map.get(field, "low"),
"reason": reason_map.get(field, "Important field is missing")
})
# Verdict
if len(critical_fields) >= 2:
verdict = "rejected"
elif len(critical_fields) == 1 or len(high_fields) >= 2:
verdict = "incomplete"
elif missing_fields:
verdict = "needs_minor_fixes"
else:
verdict = "complete"
# Quality score
total_fields = 12
filled_fields = total_fields - len(missing_fields)
base_score = filled_fields / total_fields
penalty = len(critical_fields) * 0.15
quality_score = round(max(0.01, min(0.99, base_score - penalty)), 2)
# Recommendation
parts = []
if critical_fields: parts.append(f"URGENT - Add: {', '.join(critical_fields)}")
if high_fields: parts.append(f"HIGH - Add: {', '.join(high_fields)}")
if medium_fields: parts.append(f"MEDIUM - Add: {', '.join(medium_fields)}")
if low_fields: parts.append(f"LOW - Add: {', '.join(low_fields)}")
recommendation = ". ".join(parts) if parts else "Dataset metadata is complete."
return {
"missing_fields": missing_fields,
"critical_fields": critical_fields,
"high_fields": high_fields,
"medium_fields": medium_fields,
"low_fields": low_fields,
"issues_found": issues_found,
"quality_score": quality_score,
"severity_summary": {
"critical": len(critical_fields),
"high": len(high_fields),
"medium": len(medium_fields),
"low": len(low_fields)
},
"recommendation": recommendation,
"verdict": verdict
}
# ─────────────────────────────────────────────
# USER INPUT (only runs when executed directly)
# ─────────────────────────────────────────────
if __name__ == "__main__":
print("=" * 60)
print("TASK 1 - Dataset Quality Inspector")
print("=" * 60)
user_url = input("Paste Hugging Face dataset URL: ").strip()
dataset_name = extract_dataset_name(user_url)
print(f"\nFetching '{dataset_name}' from Hugging Face...")
dataset = fetch_from_huggingface(dataset_name)
if dataset is None:
print("Could not fetch. Check URL and try again.")
else:
inspection = inspect_dataset(dataset)
final_output = {
"dataset_info": dataset,
"quality_report": {
"missing_fields": inspection["missing_fields"],
"critical_fields": inspection["critical_fields"],
"high_fields": inspection["high_fields"],
"medium_fields": inspection["medium_fields"],
"low_fields": inspection["low_fields"],
"issues_found": inspection["issues_found"],
"severity_summary": inspection["severity_summary"],
"quality_score": inspection["quality_score"],
"recommendation": inspection["recommendation"]
},
"verdict": inspection["verdict"],
"agent_action": {
"task_id": "task1_easy",
"missing_fields": inspection["missing_fields"],
"issues_found": inspection["issues_found"],
"quality_score": inspection["quality_score"],
"severity_summary": inspection["severity_summary"],
"recommendation": inspection["recommendation"],
"verdict": inspection["verdict"]
}
}
print("\n" + "=" * 60)
print("RESULTS IN JSON FORMAT")
print("=" * 60)
print("\n1. DATASET INFO:")
print(json.dumps(final_output["dataset_info"], indent=2))
print("\n2. QUALITY REPORT:")
print(json.dumps(final_output["quality_report"], indent=2))
print("\n3. VERDICT:")
print(json.dumps(final_output["verdict"], indent=2))
print("\n" + "=" * 60)
print("COPY FOR GRADER - paste agent_action into YOUR_ANSWER")
print("=" * 60)
print(json.dumps(final_output["agent_action"], indent=2))
print("\n" + "=" * 60)
print("COPY FOR GRADER - paste dataset_info into YOUR_DATASET")
print("=" * 60)
print(json.dumps(final_output["dataset_info"], indent=2))
# EXPORTS for main.py
# ─────────────────────────────────────────────
# Alias so main.py import works
extract_dataset_name_t1 = extract_dataset_name
TASK1 = {
"task_id": "task1_easy",
"name": "Dataset Metadata Validator",
"difficulty": "easy",
"max_turns": 1,
"description": (
"Inspect HuggingFace dataset metadata for missing/incomplete fields. "
"Check: license, description, source, task_type, column_descriptions, "
"author, language, date_created, num_rows, tags, citation."
),
"expected_score_range": [0.70, 0.90],
}