Spaces:
Sleeping
Sleeping
File size: 1,504 Bytes
7d9e4d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import json
import pandas as pd
import os
def normalize_url(url):
if not isinstance(url, str):
return ""
return url.strip().replace('/solutions', '').rstrip('/')
def check_integrity():
print("Loading raw_assessments.json...")
with open('shl_recommender/data/raw_assessments.json', 'r') as f:
raw_data = json.load(f)
# Create a set of normalized URLs from raw data
raw_urls = set()
for item in raw_data:
if 'url' in item:
raw_urls.add(normalize_url(item['url']))
print(f"Loaded {len(raw_urls)} unique assessments from raw data.")
print("Loading train.csv...")
df = pd.read_csv('shl_recommender/data/train.csv')
# Get all unique ground truth URLs
gt_urls = df['Assessment_url'].unique()
print(f"Found {len(gt_urls)} unique ground truth assessments in train.csv.")
missing_count = 0
print("\n--- Missing Assessments ---")
for url in gt_urls:
norm_url = normalize_url(url)
if norm_url not in raw_urls:
print(f"MISSING: {url}")
missing_count += 1
if missing_count == 0:
print("\nSUCCESS: All ground truth assessments are present in raw_assessments.json!")
else:
print(f"\nFAILURE: {missing_count} ground truth assessments are MISSING from raw_assessments.json.")
print("These items cannot be retrieved because they are not in the index.")
if __name__ == "__main__":
check_integrity()
|