cmp / dataset_bundle /source_quality_report.json
cjc0013's picture
Upload 30 files
bfdd027 verified
{
"parse_failure_count": 45,
"total_failure_count": 74,
"source_unavailable_failure_count": 29,
"parse_failures_by_group": {
"house_clerk_disclosures": 28,
"house_cpf": 17
},
"parse_failures_by_reason": {
"parse_failed": 45
},
"failure_reason_counts": {
"parse_failed": 45,
"source_unavailable": 29
},
"parse_failures_by_detail": {
"ocr_text_unparseable": 28,
"pdf_text_unparseable_after_ocr": 4,
"pdf_text_unparseable": 13
},
"top_parse_failure_members": [
{
"member_slug": "michael_t_mccaul",
"count": 8
},
{
"member_slug": "ro_khanna",
"count": 6
},
{
"member_slug": "harold_rogers",
"count": 5
},
{
"member_slug": "nicole_malliotakis",
"count": 3
},
{
"member_slug": "harriet_m_hageman",
"count": 3
},
{
"member_slug": "beth_van_duyne",
"count": 3
},
{
"member_slug": "keith_self",
"count": 2
},
{
"member_slug": "christopher_h_smith",
"count": 2
},
{
"member_slug": "ann_wagner",
"count": 2
},
{
"member_slug": "tony_wied",
"count": 2
},
{
"member_slug": "ken_calvert",
"count": 1
},
{
"member_slug": "byron_donalds",
"count": 1
},
{
"member_slug": "mike_ezell",
"count": 1
},
{
"member_slug": "charles_j_chuck_fleischmann",
"count": 1
},
{
"member_slug": "seth_moulton",
"count": 1
},
{
"member_slug": "mark_pocan",
"count": 1
},
{
"member_slug": "david_rouzer",
"count": 1
},
{
"member_slug": "michael_k_simpson",
"count": 1
},
{
"member_slug": "mike_thompson",
"count": 1
}
],
"source_unavailable_by_group": {
"house_votes": 9,
"lda_public_search": 1,
"usaspending_public_site": 233
},
"partial_recovery_counts": {
"reused_existing_raw_artifact": 50702,
"pdf_parse_failed_but_page_context_preserved": 150,
"page_supported_unresolved_request_recovered": 100,
"ocr_candidate_ptr_rows_recovered": 11,
"pdf_url_returned_html_but_recipient_recovered": 11,
"usaspending_award_public_page": 14016,
"recovered_public_award_pages_prelink:14016": 1
},
"guidance_signal_counts": {
"house_cpf needs deterministic fallback or parser hardening": 84,
"house_clerk_disclosures needs deterministic fallback or parser hardening": 28,
"house_votes needs deterministic fallback or parser hardening": 9,
"lda_public_search needs deterministic fallback or parser hardening": 2,
"usaspending_public_site needs deterministic fallback or parser hardening": 1634
},
"house_cpf_filename_recovered_requests": 10,
"house_cpf_unresolved_requests": 1866,
"house_cpf_page_supported_requests": 3173
}