Buckets:
| auto_download_public_sources: true | |
| promote_approved_sources_to_training: true | |
| require_human_approval_for_restricted_or_unclear: true | |
| approved_license_statuses: [government_public, public_domain, permissive] | |
| review_license_statuses: [unknown, copyright_notice, no_derivatives, non_commercial, restricted_download] | |
| blocked_license_statuses: [private, paywalled, credentialed, malware_risk, blocked_scheme] | |
| source_quality_certification: | |
| required_before_download: true | |
| method: transparent_local_ai_source_quality_classifier_v1 | |
| min_training_score: 4.0 | |
| min_verification_score: 2.0 | |
| min_content_words: 120 | |
| min_content_reasoning_density: 0.025 | |
| require_critical_reasoning_when_gate_fails: true | |
| reject_low_signal_bulk_text_when_critical_pass_fails: true | |
| source_risk: | |
| max_review_sources_per_round: 10 | |
| max_download_bytes: 50000000 | |
| allowed_schemes: [https, file] | |
| disallowed_url_hints: [login, signin, checkout, cart, subscribe] | |
| live_discovery: | |
| enabled: true | |
| provider: sec_api_first | |
| duckduckgo_fallback_enabled: false | |
| max_queries: 24 | |
| max_results: 40 | |
| max_results_per_query: 8 | |
| preferred_domains: [sec.gov, investor.gov, federalreserve.gov, treasury.gov, fasb.org, stern.nyu.edu, cfainstitute.org] | |
| query_templates: ["site:{domain} {asset_terms} {role_terms}", "site:{domain} red flags checklist {asset_terms} {role_terms}", "site:{domain} case study worked example {asset_terms} {role_terms}", "site:{domain} pass fail reasoning {asset_terms} {role_terms}", "site:{domain} investor bulletin {asset_terms} {role_terms}", "site:{domain} PDF {asset_terms} {role_terms}"] | |
| retry_query_terms: ["red flags checklist case study pass fail reasoning", "worked examples critical decision financial statement analysis", "reject approve because accounting quality warning signs", "investor bulletin financial statements valuation disclosure", "public company analysis cash flow return on invested capital", "equity research accounting quality capital allocation moat"] | |
| formats: | |
| pdf: | |
| normalize: true | |
| train: true | |
| validate: true | |
| min_train_text_chars: 500 | |
| min_validation_text_chars: 1000 | |
| jsonl: | |
| normalize: true | |
| train: true | |
| validate: true | |
| min_train_text_chars: 500 | |
| min_validation_text_chars: 1000 | |
| html: | |
| normalize: true | |
| train: true | |
| validate: false | |
| min_train_text_chars: 500 | |
| min_validation_text_chars: 5000 | |
| max_boilerplate_ratio: 0.99 | |
| html_index: | |
| normalize: true | |
| train: false | |
| validate: false | |
| min_train_text_chars: 500 | |
| max_boilerplate_ratio: 0.95 | |
| txt: | |
| normalize: true | |
| train: true | |
| validate: false | |
| min_train_text_chars: 500 | |
| md: | |
| normalize: true | |
| train: true | |
| validate: false | |
| min_train_text_chars: 500 | |
| unknown: | |
| normalize: false | |
| train: false | |
| validate: false | |
| stall_breakout: | |
| max_rounds: 3 | |
| max_new_sources_per_round: 10 | |
| max_source_discovery_attempts: 5 | |
| max_breakout_reasoning_records: 300 | |
| min_discovery_attempts_before_reasoning_halt: 10 | |
| severe_regression_aggregate_abs: 0.05 | |
| severe_regression_critical_abs: 0.05 | |
| stop_when_quality_gate_passes: true | |
| stop_when_source_budget_exhausted: true | |
Xet Storage Details
- Size:
- 3.14 kB
- Xet hash:
- d2bfcbc1beb706e8de02aeeea1d0a9e5c9793432365c55b27959d7b26a4a938b
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.