linvest21's picture
download
raw
3.14 kB
auto_download_public_sources: true
promote_approved_sources_to_training: true
require_human_approval_for_restricted_or_unclear: true
approved_license_statuses: [government_public, public_domain, permissive]
review_license_statuses: [unknown, copyright_notice, no_derivatives, non_commercial, restricted_download]
blocked_license_statuses: [private, paywalled, credentialed, malware_risk, blocked_scheme]
source_quality_certification:
required_before_download: true
method: transparent_local_ai_source_quality_classifier_v1
min_training_score: 4.0
min_verification_score: 2.0
min_content_words: 120
min_content_reasoning_density: 0.025
require_critical_reasoning_when_gate_fails: true
reject_low_signal_bulk_text_when_critical_pass_fails: true
source_risk:
max_review_sources_per_round: 10
max_download_bytes: 50000000
allowed_schemes: [https, file]
disallowed_url_hints: [login, signin, checkout, cart, subscribe]
live_discovery:
enabled: true
provider: sec_api_first
duckduckgo_fallback_enabled: false
max_queries: 24
max_results: 40
max_results_per_query: 8
preferred_domains: [sec.gov, investor.gov, federalreserve.gov, treasury.gov, fasb.org, stern.nyu.edu, cfainstitute.org]
query_templates: ["site:{domain} {asset_terms} {role_terms}", "site:{domain} red flags checklist {asset_terms} {role_terms}", "site:{domain} case study worked example {asset_terms} {role_terms}", "site:{domain} pass fail reasoning {asset_terms} {role_terms}", "site:{domain} investor bulletin {asset_terms} {role_terms}", "site:{domain} PDF {asset_terms} {role_terms}"]
retry_query_terms: ["red flags checklist case study pass fail reasoning", "worked examples critical decision financial statement analysis", "reject approve because accounting quality warning signs", "investor bulletin financial statements valuation disclosure", "public company analysis cash flow return on invested capital", "equity research accounting quality capital allocation moat"]
formats:
pdf:
normalize: true
train: true
validate: true
min_train_text_chars: 500
min_validation_text_chars: 1000
jsonl:
normalize: true
train: true
validate: true
min_train_text_chars: 500
min_validation_text_chars: 1000
html:
normalize: true
train: true
validate: false
min_train_text_chars: 500
min_validation_text_chars: 5000
max_boilerplate_ratio: 0.99
html_index:
normalize: true
train: false
validate: false
min_train_text_chars: 500
max_boilerplate_ratio: 0.95
txt:
normalize: true
train: true
validate: false
min_train_text_chars: 500
md:
normalize: true
train: true
validate: false
min_train_text_chars: 500
unknown:
normalize: false
train: false
validate: false
stall_breakout:
max_rounds: 3
max_new_sources_per_round: 10
max_source_discovery_attempts: 5
max_breakout_reasoning_records: 300
min_discovery_attempts_before_reasoning_halt: 10
severe_regression_aggregate_abs: 0.05
severe_regression_critical_abs: 0.05
stop_when_quality_gate_passes: true
stop_when_source_budget_exhausted: true

Xet Storage Details

Size:
3.14 kB
·
Xet hash:
d2bfcbc1beb706e8de02aeeea1d0a9e5c9793432365c55b27959d7b26a4a938b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.