| from __future__ import annotations |
| from typing import Optional |
| from pathlib import Path |
| import yaml |
| from pydantic import BaseModel, Field |
|
|
| class BootstrapConfig(BaseModel): |
| url: str = ""; dest: str; retries: int = 5; timeout: int = 300 |
|
|
| class ServerConfig(BaseModel): |
| host: str = "0.0.0.0"; port: int = 8000; workers: int = 1; module: str = "app.main:app" |
|
|
| class ElasticsearchConfig(BaseModel): |
| host: str = "http://elasticsearch:9200"; index: Optional[str] = None |
| timeout: int = 30; max_retries: int = 3 |
|
|
| class RegexRule(BaseModel): |
| find: str; rep: str; loop: bool = False |
|
|
| class Subfield(BaseModel): |
| src: str; dest: str; type: str = "string"; es_type: str = "keyword" |
| facet: bool = False; facet_size: int = 10; suggest: bool = False; keyword_subfield: bool = False |
|
|
| class SearchField(BaseModel): |
| field: str; boost: float = 1.0; subfield: Optional[str] = None |
|
|
| class FieldDef(BaseModel): |
| field: str; type: str = "string"; key: bool = False; required: bool = False |
| embed: bool = False; es_type: str = "text"; index: bool = True |
| exclude_source: bool = False; facet: bool = False; facet_size: int = 10 |
| facet_histogram: bool = False; suggest: bool = False; keyword_subfield: bool = False |
| timestamp_fmt: int = 1; regex: list[RegexRule] = Field(default_factory=list) |
| subfields: list[Subfield] = Field(default_factory=list) |
|
|
| class DataConfig(BaseModel): |
| index_name: str; display_name: str = ""; id_field: str; data_format: str = "jsonl" |
| bootstrap: Optional[BootstrapConfig] = None |
| server: ServerConfig = Field(default_factory=ServerConfig) |
| elasticsearch: ElasticsearchConfig = Field(default_factory=ElasticsearchConfig) |
| analyzer: str = "standard"; search_fields: list[SearchField] = Field(default_factory=list) |
| campos_filter: list[FieldDef] = Field(default_factory=list); chunk_size: int = 500 |
|
|
| @property |
| def jsonl_path(self): return self.bootstrap.dest if self.bootstrap else f"/app/data/{self.index_name}.jsonl" |
| @property |
| def es_index(self): return self.elasticsearch.index or self.index_name |
| @property |
| def es_host(self): return self.elasticsearch.host |
| @property |
| def key_field(self): return next((f for f in self.campos_filter if f.key), None) |
| @property |
| def flat_fields(self): return [f for f in self.campos_filter if f.type != "object"] |
| @property |
| def facet_fields(self): |
| r=[] |
| for f in self.campos_filter: |
| if getattr(f,"facet",False): r.append(f) |
| for sf in getattr(f,"subfields",[]): |
| if sf.facet: r.append(sf) |
| return r |
| @property |
| def suggest_fields(self): |
| r=[] |
| for f in self.campos_filter: |
| if f.suggest: r.append(f.field) |
| for sf in f.subfields: |
| if sf.suggest: r.append(sf.dest) |
| return r |
| @property |
| def source_excludes(self): return [f.field for f in self.campos_filter if f.exclude_source] |
| @property |
| def search_fields_es(self): |
| r=[] |
| for sf in self.search_fields: |
| name=f"{sf.field}.{sf.subfield}" if sf.subfield else sf.field |
| r.append(f"{name}^{sf.boost}" if sf.boost!=1.0 else name) |
| return r |
|
|
| def load_config(path) -> DataConfig: |
| with open(path, encoding="utf-8") as fh: return DataConfig.model_validate(yaml.safe_load(fh)) |
|
|