File size: 1,457 Bytes
d924ea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# tools/build_meta_only.py
import os, json
from pathlib import Path

DATA_CANDIDATES = [
    "data/cosmetics_full_en_zh.jsonl",
    "data/cosmetic_full_en_zh.jsonl",
    "cosmetics_full_en_zh.jsonl",
    "cosmetic_full_en_zh.jsonl",
]
OUT_DIR = Path("indexes")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_META = OUT_DIR / "cosmetics_meta.json"

def find_dataset():
    for p in DATA_CANDIDATES:
        if os.path.exists(p):
            return p
    raise FileNotFoundError("Dataset JSONL not found in expected locations.")

def main():
    ds = find_dataset()
    records = [json.loads(l) for l in open(ds, "r", encoding="utf-8")]
    meta = []
    for r in records:
        meta.append({
            "id": r.get("id"),
            "brand_en": r.get("brand_en"),
            "brand_zh": r.get("brand_zh"),
            "product_name_en": r.get("product_name_en"),
            "product_name_zh": r.get("product_name_zh"),
            "category_en": r.get("category_en"),
            "category_zh": r.get("category_zh"),
            "price_value": r.get("price_value"),
            "price_currency": r.get("price_currency"),
            "source_url": r.get("source_url"),
            "image_url": r.get("image_url"),   # <-- now included
        })
    with open(OUT_META, "w", encoding="utf-8") as f:
        json.dump(meta, f, ensure_ascii=False, indent=2)
    print(f"✅ wrote {len(meta)} records -> {OUT_META}")

if __name__ == "__main__":
    main()