Scrap-Dji / scripts /bootstrap_typesense.py
joel
Initial deployment: Scrap-Dji with API
dfdddb1
import json
import os
from indexer.typesense_indexer import create_collection_if_not_exists, index_document
SCHEMA = {
"name": "documents",
"fields": [
{"name": "id", "type": "string"},
{"name": "titre", "type": "string"},
{"name": "texte", "type": "string"},
{"name": "langue", "type": "string", "facet": True},
{"name": "type_document", "type": "string", "facet": True},
{"name": "pays", "type": "string", "facet": True},
{"name": "source_url", "type": "string"},
{"name": "date", "type": "string"}
]
}
def main():
create_collection_if_not_exists(SCHEMA)
path = os.environ.get("SEED_JSONL", "datasets/ewe/final/ewe_corpus.jsonl")
if os.path.exists(path):
with open(path, "r", encoding="utf-8") as f:
for line in f:
doc = json.loads(line)
doc_id = doc.get("uuid") or doc.get("id") or os.urandom(8).hex()
doc["id"] = doc_id
index_document("documents", doc)
print("Seed terminé.")
else:
print("Aucun fichier de seed trouvé, collection créée sans documents.")
if __name__ == "__main__":
main()