Spaces:

jojonocode
/

Scrap-Dji

Sleeping

File size: 1,196 Bytes

dfdddb1

import json
import os
from indexer.typesense_indexer import create_collection_if_not_exists, index_document

SCHEMA = {
    "name": "documents",
    "fields": [
        {"name": "id", "type": "string"},
        {"name": "titre", "type": "string"},
        {"name": "texte", "type": "string"},
        {"name": "langue", "type": "string", "facet": True},
        {"name": "type_document", "type": "string", "facet": True},
        {"name": "pays", "type": "string", "facet": True},
        {"name": "source_url", "type": "string"},
        {"name": "date", "type": "string"}
    ]
}

def main():
    create_collection_if_not_exists(SCHEMA)
    path = os.environ.get("SEED_JSONL", "datasets/ewe/final/ewe_corpus.jsonl")
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                doc = json.loads(line)
                doc_id = doc.get("uuid") or doc.get("id") or os.urandom(8).hex()
                doc["id"] = doc_id
                index_document("documents", doc)
        print("Seed terminé.")
    else:
        print("Aucun fichier de seed trouvé, collection créée sans documents.")

if __name__ == "__main__":
    main()