import json import os from indexer.typesense_indexer import create_collection_if_not_exists, index_document SCHEMA = { "name": "documents", "fields": [ {"name": "id", "type": "string"}, {"name": "titre", "type": "string"}, {"name": "texte", "type": "string"}, {"name": "langue", "type": "string", "facet": True}, {"name": "type_document", "type": "string", "facet": True}, {"name": "pays", "type": "string", "facet": True}, {"name": "source_url", "type": "string"}, {"name": "date", "type": "string"} ] } def main(): create_collection_if_not_exists(SCHEMA) path = os.environ.get("SEED_JSONL", "datasets/ewe/final/ewe_corpus.jsonl") if os.path.exists(path): with open(path, "r", encoding="utf-8") as f: for line in f: doc = json.loads(line) doc_id = doc.get("uuid") or doc.get("id") or os.urandom(8).hex() doc["id"] = doc_id index_document("documents", doc) print("Seed terminé.") else: print("Aucun fichier de seed trouvé, collection créée sans documents.") if __name__ == "__main__": main()