Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from indexer.typesense_indexer import create_collection_if_not_exists, index_document | |
| SCHEMA = { | |
| "name": "documents", | |
| "fields": [ | |
| {"name": "id", "type": "string"}, | |
| {"name": "titre", "type": "string"}, | |
| {"name": "texte", "type": "string"}, | |
| {"name": "langue", "type": "string", "facet": True}, | |
| {"name": "type_document", "type": "string", "facet": True}, | |
| {"name": "pays", "type": "string", "facet": True}, | |
| {"name": "source_url", "type": "string"}, | |
| {"name": "date", "type": "string"} | |
| ] | |
| } | |
| def main(): | |
| create_collection_if_not_exists(SCHEMA) | |
| path = os.environ.get("SEED_JSONL", "datasets/ewe/final/ewe_corpus.jsonl") | |
| if os.path.exists(path): | |
| with open(path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| doc = json.loads(line) | |
| doc_id = doc.get("uuid") or doc.get("id") or os.urandom(8).hex() | |
| doc["id"] = doc_id | |
| index_document("documents", doc) | |
| print("Seed terminé.") | |
| else: | |
| print("Aucun fichier de seed trouvé, collection créée sans documents.") | |
| if __name__ == "__main__": | |
| main() | |