|
|
"""
|
|
|
Upload FONS Knowledge Base to Hugging Face Dataset
|
|
|
===================================================
|
|
|
This script uploads the large 61MB knowledge file to a HF Dataset,
|
|
|
so it can be loaded at runtime in the Space without hitting size limits.
|
|
|
"""
|
|
|
|
|
|
from datasets import Dataset
|
|
|
from huggingface_hub import HfApi
|
|
|
import json
|
|
|
import os
|
|
|
|
|
|
|
|
|
KNOWLEDGE_FILE = "fons_knowledge_google.jsonl"
|
|
|
DATASET_REPO = "NurseCitizenDeveloper/fons-knowledge-base"
|
|
|
|
|
|
def load_jsonl(filepath):
|
|
|
"""Load JSONL file into list of dicts."""
|
|
|
data = []
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
|
for line in f:
|
|
|
try:
|
|
|
data.append(json.loads(line))
|
|
|
except:
|
|
|
continue
|
|
|
return data
|
|
|
|
|
|
def main():
|
|
|
print(f"π Loading {KNOWLEDGE_FILE}...")
|
|
|
records = load_jsonl(KNOWLEDGE_FILE)
|
|
|
print(f"β
Loaded {len(records)} records")
|
|
|
|
|
|
|
|
|
dataset = Dataset.from_list(records)
|
|
|
print(f"π Dataset: {dataset}")
|
|
|
|
|
|
|
|
|
print(f"β¬οΈ Uploading to {DATASET_REPO}...")
|
|
|
dataset.push_to_hub(DATASET_REPO, private=False)
|
|
|
print(f"β
Upload complete! View at: https://huggingface.co/datasets/{DATASET_REPO}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|