agentic-defensor / upload_datasets.py
vichudo's picture
feat: add dataset uploader
f41e5db
Raw
History Blame Contribute Delete
1.8 kB
# Content for upload_datasets.py
import pickle
import os
from datasets import Dataset
from huggingface_hub import HfApi
# Initialize Hugging Face API
api = HfApi()
# Upload embeddings
print("Preparing embeddings dataset...")
try:
with open('embeddings/embeddings.pkl', 'rb') as f:
embeddings_data = pickle.load(f)
# Create dataset with metadata to preserve the format
embeddings_ds = Dataset.from_dict({
"data": [pickle.dumps(embeddings_data)],
"format": ["pickle"]
})
# Push to hub
print("Uploading embeddings dataset...")
embeddings_ds.push_to_hub("vichudo/agentic-defensor-embeddings")
print("Embeddings dataset uploaded successfully!")
except Exception as e:
print(f"Error uploading embeddings: {e}")
# Upload FAISS index separately
print("Uploading FAISS index file...")
try:
api.upload_file(
path_or_fileobj="embeddings/faiss_index.index",
path_in_repo="faiss_index.index",
repo_id="vichudo/agentic-defensor-embeddings",
repo_type="dataset"
)
print("FAISS index uploaded successfully!")
except Exception as e:
print(f"Error uploading FAISS index: {e}")
# Upload document chunks
print("Preparing document chunks dataset...")
try:
with open('data/doc_chunks.pkl', 'rb') as f:
chunks_data = pickle.load(f)
# Create dataset
chunks_ds = Dataset.from_dict({
"data": [pickle.dumps(chunks_data)],
"format": ["pickle"]
})
# Push to hub
print("Uploading document chunks dataset...")
chunks_ds.push_to_hub("vichudo/agentic-defensor-chunks")
print("Document chunks dataset uploaded successfully!")
except Exception as e:
print(f"Error uploading document chunks: {e}")
print("Dataset upload process complete!")