Spaces:
Runtime error
Runtime error
Commit ·
1e83d8a
1
Parent(s): 5b42a86
update file to save in dataset repo
Browse files- app.py +2 -3
- utils/data_loader.py +7 -14
- utils/vector_utils.py +33 -5
app.py
CHANGED
|
@@ -3,14 +3,13 @@
|
|
| 3 |
import os
|
| 4 |
from utils.data_loader import download_dataset, save_metadata
|
| 5 |
from utils.vector_utils import create_vector_db
|
| 6 |
-
from config import v_auth_token, v_vector_folder, v_metadata_file
|
| 7 |
|
| 8 |
if __name__ == "__main__":
|
| 9 |
if not v_auth_token:
|
| 10 |
raise ValueError("Authentication token not found. Ensure 'hkey' is set as a secret in Hugging Face Spaces.")
|
| 11 |
|
| 12 |
# Ensure writable base directory and subdirectories are initialized
|
| 13 |
-
os.makedirs(v_base_path, exist_ok=True)
|
| 14 |
os.makedirs(v_vector_folder, exist_ok=True)
|
| 15 |
if not os.path.exists(v_metadata_file):
|
| 16 |
print("Metadata file not found. Creating a new one.")
|
|
@@ -22,4 +21,4 @@ if __name__ == "__main__":
|
|
| 22 |
if not v_dataset_path:
|
| 23 |
print("Vector database is up-to-date.")
|
| 24 |
else:
|
| 25 |
-
create_vector_db(v_dataset_path, v_vector_folder)
|
|
|
|
| 3 |
import os
|
| 4 |
from utils.data_loader import download_dataset, save_metadata
|
| 5 |
from utils.vector_utils import create_vector_db
|
| 6 |
+
from config import v_auth_token, v_vector_folder, v_metadata_file
|
| 7 |
|
| 8 |
if __name__ == "__main__":
|
| 9 |
if not v_auth_token:
|
| 10 |
raise ValueError("Authentication token not found. Ensure 'hkey' is set as a secret in Hugging Face Spaces.")
|
| 11 |
|
| 12 |
# Ensure writable base directory and subdirectories are initialized
|
|
|
|
| 13 |
os.makedirs(v_vector_folder, exist_ok=True)
|
| 14 |
if not os.path.exists(v_metadata_file):
|
| 15 |
print("Metadata file not found. Creating a new one.")
|
|
|
|
| 21 |
if not v_dataset_path:
|
| 22 |
print("Vector database is up-to-date.")
|
| 23 |
else:
|
| 24 |
+
create_vector_db(v_dataset_path, v_vector_folder, v_auth_token)
|
utils/data_loader.py
CHANGED
|
@@ -12,13 +12,16 @@ def download_dataset(v_auth_token, v_metadata_file, v_vector_folder):
|
|
| 12 |
print("No updates detected. Skipping vector creation.")
|
| 13 |
return False
|
| 14 |
|
| 15 |
-
#
|
|
|
|
| 16 |
v_dataset_path = hf_hub_download(
|
| 17 |
repo_id="vishalsh13/Dataset1",
|
| 18 |
repo_type="dataset",
|
| 19 |
-
subfolder="data",
|
|
|
|
| 20 |
token=v_auth_token
|
| 21 |
)
|
|
|
|
| 22 |
print("Dataset downloaded successfully.")
|
| 23 |
save_metadata(v_metadata_file, v_current_metadata)
|
| 24 |
return v_dataset_path
|
|
@@ -28,22 +31,12 @@ def fetch_metadata():
|
|
| 28 |
return {"dataset_version": "v1.0"}
|
| 29 |
|
| 30 |
def load_metadata(v_metadata_file):
|
| 31 |
-
"""
|
| 32 |
-
Load metadata from the file. If the file doesn't exist or is empty, return an empty dictionary.
|
| 33 |
-
"""
|
| 34 |
if os.path.exists(v_metadata_file):
|
| 35 |
with open(v_metadata_file, "r") as file:
|
| 36 |
-
|
| 37 |
-
return json.load(file)
|
| 38 |
-
except json.JSONDecodeError:
|
| 39 |
-
print(f"Metadata file {v_metadata_file} is empty or invalid. Initializing new metadata.")
|
| 40 |
-
return {}
|
| 41 |
return {}
|
| 42 |
|
| 43 |
def save_metadata(v_metadata_file, v_metadata):
|
| 44 |
-
"""
|
| 45 |
-
Save metadata to the file. Creates the file if it doesn't exist.
|
| 46 |
-
"""
|
| 47 |
os.makedirs(os.path.dirname(v_metadata_file), exist_ok=True)
|
| 48 |
with open(v_metadata_file, "w") as file:
|
| 49 |
-
json.dump(v_metadata, file
|
|
|
|
| 12 |
print("No updates detected. Skipping vector creation.")
|
| 13 |
return False
|
| 14 |
|
| 15 |
+
# Define the specific file to download
|
| 16 |
+
v_filename = "train.csv" # Replace this with the actual filename in your repository
|
| 17 |
v_dataset_path = hf_hub_download(
|
| 18 |
repo_id="vishalsh13/Dataset1",
|
| 19 |
repo_type="dataset",
|
| 20 |
+
subfolder="data", # Adjust or remove subfolder as needed
|
| 21 |
+
filename=v_filename,
|
| 22 |
token=v_auth_token
|
| 23 |
)
|
| 24 |
+
|
| 25 |
print("Dataset downloaded successfully.")
|
| 26 |
save_metadata(v_metadata_file, v_current_metadata)
|
| 27 |
return v_dataset_path
|
|
|
|
| 31 |
return {"dataset_version": "v1.0"}
|
| 32 |
|
| 33 |
def load_metadata(v_metadata_file):
|
|
|
|
|
|
|
|
|
|
| 34 |
if os.path.exists(v_metadata_file):
|
| 35 |
with open(v_metadata_file, "r") as file:
|
| 36 |
+
return json.load(file)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
return {}
|
| 38 |
|
| 39 |
def save_metadata(v_metadata_file, v_metadata):
|
|
|
|
|
|
|
|
|
|
| 40 |
os.makedirs(os.path.dirname(v_metadata_file), exist_ok=True)
|
| 41 |
with open(v_metadata_file, "w") as file:
|
| 42 |
+
json.dump(v_metadata, file)
|
utils/vector_utils.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
# utils/vector_utils.py
|
| 2 |
|
| 3 |
import os
|
| 4 |
-
import numpy as np
|
| 5 |
import faiss
|
|
|
|
|
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
|
| 8 |
-
def create_vector_db(v_dataset_path, v_vector_folder):
|
| 9 |
# Initialize the model
|
| 10 |
obj_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 11 |
|
|
@@ -15,10 +16,37 @@ def create_vector_db(v_dataset_path, v_vector_folder):
|
|
| 15 |
|
| 16 |
v_embeddings = obj_model.encode(v_data)
|
| 17 |
|
| 18 |
-
# Save vectors
|
| 19 |
os.makedirs(v_vector_folder, exist_ok=True)
|
|
|
|
| 20 |
v_index = faiss.IndexFlatL2(v_embeddings.shape[1])
|
| 21 |
v_index.add(np.array(v_embeddings))
|
| 22 |
-
faiss.write_index(v_index,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# utils/vector_utils.py
|
| 2 |
|
| 3 |
import os
|
|
|
|
| 4 |
import faiss
|
| 5 |
+
import numpy as np
|
| 6 |
+
from huggingface_hub import Repository
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
|
| 9 |
+
def create_vector_db(v_dataset_path, v_vector_folder, v_auth_token):
|
| 10 |
# Initialize the model
|
| 11 |
obj_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 12 |
|
|
|
|
| 16 |
|
| 17 |
v_embeddings = obj_model.encode(v_data)
|
| 18 |
|
| 19 |
+
# Save vectors locally
|
| 20 |
os.makedirs(v_vector_folder, exist_ok=True)
|
| 21 |
+
v_vector_file = os.path.join(v_vector_folder, "vector_index")
|
| 22 |
v_index = faiss.IndexFlatL2(v_embeddings.shape[1])
|
| 23 |
v_index.add(np.array(v_embeddings))
|
| 24 |
+
faiss.write_index(v_index, v_vector_file)
|
| 25 |
+
|
| 26 |
+
print(f"Vector database created and saved locally at {v_vector_file}")
|
| 27 |
+
|
| 28 |
+
# Save vector file back to Hugging Face dataset repository
|
| 29 |
+
upload_to_huggingface_repo(v_vector_file, v_auth_token)
|
| 30 |
+
print("Vector file successfully uploaded to Hugging Face dataset repository.")
|
| 31 |
+
|
| 32 |
+
def upload_to_huggingface_repo(v_file_path, v_auth_token):
|
| 33 |
+
"""
|
| 34 |
+
Upload the given file to the Hugging Face dataset repository.
|
| 35 |
+
"""
|
| 36 |
+
v_repo_id = "vishalsh13/Dataset1" # Replace with your repository name
|
| 37 |
+
v_repo = Repository(
|
| 38 |
+
local_dir="temp_repo",
|
| 39 |
+
clone_from=v_repo_id,
|
| 40 |
+
use_auth_token=v_auth_token
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Copy the file to the repository directory
|
| 44 |
+
os.makedirs(v_repo.local_dir, exist_ok=True)
|
| 45 |
+
v_dest_path = os.path.join(v_repo.local_dir, os.path.basename(v_file_path))
|
| 46 |
+
os.replace(v_file_path, v_dest_path)
|
| 47 |
|
| 48 |
+
# Commit and push the changes
|
| 49 |
+
v_repo.git_add(v_dest_path)
|
| 50 |
+
v_repo.git_commit("Upload updated vector file.")
|
| 51 |
+
v_repo.git_push()
|
| 52 |
+
print(f"Uploaded {os.path.basename(v_file_path)} to Hugging Face repository: {v_repo_id}")
|