vishalsh13 commited on
Commit
1e83d8a
·
1 Parent(s): 5b42a86

update file to save in dataset repo

Browse files
Files changed (3) hide show
  1. app.py +2 -3
  2. utils/data_loader.py +7 -14
  3. utils/vector_utils.py +33 -5
app.py CHANGED
@@ -3,14 +3,13 @@
3
  import os
4
  from utils.data_loader import download_dataset, save_metadata
5
  from utils.vector_utils import create_vector_db
6
- from config import v_auth_token, v_vector_folder, v_metadata_file, v_base_path
7
 
8
  if __name__ == "__main__":
9
  if not v_auth_token:
10
  raise ValueError("Authentication token not found. Ensure 'hkey' is set as a secret in Hugging Face Spaces.")
11
 
12
  # Ensure writable base directory and subdirectories are initialized
13
- os.makedirs(v_base_path, exist_ok=True)
14
  os.makedirs(v_vector_folder, exist_ok=True)
15
  if not os.path.exists(v_metadata_file):
16
  print("Metadata file not found. Creating a new one.")
@@ -22,4 +21,4 @@ if __name__ == "__main__":
22
  if not v_dataset_path:
23
  print("Vector database is up-to-date.")
24
  else:
25
- create_vector_db(v_dataset_path, v_vector_folder)
 
3
  import os
4
  from utils.data_loader import download_dataset, save_metadata
5
  from utils.vector_utils import create_vector_db
6
+ from config import v_auth_token, v_vector_folder, v_metadata_file
7
 
8
  if __name__ == "__main__":
9
  if not v_auth_token:
10
  raise ValueError("Authentication token not found. Ensure 'hkey' is set as a secret in Hugging Face Spaces.")
11
 
12
  # Ensure writable base directory and subdirectories are initialized
 
13
  os.makedirs(v_vector_folder, exist_ok=True)
14
  if not os.path.exists(v_metadata_file):
15
  print("Metadata file not found. Creating a new one.")
 
21
  if not v_dataset_path:
22
  print("Vector database is up-to-date.")
23
  else:
24
+ create_vector_db(v_dataset_path, v_vector_folder, v_auth_token)
utils/data_loader.py CHANGED
@@ -12,13 +12,16 @@ def download_dataset(v_auth_token, v_metadata_file, v_vector_folder):
12
  print("No updates detected. Skipping vector creation.")
13
  return False
14
 
15
- # Download the dataset
 
16
  v_dataset_path = hf_hub_download(
17
  repo_id="vishalsh13/Dataset1",
18
  repo_type="dataset",
19
- subfolder="data",
 
20
  token=v_auth_token
21
  )
 
22
  print("Dataset downloaded successfully.")
23
  save_metadata(v_metadata_file, v_current_metadata)
24
  return v_dataset_path
@@ -28,22 +31,12 @@ def fetch_metadata():
28
  return {"dataset_version": "v1.0"}
29
 
30
  def load_metadata(v_metadata_file):
31
- """
32
- Load metadata from the file. If the file doesn't exist or is empty, return an empty dictionary.
33
- """
34
  if os.path.exists(v_metadata_file):
35
  with open(v_metadata_file, "r") as file:
36
- try:
37
- return json.load(file)
38
- except json.JSONDecodeError:
39
- print(f"Metadata file {v_metadata_file} is empty or invalid. Initializing new metadata.")
40
- return {}
41
  return {}
42
 
43
  def save_metadata(v_metadata_file, v_metadata):
44
- """
45
- Save metadata to the file. Creates the file if it doesn't exist.
46
- """
47
  os.makedirs(os.path.dirname(v_metadata_file), exist_ok=True)
48
  with open(v_metadata_file, "w") as file:
49
- json.dump(v_metadata, file, indent=4)
 
12
  print("No updates detected. Skipping vector creation.")
13
  return False
14
 
15
+ # Define the specific file to download
16
+ v_filename = "train.csv" # Replace this with the actual filename in your repository
17
  v_dataset_path = hf_hub_download(
18
  repo_id="vishalsh13/Dataset1",
19
  repo_type="dataset",
20
+ subfolder="data", # Adjust or remove subfolder as needed
21
+ filename=v_filename,
22
  token=v_auth_token
23
  )
24
+
25
  print("Dataset downloaded successfully.")
26
  save_metadata(v_metadata_file, v_current_metadata)
27
  return v_dataset_path
 
31
  return {"dataset_version": "v1.0"}
32
 
33
  def load_metadata(v_metadata_file):
 
 
 
34
  if os.path.exists(v_metadata_file):
35
  with open(v_metadata_file, "r") as file:
36
+ return json.load(file)
 
 
 
 
37
  return {}
38
 
39
  def save_metadata(v_metadata_file, v_metadata):
 
 
 
40
  os.makedirs(os.path.dirname(v_metadata_file), exist_ok=True)
41
  with open(v_metadata_file, "w") as file:
42
+ json.dump(v_metadata, file)
utils/vector_utils.py CHANGED
@@ -1,11 +1,12 @@
1
  # utils/vector_utils.py
2
 
3
  import os
4
- import numpy as np
5
  import faiss
 
 
6
  from sentence_transformers import SentenceTransformer
7
 
8
- def create_vector_db(v_dataset_path, v_vector_folder):
9
  # Initialize the model
10
  obj_model = SentenceTransformer('all-MiniLM-L6-v2')
11
 
@@ -15,10 +16,37 @@ def create_vector_db(v_dataset_path, v_vector_folder):
15
 
16
  v_embeddings = obj_model.encode(v_data)
17
 
18
- # Save vectors
19
  os.makedirs(v_vector_folder, exist_ok=True)
 
20
  v_index = faiss.IndexFlatL2(v_embeddings.shape[1])
21
  v_index.add(np.array(v_embeddings))
22
- faiss.write_index(v_index, os.path.join(v_vector_folder, "vector_index"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- print(f"Vector database created at {v_vector_folder}")
 
 
 
 
 
1
  # utils/vector_utils.py
2
 
3
  import os
 
4
  import faiss
5
+ import numpy as np
6
+ from huggingface_hub import Repository
7
  from sentence_transformers import SentenceTransformer
8
 
9
+ def create_vector_db(v_dataset_path, v_vector_folder, v_auth_token):
10
  # Initialize the model
11
  obj_model = SentenceTransformer('all-MiniLM-L6-v2')
12
 
 
16
 
17
  v_embeddings = obj_model.encode(v_data)
18
 
19
+ # Save vectors locally
20
  os.makedirs(v_vector_folder, exist_ok=True)
21
+ v_vector_file = os.path.join(v_vector_folder, "vector_index")
22
  v_index = faiss.IndexFlatL2(v_embeddings.shape[1])
23
  v_index.add(np.array(v_embeddings))
24
+ faiss.write_index(v_index, v_vector_file)
25
+
26
+ print(f"Vector database created and saved locally at {v_vector_file}")
27
+
28
+ # Save vector file back to Hugging Face dataset repository
29
+ upload_to_huggingface_repo(v_vector_file, v_auth_token)
30
+ print("Vector file successfully uploaded to Hugging Face dataset repository.")
31
+
32
+ def upload_to_huggingface_repo(v_file_path, v_auth_token):
33
+ """
34
+ Upload the given file to the Hugging Face dataset repository.
35
+ """
36
+ v_repo_id = "vishalsh13/Dataset1" # Replace with your repository name
37
+ v_repo = Repository(
38
+ local_dir="temp_repo",
39
+ clone_from=v_repo_id,
40
+ use_auth_token=v_auth_token
41
+ )
42
+
43
+ # Copy the file to the repository directory
44
+ os.makedirs(v_repo.local_dir, exist_ok=True)
45
+ v_dest_path = os.path.join(v_repo.local_dir, os.path.basename(v_file_path))
46
+ os.replace(v_file_path, v_dest_path)
47
 
48
+ # Commit and push the changes
49
+ v_repo.git_add(v_dest_path)
50
+ v_repo.git_commit("Upload updated vector file.")
51
+ v_repo.git_push()
52
+ print(f"Uploaded {os.path.basename(v_file_path)} to Hugging Face repository: {v_repo_id}")