Alon Albalak commited on
Commit
7e52249
·
1 Parent(s): b30b751

update data manager

Browse files
Files changed (1) hide show
  1. src/models/data_manager.py +3 -34
src/models/data_manager.py CHANGED
@@ -9,13 +9,14 @@ from pathlib import Path
9
  from datasets import load_dataset
10
  from huggingface_hub import CommitScheduler
11
 
 
12
 
13
  JSON_DATASET_DIR = Path("results")
14
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
15
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
16
 
17
  scheduler = CommitScheduler(
18
- repo_id="alon-albalak/collaborative-decoding-results",
19
  repo_type="dataset",
20
  folder_path=JSON_DATASET_DIR.as_posix(),
21
  path_in_repo="data",
@@ -51,20 +52,8 @@ class DataManager:
51
  if self.results is None:
52
  raise RuntimeError("Results not loaded. Call get_results() first.")
53
  self.results.extend(new_results)
54
-
55
- def load_results_data(self, filepath="data/results.jsonl"):
56
- """Load all results data from results.jsonl file."""
57
- results = []
58
- try:
59
- with open(filepath, "r") as f:
60
- for line in f:
61
- if line.strip():
62
- results.append(json.loads(line))
63
- except FileNotFoundError:
64
- pass # Return empty list if file doesn't exist
65
- return results
66
 
67
- def load_results_from_hf(self, hf_repo="alon-albalak/collaborative-decoding-results"):
68
  """Load results data from Hugging Face dataset repository."""
69
  try:
70
  dataset = load_dataset(hf_repo, split="train")
@@ -73,26 +62,6 @@ class DataManager:
73
  print(f"Error loading dataset from Hugging Face: {e}")
74
  return []
75
 
76
- def save_interaction(self, prompt_data, user_continuation, generated_response,
77
- cosine_distance, session_id, num_user_tokens, filepath="data/results.jsonl"):
78
- """Save a user interaction to the results file"""
79
- interaction = {
80
- "prompt": prompt_data["prompt"],
81
- "model": prompt_data["model"],
82
- "llm_partial_response": prompt_data["llm_partial_response"],
83
- "llm_full_response_original": prompt_data["llm_full_response_original"],
84
- "user_continuation": user_continuation,
85
- "full_response_from_user": generated_response,
86
- "cosine_distance": cosine_distance,
87
- "timestamp": datetime.datetime.now().isoformat(),
88
- "continuation_source": session_id,
89
- "num_user_tokens": num_user_tokens
90
- }
91
-
92
- os.makedirs(os.path.dirname(filepath), exist_ok=True)
93
- with open(filepath, "a") as f:
94
- f.write(json.dumps(interaction) + "\n")
95
-
96
  def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
97
  cosine_distance, session_id, num_user_tokens):
98
  interaction = {
 
9
  from datasets import load_dataset
10
  from huggingface_hub import CommitScheduler
11
 
12
+ HF_REPO_ID = "alon-albalak/collaborative-decoding-results"
13
 
14
  JSON_DATASET_DIR = Path("results")
15
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
16
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
17
 
18
  scheduler = CommitScheduler(
19
+ repo_id=HF_REPO_ID,
20
  repo_type="dataset",
21
  folder_path=JSON_DATASET_DIR.as_posix(),
22
  path_in_repo="data",
 
52
  if self.results is None:
53
  raise RuntimeError("Results not loaded. Call get_results() first.")
54
  self.results.extend(new_results)
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ def load_results_from_hf(self, hf_repo=HF_REPO_ID):
57
  """Load results data from Hugging Face dataset repository."""
58
  try:
59
  dataset = load_dataset(hf_repo, split="train")
 
62
  print(f"Error loading dataset from Hugging Face: {e}")
63
  return []
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
66
  cosine_distance, session_id, num_user_tokens):
67
  interaction = {