Spaces:
Sleeping
Sleeping
Alon Albalak commited on
Commit ·
7e52249
1
Parent(s): b30b751
update data manager
Browse files- src/models/data_manager.py +3 -34
src/models/data_manager.py
CHANGED
|
@@ -9,13 +9,14 @@ from pathlib import Path
|
|
| 9 |
from datasets import load_dataset
|
| 10 |
from huggingface_hub import CommitScheduler
|
| 11 |
|
|
|
|
| 12 |
|
| 13 |
JSON_DATASET_DIR = Path("results")
|
| 14 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
| 15 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
|
| 16 |
|
| 17 |
scheduler = CommitScheduler(
|
| 18 |
-
repo_id=
|
| 19 |
repo_type="dataset",
|
| 20 |
folder_path=JSON_DATASET_DIR.as_posix(),
|
| 21 |
path_in_repo="data",
|
|
@@ -51,20 +52,8 @@ class DataManager:
|
|
| 51 |
if self.results is None:
|
| 52 |
raise RuntimeError("Results not loaded. Call get_results() first.")
|
| 53 |
self.results.extend(new_results)
|
| 54 |
-
|
| 55 |
-
def load_results_data(self, filepath="data/results.jsonl"):
|
| 56 |
-
"""Load all results data from results.jsonl file."""
|
| 57 |
-
results = []
|
| 58 |
-
try:
|
| 59 |
-
with open(filepath, "r") as f:
|
| 60 |
-
for line in f:
|
| 61 |
-
if line.strip():
|
| 62 |
-
results.append(json.loads(line))
|
| 63 |
-
except FileNotFoundError:
|
| 64 |
-
pass # Return empty list if file doesn't exist
|
| 65 |
-
return results
|
| 66 |
|
| 67 |
-
def load_results_from_hf(self, hf_repo=
|
| 68 |
"""Load results data from Hugging Face dataset repository."""
|
| 69 |
try:
|
| 70 |
dataset = load_dataset(hf_repo, split="train")
|
|
@@ -73,26 +62,6 @@ class DataManager:
|
|
| 73 |
print(f"Error loading dataset from Hugging Face: {e}")
|
| 74 |
return []
|
| 75 |
|
| 76 |
-
def save_interaction(self, prompt_data, user_continuation, generated_response,
|
| 77 |
-
cosine_distance, session_id, num_user_tokens, filepath="data/results.jsonl"):
|
| 78 |
-
"""Save a user interaction to the results file"""
|
| 79 |
-
interaction = {
|
| 80 |
-
"prompt": prompt_data["prompt"],
|
| 81 |
-
"model": prompt_data["model"],
|
| 82 |
-
"llm_partial_response": prompt_data["llm_partial_response"],
|
| 83 |
-
"llm_full_response_original": prompt_data["llm_full_response_original"],
|
| 84 |
-
"user_continuation": user_continuation,
|
| 85 |
-
"full_response_from_user": generated_response,
|
| 86 |
-
"cosine_distance": cosine_distance,
|
| 87 |
-
"timestamp": datetime.datetime.now().isoformat(),
|
| 88 |
-
"continuation_source": session_id,
|
| 89 |
-
"num_user_tokens": num_user_tokens
|
| 90 |
-
}
|
| 91 |
-
|
| 92 |
-
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
| 93 |
-
with open(filepath, "a") as f:
|
| 94 |
-
f.write(json.dumps(interaction) + "\n")
|
| 95 |
-
|
| 96 |
def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
|
| 97 |
cosine_distance, session_id, num_user_tokens):
|
| 98 |
interaction = {
|
|
|
|
| 9 |
from datasets import load_dataset
|
| 10 |
from huggingface_hub import CommitScheduler
|
| 11 |
|
| 12 |
+
HF_REPO_ID = "alon-albalak/collaborative-decoding-results"
|
| 13 |
|
| 14 |
JSON_DATASET_DIR = Path("results")
|
| 15 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
| 16 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
|
| 17 |
|
| 18 |
scheduler = CommitScheduler(
|
| 19 |
+
repo_id=HF_REPO_ID,
|
| 20 |
repo_type="dataset",
|
| 21 |
folder_path=JSON_DATASET_DIR.as_posix(),
|
| 22 |
path_in_repo="data",
|
|
|
|
| 52 |
if self.results is None:
|
| 53 |
raise RuntimeError("Results not loaded. Call get_results() first.")
|
| 54 |
self.results.extend(new_results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
+
def load_results_from_hf(self, hf_repo=HF_REPO_ID):
|
| 57 |
"""Load results data from Hugging Face dataset repository."""
|
| 58 |
try:
|
| 59 |
dataset = load_dataset(hf_repo, split="train")
|
|
|
|
| 62 |
print(f"Error loading dataset from Hugging Face: {e}")
|
| 63 |
return []
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
|
| 66 |
cosine_distance, session_id, num_user_tokens):
|
| 67 |
interaction = {
|