samwaugh commited on
Commit
4f1c614
Β·
1 Parent(s): efbac81

Try to fix changes

Browse files
backend/runner/app.py CHANGED
@@ -98,7 +98,9 @@ from .config import (
98
  ARTIFACTS_DIR,
99
  OUTPUTS_DIR,
100
  JSON_INFO_DIR,
101
- MARKER_DIR
 
 
102
  )
103
 
104
  # Import data from config (loaded from HF datasets)
 
98
  ARTIFACTS_DIR,
99
  OUTPUTS_DIR,
100
  JSON_INFO_DIR,
101
+ MARKER_DIR,
102
+ JSON_DATASETS,
103
+ EMBEDDINGS_DATASETS
104
  )
105
 
106
  # Import data from config (loaded from HF datasets)
backend/runner/config.py CHANGED
@@ -5,12 +5,33 @@ All runner modules should import from this module instead of defining their own
5
 
6
  import os
7
  from pathlib import Path
8
- from datasets import load_dataset
9
 
10
- # HF Dataset IDs
11
- EMBEDDINGS_DATASET = "samwaugh/artefact-embeddings"
12
- JSON_DATASET = "samwaugh/artefact-json"
13
- MARKDOWN_DATASET = "samwaugh/artefact-markdown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # READ root (repo data - read-only)
16
  PROJECT_ROOT = Path(__file__).resolve().parents[2]
@@ -60,50 +81,96 @@ for dir_path in [OUTPUTS_DIR, ARTIFACTS_DIR]:
60
  print(f"⚠️ Could not create directory {dir_path}: {e}")
61
 
62
  # Global data variables (will be populated from HF datasets)
63
- sentences = {}
64
- works = {}
65
- creators = {}
66
- topics = {}
67
- topic_names = {}
68
-
69
- def load_json_from_hf(dataset_name: str, file_name: str):
70
- """Load JSON data from Hugging Face dataset"""
71
- try:
72
- dataset = load_dataset(dataset_name, split="train")
73
- # Access the specific file content
74
- return dataset[file_name]
75
- except Exception as e:
76
- print(f"Failed to load {file_name} from HF: {e}")
77
- return None
78
 
79
- def load_all_data():
80
- """Load all data from Hugging Face datasets"""
81
- global sentences, works, creators, topics, topic_names
82
-
83
- print("πŸ”„ Loading data from Hugging Face datasets...")
84
-
85
- sentences = load_json_from_hf(JSON_DATASET, "sentences.json")
86
- works = load_json_from_hf(JSON_DATASET, "works.json")
87
- creators = load_json_from_hf(JSON_DATASET, "creators.json")
88
- topics = load_json_from_hf(JSON_DATASET, "topics.json")
89
- topic_names = load_json_from_hf(JSON_DATASET, "topic_names.json")
90
-
91
- # Validate data loading
92
- if sentences and works and creators and topics and topic_names:
93
- print(f"βœ… Successfully loaded data from HF:")
 
 
 
 
 
 
 
 
 
 
 
94
  print(f" Sentences: {len(sentences)} entries")
95
  print(f" Works: {len(works)} entries")
96
- print(f" Topics: {len(topics)} entries")
97
  print(f" Creators: {len(creators)} entries")
98
- print(f" Topic names: {len(topic_names)} entries")
99
- else:
100
- print("⚠️ Some data failed to load from HF datasets")
101
- # Fallback to empty dicts to prevent crashes
102
- sentences = sentences or {}
103
- works = works or {}
104
- creators = creators or {}
105
- topics = topics or {}
106
- topic_names = topic_names or {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Initialize data loading
109
- load_all_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  import os
7
  from pathlib import Path
8
+ from typing import Any, Dict, Optional
9
 
10
+ # Try to import datasets, but handle gracefully if not available
11
+ try:
12
+ from datasets import load_dataset
13
+ DATASETS_AVAILABLE = True
14
+ except ImportError:
15
+ print("⚠️ datasets library not available - HF dataset loading disabled")
16
+ DATASETS_AVAILABLE = False
17
+
18
+ # Environment variables for dataset names
19
+ ARTEFACT_JSON_DATASET = os.getenv('ARTEFACT_JSON_DATASET', 'samwaugh/artefact-json')
20
+ ARTEFACT_EMBEDDINGS_DATASET = os.getenv('ARTEFACT_EMBEDDINGS_DATASET', 'samwaugh/artefact-embeddings')
21
+ ARTEFACT_MARKDOWN_DATASET = os.getenv('ARTEFACT_MARKDOWN_DATASET', 'samwaugh/artefact-markdown')
22
+
23
+ # Legacy path variables for backward compatibility
24
+ JSON_INFO_DIR = "/data/hub/datasets--samwaugh--artefact-json/snapshots/latest"
25
+ EMBEDDINGS_DIR = "/data/hub/datasets--samwaugh--artefact-embeddings/snapshots/latest"
26
+ MARKDOWN_DIR = "/data/hub/datasets--samwaugh--artefact-markdown/snapshots/latest"
27
+
28
+ # Embedding file paths for backward compatibility
29
+ CLIP_EMBEDDINGS_ST = Path(EMBEDDINGS_DIR) / "clip_embeddings.safetensors"
30
+ PAINTINGCLIP_EMBEDDINGS_ST = Path(EMBEDDINGS_DIR) / "paintingclip_embeddings.safetensors"
31
+ CLIP_SENTENCE_IDS = Path(EMBEDDINGS_DIR) / "clip_embeddings_sentence_ids.json"
32
+ PAINTINGCLIP_SENTENCE_IDS = Path(EMBEDDINGS_DIR) / "paintingclip_embeddings_sentence_ids.json"
33
+ CLIP_EMBEDDINGS_DIR = EMBEDDINGS_DIR
34
+ PAINTINGCLIP_EMBEDDINGS_DIR = EMBEDDINGS_DIR
35
 
36
  # READ root (repo data - read-only)
37
  PROJECT_ROOT = Path(__file__).resolve().parents[2]
 
81
  print(f"⚠️ Could not create directory {dir_path}: {e}")
82
 
83
  # Global data variables (will be populated from HF datasets)
84
+ sentences: Dict[str, Any] = {}
85
+ works: Dict[str, Any] = {}
86
+ creators: Dict[str, Any] = {}
87
+ topics: Dict[str, Any] = {}
88
+ topic_names: Dict[str, Any] = {}
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # Load datasets from Hugging Face
91
+ def load_json_datasets() -> Optional[Dict[str, Any]]:
92
+ """Load all JSON datasets from Hugging Face"""
93
+ if not DATASETS_AVAILABLE:
94
+ print("⚠️ datasets library not available - skipping HF dataset loading")
95
+ return None
96
+
97
+ try:
98
+ print("πŸ”„ Loading data from Hugging Face datasets...")
99
+
100
+ creators_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'creators.json', split='train')
101
+ sentences_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'sentences.json', split='train')
102
+ works_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'works.json', split='train')
103
+ topics_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topics.json', split='train')
104
+ topic_names_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topic_names.json', split='train')
105
+
106
+ # Convert to dictionaries for backward compatibility
107
+ global sentences, works, creators, topics, topic_names
108
+
109
+ sentences = {str(i): item for i, item in enumerate(sentences_dataset)}
110
+ works = {str(i): item for i, item in enumerate(works_dataset)}
111
+ creators = {str(i): item for i, item in enumerate(creators_dataset)}
112
+ topics = {str(i): item for i, item in enumerate(topics_dataset)}
113
+ topic_names = {str(i): item for i, item in enumerate(topic_names_dataset)}
114
+
115
+ print(f"βœ… Successfully loaded JSON datasets from HF:")
116
  print(f" Sentences: {len(sentences)} entries")
117
  print(f" Works: {len(works)} entries")
 
118
  print(f" Creators: {len(creators)} entries")
119
+ print(f" Topics: {len(topics)} entries")
120
+ print(f" Topic Names: {len(topic_names)} entries")
121
+
122
+ return {
123
+ 'creators': creators_dataset,
124
+ 'sentences': sentences_dataset,
125
+ 'works': works_dataset,
126
+ 'topics': topics_dataset,
127
+ 'topic_names': topic_names_dataset
128
+ }
129
+ except Exception as e:
130
+ print(f"❌ Failed to load JSON datasets from HF: {e}")
131
+ return None
132
+
133
+ def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
134
+ """Load embeddings datasets from Hugging Face"""
135
+ if not DATASETS_AVAILABLE:
136
+ print("⚠️ datasets library not available - skipping HF embeddings loading")
137
+ return None
138
+
139
+ try:
140
+ clip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'clip_embeddings.safetensors', split='train')
141
+ paintingclip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'paintingclip_embeddings.safetensors', split='train')
142
+
143
+ return {
144
+ 'clip': clip_embeddings,
145
+ 'paintingclip': paintingclip_embeddings
146
+ }
147
+ except Exception as e:
148
+ print(f"❌ Failed to load embeddings datasets from HF: {e}")
149
+ return None
150
+
151
+ # Initialize datasets
152
+ JSON_DATASETS = load_json_datasets()
153
+ EMBEDDINGS_DATASETS = load_embeddings_datasets()
154
 
155
  # Initialize data loading
156
+ if JSON_DATASETS is None:
157
+ print("⚠️ Some data failed to load from HF datasets")
158
+ else:
159
+ print("βœ… All data loaded successfully from HF datasets")
160
+
161
+ # Add this function for backward compatibility
162
+ def st_load_file(file_path: Path) -> Any:
163
+ """Load a file using safetensors or other methods"""
164
+ try:
165
+ if file_path.suffix == '.safetensors':
166
+ import safetensors
167
+ return safetensors.safe_open(str(file_path), framework="pt")
168
+ else:
169
+ import torch
170
+ return torch.load(str(file_path))
171
+ except ImportError:
172
+ print(f"⚠️ Required library not available for loading {file_path}")
173
+ return None
174
+ except Exception as e:
175
+ print(f"❌ Error loading {file_path}: {e}")
176
+ return None
backend/runner/filtering.py CHANGED
@@ -5,7 +5,10 @@ Filtering logic for sentence selection based on topics and creators.
5
  from typing import Any, Dict, List, Set
6
 
7
  # Import data from config (loaded from HF datasets)
8
- from .config import sentences, works, creators, topics
 
 
 
9
 
10
  # Data is now loaded from Hugging Face datasets in config.py
11
  # No need to load from local files anymore
@@ -24,7 +27,7 @@ def get_filtered_sentence_ids(
24
  Set of sentence IDs that match all filters
25
  """
26
  # Start with all sentence IDs
27
- valid_sentence_ids = set(sentences.keys())
28
 
29
  # If no filters, return all sentences
30
  if not filter_topics and not filter_creators:
@@ -38,21 +41,21 @@ def get_filtered_sentence_ids(
38
  # Using topics.json (topic -> works mapping)
39
  # For each selected topic, get all works that have it
40
  for topic_id in filter_topics:
41
- if topic_id in topics:
42
  # Add all works that have this topic
43
- valid_work_ids.update(topics[topic_id])
44
  else:
45
  # If no topic filter, all works are valid so far
46
- valid_work_ids = set(works.keys())
47
 
48
  # Apply creator filter
49
  if filter_creators:
50
  # Direct lookup in creators.json (more efficient)
51
  creator_work_ids = set()
52
  for creator_name in filter_creators:
53
- if creator_name in creators:
54
  # Get all works by this creator directly from creators.json
55
- creator_work_ids.update(creators[creator_name])
56
 
57
  # Intersect with existing valid_work_ids if topics were filtered
58
  if filter_topics:
 
5
  from typing import Any, Dict, List, Set
6
 
7
  # Import data from config (loaded from HF datasets)
8
+ from .config import (
9
+ JSON_INFO_DIR,
10
+ JSON_DATASETS
11
+ )
12
 
13
  # Data is now loaded from Hugging Face datasets in config.py
14
  # No need to load from local files anymore
 
27
  Set of sentence IDs that match all filters
28
  """
29
  # Start with all sentence IDs
30
+ valid_sentence_ids = set(JSON_DATASETS['sentences']['id'])
31
 
32
  # If no filters, return all sentences
33
  if not filter_topics and not filter_creators:
 
41
  # Using topics.json (topic -> works mapping)
42
  # For each selected topic, get all works that have it
43
  for topic_id in filter_topics:
44
+ if topic_id in JSON_DATASETS['topics']:
45
  # Add all works that have this topic
46
+ valid_work_ids.update(JSON_DATASETS['topics'][topic_id])
47
  else:
48
  # If no topic filter, all works are valid so far
49
+ valid_work_ids = set(JSON_DATASETS['works']['id'])
50
 
51
  # Apply creator filter
52
  if filter_creators:
53
  # Direct lookup in creators.json (more efficient)
54
  creator_work_ids = set()
55
  for creator_name in filter_creators:
56
+ if creator_name in JSON_DATASETS['creators']:
57
  # Get all works by this creator directly from creators.json
58
+ creator_work_ids.update(JSON_DATASETS['creators'][creator_name])
59
 
60
  # Intersect with existing valid_work_ids if topics were filtered
61
  if filter_topics:
backend/runner/inference.py CHANGED
@@ -31,14 +31,19 @@ from .filtering import get_filtered_sentence_ids
31
  # on-demand Grad-ECLIP & region-aware ranking
32
  from .heatmap import generate_heatmap
33
  from .config import (
 
 
 
 
34
  PAINTINGCLIP_MODEL_DIR,
35
- EMBEDDINGS_DATASET,
36
- JSON_DATASET,
37
- sentences,
38
- works,
39
- creators,
40
- topics,
41
- topic_names
 
42
  )
43
 
44
  # ─── Configuration ───────────────────────────────────────────────────────────
@@ -65,8 +70,8 @@ TOP_K = 25 # Number of results to return
65
  def load_embeddings_from_hf():
66
  """Load embeddings from HF dataset"""
67
  try:
68
- print(f"πŸ” Loading embeddings from {EMBEDDINGS_DATASET}...")
69
- dataset = load_dataset(EMBEDDINGS_DATASET, split="train")
70
 
71
  # Load CLIP embeddings
72
  clip_embeddings = dataset["clip_embeddings"]
@@ -92,6 +97,9 @@ def _load_sentences_metadata() -> Dict[str, Dict[str, Any]]:
92
  """
93
  Get sentence metadata from global config (loaded from HF datasets).
94
  """
 
 
 
95
  return sentences
96
 
97
  @lru_cache(maxsize=1)
@@ -156,7 +164,7 @@ def _initialize_pipeline():
156
  try:
157
  embeddings_data = load_embeddings_from_hf()
158
  if embeddings_data is None:
159
- raise ValueError(f"Failed to load embeddings from HF dataset: {EMBEDDINGS_DATASET}")
160
 
161
  if MODEL_TYPE == "clip":
162
  embeddings, sentence_ids = embeddings_data["clip"]
@@ -489,3 +497,20 @@ def load_embeddings_for_model(model_type: str):
489
  print(f" - {st_file.name}")
490
  print(f" - {ids_file.name}")
491
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # on-demand Grad-ECLIP & region-aware ranking
32
  from .heatmap import generate_heatmap
33
  from .config import (
34
+ JSON_INFO_DIR,
35
+ EMBEDDINGS_DIR,
36
+ JSON_DATASETS,
37
+ EMBEDDINGS_DATASETS,
38
  PAINTINGCLIP_MODEL_DIR,
39
+ ARTEFACT_EMBEDDINGS_DATASET,
40
+ sentences, # Add this
41
+ CLIP_EMBEDDINGS_ST, # Add these for backward compatibility
42
+ PAINTINGCLIP_EMBEDDINGS_ST,
43
+ CLIP_SENTENCE_IDS,
44
+ PAINTINGCLIP_SENTENCE_IDS,
45
+ CLIP_EMBEDDINGS_DIR,
46
+ PAINTINGCLIP_EMBEDDINGS_DIR
47
  )
48
 
49
  # ─── Configuration ───────────────────────────────────────────────────────────
 
70
  def load_embeddings_from_hf():
71
  """Load embeddings from HF dataset"""
72
  try:
73
+ print(f"πŸ” Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
74
+ dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split="train")
75
 
76
  # Load CLIP embeddings
77
  clip_embeddings = dataset["clip_embeddings"]
 
97
  """
98
  Get sentence metadata from global config (loaded from HF datasets).
99
  """
100
+ if not sentences:
101
+ print("⚠️ No sentence metadata available - check if HF datasets loaded successfully")
102
+ return {}
103
  return sentences
104
 
105
  @lru_cache(maxsize=1)
 
164
  try:
165
  embeddings_data = load_embeddings_from_hf()
166
  if embeddings_data is None:
167
+ raise ValueError(f"Failed to load embeddings from HF dataset: {ARTEFACT_EMBEDDINGS_DATASET}")
168
 
169
  if MODEL_TYPE == "clip":
170
  embeddings, sentence_ids = embeddings_data["clip"]
 
497
  print(f" - {st_file.name}")
498
  print(f" - {ids_file.name}")
499
  return None, None
500
+
501
+ # Add this function for backward compatibility
502
+ def st_load_file(file_path: Path) -> Any:
503
+ """Load a file using safetensors or other methods"""
504
+ try:
505
+ if file_path.suffix == '.safetensors':
506
+ import safetensors
507
+ return safetensors.safe_open(str(file_path), framework="pt")
508
+ else:
509
+ import torch
510
+ return torch.load(str(file_path))
511
+ except ImportError:
512
+ print(f"⚠️ Required library not available for loading {file_path}")
513
+ return None
514
+ except Exception as e:
515
+ print(f"❌ Error loading {file_path}: {e}")
516
+ return None
requirements.txt CHANGED
@@ -6,7 +6,7 @@ flask-cors
6
  # Hugging Face ecosystem
7
  huggingface_hub>=0.20
8
  hf_transfer>=0.1.4
9
- datasets>=2.14.0
10
 
11
  # Core ML libraries
12
  torch>=2.0.0
 
6
  # Hugging Face ecosystem
7
  huggingface_hub>=0.20
8
  hf_transfer>=0.1.4
9
+ datasets>=2.0.0
10
 
11
  # Core ML libraries
12
  torch>=2.0.0