samwaugh commited on
Commit
e4db11d
ยท
1 Parent(s): 4f1c614

Try to fix

Browse files
backend/runner/config.py CHANGED
@@ -97,20 +97,37 @@ def load_json_datasets() -> Optional[Dict[str, Any]]:
97
  try:
98
  print("๐Ÿ”„ Loading data from Hugging Face datasets...")
99
 
100
- creators_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'creators.json', split='train')
101
- sentences_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'sentences.json', split='train')
102
- works_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'works.json', split='train')
103
- topics_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topics.json', split='train')
104
- topic_names_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topic_names.json', split='train')
105
 
106
- # Convert to dictionaries for backward compatibility
 
 
 
 
107
  global sentences, works, creators, topics, topic_names
108
 
109
- sentences = {str(i): item for i, item in enumerate(sentences_dataset)}
110
- works = {str(i): item for i, item in enumerate(works_dataset)}
111
- creators = {str(i): item for i, item in enumerate(creators_dataset)}
112
- topics = {str(i): item for i, item in enumerate(topics_dataset)}
113
- topic_names = {str(i): item for i, item in enumerate(topic_names_dataset)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  print(f"โœ… Successfully loaded JSON datasets from HF:")
116
  print(f" Sentences: {len(sentences)} entries")
@@ -119,13 +136,7 @@ def load_json_datasets() -> Optional[Dict[str, Any]]:
119
  print(f" Topics: {len(topics)} entries")
120
  print(f" Topic Names: {len(topic_names)} entries")
121
 
122
- return {
123
- 'creators': creators_dataset,
124
- 'sentences': sentences_dataset,
125
- 'works': works_dataset,
126
- 'topics': topics_dataset,
127
- 'topic_names': topic_names_dataset
128
- }
129
  except Exception as e:
130
  print(f"โŒ Failed to load JSON datasets from HF: {e}")
131
  return None
@@ -137,13 +148,16 @@ def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
137
  return None
138
 
139
  try:
140
- clip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'clip_embeddings.safetensors', split='train')
141
- paintingclip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'paintingclip_embeddings.safetensors', split='train')
 
 
 
 
 
142
 
143
- return {
144
- 'clip': clip_embeddings,
145
- 'paintingclip': paintingclip_embeddings
146
- }
147
  except Exception as e:
148
  print(f"โŒ Failed to load embeddings datasets from HF: {e}")
149
  return None
 
97
  try:
98
  print("๐Ÿ”„ Loading data from Hugging Face datasets...")
99
 
100
+ # Load the entire dataset (it should contain all JSON data)
101
+ dataset = load_dataset(ARTEFACT_JSON_DATASET, split='train')
 
 
 
102
 
103
+ print(f" Dataset columns: {dataset.column_names}")
104
+ print(f"๐Ÿ” Dataset length: {len(dataset)}")
105
+
106
+ # The dataset should contain all the JSON data in a single table
107
+ # We need to extract the different data types from the columns
108
  global sentences, works, creators, topics, topic_names
109
 
110
+ # Initialize empty dictionaries
111
+ sentences = {}
112
+ works = {}
113
+ creators = {}
114
+ topics = {}
115
+ topic_names = {}
116
+
117
+ # Process the dataset based on its actual structure
118
+ # This will depend on how the data was uploaded
119
+ for i, item in enumerate(dataset):
120
+ # Check what type of data this item contains
121
+ if 'sentence_id' in item:
122
+ sentences[str(i)] = item
123
+ elif 'work_id' in item:
124
+ works[str(i)] = item
125
+ elif 'creator_name' in item:
126
+ creators[str(i)] = item
127
+ elif 'topic_id' in item:
128
+ topics[str(i)] = item
129
+ elif 'topic_name' in item:
130
+ topic_names[str(i)] = item
131
 
132
  print(f"โœ… Successfully loaded JSON datasets from HF:")
133
  print(f" Sentences: {len(sentences)} entries")
 
136
  print(f" Topics: {len(topics)} entries")
137
  print(f" Topic Names: {len(topic_names)} entries")
138
 
139
+ return dataset
 
 
 
 
 
 
140
  except Exception as e:
141
  print(f"โŒ Failed to load JSON datasets from HF: {e}")
142
  return None
 
148
  return None
149
 
150
  try:
151
+ print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
152
+
153
+ # Load the entire dataset
154
+ dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split='train')
155
+
156
+ print(f" Embeddings dataset columns: {dataset.column_names}")
157
+ print(f" Embeddings dataset length: {len(dataset)}")
158
 
159
+ # Return the dataset for inspection
160
+ return dataset
 
 
161
  except Exception as e:
162
  print(f"โŒ Failed to load embeddings datasets from HF: {e}")
163
  return None
backend/runner/filtering.py CHANGED
@@ -18,60 +18,11 @@ def get_filtered_sentence_ids(
18
  ) -> Set[str]:
19
  """
20
  Get the set of sentence IDs that match the given filters.
21
-
22
- Args:
23
- filter_topics: List of topic codes to filter by (e.g., ["C2778983918", ...])
24
- filter_creators: List of creator names to filter by
25
-
26
- Returns:
27
- Set of sentence IDs that match all filters
28
  """
29
- # Start with all sentence IDs
30
- valid_sentence_ids = set(JSON_DATASETS['sentences']['id'])
31
-
32
- # If no filters, return all sentences
33
- if not filter_topics and not filter_creators:
34
- return valid_sentence_ids
35
-
36
- # Build set of valid work IDs based on filters
37
- valid_work_ids = set()
38
-
39
- # Apply topic filter
40
- if filter_topics:
41
- # Using topics.json (topic -> works mapping)
42
- # For each selected topic, get all works that have it
43
- for topic_id in filter_topics:
44
- if topic_id in JSON_DATASETS['topics']:
45
- # Add all works that have this topic
46
- valid_work_ids.update(JSON_DATASETS['topics'][topic_id])
47
- else:
48
- # If no topic filter, all works are valid so far
49
- valid_work_ids = set(JSON_DATASETS['works']['id'])
50
-
51
- # Apply creator filter
52
- if filter_creators:
53
- # Direct lookup in creators.json (more efficient)
54
- creator_work_ids = set()
55
- for creator_name in filter_creators:
56
- if creator_name in JSON_DATASETS['creators']:
57
- # Get all works by this creator directly from creators.json
58
- creator_work_ids.update(JSON_DATASETS['creators'][creator_name])
59
-
60
- # Intersect with existing valid_work_ids if topics were filtered
61
- if filter_topics:
62
- valid_work_ids = valid_work_ids.intersection(creator_work_ids)
63
- else:
64
- valid_work_ids = creator_work_ids
65
-
66
- # Now filter sentences to only those from valid works
67
- filtered_sentence_ids = set()
68
- for sentence_id in valid_sentence_ids:
69
- # Extract work ID from sentence ID (format: WORKID_sXXXX)
70
- work_id = sentence_id.split("_")[0]
71
- if work_id in valid_work_ids:
72
- filtered_sentence_ids.add(sentence_id)
73
-
74
- return filtered_sentence_ids
75
 
76
 
77
  def apply_filters_to_results(
 
18
  ) -> Set[str]:
19
  """
20
  Get the set of sentence IDs that match the given filters.
 
 
 
 
 
 
 
21
  """
22
+ # For now, return empty set since data loading is failing
23
+ # This will be fixed once we understand the actual dataset structure
24
+ print("โš ๏ธ Filtering disabled - data not loaded properly")
25
+ return set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def apply_filters_to_results(
backend/runner/inference.py CHANGED
@@ -73,22 +73,15 @@ def load_embeddings_from_hf():
73
  print(f"๐Ÿ” Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
74
  dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split="train")
75
 
76
- # Load CLIP embeddings
77
- clip_embeddings = dataset["clip_embeddings"]
78
- clip_sentence_ids = dataset["clip_embeddings_sentence_ids"]
79
 
80
- # Load PaintingCLIP embeddings
81
- paintingclip_embeddings = dataset["paintingclip_embeddings"]
82
- paintingclip_sentence_ids = dataset["paintingclip_embeddings_sentence_ids"]
 
83
 
84
- print(f"โœ… Successfully loaded embeddings from HF:")
85
- print(f" CLIP: {len(clip_sentence_ids)} embeddings")
86
- print(f" PaintingCLIP: {len(paintingclip_sentence_ids)} embeddings")
87
-
88
- return {
89
- "clip": (clip_embeddings, clip_sentence_ids),
90
- "paintingclip": (paintingclip_embeddings, paintingclip_sentence_ids)
91
- }
92
  except Exception as e:
93
  print(f"โŒ Failed to load embeddings from HF: {e}")
94
  return None
 
73
  print(f"๐Ÿ” Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
74
  dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split="train")
75
 
76
+ print(f"โœ… Dataset columns: {dataset.column_names}")
77
+ print(f"๐Ÿ” Dataset length: {len(dataset)}")
 
78
 
79
+ # We need to understand the actual structure of the embeddings dataset
80
+ # For now, let's return the dataset for inspection
81
+ print("โš ๏ธ Embeddings dataset structure needs to be analyzed")
82
+ print("โš ๏ธ Please check the console output above to see available columns")
83
 
84
+ return None
 
 
 
 
 
 
 
85
  except Exception as e:
86
  print(f"โŒ Failed to load embeddings from HF: {e}")
87
  return None