Spaces:

samwaugh
/

ArteFact

Paused

App Files Files Community

samwaugh commited on Sep 1, 2025

Commit

e4db11d

1 Parent(s): 4f1c614

Try to fix

Browse files

Files changed (3) hide show

backend/runner/config.py +38 -24
backend/runner/filtering.py +4 -53
backend/runner/inference.py +7 -14

backend/runner/config.py CHANGED Viewed

@@ -97,20 +97,37 @@ def load_json_datasets() -> Optional[Dict[str, Any]]:
     try:
         print("🔄 Loading data from Hugging Face datasets...")
-        creators_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'creators.json', split='train')
-        sentences_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'sentences.json', split='train')
-        works_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'works.json', split='train')
-        topics_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topics.json', split='train')
-        topic_names_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topic_names.json', split='train')
-        # Convert to dictionaries for backward compatibility
         global sentences, works, creators, topics, topic_names
-        sentences = {str(i): item for i, item in enumerate(sentences_dataset)}
-        works = {str(i): item for i, item in enumerate(works_dataset)}
-        creators = {str(i): item for i, item in enumerate(creators_dataset)}
-        topics = {str(i): item for i, item in enumerate(topics_dataset)}
-        topic_names = {str(i): item for i, item in enumerate(topic_names_dataset)}
         print(f"✅ Successfully loaded JSON datasets from HF:")
         print(f"   Sentences: {len(sentences)} entries")
@@ -119,13 +136,7 @@ def load_json_datasets() -> Optional[Dict[str, Any]]:
         print(f"   Topics: {len(topics)} entries")
         print(f"   Topic Names: {len(topic_names)} entries")
-        return {
-            'creators': creators_dataset,
-            'sentences': sentences_dataset,
-            'works': works_dataset,
-            'topics': topics_dataset,
-            'topic_names': topic_names_dataset
-        }
     except Exception as e:
         print(f"❌ Failed to load JSON datasets from HF: {e}")
         return None
@@ -137,13 +148,16 @@ def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
         return None
     try:
-        clip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'clip_embeddings.safetensors', split='train')
-        paintingclip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'paintingclip_embeddings.safetensors', split='train')
-        return {
-            'clip': clip_embeddings,
-            'paintingclip': paintingclip_embeddings
-        }
     except Exception as e:
         print(f"❌ Failed to load embeddings datasets from HF: {e}")
         return None

     try:
         print("🔄 Loading data from Hugging Face datasets...")
+        # Load the entire dataset (it should contain all JSON data)
+        dataset = load_dataset(ARTEFACT_JSON_DATASET, split='train')
+        print(f" Dataset columns: {dataset.column_names}")
+        print(f"🔍 Dataset length: {len(dataset)}")
+        # The dataset should contain all the JSON data in a single table
+        # We need to extract the different data types from the columns
         global sentences, works, creators, topics, topic_names
+        # Initialize empty dictionaries
+        sentences = {}
+        works = {}
+        creators = {}
+        topics = {}
+        topic_names = {}
+        # Process the dataset based on its actual structure
+        # This will depend on how the data was uploaded
+        for i, item in enumerate(dataset):
+            # Check what type of data this item contains
+            if 'sentence_id' in item:
+                sentences[str(i)] = item
+            elif 'work_id' in item:
+                works[str(i)] = item
+            elif 'creator_name' in item:
+                creators[str(i)] = item
+            elif 'topic_id' in item:
+                topics[str(i)] = item
+            elif 'topic_name' in item:
+                topic_names[str(i)] = item
         print(f"✅ Successfully loaded JSON datasets from HF:")
         print(f"   Sentences: {len(sentences)} entries")
         print(f"   Topics: {len(topics)} entries")
         print(f"   Topic Names: {len(topic_names)} entries")
+        return dataset
     except Exception as e:
         print(f"❌ Failed to load JSON datasets from HF: {e}")
         return None
         return None
     try:
+        print(f" Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
+        # Load the entire dataset
+        dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split='train')
+        print(f" Embeddings dataset columns: {dataset.column_names}")
+        print(f" Embeddings dataset length: {len(dataset)}")
+        # Return the dataset for inspection
+        return dataset
     except Exception as e:
         print(f"❌ Failed to load embeddings datasets from HF: {e}")
         return None

backend/runner/filtering.py CHANGED Viewed

@@ -18,60 +18,11 @@ def get_filtered_sentence_ids(
 ) -> Set[str]:
     """
     Get the set of sentence IDs that match the given filters.
-    Args:
-        filter_topics: List of topic codes to filter by (e.g., ["C2778983918", ...])
-        filter_creators: List of creator names to filter by
-    Returns:
-        Set of sentence IDs that match all filters
     """
-    # Start with all sentence IDs
-    valid_sentence_ids = set(JSON_DATASETS['sentences']['id'])
-    # If no filters, return all sentences
-    if not filter_topics and not filter_creators:
-        return valid_sentence_ids
-    # Build set of valid work IDs based on filters
-    valid_work_ids = set()
-    # Apply topic filter
-    if filter_topics:
-        # Using topics.json (topic -> works mapping)
-        # For each selected topic, get all works that have it
-        for topic_id in filter_topics:
-            if topic_id in JSON_DATASETS['topics']:
-                # Add all works that have this topic
-                valid_work_ids.update(JSON_DATASETS['topics'][topic_id])
-    else:
-        # If no topic filter, all works are valid so far
-        valid_work_ids = set(JSON_DATASETS['works']['id'])
-    # Apply creator filter
-    if filter_creators:
-        # Direct lookup in creators.json (more efficient)
-        creator_work_ids = set()
-        for creator_name in filter_creators:
-            if creator_name in JSON_DATASETS['creators']:
-                # Get all works by this creator directly from creators.json
-                creator_work_ids.update(JSON_DATASETS['creators'][creator_name])
-        # Intersect with existing valid_work_ids if topics were filtered
-        if filter_topics:
-            valid_work_ids = valid_work_ids.intersection(creator_work_ids)
-        else:
-            valid_work_ids = creator_work_ids
-    # Now filter sentences to only those from valid works
-    filtered_sentence_ids = set()
-    for sentence_id in valid_sentence_ids:
-        # Extract work ID from sentence ID (format: WORKID_sXXXX)
-        work_id = sentence_id.split("_")[0]
-        if work_id in valid_work_ids:
-            filtered_sentence_ids.add(sentence_id)
-    return filtered_sentence_ids
 def apply_filters_to_results(

 ) -> Set[str]:
     """
     Get the set of sentence IDs that match the given filters.
     """
+    # For now, return empty set since data loading is failing
+    # This will be fixed once we understand the actual dataset structure
+    print("⚠️  Filtering disabled - data not loaded properly")
+    return set()
 def apply_filters_to_results(

backend/runner/inference.py CHANGED Viewed

@@ -73,22 +73,15 @@ def load_embeddings_from_hf():
         print(f"🔍 Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
         dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split="train")
-        # Load CLIP embeddings
-        clip_embeddings = dataset["clip_embeddings"]
-        clip_sentence_ids = dataset["clip_embeddings_sentence_ids"]
-        # Load PaintingCLIP embeddings
-        paintingclip_embeddings = dataset["paintingclip_embeddings"]
-        paintingclip_sentence_ids = dataset["paintingclip_embeddings_sentence_ids"]
-        print(f"✅ Successfully loaded embeddings from HF:")
-        print(f"   CLIP: {len(clip_sentence_ids)} embeddings")
-        print(f"   PaintingCLIP: {len(paintingclip_sentence_ids)} embeddings")
-        return {
-            "clip": (clip_embeddings, clip_sentence_ids),
-            "paintingclip": (paintingclip_embeddings, paintingclip_sentence_ids)
-        }
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None

         print(f"🔍 Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
         dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split="train")
+        print(f"✅ Dataset columns: {dataset.column_names}")
+        print(f"🔍 Dataset length: {len(dataset)}")
+        # We need to understand the actual structure of the embeddings dataset
+        # For now, let's return the dataset for inspection
+        print("⚠️  Embeddings dataset structure needs to be analyzed")
+        print("⚠️  Please check the console output above to see available columns")
+        return None
     except Exception as e:
         print(f"❌ Failed to load embeddings from HF: {e}")
         return None