Try to fix changes
Browse files- backend/runner/app.py +3 -1
- backend/runner/config.py +113 -46
- backend/runner/filtering.py +10 -7
- backend/runner/inference.py +35 -10
- requirements.txt +1 -1
backend/runner/app.py
CHANGED
|
@@ -98,7 +98,9 @@ from .config import (
|
|
| 98 |
ARTIFACTS_DIR,
|
| 99 |
OUTPUTS_DIR,
|
| 100 |
JSON_INFO_DIR,
|
| 101 |
-
MARKER_DIR
|
|
|
|
|
|
|
| 102 |
)
|
| 103 |
|
| 104 |
# Import data from config (loaded from HF datasets)
|
|
|
|
| 98 |
ARTIFACTS_DIR,
|
| 99 |
OUTPUTS_DIR,
|
| 100 |
JSON_INFO_DIR,
|
| 101 |
+
MARKER_DIR,
|
| 102 |
+
JSON_DATASETS,
|
| 103 |
+
EMBEDDINGS_DATASETS
|
| 104 |
)
|
| 105 |
|
| 106 |
# Import data from config (loaded from HF datasets)
|
backend/runner/config.py
CHANGED
|
@@ -5,12 +5,33 @@ All runner modules should import from this module instead of defining their own
|
|
| 5 |
|
| 6 |
import os
|
| 7 |
from pathlib import Path
|
| 8 |
-
from
|
| 9 |
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# READ root (repo data - read-only)
|
| 16 |
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
|
@@ -60,50 +81,96 @@ for dir_path in [OUTPUTS_DIR, ARTIFACTS_DIR]:
|
|
| 60 |
print(f"β οΈ Could not create directory {dir_path}: {e}")
|
| 61 |
|
| 62 |
# Global data variables (will be populated from HF datasets)
|
| 63 |
-
sentences = {}
|
| 64 |
-
works = {}
|
| 65 |
-
creators = {}
|
| 66 |
-
topics = {}
|
| 67 |
-
topic_names = {}
|
| 68 |
-
|
| 69 |
-
def load_json_from_hf(dataset_name: str, file_name: str):
|
| 70 |
-
"""Load JSON data from Hugging Face dataset"""
|
| 71 |
-
try:
|
| 72 |
-
dataset = load_dataset(dataset_name, split="train")
|
| 73 |
-
# Access the specific file content
|
| 74 |
-
return dataset[file_name]
|
| 75 |
-
except Exception as e:
|
| 76 |
-
print(f"Failed to load {file_name} from HF: {e}")
|
| 77 |
-
return None
|
| 78 |
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
print(f" Sentences: {len(sentences)} entries")
|
| 95 |
print(f" Works: {len(works)} entries")
|
| 96 |
-
print(f" Topics: {len(topics)} entries")
|
| 97 |
print(f" Creators: {len(creators)} entries")
|
| 98 |
-
print(f"
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
# Initialize data loading
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
import os
|
| 7 |
from pathlib import Path
|
| 8 |
+
from typing import Any, Dict, Optional
|
| 9 |
|
| 10 |
+
# Try to import datasets, but handle gracefully if not available
|
| 11 |
+
try:
|
| 12 |
+
from datasets import load_dataset
|
| 13 |
+
DATASETS_AVAILABLE = True
|
| 14 |
+
except ImportError:
|
| 15 |
+
print("β οΈ datasets library not available - HF dataset loading disabled")
|
| 16 |
+
DATASETS_AVAILABLE = False
|
| 17 |
+
|
| 18 |
+
# Environment variables for dataset names
|
| 19 |
+
ARTEFACT_JSON_DATASET = os.getenv('ARTEFACT_JSON_DATASET', 'samwaugh/artefact-json')
|
| 20 |
+
ARTEFACT_EMBEDDINGS_DATASET = os.getenv('ARTEFACT_EMBEDDINGS_DATASET', 'samwaugh/artefact-embeddings')
|
| 21 |
+
ARTEFACT_MARKDOWN_DATASET = os.getenv('ARTEFACT_MARKDOWN_DATASET', 'samwaugh/artefact-markdown')
|
| 22 |
+
|
| 23 |
+
# Legacy path variables for backward compatibility
|
| 24 |
+
JSON_INFO_DIR = "/data/hub/datasets--samwaugh--artefact-json/snapshots/latest"
|
| 25 |
+
EMBEDDINGS_DIR = "/data/hub/datasets--samwaugh--artefact-embeddings/snapshots/latest"
|
| 26 |
+
MARKDOWN_DIR = "/data/hub/datasets--samwaugh--artefact-markdown/snapshots/latest"
|
| 27 |
+
|
| 28 |
+
# Embedding file paths for backward compatibility
|
| 29 |
+
CLIP_EMBEDDINGS_ST = Path(EMBEDDINGS_DIR) / "clip_embeddings.safetensors"
|
| 30 |
+
PAINTINGCLIP_EMBEDDINGS_ST = Path(EMBEDDINGS_DIR) / "paintingclip_embeddings.safetensors"
|
| 31 |
+
CLIP_SENTENCE_IDS = Path(EMBEDDINGS_DIR) / "clip_embeddings_sentence_ids.json"
|
| 32 |
+
PAINTINGCLIP_SENTENCE_IDS = Path(EMBEDDINGS_DIR) / "paintingclip_embeddings_sentence_ids.json"
|
| 33 |
+
CLIP_EMBEDDINGS_DIR = EMBEDDINGS_DIR
|
| 34 |
+
PAINTINGCLIP_EMBEDDINGS_DIR = EMBEDDINGS_DIR
|
| 35 |
|
| 36 |
# READ root (repo data - read-only)
|
| 37 |
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
|
|
|
| 81 |
print(f"β οΈ Could not create directory {dir_path}: {e}")
|
| 82 |
|
| 83 |
# Global data variables (will be populated from HF datasets)
|
| 84 |
+
sentences: Dict[str, Any] = {}
|
| 85 |
+
works: Dict[str, Any] = {}
|
| 86 |
+
creators: Dict[str, Any] = {}
|
| 87 |
+
topics: Dict[str, Any] = {}
|
| 88 |
+
topic_names: Dict[str, Any] = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
# Load datasets from Hugging Face
|
| 91 |
+
def load_json_datasets() -> Optional[Dict[str, Any]]:
|
| 92 |
+
"""Load all JSON datasets from Hugging Face"""
|
| 93 |
+
if not DATASETS_AVAILABLE:
|
| 94 |
+
print("β οΈ datasets library not available - skipping HF dataset loading")
|
| 95 |
+
return None
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
print("π Loading data from Hugging Face datasets...")
|
| 99 |
+
|
| 100 |
+
creators_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'creators.json', split='train')
|
| 101 |
+
sentences_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'sentences.json', split='train')
|
| 102 |
+
works_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'works.json', split='train')
|
| 103 |
+
topics_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topics.json', split='train')
|
| 104 |
+
topic_names_dataset = load_dataset(ARTEFACT_JSON_DATASET, 'topic_names.json', split='train')
|
| 105 |
+
|
| 106 |
+
# Convert to dictionaries for backward compatibility
|
| 107 |
+
global sentences, works, creators, topics, topic_names
|
| 108 |
+
|
| 109 |
+
sentences = {str(i): item for i, item in enumerate(sentences_dataset)}
|
| 110 |
+
works = {str(i): item for i, item in enumerate(works_dataset)}
|
| 111 |
+
creators = {str(i): item for i, item in enumerate(creators_dataset)}
|
| 112 |
+
topics = {str(i): item for i, item in enumerate(topics_dataset)}
|
| 113 |
+
topic_names = {str(i): item for i, item in enumerate(topic_names_dataset)}
|
| 114 |
+
|
| 115 |
+
print(f"β
Successfully loaded JSON datasets from HF:")
|
| 116 |
print(f" Sentences: {len(sentences)} entries")
|
| 117 |
print(f" Works: {len(works)} entries")
|
|
|
|
| 118 |
print(f" Creators: {len(creators)} entries")
|
| 119 |
+
print(f" Topics: {len(topics)} entries")
|
| 120 |
+
print(f" Topic Names: {len(topic_names)} entries")
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
'creators': creators_dataset,
|
| 124 |
+
'sentences': sentences_dataset,
|
| 125 |
+
'works': works_dataset,
|
| 126 |
+
'topics': topics_dataset,
|
| 127 |
+
'topic_names': topic_names_dataset
|
| 128 |
+
}
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"β Failed to load JSON datasets from HF: {e}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
def load_embeddings_datasets() -> Optional[Dict[str, Any]]:
|
| 134 |
+
"""Load embeddings datasets from Hugging Face"""
|
| 135 |
+
if not DATASETS_AVAILABLE:
|
| 136 |
+
print("β οΈ datasets library not available - skipping HF embeddings loading")
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
clip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'clip_embeddings.safetensors', split='train')
|
| 141 |
+
paintingclip_embeddings = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, 'paintingclip_embeddings.safetensors', split='train')
|
| 142 |
+
|
| 143 |
+
return {
|
| 144 |
+
'clip': clip_embeddings,
|
| 145 |
+
'paintingclip': paintingclip_embeddings
|
| 146 |
+
}
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"β Failed to load embeddings datasets from HF: {e}")
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
# Initialize datasets
|
| 152 |
+
JSON_DATASETS = load_json_datasets()
|
| 153 |
+
EMBEDDINGS_DATASETS = load_embeddings_datasets()
|
| 154 |
|
| 155 |
# Initialize data loading
|
| 156 |
+
if JSON_DATASETS is None:
|
| 157 |
+
print("β οΈ Some data failed to load from HF datasets")
|
| 158 |
+
else:
|
| 159 |
+
print("β
All data loaded successfully from HF datasets")
|
| 160 |
+
|
| 161 |
+
# Add this function for backward compatibility
|
| 162 |
+
def st_load_file(file_path: Path) -> Any:
|
| 163 |
+
"""Load a file using safetensors or other methods"""
|
| 164 |
+
try:
|
| 165 |
+
if file_path.suffix == '.safetensors':
|
| 166 |
+
import safetensors
|
| 167 |
+
return safetensors.safe_open(str(file_path), framework="pt")
|
| 168 |
+
else:
|
| 169 |
+
import torch
|
| 170 |
+
return torch.load(str(file_path))
|
| 171 |
+
except ImportError:
|
| 172 |
+
print(f"β οΈ Required library not available for loading {file_path}")
|
| 173 |
+
return None
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print(f"β Error loading {file_path}: {e}")
|
| 176 |
+
return None
|
backend/runner/filtering.py
CHANGED
|
@@ -5,7 +5,10 @@ Filtering logic for sentence selection based on topics and creators.
|
|
| 5 |
from typing import Any, Dict, List, Set
|
| 6 |
|
| 7 |
# Import data from config (loaded from HF datasets)
|
| 8 |
-
from .config import
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Data is now loaded from Hugging Face datasets in config.py
|
| 11 |
# No need to load from local files anymore
|
|
@@ -24,7 +27,7 @@ def get_filtered_sentence_ids(
|
|
| 24 |
Set of sentence IDs that match all filters
|
| 25 |
"""
|
| 26 |
# Start with all sentence IDs
|
| 27 |
-
valid_sentence_ids = set(sentences
|
| 28 |
|
| 29 |
# If no filters, return all sentences
|
| 30 |
if not filter_topics and not filter_creators:
|
|
@@ -38,21 +41,21 @@ def get_filtered_sentence_ids(
|
|
| 38 |
# Using topics.json (topic -> works mapping)
|
| 39 |
# For each selected topic, get all works that have it
|
| 40 |
for topic_id in filter_topics:
|
| 41 |
-
if topic_id in topics:
|
| 42 |
# Add all works that have this topic
|
| 43 |
-
valid_work_ids.update(topics[topic_id])
|
| 44 |
else:
|
| 45 |
# If no topic filter, all works are valid so far
|
| 46 |
-
valid_work_ids = set(works
|
| 47 |
|
| 48 |
# Apply creator filter
|
| 49 |
if filter_creators:
|
| 50 |
# Direct lookup in creators.json (more efficient)
|
| 51 |
creator_work_ids = set()
|
| 52 |
for creator_name in filter_creators:
|
| 53 |
-
if creator_name in creators:
|
| 54 |
# Get all works by this creator directly from creators.json
|
| 55 |
-
creator_work_ids.update(creators[creator_name])
|
| 56 |
|
| 57 |
# Intersect with existing valid_work_ids if topics were filtered
|
| 58 |
if filter_topics:
|
|
|
|
| 5 |
from typing import Any, Dict, List, Set
|
| 6 |
|
| 7 |
# Import data from config (loaded from HF datasets)
|
| 8 |
+
from .config import (
|
| 9 |
+
JSON_INFO_DIR,
|
| 10 |
+
JSON_DATASETS
|
| 11 |
+
)
|
| 12 |
|
| 13 |
# Data is now loaded from Hugging Face datasets in config.py
|
| 14 |
# No need to load from local files anymore
|
|
|
|
| 27 |
Set of sentence IDs that match all filters
|
| 28 |
"""
|
| 29 |
# Start with all sentence IDs
|
| 30 |
+
valid_sentence_ids = set(JSON_DATASETS['sentences']['id'])
|
| 31 |
|
| 32 |
# If no filters, return all sentences
|
| 33 |
if not filter_topics and not filter_creators:
|
|
|
|
| 41 |
# Using topics.json (topic -> works mapping)
|
| 42 |
# For each selected topic, get all works that have it
|
| 43 |
for topic_id in filter_topics:
|
| 44 |
+
if topic_id in JSON_DATASETS['topics']:
|
| 45 |
# Add all works that have this topic
|
| 46 |
+
valid_work_ids.update(JSON_DATASETS['topics'][topic_id])
|
| 47 |
else:
|
| 48 |
# If no topic filter, all works are valid so far
|
| 49 |
+
valid_work_ids = set(JSON_DATASETS['works']['id'])
|
| 50 |
|
| 51 |
# Apply creator filter
|
| 52 |
if filter_creators:
|
| 53 |
# Direct lookup in creators.json (more efficient)
|
| 54 |
creator_work_ids = set()
|
| 55 |
for creator_name in filter_creators:
|
| 56 |
+
if creator_name in JSON_DATASETS['creators']:
|
| 57 |
# Get all works by this creator directly from creators.json
|
| 58 |
+
creator_work_ids.update(JSON_DATASETS['creators'][creator_name])
|
| 59 |
|
| 60 |
# Intersect with existing valid_work_ids if topics were filtered
|
| 61 |
if filter_topics:
|
backend/runner/inference.py
CHANGED
|
@@ -31,14 +31,19 @@ from .filtering import get_filtered_sentence_ids
|
|
| 31 |
# on-demand Grad-ECLIP & region-aware ranking
|
| 32 |
from .heatmap import generate_heatmap
|
| 33 |
from .config import (
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
PAINTINGCLIP_MODEL_DIR,
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
| 42 |
)
|
| 43 |
|
| 44 |
# βββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -65,8 +70,8 @@ TOP_K = 25 # Number of results to return
|
|
| 65 |
def load_embeddings_from_hf():
|
| 66 |
"""Load embeddings from HF dataset"""
|
| 67 |
try:
|
| 68 |
-
print(f"π Loading embeddings from {
|
| 69 |
-
dataset = load_dataset(
|
| 70 |
|
| 71 |
# Load CLIP embeddings
|
| 72 |
clip_embeddings = dataset["clip_embeddings"]
|
|
@@ -92,6 +97,9 @@ def _load_sentences_metadata() -> Dict[str, Dict[str, Any]]:
|
|
| 92 |
"""
|
| 93 |
Get sentence metadata from global config (loaded from HF datasets).
|
| 94 |
"""
|
|
|
|
|
|
|
|
|
|
| 95 |
return sentences
|
| 96 |
|
| 97 |
@lru_cache(maxsize=1)
|
|
@@ -156,7 +164,7 @@ def _initialize_pipeline():
|
|
| 156 |
try:
|
| 157 |
embeddings_data = load_embeddings_from_hf()
|
| 158 |
if embeddings_data is None:
|
| 159 |
-
raise ValueError(f"Failed to load embeddings from HF dataset: {
|
| 160 |
|
| 161 |
if MODEL_TYPE == "clip":
|
| 162 |
embeddings, sentence_ids = embeddings_data["clip"]
|
|
@@ -489,3 +497,20 @@ def load_embeddings_for_model(model_type: str):
|
|
| 489 |
print(f" - {st_file.name}")
|
| 490 |
print(f" - {ids_file.name}")
|
| 491 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# on-demand Grad-ECLIP & region-aware ranking
|
| 32 |
from .heatmap import generate_heatmap
|
| 33 |
from .config import (
|
| 34 |
+
JSON_INFO_DIR,
|
| 35 |
+
EMBEDDINGS_DIR,
|
| 36 |
+
JSON_DATASETS,
|
| 37 |
+
EMBEDDINGS_DATASETS,
|
| 38 |
PAINTINGCLIP_MODEL_DIR,
|
| 39 |
+
ARTEFACT_EMBEDDINGS_DATASET,
|
| 40 |
+
sentences, # Add this
|
| 41 |
+
CLIP_EMBEDDINGS_ST, # Add these for backward compatibility
|
| 42 |
+
PAINTINGCLIP_EMBEDDINGS_ST,
|
| 43 |
+
CLIP_SENTENCE_IDS,
|
| 44 |
+
PAINTINGCLIP_SENTENCE_IDS,
|
| 45 |
+
CLIP_EMBEDDINGS_DIR,
|
| 46 |
+
PAINTINGCLIP_EMBEDDINGS_DIR
|
| 47 |
)
|
| 48 |
|
| 49 |
# βββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 70 |
def load_embeddings_from_hf():
|
| 71 |
"""Load embeddings from HF dataset"""
|
| 72 |
try:
|
| 73 |
+
print(f"π Loading embeddings from {ARTEFACT_EMBEDDINGS_DATASET}...")
|
| 74 |
+
dataset = load_dataset(ARTEFACT_EMBEDDINGS_DATASET, split="train")
|
| 75 |
|
| 76 |
# Load CLIP embeddings
|
| 77 |
clip_embeddings = dataset["clip_embeddings"]
|
|
|
|
| 97 |
"""
|
| 98 |
Get sentence metadata from global config (loaded from HF datasets).
|
| 99 |
"""
|
| 100 |
+
if not sentences:
|
| 101 |
+
print("β οΈ No sentence metadata available - check if HF datasets loaded successfully")
|
| 102 |
+
return {}
|
| 103 |
return sentences
|
| 104 |
|
| 105 |
@lru_cache(maxsize=1)
|
|
|
|
| 164 |
try:
|
| 165 |
embeddings_data = load_embeddings_from_hf()
|
| 166 |
if embeddings_data is None:
|
| 167 |
+
raise ValueError(f"Failed to load embeddings from HF dataset: {ARTEFACT_EMBEDDINGS_DATASET}")
|
| 168 |
|
| 169 |
if MODEL_TYPE == "clip":
|
| 170 |
embeddings, sentence_ids = embeddings_data["clip"]
|
|
|
|
| 497 |
print(f" - {st_file.name}")
|
| 498 |
print(f" - {ids_file.name}")
|
| 499 |
return None, None
|
| 500 |
+
|
| 501 |
+
# Add this function for backward compatibility
|
| 502 |
+
def st_load_file(file_path: Path) -> Any:
|
| 503 |
+
"""Load a file using safetensors or other methods"""
|
| 504 |
+
try:
|
| 505 |
+
if file_path.suffix == '.safetensors':
|
| 506 |
+
import safetensors
|
| 507 |
+
return safetensors.safe_open(str(file_path), framework="pt")
|
| 508 |
+
else:
|
| 509 |
+
import torch
|
| 510 |
+
return torch.load(str(file_path))
|
| 511 |
+
except ImportError:
|
| 512 |
+
print(f"β οΈ Required library not available for loading {file_path}")
|
| 513 |
+
return None
|
| 514 |
+
except Exception as e:
|
| 515 |
+
print(f"β Error loading {file_path}: {e}")
|
| 516 |
+
return None
|
requirements.txt
CHANGED
|
@@ -6,7 +6,7 @@ flask-cors
|
|
| 6 |
# Hugging Face ecosystem
|
| 7 |
huggingface_hub>=0.20
|
| 8 |
hf_transfer>=0.1.4
|
| 9 |
-
datasets>=2.
|
| 10 |
|
| 11 |
# Core ML libraries
|
| 12 |
torch>=2.0.0
|
|
|
|
| 6 |
# Hugging Face ecosystem
|
| 7 |
huggingface_hub>=0.20
|
| 8 |
hf_transfer>=0.1.4
|
| 9 |
+
datasets>=2.0.0
|
| 10 |
|
| 11 |
# Core ML libraries
|
| 12 |
torch>=2.0.0
|