Andrej Janchevski commited on
Commit
ee3e8fe
·
1 Parent(s): 0f7b533

feat(coins): add optional seed param and label fields to sample-triples

Browse files

- registry.sample_triples now accepts seed= for deterministic sampling
via random.Random(seed); omitted seed preserves prior random behaviour
- Each head/relation/tail gains a dataset-cleaned label alongside id and
name (NELL strips concept:, Freebase strips /m/, WordNet drops POS
suffix) so the frontend can show display-friendly strings without
duplicating the cleanup logic
- OpenAPI, Postman collection, and backend README updated in lockstep
- Enables a day-stable NELL Fact of the Day widget seeded by ISO date

docs/api.yaml CHANGED
@@ -169,6 +169,17 @@ paths:
169
  maximum: 50
170
  default: 10
171
  description: Number of random triples to return
 
 
 
 
 
 
 
 
 
 
 
172
  responses:
173
  "200":
174
  description: Sample triples
@@ -844,6 +855,10 @@ components:
844
  name:
845
  type: string
846
  example: dog's_breakfast.n.01
 
 
 
 
847
 
848
  CoinsRelationsResponse:
849
  type: object
@@ -872,6 +887,10 @@ components:
872
  name:
873
  type: string
874
  example: _hypernym
 
 
 
 
875
 
876
  CoinsSampleTriplesResponse:
877
  type: object
 
169
  maximum: 50
170
  default: 10
171
  description: Number of random triples to return
172
+ - name: seed
173
+ in: query
174
+ required: false
175
+ schema:
176
+ type: string
177
+ description: |
178
+ Optional sampling seed. When provided, sampling is deterministic —
179
+ the same `(dataset_id, count, seed)` always yields the same triples.
180
+ Useful for day-stable "fact of the day" widgets (e.g. seed by the
181
+ ISO date). When omitted, sampling is fully random.
182
+ example: "2026-04-15"
183
  responses:
184
  "200":
185
  description: Sample triples
 
855
  name:
856
  type: string
857
  example: dog's_breakfast.n.01
858
+ label:
859
+ type: string
860
+ description: Dataset-specific short, display-friendly form of `name`. Only present on sample-triples responses.
861
+ example: dog's_breakfast
862
 
863
  CoinsRelationsResponse:
864
  type: object
 
887
  name:
888
  type: string
889
  example: _hypernym
890
+ label:
891
+ type: string
892
+ description: Dataset-specific short, display-friendly form of `name`. Only present on sample-triples responses.
893
+ example: hypernym
894
 
895
  CoinsSampleTriplesResponse:
896
  type: object
docs/postman/collection.json CHANGED
@@ -135,14 +135,15 @@
135
  "method": "GET",
136
  "header": [],
137
  "url": {
138
- "raw": "{{base_url}}/coins/datasets/wordnet/sample-triples?count=5",
139
  "host": ["{{base_url}}"],
140
  "path": ["coins", "datasets", "wordnet", "sample-triples"],
141
  "query": [
142
- { "key": "count", "value": "5" }
 
143
  ]
144
  },
145
- "description": "Random sample triples from the dataset."
146
  }
147
  },
148
  {
 
135
  "method": "GET",
136
  "header": [],
137
  "url": {
138
+ "raw": "{{base_url}}/coins/datasets/wordnet/sample-triples?count=5&seed=2026-04-15",
139
  "host": ["{{base_url}}"],
140
  "path": ["coins", "datasets", "wordnet", "sample-triples"],
141
  "query": [
142
+ { "key": "count", "value": "5" },
143
+ { "key": "seed", "value": "2026-04-15", "description": "Optional. When provided, sampling is deterministic (same seed + count ⇒ same triples). Omit for random." }
144
  ]
145
  },
146
+ "description": "Random sample triples from the dataset. Each triple has head/relation/tail entries with { id, name, label }; `label` is a dataset-specific display-friendly form of `name` (NELL strips `concept:` prefixes, Freebase strips `/m/`, WordNet drops the POS suffix). Pass an optional `seed` (any string) for deterministic sampling — e.g. seed by today's ISO date for a day-stable 'fact of the day' widget."
147
  }
148
  },
149
  {
src/backend/README.md CHANGED
@@ -77,7 +77,7 @@ All endpoints are prefixed with `/api/v1/`.
77
  | `GET` | `/coins/datasets` | List datasets with entity/relation counts |
78
  | `GET` | `/coins/datasets/{id}/entities` | Paginated entity search (`?q=&page=&page_size=`) |
79
  | `GET` | `/coins/datasets/{id}/relations` | Paginated relation search (`?q=&page=&page_size=`) |
80
- | `GET` | `/coins/datasets/{id}/sample-triples` | Random training triples (`?count=10`) |
81
  | `GET` | `/coins/models` | Available algorithms + supported query structures |
82
  | `GET` | `/coins/query-structures` | Query graph templates for frontend rendering |
83
  | `POST` | `/coins/predict` | Run link prediction / query answering |
 
77
  | `GET` | `/coins/datasets` | List datasets with entity/relation counts |
78
  | `GET` | `/coins/datasets/{id}/entities` | Paginated entity search (`?q=&page=&page_size=`) |
79
  | `GET` | `/coins/datasets/{id}/relations` | Paginated relation search (`?q=&page=&page_size=`) |
80
+ | `GET` | `/coins/datasets/{id}/sample-triples` | Random training triples (`?count=10&seed=...`); optional `seed` makes sampling deterministic (same `seed+count` ⇒ same triples, e.g. seed by ISO date for a day-stable widget). Head/relation/tail each carry a dataset-cleaned `label` alongside `id`, `name` |
81
  | `GET` | `/coins/models` | Available algorithms + supported query structures |
82
  | `GET` | `/coins/query-structures` | Query graph templates for frontend rendering |
83
  | `POST` | `/coins/predict` | Run link prediction / query answering |
src/backend/api/services/registry.py CHANGED
@@ -8,6 +8,7 @@ import yaml
8
  from django.conf import settings
9
 
10
  from api.services.constants import COINS_CONFIG_SUFFIX, COINS_DATASET_META
 
11
 
12
  logger = logging.getLogger(__name__)
13
 
@@ -530,8 +531,13 @@ class ModelRegistry:
530
  start = (max(1, page) - 1) * page_size
531
  return items[start:start + page_size], total
532
 
533
- def sample_triples(self, dataset_id, count=10):
534
- """Return random triples with resolved entity/relation names."""
 
 
 
 
 
535
  loader = self.loaders.get(dataset_id)
536
  if loader is None:
537
  return []
@@ -539,16 +545,20 @@ class ModelRegistry:
539
  edge_data = loader.train_edge_data
540
  count = min(count, len(edge_data))
541
 
542
- indices = random.sample(range(len(edge_data)), count)
 
543
 
544
  result = []
545
  for i in indices:
546
  row = edge_data.iloc[i]
547
  h, r, t = int(row.s), int(row.r), int(row.t)
 
 
 
548
  result.append({
549
- "head": {"id": h, "name": str(inv_nodes.get(h, h))},
550
- "relation": {"id": r, "name": str(inv_relations.get(r, r))},
551
- "tail": {"id": t, "name": str(inv_nodes.get(t, t))},
552
  })
553
  return result
554
 
 
8
  from django.conf import settings
9
 
10
  from api.services.constants import COINS_CONFIG_SUFFIX, COINS_DATASET_META
11
+ from api.utils import clean_entity_name, clean_relation_name
12
 
13
  logger = logging.getLogger(__name__)
14
 
 
531
  start = (max(1, page) - 1) * page_size
532
  return items[start:start + page_size], total
533
 
534
+ def sample_triples(self, dataset_id, count=10, seed=None):
535
+ """Return random triples with resolved entity/relation names.
536
+
537
+ When ``seed`` is provided, sampling is deterministic — the same
538
+ ``(dataset_id, count, seed)`` always yields the same triples. When
539
+ ``seed`` is None, uses the global RNG.
540
+ """
541
  loader = self.loaders.get(dataset_id)
542
  if loader is None:
543
  return []
 
545
  edge_data = loader.train_edge_data
546
  count = min(count, len(edge_data))
547
 
548
+ rng = random.Random(seed) if seed is not None else random
549
+ indices = rng.sample(range(len(edge_data)), count)
550
 
551
  result = []
552
  for i in indices:
553
  row = edge_data.iloc[i]
554
  h, r, t = int(row.s), int(row.r), int(row.t)
555
+ h_name = str(inv_nodes.get(h, h))
556
+ r_name = str(inv_relations.get(r, r))
557
+ t_name = str(inv_nodes.get(t, t))
558
  result.append({
559
+ "head": {"id": h, "name": h_name, "label": clean_entity_name(h_name, dataset_id)},
560
+ "relation": {"id": r, "name": r_name, "label": clean_relation_name(r_name, dataset_id)},
561
+ "tail": {"id": t, "name": t_name, "label": clean_entity_name(t_name, dataset_id)},
562
  })
563
  return result
564
 
src/backend/api/views/coins.py CHANGED
@@ -78,9 +78,12 @@ class CoinsSampleTriplesView(APIView):
78
  count = int(request.query_params.get("count", 10))
79
  count = max(1, min(50, count))
80
 
 
 
 
81
  return Response({
82
  "dataset_id": dataset_id,
83
- "triples": registry.sample_triples(dataset_id, count),
84
  })
85
 
86
 
 
78
  count = int(request.query_params.get("count", 10))
79
  count = max(1, min(50, count))
80
 
81
+ seed_raw = request.query_params.get("seed")
82
+ seed = seed_raw if seed_raw not in (None, "") else None
83
+
84
  return Response({
85
  "dataset_id": dataset_id,
86
+ "triples": registry.sample_triples(dataset_id, count, seed=seed),
87
  })
88
 
89