hjulerm commited on
Commit
405d502
Β·
1 Parent(s): 30ae921

Add HDC text-to-pictogram space

Browse files
Files changed (5) hide show
  1. README.md +16 -6
  2. app.py +235 -0
  3. core_pictograms.json +0 -0
  4. hdc_text2picto.py +263 -0
  5. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,22 @@
1
  ---
2
- title: HDC ToPicto Translation
3
- emoji: πŸ‘€
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Speech to ARASAAC Pictograms (HDC)
3
+ emoji: 🧠
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
9
+ license: apache-2.0
10
  ---
11
 
12
+ # 🧠 Speech / Text β†’ ARASAAC Pictograms (HDC)
13
+
14
+ Convert spoken or written English into ARASAAC pictograms using **Hyperdimensional Computing**.
15
+
16
+ - **Audio tab**: record via microphone or upload a `.wav` file β†’ transcribed with [Whisper tiny (EN)](https://huggingface.co/openai/whisper-tiny.en) β†’ pictograms
17
+ - **Text tab**: type directly β†’ pictograms
18
+ - Transcription is editable before generating pictograms
19
+ - Pictogram lookup uses an **HDC prototype memory** built from ~855 core vocabulary pictograms
20
+ with WordNet synonym injection β€” no API call per word at inference time
21
+ - Each pictogram card shows a **similarity score badge** (retrieval confidence)
22
+ - Words below the confidence threshold show a `?` placeholder
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import os
4
+ import nltk
5
+ import gradio as gr
6
+ from transformers import pipeline
7
+ from nltk.corpus import wordnet
8
+
9
+ # Ensure WordNet data is available
10
+ nltk.download("wordnet", quiet=True)
11
+
12
+ # ── HDC imports ───────────────────────────────────────────────────────────────
13
+ from hdc_text2picto import encode_word, PictogramMemory
14
+
15
+ # ── ASR model ─────────────────────────────────────────────────────────────────
16
+ asr = pipeline(
17
+ "automatic-speech-recognition",
18
+ model="openai/whisper-tiny.en",
19
+ device="cpu",
20
+ )
21
+
22
+ # ── HDC: build prototype memory from cached core pictograms ───────────────────
23
+ #
24
+ # At startup we load the locally cached ARASAAC core vocabulary JSON and build
25
+ # a PictogramMemory by encoding every keyword (+ WordNet synonyms) for each
26
+ # pictogram. This replaces the per-word ARASAAC API call at inference time:
27
+ # retrieval is entirely local and offline after startup.
28
+
29
+ SYNSET_SUFFIX_TO_WN = {
30
+ "n": wordnet.NOUN, "v": wordnet.VERB,
31
+ "a": wordnet.ADJ, "s": wordnet.ADJ, "r": wordnet.ADV,
32
+ }
33
+ SYNSET_SUFFIX_TO_POS = {"n": "NOUN", "v": "VERB", "a": "ADJ", "s": "ADJ", "r": "ADV"}
34
+
35
+ CONFIDENCE_THRESHOLD = 0.10 # cosine similarity below this β†’ show ? placeholder
36
+
37
+
38
+ def pos_from_synsets(synsets: list[str]) -> str:
39
+ if synsets:
40
+ return SYNSET_SUFFIX_TO_POS.get(synsets[0].split("-")[-1], "OTHER")
41
+ return "OTHER"
42
+
43
+
44
+ def get_synonyms(keyword: str, synsets: list[str]) -> list[str]:
45
+ wn_pos = SYNSET_SUFFIX_TO_WN.get(synsets[0].split("-")[-1]) if synsets else None
46
+ synonyms = set()
47
+ for ss in wordnet.synsets(keyword, pos=wn_pos):
48
+ for lemma in ss.lemmas():
49
+ syn = lemma.name().replace("_", " ").lower()
50
+ if syn != keyword.lower():
51
+ synonyms.add(syn)
52
+ return list(synonyms)
53
+
54
+
55
+ def build_memory(core_pictos: list[dict]) -> PictogramMemory:
56
+ """Encode all core pictogram keywords (+ WordNet synonyms) into prototypes."""
57
+ memory = PictogramMemory()
58
+ for p in core_pictos:
59
+ pid = p["_id"]
60
+ synsets = p.get("synsets", [])
61
+ keywords = [kw for kw in p.get("keywords", []) if kw.get("keyword")]
62
+ if not keywords:
63
+ continue
64
+
65
+ label = keywords[0]["keyword"]
66
+ pos = pos_from_synsets(synsets)
67
+
68
+ # Encode using pos="OTHER" and synsets=[] to match inference-time encoding,
69
+ # where POS and synsets are unknown. This ensures training and inference
70
+ # composites are built the same way, so cosine similarity is meaningful.
71
+ seen = set()
72
+ for kw in keywords:
73
+ word = kw["keyword"]
74
+ if word.lower() not in seen:
75
+ seen.add(word.lower())
76
+ memory.add(pid, encode_word(word, "OTHER", "NONE", []), label)
77
+
78
+ # WordNet synonym injection (encoded the same way)
79
+ for syn in get_synonyms(word, synsets):
80
+ if syn not in seen:
81
+ seen.add(syn)
82
+ memory.add(pid, encode_word(syn, "OTHER", "NONE", []), label)
83
+
84
+ memory.build()
85
+ return memory
86
+
87
+
88
+ print("Building HDC prototype memory from core vocabulary...")
89
+ _cache_path = os.path.join(os.path.dirname(__file__), "core_pictograms.json")
90
+ with open(_cache_path) as f:
91
+ _core_pictos = json.load(f)
92
+ memory = build_memory(_core_pictos)
93
+ print(f" Ready β€” {len(memory.protos)} pictogram prototypes loaded.")
94
+
95
+ # ── HDC lookup ────────────────────────────────────────────────────────────────
96
+
97
+ def hdc_lookup(word: str) -> tuple[int | None, float, str]:
98
+ """
99
+ Encode a word as an HDC composite vector and retrieve the nearest pictogram
100
+ prototype. POS and synsets are unknown at inference time so we use defaults;
101
+ the semantic content from the GloVe embedding carries most of the signal.
102
+
103
+ Returns (picto_id, similarity, label) or (None, 0.0, "") if below threshold.
104
+ """
105
+ query_hv = encode_word(word, pos="OTHER", ner="NONE", synsets=[])
106
+ results = memory.retrieve(query_hv, top_k=1)
107
+ pid, label, sim = results[0]
108
+ if sim >= CONFIDENCE_THRESHOLD:
109
+ return pid, sim, label
110
+ return None, sim, ""
111
+
112
+ # ── Image URL ─────────────────────────────────────────────────────────────────
113
+
114
+ def picto_url(picto_id: int, size: int = 500) -> str:
115
+ return f"https://static.arasaac.org/pictograms/{picto_id}/{picto_id}_{size}.png"
116
+
117
+ # ── Tokeniser ─────────────────────────────────────────────────────────────────
118
+
119
+ def tokenize(text: str) -> list[str]:
120
+ return [re.sub(r"[^\w'-]", "", tok) for tok in text.split() if re.sub(r"[^\w'-]", "", tok)]
121
+
122
+ # ── Render pictograms ─────────────────────────────────────────────────────────
123
+
124
+ def render_pictos(text: str) -> str:
125
+ if not text or not text.strip():
126
+ return "<p style='color:gray;text-align:center;padding:20px;'>No text to display.</p>"
127
+
128
+ cards = []
129
+ for word in tokenize(text):
130
+ picto_id, sim, label = hdc_lookup(word)
131
+
132
+ if picto_id:
133
+ img = (
134
+ f'<img src="{picto_url(picto_id)}" alt="{word}" title="{label} (sim={sim:.2f})" '
135
+ f'style="width:110px;height:110px;object-fit:contain;">'
136
+ )
137
+ # Similarity badge: green if confident, orange if marginal
138
+ badge_color = "#4caf50" if sim >= 0.15 else "#ff9800"
139
+ badge = (
140
+ f'<span style="font-size:0.7rem;background:{badge_color};color:white;'
141
+ f'border-radius:4px;padding:1px 4px;">{sim:.2f}</span>'
142
+ )
143
+ label_style = "font-size:0.85rem;margin-top:4px;word-break:break-word;font-weight:600;"
144
+ label_html = f'<p style="{label_style}">{word}</p>{badge}'
145
+ else:
146
+ img = (
147
+ '<div style="width:110px;height:110px;background:#f0f0f0;border-radius:8px;'
148
+ 'display:flex;align-items:center;justify-content:center;font-size:2rem;color:#bbb;">?</div>'
149
+ )
150
+ label_style = "font-size:0.85rem;margin-top:4px;word-break:break-word;color:#aaa;"
151
+ label_html = f'<p style="{label_style}">{word}</p>'
152
+
153
+ cards.append(
154
+ f'<div style="display:flex;flex-direction:column;align-items:center;width:130px;'
155
+ f'padding:8px;background:white;border-radius:10px;box-shadow:0 1px 4px rgba(0,0,0,0.1);">'
156
+ f'{img}{label_html}</div>'
157
+ )
158
+
159
+ return (
160
+ '<div style="display:flex;flex-wrap:wrap;gap:12px;justify-content:center;'
161
+ 'padding:20px;background:#f5f5f5;border-radius:12px;">'
162
+ + "".join(cards) + "</div>"
163
+ )
164
+
165
+ # ── Processing functions ──────────────────────────────────────────────────────
166
+
167
+ def process_audio(audio_path):
168
+ if audio_path is None:
169
+ return "", "<p style='color:gray;text-align:center;padding:20px;'>No audio provided.</p>"
170
+ result = asr(audio_path)
171
+ text = result["text"].strip()
172
+ return text, render_pictos(text)
173
+
174
+
175
+ def process_text(text):
176
+ return render_pictos(text)
177
+
178
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
179
+
180
+ with gr.Blocks(title="Speech/Text β†’ ARASAAC Pictograms (HDC)") as demo:
181
+ gr.Markdown(
182
+ """
183
+ # 🧠 Speech / Text β†’ ARASAAC Pictograms (HDC)
184
+ Convert spoken or written English into ARASAAC pictograms using
185
+ **Hyperdimensional Computing** for offline, semantic word-to-pictogram retrieval.
186
+
187
+ Uses [Whisper tiny](https://huggingface.co/openai/whisper-tiny.en) for speech recognition.
188
+ Pictogram lookup uses HDC prototype memory built from ~855 core vocabulary pictograms
189
+ and WordNet synonym injection β€” no API call per word at inference time.
190
+
191
+ The similarity score badge on each card shows retrieval confidence
192
+ (🟒 β‰₯ 0.15 Β· 🟠 < 0.15 Β· **?** below threshold).
193
+ """
194
+ )
195
+
196
+ with gr.Tab("🎀 Audio"):
197
+ audio_input = gr.Audio(
198
+ sources=["microphone", "upload"],
199
+ type="filepath",
200
+ label="Record or upload audio (.wav)",
201
+ )
202
+ transcribe_btn = gr.Button("Transcribe & Generate Pictograms", variant="primary")
203
+ transcribed_box = gr.Textbox(
204
+ label="Transcribed text (editable β€” press Enter to regenerate pictograms)",
205
+ lines=2,
206
+ interactive=True,
207
+ )
208
+ audio_picto_output = gr.HTML()
209
+
210
+ transcribe_btn.click(
211
+ fn=process_audio,
212
+ inputs=audio_input,
213
+ outputs=[transcribed_box, audio_picto_output],
214
+ )
215
+ transcribed_box.submit(
216
+ fn=process_text,
217
+ inputs=transcribed_box,
218
+ outputs=audio_picto_output,
219
+ )
220
+
221
+ with gr.Tab("✍️ Text"):
222
+ text_input = gr.Textbox(
223
+ label="Input text",
224
+ placeholder="e.g. I want to eat an apple",
225
+ lines=2,
226
+ )
227
+ text_btn = gr.Button("Generate Pictograms", variant="primary")
228
+ text_picto_output = gr.HTML()
229
+
230
+ text_btn.click(fn=process_text, inputs=text_input, outputs=text_picto_output)
231
+ text_input.submit(fn=process_text, inputs=text_input, outputs=text_picto_output)
232
+
233
+
234
+ if __name__ == "__main__":
235
+ demo.launch(theme=gr.themes.Soft())
core_pictograms.json ADDED
The diff for this file is too large to render. See raw diff
 
hdc_text2picto.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimal HDC Text-to-Pictogram example.
3
+
4
+ Encoding notation:
5
+ word_hv = project(embedding(word))
6
+ composite = bundle(
7
+ bind(word_hv, hv(NER_class)),
8
+ bind(word_hv, hv(POS_tag)),
9
+ bind(word_hv, hv(WN_synset)),
10
+ word_hv
11
+ )
12
+ """
13
+
14
+ import numpy as np
15
+ import requests
16
+ from sentence_transformers import SentenceTransformer
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Hyperparameters
20
+ # ---------------------------------------------------------------------------
21
+ D = 10_000 # hypervector dimension
22
+ EMBEDDING_MODEL = "sentence-transformers/average_word_embeddings_glove.6B.300d" # static word vectors, 300-dim
23
+ EMBEDDING_DIM = 300
24
+ ARASAAC_API = "https://api.arasaac.org/v1/pictograms/en"
25
+ RNG = np.random.default_rng(42)
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # 1. Atom memory β€” fixed random bipolar hypervectors for symbolic features
29
+ #
30
+ # In HDC, atomic concepts (POS tags, NER classes, synset IDs) are each
31
+ # represented by a unique, randomly generated hypervector. These are called
32
+ # "atom vectors". Because D is very large (~10,000), any two randomly drawn
33
+ # vectors are nearly orthogonal with high probability β€” meaning they are
34
+ # maximally dissimilar by default. This quasi-orthogonality is the key
35
+ # property that lets us encode distinct features without interference.
36
+ #
37
+ # Atom vectors are fixed for the lifetime of the model: the same symbol
38
+ # always maps to the same vector, so representations are consistent across
39
+ # all words and queries.
40
+ # ---------------------------------------------------------------------------
41
+ POS_HVS = {
42
+ # One atom per part-of-speech tag. Binding word_hv with e.g. POS_HVS["VERB"]
43
+ # produces a new vector that encodes "this word used as a verb" β€” distinct
44
+ # from the same word bound to POS_HVS["NOUN"].
45
+ tag: RNG.choice([-1.0, 1.0], size=D)
46
+ for tag in ["NOUN", "VERB", "ADJ", "ADV", "PROPN", "OTHER"]
47
+ }
48
+
49
+ NER_HVS = {
50
+ # One atom per named-entity class. Allows the model to distinguish e.g.
51
+ # "Jordan" as a PERSON vs "Jordan" as a LOC, which may map to different
52
+ # pictograms.
53
+ cls: RNG.choice([-1.0, 1.0], size=D)
54
+ for cls in ["PERSON", "ORG", "LOC", "DATE", "NONE"]
55
+ }
56
+
57
+ # WordNet synset atoms are created on demand and cached.
58
+ # Synsets provide a language-neutral semantic identifier (e.g. "01170802-v"
59
+ # for the concept of eating). Using synset atoms ties the word representation
60
+ # to an abstract meaning rather than a surface form, enabling cross-lingual
61
+ # matching: a French word and its English equivalent share the same synset
62
+ # atom if they are linked in a multilingual WordNet.
63
+ _SYNSET_HVS: dict[str, np.ndarray] = {}
64
+
65
+ def hv(synset_id: str) -> np.ndarray:
66
+ """Return the (cached) atom hypervector for a WordNet synset ID."""
67
+ if synset_id not in _SYNSET_HVS:
68
+ _SYNSET_HVS[synset_id] = RNG.choice([-1.0, 1.0], size=D)
69
+ return _SYNSET_HVS[synset_id]
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # 2. Random projection matrix: embedding_dim β†’ D
73
+ #
74
+ # FastText produces a 300-dim dense vector for each word. We need to lift
75
+ # this into the HD space (10,000-dim) so that HDC operations can be applied.
76
+ # A random Gaussian matrix W achieves this: by the Johnson-Lindenstrauss
77
+ # lemma, random projections approximately preserve pairwise distances, so
78
+ # words that were semantically similar in embedding space remain similar
79
+ # after projection. Dividing by sqrt(EMBEDDING_DIM) keeps the scale stable.
80
+ # The sign() in project() binarises the result to {-1, +1}, making it a
81
+ # proper bipolar hypervector.
82
+ # ---------------------------------------------------------------------------
83
+ _model = SentenceTransformer(EMBEDDING_MODEL)
84
+ W = RNG.standard_normal((D, EMBEDDING_DIM)) / np.sqrt(EMBEDDING_DIM)
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # 3. HDC operations
88
+ # ---------------------------------------------------------------------------
89
+ def project(embedding: np.ndarray) -> np.ndarray:
90
+ """Lift a dense embedding into HD space via random projection.
91
+
92
+ sign(W @ v) maps the continuous embedding to a bipolar hypervector
93
+ {-1, +1}^D while approximately preserving cosine similarity.
94
+ """
95
+ return np.sign(W @ embedding)
96
+
97
+ def bind(a: np.ndarray, b: np.ndarray) -> np.ndarray:
98
+ """Binding: element-wise multiplication of two bipolar hypervectors.
99
+
100
+ bind(a, b) produces a vector that is dissimilar to both a and b
101
+ individually, but encodes their *association*. It is the HDC equivalent
102
+ of a key-value pair: bind(word_hv, POS_HVS["VERB"]) means
103
+ "this word in its role as a verb". Binding is invertible:
104
+ bind(bind(a, b), b) == a, so features can be recovered later.
105
+ """
106
+ return a * b
107
+
108
+ def bundle(vectors: list[np.ndarray]) -> np.ndarray:
109
+ """Bundling: element-wise majority vote across a list of hypervectors.
110
+
111
+ bundle([a, b, c]) produces a vector that is *similar* to all of its
112
+ inputs simultaneously. It is the HDC equivalent of a set: the result
113
+ represents "a and b and c together". Used here to combine the
114
+ different feature bindings into a single composite word vector.
115
+ """
116
+ return np.sign(np.sum(vectors, axis=0))
117
+
118
+ def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
119
+ return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))
120
+
121
+ # ---------------------------------------------------------------------------
122
+ # 4. Word encoder β€” follows the notation exactly
123
+ #
124
+ # Each word is encoded as a composite hypervector that fuses:
125
+ # - its semantic content (via the projected FastText embedding)
126
+ # - its grammatical role (POS tag atom)
127
+ # - its named-entity class (NER atom)
128
+ # - its abstract meaning (WordNet synset atom(s))
129
+ #
130
+ # The bind operations associate the word's semantic vector with each feature
131
+ # atom. The final bundle merges all of these associations into one vector.
132
+ # Two words with similar embeddings *and* the same POS/NER/synset will
133
+ # produce very similar composites β€” making them likely to activate the same
134
+ # pictogram prototype.
135
+ # ---------------------------------------------------------------------------
136
+ def encode_word(word: str, pos: str, ner: str, synsets: list[str]) -> np.ndarray:
137
+ # Multi-word expressions (e.g. "swimming costume", "gum boots") are split
138
+ # into tokens and each token is encoded independently then bundled together.
139
+ # This avoids the OOV problem where GloVe has no entry for the full phrase
140
+ # but knows each constituent word well.
141
+ tokens = word.split()
142
+ if len(tokens) > 1:
143
+ return bundle([encode_word(t, pos, ner, synsets) for t in tokens])
144
+
145
+ embedding = _model.encode(word) # static GloVe vector; swap for fasttext.get_word_vector() when cc.en.300.bin is available
146
+ word_hv = project(embedding) # lift to HD space
147
+
148
+ components = [
149
+ bind(word_hv, NER_HVS.get(ner, NER_HVS["NONE"])), # bind(word_hv, hv(NER_class))
150
+ bind(word_hv, POS_HVS.get(pos, POS_HVS["OTHER"])), # bind(word_hv, hv(POS_tag))
151
+ *[bind(word_hv, hv(syn)) for syn in synsets], # bind(word_hv, hv(WN_synset))
152
+ word_hv, # base representation
153
+ ]
154
+ return bundle(components) # composite vector for this word
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # 5. Prototype memory
158
+ #
159
+ # A prototype is an aggregated hypervector representing all the words that
160
+ # map to a given pictogram. During training, composite vectors for each
161
+ # training word are summed (accumulated). After all words are processed,
162
+ # a majority vote (sign) finalises each prototype into a bipolar vector.
163
+ #
164
+ # The resulting prototype sits near the "centre" of all its training words
165
+ # in HD space β€” analogous to a centroid classifier in standard ML. At
166
+ # inference, the query composite is compared to every prototype via cosine
167
+ # similarity, and the closest one wins.
168
+ #
169
+ # Key advantage: adding a new pictogram (or a new word sense) requires only
170
+ # bundling one more composite into the relevant accumulator β€” no retraining.
171
+ # ---------------------------------------------------------------------------
172
+ class PictogramMemory:
173
+ def __init__(self):
174
+ self._accum: dict[int, np.ndarray] = {} # running sum before finalisation
175
+ self.protos: dict[int, np.ndarray] = {} # finalised bipolar prototypes
176
+ self.labels: dict[int, str] = {} # picto_id -> primary keyword
177
+
178
+ def add(self, picto_id: int, composite: np.ndarray, label: str = ""):
179
+ """Accumulate a composite vector into the prototype for picto_id."""
180
+ if picto_id not in self._accum:
181
+ self._accum[picto_id] = np.zeros(D)
182
+ self.labels[picto_id] = label
183
+ self._accum[picto_id] += composite
184
+
185
+ def build(self):
186
+ """Finalise all prototypes via majority vote (sign of accumulated sum)."""
187
+ self.protos = {pid: np.sign(acc) for pid, acc in self._accum.items()}
188
+
189
+ def retrieve(self, query: np.ndarray, top_k: int = 3) -> list[tuple]:
190
+ """Return top-k (picto_id, label, similarity) sorted by cosine similarity."""
191
+ scores = [
192
+ (pid, self.labels[pid], cosine_sim(query, proto))
193
+ for pid, proto in self.protos.items()
194
+ ]
195
+ return sorted(scores, key=lambda x: -x[2])[:top_k]
196
+
197
+ # ---------------------------------------------------------------------------
198
+ # 6. ARASAAC helpers
199
+ # ---------------------------------------------------------------------------
200
+ def fetch_synsets(picto_id: int) -> tuple[list[str], str]:
201
+ """Fetch WordNet synset IDs and primary keyword for a pictogram from the API."""
202
+ r = requests.get(f"{ARASAAC_API}/{picto_id}", timeout=10)
203
+ r.raise_for_status()
204
+ data = r.json()
205
+ synsets = data.get("synsets", [])
206
+ label = (data.get("keywords") or [{}])[0].get("keyword", str(picto_id))
207
+ return synsets, label
208
+
209
+ # ---------------------------------------------------------------------------
210
+ # 7. Demo
211
+ # ---------------------------------------------------------------------------
212
+ # Training set: (word, POS, NER, picto_id)
213
+ # POS/NER would come from a tagger at runtime; hardcoded here for clarity.
214
+ TRAIN = [
215
+ ("eat", "VERB", "NONE", 6456),
216
+ ("eating", "VERB", "NONE", 6456),
217
+ ("food", "NOUN", "NONE", 6456),
218
+ ("drink", "VERB", "NONE", 2276),
219
+ ("water", "NOUN", "NONE", 2276),
220
+ ("run", "VERB", "NONE", 2719),
221
+ ("running", "VERB", "NONE", 2719),
222
+ ("house", "NOUN", "NONE", 2317),
223
+ ("home", "NOUN", "NONE", 2317),
224
+ ("happy", "ADJ", "NONE", 3245),
225
+ ("sad", "ADJ", "NONE", 2606),
226
+ ]
227
+
228
+ # Query set: words not seen during training
229
+ QUERIES = [
230
+ ("consume", "VERB", "NONE", []),
231
+ ("joyful", "ADJ", "NONE", []),
232
+ ("dwelling", "NOUN", "NONE", []),
233
+ ("sprint", "VERB", "NONE", []),
234
+ ("beverage", "NOUN", "NONE", []),
235
+ ]
236
+
237
+ if __name__ == "__main__":
238
+ memory = PictogramMemory()
239
+ synset_cache: dict[int, tuple[list[str], str]] = {}
240
+
241
+ # --- Training: encode each word and accumulate into its pictogram prototype ---
242
+ print("=== Training ===")
243
+ for word, pos, ner, picto_id in TRAIN:
244
+ if picto_id not in synset_cache:
245
+ synset_cache[picto_id] = fetch_synsets(picto_id)
246
+ synsets, label = synset_cache[picto_id]
247
+
248
+ composite = encode_word(word, pos, ner, synsets)
249
+ memory.add(picto_id, composite, label)
250
+ print(f" '{word}' ({pos}) + synsets={synsets} β†’ picto {picto_id} [{label}]")
251
+
252
+ memory.build()
253
+ print(f"\nBuilt {len(memory.protos)} prototypes.\n")
254
+
255
+ # --- Retrieval: encode unseen words and find the nearest prototype ---
256
+ print("=== Retrieval ===")
257
+ for word, pos, ner, synsets in QUERIES:
258
+ query_hv = encode_word(word, pos, ner, synsets)
259
+ results = memory.retrieve(query_hv, top_k=3)
260
+ print(f" Query: '{word}'")
261
+ for pid, label, sim in results:
262
+ print(f" β†’ [{pid}] {label:<20s} sim={sim:.4f}")
263
+ print()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=6.0.0
2
+ transformers
3
+ torch
4
+ requests
5
+ soundfile
6
+ librosa
7
+ sentence-transformers
8
+ nltk