Signlink / mapping.py
gaurannggg7's picture
Update mapping.py
25f2802 verified
import os
import re
import pandas as pd
class ASLDictionary:
"""
Load token->video-filename map from CSV and dynamically resolve paths.
Lookup chain: exact match -> lemmatization -> fingerspelling fallback.
"""
LEMMA_RULES = [
(r'ING$', ''), # CLEANING -> CLEAN
(r'INGS$', ''), # CLEANINGS -> CLEAN
(r'ED$', ''), # DELAYED -> DELAY
(r'LY$', ''), # QUICKLY -> QUICK
(r'ER$', ''), # FASTER -> FAST
(r'EST$', ''), # FASTEST -> FAST
(r'ION$', ''), # ATTENTION -> ATTEND (approximate)
(r'TION$', ''), # ATTENTION -> ATTEN (then try)
(r'LY$', ''), # SLOWLY -> SLOW
(r'S$', ''), # FRIENDS -> FRIEND
]
def __init__(
self,
csv_path: str = "content/asl_app_data/asl_video_index_final_with_path_cleaned.csv",
dictionary_dir: str = "content/asl_app_data/dictionary",
fingerspelling_dir: str = "content/asl_app_data/Letters"
):
self.directory = dictionary_dir
self.fingerspelling_dir = fingerspelling_dir
# 1. Build token -> filename map from CSV
df = pd.read_csv(csv_path)
self.token_to_filename = {}
for _, row in df.iterrows():
filepath = row['path']
fname = os.path.basename(filepath)
for col in ('token', 'phrase', 'word'):
val = row.get(col, "")
if pd.notna(val) and str(val).strip():
key = str(val).upper().strip()
self.token_to_filename[key] = fname
# Sort keys by word-count descending for greedy phrase matching
self.keys = sorted(
self.token_to_filename.keys(),
key=lambda x: len(x.split()),
reverse=True
)
# 2. RAG-style indexer: scan all subfolders for absolute paths
self.actual_file_paths = {}
if os.path.exists(self.directory):
for root, dirs, files in os.walk(self.directory):
for file in files:
if not file.startswith('.'):
self.actual_file_paths[file] = os.path.join(root, file)
# 3. Fingerspelling index: A-Z letter paths
self.letter_paths = {}
if os.path.exists(self.fingerspelling_dir):
for letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
path = os.path.join(self.fingerspelling_dir, f"{letter}.mp4")
if os.path.exists(path):
self.letter_paths[letter] = path
print(f"✅ Dictionary loaded: {len(self.token_to_filename)} tokens")
print(f"✅ Fingerspelling loaded: {sorted(self.letter_paths.keys())}")
def _resolve(self, token: str) -> str | None:
"""Look up token in CSV map and resolve to absolute file path."""
fname = self.token_to_filename.get(token)
if fname:
full = self.actual_file_paths.get(fname)
if full and os.path.exists(full):
return full
return None
def _lemmatize(self, token: str) -> str | None:
"""Try stripping suffixes to find a base form in vocabulary."""
for pattern, replacement in self.LEMMA_RULES:
candidate = re.sub(pattern, replacement, token)
if candidate != token and len(candidate) > 2:
if candidate in self.token_to_filename:
return candidate
return None
def _fingerspell(self, token: str) -> list[str]:
"""Break token into individual letter video paths."""
paths = []
for char in token.upper():
if char in self.letter_paths:
paths.append(self.letter_paths[char])
elif char == ' ':
pass
else:
print(f"⚠️ No fingerspelling for: '{char}'")
return paths
def get_paths(self, gloss_tokens: list[str]) -> list[str]:
out_paths = []
tokens = [t.upper() for t in gloss_tokens]
n = len(tokens)
i = 0
while i < n:
# 1. Greedy phrase match (longest first)
match = None
match_len = 0
for key in self.keys:
parts = key.split()
L = len(parts)
if L > 1 and i + L <= n and tokens[i:i + L] == parts:
match, match_len = key, L
break
if match:
full = self._resolve(match)
if full:
out_paths.append(full)
print(f"✅ Phrase match: '{match}'")
else:
print(f"⚠️ Phrase found but file missing: '{match}'")
i += match_len
continue
tok = tokens[i]
# 2. Exact single token match
full = self._resolve(tok)
if full:
out_paths.append(full)
print(f"✅ Exact match: '{tok}'")
i += 1
continue
# 3. Lemmatization fallback
lemma = self._lemmatize(tok)
if lemma:
full = self._resolve(lemma)
if full:
out_paths.append(full)
print(f"✅ Lemma match: '{tok}' -> '{lemma}'")
i += 1
continue
# 4. Fingerspelling fallback
print(f"⚠️ No mapping for '{tok}' — fingerspelling...")
spelled = self._fingerspell(tok)
if spelled:
out_paths.extend(spelled)
else:
print(f"⚠️ Could not fingerspell '{tok}'")
i += 1
return out_paths