Spaces:
Running
Running
| import os | |
| import re | |
| import pandas as pd | |
| class ASLDictionary: | |
| """ | |
| Load token->video-filename map from CSV and dynamically resolve paths. | |
| Lookup chain: exact match -> lemmatization -> fingerspelling fallback. | |
| """ | |
| LEMMA_RULES = [ | |
| (r'ING$', ''), # CLEANING -> CLEAN | |
| (r'INGS$', ''), # CLEANINGS -> CLEAN | |
| (r'ED$', ''), # DELAYED -> DELAY | |
| (r'LY$', ''), # QUICKLY -> QUICK | |
| (r'ER$', ''), # FASTER -> FAST | |
| (r'EST$', ''), # FASTEST -> FAST | |
| (r'ION$', ''), # ATTENTION -> ATTEND (approximate) | |
| (r'TION$', ''), # ATTENTION -> ATTEN (then try) | |
| (r'LY$', ''), # SLOWLY -> SLOW | |
| (r'S$', ''), # FRIENDS -> FRIEND | |
| ] | |
| def __init__( | |
| self, | |
| csv_path: str = "content/asl_app_data/asl_video_index_final_with_path_cleaned.csv", | |
| dictionary_dir: str = "content/asl_app_data/dictionary", | |
| fingerspelling_dir: str = "content/asl_app_data/Letters" | |
| ): | |
| self.directory = dictionary_dir | |
| self.fingerspelling_dir = fingerspelling_dir | |
| # 1. Build token -> filename map from CSV | |
| df = pd.read_csv(csv_path) | |
| self.token_to_filename = {} | |
| for _, row in df.iterrows(): | |
| filepath = row['path'] | |
| fname = os.path.basename(filepath) | |
| for col in ('token', 'phrase', 'word'): | |
| val = row.get(col, "") | |
| if pd.notna(val) and str(val).strip(): | |
| key = str(val).upper().strip() | |
| self.token_to_filename[key] = fname | |
| # Sort keys by word-count descending for greedy phrase matching | |
| self.keys = sorted( | |
| self.token_to_filename.keys(), | |
| key=lambda x: len(x.split()), | |
| reverse=True | |
| ) | |
| # 2. RAG-style indexer: scan all subfolders for absolute paths | |
| self.actual_file_paths = {} | |
| if os.path.exists(self.directory): | |
| for root, dirs, files in os.walk(self.directory): | |
| for file in files: | |
| if not file.startswith('.'): | |
| self.actual_file_paths[file] = os.path.join(root, file) | |
| # 3. Fingerspelling index: A-Z letter paths | |
| self.letter_paths = {} | |
| if os.path.exists(self.fingerspelling_dir): | |
| for letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ': | |
| path = os.path.join(self.fingerspelling_dir, f"{letter}.mp4") | |
| if os.path.exists(path): | |
| self.letter_paths[letter] = path | |
| print(f"✅ Dictionary loaded: {len(self.token_to_filename)} tokens") | |
| print(f"✅ Fingerspelling loaded: {sorted(self.letter_paths.keys())}") | |
| def _resolve(self, token: str) -> str | None: | |
| """Look up token in CSV map and resolve to absolute file path.""" | |
| fname = self.token_to_filename.get(token) | |
| if fname: | |
| full = self.actual_file_paths.get(fname) | |
| if full and os.path.exists(full): | |
| return full | |
| return None | |
| def _lemmatize(self, token: str) -> str | None: | |
| """Try stripping suffixes to find a base form in vocabulary.""" | |
| for pattern, replacement in self.LEMMA_RULES: | |
| candidate = re.sub(pattern, replacement, token) | |
| if candidate != token and len(candidate) > 2: | |
| if candidate in self.token_to_filename: | |
| return candidate | |
| return None | |
| def _fingerspell(self, token: str) -> list[str]: | |
| """Break token into individual letter video paths.""" | |
| paths = [] | |
| for char in token.upper(): | |
| if char in self.letter_paths: | |
| paths.append(self.letter_paths[char]) | |
| elif char == ' ': | |
| pass | |
| else: | |
| print(f"⚠️ No fingerspelling for: '{char}'") | |
| return paths | |
| def get_paths(self, gloss_tokens: list[str]) -> list[str]: | |
| out_paths = [] | |
| tokens = [t.upper() for t in gloss_tokens] | |
| n = len(tokens) | |
| i = 0 | |
| while i < n: | |
| # 1. Greedy phrase match (longest first) | |
| match = None | |
| match_len = 0 | |
| for key in self.keys: | |
| parts = key.split() | |
| L = len(parts) | |
| if L > 1 and i + L <= n and tokens[i:i + L] == parts: | |
| match, match_len = key, L | |
| break | |
| if match: | |
| full = self._resolve(match) | |
| if full: | |
| out_paths.append(full) | |
| print(f"✅ Phrase match: '{match}'") | |
| else: | |
| print(f"⚠️ Phrase found but file missing: '{match}'") | |
| i += match_len | |
| continue | |
| tok = tokens[i] | |
| # 2. Exact single token match | |
| full = self._resolve(tok) | |
| if full: | |
| out_paths.append(full) | |
| print(f"✅ Exact match: '{tok}'") | |
| i += 1 | |
| continue | |
| # 3. Lemmatization fallback | |
| lemma = self._lemmatize(tok) | |
| if lemma: | |
| full = self._resolve(lemma) | |
| if full: | |
| out_paths.append(full) | |
| print(f"✅ Lemma match: '{tok}' -> '{lemma}'") | |
| i += 1 | |
| continue | |
| # 4. Fingerspelling fallback | |
| print(f"⚠️ No mapping for '{tok}' — fingerspelling...") | |
| spelled = self._fingerspell(tok) | |
| if spelled: | |
| out_paths.extend(spelled) | |
| else: | |
| print(f"⚠️ Could not fingerspell '{tok}'") | |
| i += 1 | |
| return out_paths |