Spaces:
Sleeping
Sleeping
| import math | |
| from typing import List, Dict, Optional, Callable | |
| class EntropyRanker: | |
| """ | |
| Scores and filters text samples by Shannon entropy of their token distribution. | |
| Used to remove low-information or repetitive samples from scientific corpora. | |
| """ | |
| def __init__(self, entropy_threshold: float = 3.5, tokenizer: Optional[Callable[[str], List[str]]] = None): | |
| """ | |
| Args: | |
| entropy_threshold: Minimum entropy required to keep a sample. | |
| tokenizer: Function to tokenize text. Defaults to whitespace split. | |
| """ | |
| self.entropy_threshold = entropy_threshold | |
| self.tokenizer = tokenizer or (lambda x: x.split()) | |
| def shannon_entropy(tokens: List[str]) -> float: | |
| """Compute Shannon entropy for a list of tokens.""" | |
| if not tokens: | |
| return 0.0 | |
| freq = {} | |
| for t in tokens: | |
| freq[t] = freq.get(t, 0) + 1 | |
| total = len(tokens) | |
| entropy = 0.0 | |
| for count in freq.values(): | |
| p = count / total | |
| entropy -= p * math.log(p, 2) | |
| return entropy | |
| def score_sample(self, text: str) -> float: | |
| """Tokenize and score a text sample by entropy.""" | |
| tokens = self.tokenizer(text) | |
| return self.shannon_entropy(tokens) | |
| def is_explanatory(self, text: str) -> bool: | |
| """Return True if sample passes an entropy threshold.""" | |
| return self.score_sample(text) >= self.entropy_threshold | |
| def filter_samples(self, samples: List[Dict], text_key: str = "text") -> List[Dict]: | |
| """Filter a list of dict samples, keeping only those above a threshold.""" | |
| return [s for s in samples if self.is_explanatory(s.get(text_key, ""))] | |
| def rank_samples(self, samples: List[Dict], text_key: str = "text", top_k: Optional[int] = None) -> List[Dict]: | |
| """ | |
| Rank samples by entropy, descending. Optionally return only top_k. | |
| """ | |
| scored = [ | |
| (self.score_sample(s.get(text_key, "")), s) | |
| for s in samples | |
| ] | |
| scored.sort(reverse=True, key=lambda x: x[0]) | |
| ranked = [s for _, s in scored if _ >= self.entropy_threshold] | |
| if top_k is not None: | |
| ranked = ranked[:top_k] | |
| return ranked |