| """ |
| Core cryptanalysis utilities for Cipher Detective AI. |
| |
| These functions are intentionally dependency-light so they can be tested without |
| loading Gradio, Transformers, or a hosted model. The project combines: |
| |
| 1. transparent, human-readable classical cryptanalysis signals; and |
| 2. an optional Transformer classifier for Hugging Face-native deployment. |
| |
| Educational use only. This does not break modern encryption. |
| """ |
| from __future__ import annotations |
|
|
| import math |
| import re |
| from collections import Counter |
| from dataclasses import dataclass |
|
|
| ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
| ENGLISH_IOC = 0.0667 |
| RANDOM_IOC = 1.0 / 26.0 |
| ENGLISH_FREQ = { |
| "A": 8.167, "B": 1.492, "C": 2.782, "D": 4.253, "E": 12.702, "F": 2.228, |
| "G": 2.015, "H": 6.094, "I": 6.966, "J": 0.153, "K": 0.772, "L": 4.025, |
| "M": 2.406, "N": 6.749, "O": 7.507, "P": 1.929, "Q": 0.095, "R": 5.987, |
| "S": 6.327, "T": 9.056, "U": 2.758, "V": 0.978, "W": 2.360, "X": 0.150, |
| "Y": 1.974, "Z": 0.074, |
| } |
| COMMON_WORDS = [ |
| |
| "THAT", "HAVE", "WITH", "FROM", "WILL", "THEY", "BEEN", "WHEN", |
| "WERE", "INTO", "THAN", "THEM", "MORE", "WHAT", "YOUR", "EACH", "SOME", |
| "OVER", "JUST", "ALSO", "BACK", "VERY", "TIME", "COME", "MAKE", "TAKE", |
| "ONLY", "EVEN", "KNOW", "MANY", "MOST", "SUCH", "MUST", "LONG", "UPON", |
| "UNTO", "SEND", "GIVE", "FIND", "WELL", "PART", "HAND", "HERE", "SAID", |
| "LAST", "NEXT", "BOTH", "LIFE", "DEATH", "LOVE", "KING", "LORD", "ARMY", |
| "ENEMY", "SECRET", "CIPHER", "CIPHERS", "MESSAGE", "ATTACK", "LETTER", |
| "LETTERS", "BEFORE", "SHOULD", "FORCE", "ORDER", "PAIRS", "THREE", |
| "HUMAN", "STRONG", "SECURE", "PATTERN", "KEYWORD", "MACHINE", "WITHOUT", |
| "EVIDENCE", "ALPHABET", "ENCODING", "FREQUENCY", "KNOWLEDGE", "ENCRYPTION", |
| "TRANSPOSITION", "SUBSTITUTION", "COINCIDENCE", "POLYALPHABETIC", |
| "CIPHERTEXT", "PLAINTEXT", "POLYBIUS", "VIGENERE", "ENIGMA", "COLUMNAR", |
| "INDEX", "PUBLIC", "SIGNAL", "SQUARE", "ITSELF", "CAREFUL", "BRUTE", |
| "SHOW", "USES", "USED", "WORKS", "CLAIM", "NINETEEN", |
| |
| "THE", "AND", "FOR", "NOT", "YOU", "THIS", "BUT", "ARE", "CAN", "HIS", |
| "HER", "OUR", "OUT", "WHO", "ITS", "WAS", "HAS", "NOW", "HIM", "MAY", |
| "SAY", "ALL", "ONE", "GET", "GOD", "MAN", "WAR", "END", "DAY", "KEY", |
| "USE", "TWO", |
| ] |
| |
| |
| |
| _SHORT_WORDS = frozenset(w for w in COMMON_WORDS if len(w) <= 3) |
| BIGRAMS = ["TH", "HE", "IN", "ER", "AN", "RE", "ON", "AT", "EN", "ND", "TI", "ES", "OR", "TE"] |
| TRIGRAMS = ["THE", "AND", "ING", "ION", "ENT", "HER", "FOR", "THA", "NTH", "INT"] |
|
|
| @dataclass |
| class ModelPrediction: |
| label: str |
| confidence: float |
| scores: dict[str, float] |
| source: str |
|
|
| @dataclass |
| class Evidence: |
| letters: int |
| unique_letters: int |
| index_of_coincidence: float |
| entropy: float |
| chi_squared: float |
| top_letters: list[tuple[str, int, float]] |
| top_bigrams: list[tuple[str, int]] |
| top_trigrams: list[tuple[str, int]] |
| caesar_candidates: list[tuple[int, float, int, str]] |
| atbash_plaintext: str |
| affine_candidates: list[tuple[int, int, float, int, str]] |
| friedman_key_length: float |
| kasiski_key_lengths: list[tuple[int, int]] |
| transposition_signal: float |
| bigram_support: float |
| notes: list[str] |
|
|
|
|
| def clean_letters(text: str) -> str: |
| return re.sub(r"[^A-Z]", "", text.upper()) |
|
|
|
|
| def caesar_encrypt(text: str, shift: int) -> str: |
| out = [] |
| for ch in text.upper(): |
| if ch in ALPHABET: |
| out.append(ALPHABET[(ALPHABET.index(ch) + shift) % 26]) |
| else: |
| out.append(ch) |
| return "".join(out) |
|
|
|
|
| def caesar_shift(text: str, shift: int) -> str: |
| """Decode by shifting letters backward.""" |
| out = [] |
| for ch in text.upper(): |
| if ch in ALPHABET: |
| out.append(ALPHABET[(ALPHABET.index(ch) - shift) % 26]) |
| else: |
| out.append(ch) |
| return "".join(out) |
|
|
|
|
| def atbash(text: str) -> str: |
| return text.upper().translate(str.maketrans(ALPHABET, ALPHABET[::-1])) |
|
|
|
|
| def affine_encrypt(text: str, a: int, b: int) -> str: |
| out = [] |
| for ch in text.upper(): |
| if ch in ALPHABET: |
| x = ALPHABET.index(ch) |
| out.append(ALPHABET[(a * x + b) % 26]) |
| else: |
| out.append(ch) |
| return "".join(out) |
|
|
|
|
| def _mod_inverse(a: int, m: int = 26) -> int | None: |
| for x in range(1, m): |
| if (a * x) % m == 1: |
| return x |
| return None |
|
|
|
|
| def affine_decrypt(text: str, a: int, b: int) -> str: |
| inv = _mod_inverse(a) |
| if inv is None: |
| raise ValueError("a must be coprime with 26") |
| out = [] |
| for ch in text.upper(): |
| if ch in ALPHABET: |
| y = ALPHABET.index(ch) |
| out.append(ALPHABET[(inv * (y - b)) % 26]) |
| else: |
| out.append(ch) |
| return "".join(out) |
|
|
|
|
| def vigenere_encrypt(text: str, key: str) -> str: |
| key = clean_letters(key) |
| if not key: |
| raise ValueError("key must contain at least one A-Z letter") |
| out, j = [], 0 |
| for ch in text.upper(): |
| if ch in ALPHABET: |
| k = ALPHABET.index(key[j % len(key)]) |
| out.append(ALPHABET[(ALPHABET.index(ch) + k) % 26]) |
| j += 1 |
| else: |
| out.append(ch) |
| return "".join(out) |
|
|
|
|
| def vigenere_decrypt(text: str, key: str) -> str: |
| """Inverse of :func:`vigenere_encrypt`.""" |
| key = clean_letters(key) |
| if not key: |
| raise ValueError("key must contain at least one A-Z letter") |
| out, j = [], 0 |
| for ch in text.upper(): |
| if ch in ALPHABET: |
| k = ALPHABET.index(key[j % len(key)]) |
| out.append(ALPHABET[(ALPHABET.index(ch) - k) % 26]) |
| j += 1 |
| else: |
| out.append(ch) |
| return "".join(out) |
|
|
|
|
| def beaufort_decrypt(text: str, key: str) -> str: |
| """Beaufort cipher decryption. Beaufort is reciprocal: c = (k - p) mod 26, |
| so decryption uses the same operation: p = (k - c) mod 26.""" |
| key = clean_letters(key) |
| if not key: |
| raise ValueError("key must contain at least one A-Z letter") |
| out, j = [], 0 |
| for ch in text.upper(): |
| if ch in ALPHABET: |
| k = ALPHABET.index(key[j % len(key)]) |
| c = ALPHABET.index(ch) |
| out.append(ALPHABET[(k - c) % 26]) |
| j += 1 |
| else: |
| out.append(ch) |
| return "".join(out) |
|
|
|
|
| def substitution_encrypt(text: str, mapping: str) -> str: |
| """Monoalphabetic substitution. ``mapping`` is a 26-letter permutation.""" |
| mapping = mapping.upper() |
| if len(mapping) != 26 or set(mapping) != set(ALPHABET): |
| raise ValueError("mapping must be a permutation of A-Z") |
| return text.upper().translate(str.maketrans(ALPHABET, mapping)) |
|
|
|
|
| def rail_fence_encrypt(text: str, rails: int = 3) -> str: |
| chars = clean_letters(text) |
| if rails < 2: |
| return chars |
| fence = ["" for _ in range(rails)] |
| rail, direction = 0, 1 |
| for ch in chars: |
| fence[rail] += ch |
| if rail == 0: |
| direction = 1 |
| elif rail == rails - 1: |
| direction = -1 |
| rail += direction |
| return "".join(fence) |
|
|
|
|
| def columnar_transposition_encrypt(text: str, key: str) -> str: |
| chars = clean_letters(text) |
| key_letters = clean_letters(key) |
| if not key_letters: |
| return chars |
| cols = len(key_letters) |
| padding = (-len(chars)) % cols |
| chars += "X" * padding |
| rows = [chars[i:i+cols] for i in range(0, len(chars), cols)] |
| order = sorted(range(cols), key=lambda i: (key_letters[i], i)) |
| return "".join("".join(row[i] for row in rows) for i in order) |
|
|
|
|
| def chi_squared_for_english(letters: str) -> float: |
| if not letters: |
| return float("inf") |
| counts = Counter(letters) |
| n = len(letters) |
| chi = 0.0 |
| for ch in ALPHABET: |
| observed = counts.get(ch, 0) |
| expected = ENGLISH_FREQ[ch] * n / 100.0 |
| if expected: |
| chi += ((observed - expected) ** 2) / expected |
| return chi |
|
|
|
|
| def index_of_coincidence(letters: str) -> float: |
| n = len(letters) |
| if n < 2: |
| return 0.0 |
| counts = Counter(letters) |
| return sum(v * (v - 1) for v in counts.values()) / (n * (n - 1)) |
|
|
|
|
| def shannon_entropy(letters: str) -> float: |
| if not letters: |
| return 0.0 |
| n = len(letters) |
| return -sum((c/n) * math.log2(c/n) for c in Counter(letters).values()) |
|
|
|
|
| def word_score(text: str) -> int: |
| joined = re.sub(r"[^A-Z ]", " ", text.upper()) |
| tokens = set(joined.split()) |
| |
| score = sum(joined.count(w) for w in COMMON_WORDS if w not in _SHORT_WORDS) |
| score += sum(1 for w in _SHORT_WORDS if w in tokens) |
| return score |
|
|
|
|
| def ngram_counts(letters: str, n: int, top: int = 8) -> list[tuple[str, int]]: |
| if len(letters) < n: |
| return [] |
| c = Counter(letters[i:i+n] for i in range(len(letters) - n + 1)) |
| return c.most_common(top) |
|
|
|
|
| def best_caesar_candidates(text: str, top_n: int = 5) -> list[tuple[int, float, int, str]]: |
| candidates = [] |
| for shift in range(26): |
| decoded = caesar_shift(text, shift) |
| letters = clean_letters(decoded) |
| chi = chi_squared_for_english(letters) |
| score = word_score(decoded) |
| |
| candidates.append((shift, chi, score, decoded)) |
| candidates.sort(key=lambda x: (-x[2], x[1])) |
| return candidates[:top_n] |
|
|
|
|
| def best_affine_candidates(text: str, top_n: int = 5) -> list[tuple[int, int, float, int, str]]: |
| """Brute-force the 312 valid Affine keys and return the most English-looking decryptions.""" |
| candidates: list[tuple[int, int, float, int, str]] = [] |
| for a in (1, 3, 5, 7, 9, 11, 15, 17, 19, 21, 23, 25): |
| for b in range(26): |
| try: |
| decoded = affine_decrypt(text, a, b) |
| except ValueError: |
| continue |
| letters = clean_letters(decoded) |
| chi = chi_squared_for_english(letters) |
| score = word_score(decoded) |
| candidates.append((a, b, chi, score, decoded)) |
| candidates.sort(key=lambda x: (-x[3], x[2])) |
| return candidates[:top_n] |
|
|
|
|
| def vigenere_auto_solve( |
| ciphertext: str, |
| max_key_len: int = 15, |
| top_n: int = 5, |
| ) -> list[tuple[str, str, float, int]]: |
| """Auto-solve a Vigenère cipher using Kasiski / Friedman key-length estimation |
| followed by per-column Caesar brute-force. |
| |
| Returns up to ``top_n`` candidates as ``(key, plaintext, chi_sq, word_count)`` |
| sorted best-first (most English words, then lowest chi-squared). |
| """ |
| letters = clean_letters(ciphertext) |
| if len(letters) < 20: |
| return [] |
|
|
| |
| kasiski = kasiski_key_lengths(letters, top=5) |
| fried = friedman_key_length(letters) |
|
|
| key_len_candidates: set[int] = set() |
| for k, _ in kasiski[:4]: |
| if 2 <= k <= max_key_len: |
| key_len_candidates.add(k) |
| if fried: |
| for f in {int(fried), round(int(fried + 0.5))}: |
| if 2 <= f <= max_key_len: |
| key_len_candidates.add(f) |
| |
| for k in range(2, min(9, max_key_len + 1)): |
| key_len_candidates.add(k) |
|
|
| results: list[tuple[str, str, float, int]] = [] |
| seen_keys: set[str] = set() |
|
|
| for key_len in sorted(key_len_candidates): |
| |
| streams = [letters[i::key_len] for i in range(key_len)] |
| key_letters = [] |
| for stream in streams: |
| |
| best_chi_col = float("inf") |
| best_s = 0 |
| for s in range(26): |
| decoded_col = "".join( |
| ALPHABET[(ALPHABET.index(c) - s) % 26] for c in stream |
| ) |
| chi_col = chi_squared_for_english(decoded_col) |
| if chi_col < best_chi_col: |
| best_chi_col = chi_col |
| best_s = s |
| key_letters.append(ALPHABET[best_s]) |
| key = "".join(key_letters) |
| if key in seen_keys: |
| continue |
| seen_keys.add(key) |
| plaintext = vigenere_decrypt(ciphertext, key) |
| plain_letters = clean_letters(plaintext) |
| chi = chi_squared_for_english(plain_letters) |
| ws = word_score(plaintext) |
| results.append((key, plaintext, chi, ws)) |
|
|
| results.sort(key=lambda x: (-x[3], x[2])) |
| return results[:top_n] |
|
|
|
|
| def playfair_double_score(letters: str) -> float: |
| """Return fraction of consecutive letter-pairs that are identical. |
| |
| Playfair forbids encoding a pair with both letters the same (they would |
| be in the same row/column), so a genuine Playfair ciphertext has very |
| few or no identical consecutive-pair doubles. English plaintext has |
| ~3.9% (1/26), Playfair should be ≈ 0%. |
| """ |
| n = len(letters) |
| if n < 4: |
| return 0.0 |
| pairs = [letters[i:i + 2] for i in range(0, n - 1, 2)] |
| doubles = sum(1 for p in pairs if len(p) == 2 and p[0] == p[1]) |
| return doubles / max(len(pairs), 1) |
|
|
|
|
| def friedman_key_length(letters: str) -> float: |
| """Friedman estimate of Vigenère key length from the index of coincidence. |
| |
| Returns ``0.0`` if the sample is too short or the IoC is too close to random. |
| """ |
| n = len(letters) |
| if n < 40: |
| return 0.0 |
| ioc = index_of_coincidence(letters) |
| denom = (n - 1) * ioc - RANDOM_IOC * n + ENGLISH_IOC |
| if denom <= 1e-6: |
| return 0.0 |
| k = (ENGLISH_IOC - RANDOM_IOC) * n / denom |
| return max(0.0, k) |
|
|
|
|
| def kasiski_key_lengths(letters: str, min_len: int = 3, top: int = 5) -> list[tuple[int, int]]: |
| """Kasiski examination: factor distances between repeated trigrams. |
| |
| Returns ``[(candidate_length, support_count), ...]`` sorted by support. |
| """ |
| if len(letters) < 30: |
| return [] |
| positions: dict[str, list[int]] = {} |
| for i in range(len(letters) - min_len + 1): |
| positions.setdefault(letters[i:i + min_len], []).append(i) |
| factor_support: Counter = Counter() |
| for _gram, idxs in positions.items(): |
| if len(idxs) < 2: |
| continue |
| for a, b in zip(idxs, idxs[1:], strict=False): |
| d = b - a |
| for k in range(2, min(21, d + 1)): |
| if d % k == 0: |
| factor_support[k] += 1 |
| return factor_support.most_common(top) |
|
|
|
|
| def transposition_signal(letters: str) -> tuple[float, float]: |
| """Heuristic signal for transposition ciphers. |
| |
| Transposition keeps English letter frequencies intact (so chi-squared looks |
| English-like) but destroys common bigrams. Returns ``(transposition, bigram_support)`` |
| in ``[0, 1]``. High ``transposition`` and low ``bigram_support`` together |
| suggest rail-fence / columnar. |
| """ |
| n = len(letters) |
| if n < 30: |
| return 0.0, 0.0 |
| chi = chi_squared_for_english(letters) |
| |
| english_likeness = max(0.0, 1.0 - min(chi, 200.0) / 200.0) |
| bigrams = ngram_counts(letters, 2, top=200) |
| total_bigrams = max(1, n - 1) |
| common_hits = sum( |
| c for bg, c in bigrams |
| if bg in {"TH", "HE", "IN", "ER", "AN", "RE", "ON", "AT", "EN", "ND"} |
| ) |
| |
| |
| |
| bigram_support = min(1.0, common_hits / (total_bigrams * 0.12)) |
| transposition = english_likeness * (1.0 - bigram_support) |
| return round(transposition, 4), round(bigram_support, 4) |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| _BIGRAM_LOG_PROB: dict[str, float] = { |
| "TH": -2.31, "HE": -2.43, "IN": -2.78, "ER": -2.90, "AN": -3.00, "RE": -3.05, |
| "ON": -3.18, "AT": -3.25, "EN": -3.27, "ND": -3.30, "TI": -3.40, "ES": -3.45, |
| "OR": -3.48, "TE": -3.55, "OF": -3.60, "ED": -3.65, "IS": -3.70, "IT": -3.72, |
| "AL": -3.75, "AR": -3.78, "ST": -3.80, "TO": -3.82, "NT": -3.85, "NG": -3.90, |
| "SE": -3.95, "HA": -3.98, "AS": -4.00, "OU": -4.05, "IO": -4.08, "LE": -4.10, |
| "VE": -4.15, "CO": -4.18, "ME": -4.20, "DE": -4.25, "HI": -4.28, "RI": -4.30, |
| "RO": -4.32, "IC": -4.35, "NE": -4.38, "EA": -4.40, "RA": -4.42, "CE": -4.45, |
| "LI": -4.48, "CH": -4.50, "LL": -4.55, "BE": -4.58, "MA": -4.60, "SI": -4.62, |
| "OM": -4.65, "UR": -4.68, |
| |
| "WH": -4.70, "WI": -4.72, "WA": -4.73, "TR": -4.74, "OW": -4.75, "OL": -4.76, |
| "LO": -4.77, "LA": -4.78, "GH": -4.79, "EL": -4.80, "EE": -4.82, "CA": -4.83, |
| "AC": -4.84, "AB": -4.86, "PR": -4.87, "PL": -4.88, "PE": -4.89, "PA": -4.90, |
| "NO": -4.91, "LY": -4.92, "IT": -3.72, "US": -4.94, "UN": -4.95, "TU": -4.96, |
| "SO": -4.97, "SH": -4.98, "SA": -4.99, "RU": -5.00, "RD": -5.01, "OT": -5.02, |
| "OO": -5.03, "OI": -5.05, "OA": -5.06, "NS": -5.07, "NA": -5.08, "MO": -5.09, |
| "MI": -5.10, "LT": -5.11, "IL": -5.12, "IF": -5.13, "GE": -5.14, "FR": -5.15, |
| "FI": -5.16, "EW": -5.17, "ET": -5.18, "EI": -5.19, "EF": -5.20, "EG": -5.21, |
| "EC": -5.22, "DI": -5.23, "CR": -5.24, "CT": -5.25, "CL": -5.26, "BY": -5.27, |
| "BU": -5.28, "BO": -5.29, "AY": -5.30, "AU": -5.31, "AM": -5.32, "AG": -5.33, |
| } |
|
|
| |
| |
| _TRIGRAM_LOG_PROB: dict[str, float] = { |
| "THE": -2.76, "AND": -3.10, "ING": -3.34, "ION": -3.37, "ENT": -3.42, |
| "TIO": -3.44, "FOR": -3.45, "HER": -3.48, "TER": -3.50, "THA": -3.52, |
| "HIS": -3.55, "ITH": -3.56, "ALL": -3.58, "TOR": -3.59, "INT": -3.60, |
| "ERS": -3.61, "NCE": -3.62, "MEN": -3.63, "ATE": -3.64, "ORT": -3.65, |
| "INE": -3.66, "STR": -3.67, "VER": -3.68, "OTH": -3.69, "ATI": -3.70, |
| "ERE": -3.71, "EDT": -3.72, "TED": -3.73, "ESS": -3.74, "HAT": -3.75, |
| "WIT": -3.76, "ARE": -3.77, "NOT": -3.78, "NDE": -3.79, "COM": -3.80, |
| "ONS": -3.81, "OUN": -3.82, "WAS": -3.83, "PRO": -3.84, "OVE": -3.85, |
| "OUR": -3.86, "HAV": -3.87, "AVE": -3.88, "ONE": -3.89, "HAN": -3.90, |
| "IVE": -3.91, "GET": -3.92, "AIN": -3.93, "EAR": -3.94, "IST": -3.95, |
| "EME": -3.96, "LLY": -3.97, "ALLY": -3.98, "RES": -3.99, "STA": -4.00, |
| "EAL": -4.01, "OFT": -4.02, "NTO": -4.03, "ACE": -4.04, "MAK": -4.05, |
| } |
|
|
|
|
| def english_trigram_score(letters: str) -> float: |
| """Mean log10-probability per trigram. Higher (less negative) = more English.""" |
| if len(letters) < 3: |
| return -8.0 |
| total = 0.0 |
| trips = len(letters) - 2 |
| for i in range(trips): |
| tg = letters[i:i + 3] |
| seen = _TRIGRAM_LOG_PROB.get(tg) |
| if seen is not None: |
| total += seen |
| else: |
| total += ( |
| _UNIGRAM_LOG_PROB.get(tg[0], -3.0) |
| + _UNIGRAM_LOG_PROB.get(tg[1], -3.0) |
| + _UNIGRAM_LOG_PROB.get(tg[2], -3.0) |
| - 1.5 |
| ) |
| return total / trips |
|
|
|
|
| |
| _UNIGRAM_LOG_PROB: dict[str, float] = { |
| ch: math.log10(max(freq, 0.01) / 100.0) for ch, freq in ENGLISH_FREQ.items() |
| } |
| _BIGRAM_FLOOR = -8.0 |
| _BIGRAM_BACKOFF_PENALTY = 1.0 |
|
|
|
|
| def english_bigram_score(letters: str) -> float: |
| """Mean log-probability per bigram. Higher (less negative) = more English-like. |
| |
| Uses a small bigram table for common pairs; falls back to (log p(a) + log p(b)) |
| minus a small penalty for unseen bigrams. The backoff keeps the gradient |
| smooth so hill-climbing doesn't plateau on a floor. |
| """ |
| if len(letters) < 2: |
| return _BIGRAM_FLOOR |
| total = 0.0 |
| pairs = len(letters) - 1 |
| for i in range(pairs): |
| bg = letters[i:i + 2] |
| seen = _BIGRAM_LOG_PROB.get(bg) |
| if seen is not None: |
| total += seen |
| else: |
| total += ( |
| _UNIGRAM_LOG_PROB.get(bg[0], -3.0) |
| + _UNIGRAM_LOG_PROB.get(bg[1], -3.0) |
| - _BIGRAM_BACKOFF_PENALTY |
| ) |
| return total / pairs |
|
|
|
|
| def _apply_key(letters: str, key: str) -> str: |
| """Apply a 26-letter substitution key (cipher A..Z -> plaintext).""" |
| return letters.translate(str.maketrans(ALPHABET, key)) |
|
|
|
|
| def hill_climb_substitution( |
| text: str, |
| iterations: int = 4000, |
| restarts: int = 3, |
| seed: int | None = 42, |
| ) -> tuple[str, str, float]: |
| """Solve monoalphabetic substitution by hill climbing on trigram + bigram log-prob. |
| |
| Returns ``(plaintext_guess, key, score)``. ``key`` is a 26-letter string |
| where ``key[i]`` is the plaintext letter for cipher letter ``ALPHABET[i]``. |
| |
| Educational only — works on a few hundred letters of English, fails on |
| short or non-English samples. That failure mode is part of the lesson. |
| """ |
| import random as _random |
| rng = _random.Random(seed) |
| letters = clean_letters(text) |
| if len(letters) < 30: |
| return text, ALPHABET, _BIGRAM_FLOOR |
|
|
| |
| |
| english_order = "ETAOINSHRDLCUMWFGYPBVKJXQZ" |
| observed = [ch for ch, _ in Counter(letters).most_common()] |
| observed += [c for c in ALPHABET if c not in observed] |
| seed_key = list(ALPHABET) |
| for cipher_letter, plain_letter in zip(observed, english_order, strict=False): |
| seed_key[ALPHABET.index(cipher_letter)] = plain_letter |
|
|
| def _score(k: str) -> float: |
| dec = _apply_key(letters, k) |
| |
| |
| bg = english_bigram_score(dec) |
| if len(dec) >= 30: |
| tg = english_trigram_score(dec) |
| return 0.4 * bg + 0.6 * tg |
| return bg |
|
|
| best_key = "".join(seed_key) |
| best_score = _score(best_key) |
|
|
| for restart in range(max(1, restarts)): |
| current = list(seed_key) if restart == 0 else list(best_key) |
| if restart > 0: |
| |
| for _ in range(2 + restart): |
| a, b = rng.sample(range(26), 2) |
| current[a], current[b] = current[b], current[a] |
| current_score = _score("".join(current)) |
| no_improve = 0 |
| for _ in range(iterations): |
| a, b = rng.sample(range(26), 2) |
| current[a], current[b] = current[b], current[a] |
| score = _score("".join(current)) |
| if score > current_score: |
| current_score = score |
| no_improve = 0 |
| else: |
| current[a], current[b] = current[b], current[a] |
| no_improve += 1 |
| if no_improve > iterations // 4: |
| break |
| if current_score > best_score: |
| best_score = current_score |
| best_key = "".join(current) |
|
|
| plaintext = _apply_key(text.upper(), best_key) |
| return plaintext, best_key, best_score |
|
|
|
|
| def rail_fence_decrypt(text: str, rails: int) -> str: |
| """Reverse rail-fence transposition for a given number of rails.""" |
| letters = clean_letters(text) |
| n = len(letters) |
| if rails < 2 or n == 0: |
| return letters |
| |
| pattern = [] |
| rail, direction = 0, 1 |
| for _ in range(n): |
| pattern.append(rail) |
| if rail == 0: |
| direction = 1 |
| elif rail == rails - 1: |
| direction = -1 |
| rail += direction |
| |
| indices = sorted(range(n), key=lambda i: (pattern[i], i)) |
| result = [""] * n |
| for ct_pos, orig_pos in enumerate(indices): |
| result[orig_pos] = letters[ct_pos] |
| return "".join(result) |
|
|
|
|
| def best_rail_fence_candidates(text: str, max_rails: int = 15) -> list[tuple[int, str, float, int]]: |
| """Brute-force rail counts 2..max_rails, return best decodes sorted by English quality.""" |
| candidates = [] |
| for r in range(2, max_rails + 1): |
| decoded = rail_fence_decrypt(text, r) |
| chi = chi_squared_for_english(decoded) |
| ws = word_score(decoded) |
| candidates.append((r, decoded, chi, ws)) |
| candidates.sort(key=lambda x: (-x[3], x[2])) |
| return candidates[:5] |
|
|
|
|
| def columnar_transposition_decrypt(text: str, key: str) -> str: |
| """Reverse columnar transposition given a known keyword.""" |
| letters = clean_letters(text) |
| key_letters = clean_letters(key) |
| if not key_letters or not letters: |
| return letters |
| cols = len(key_letters) |
| n = len(letters) |
| full_rows, remainder = divmod(n, cols) |
| order = sorted(range(cols), key=lambda i: (key_letters[i], i)) |
| |
| col_lengths = [full_rows + (1 if rank < remainder else 0) |
| for rank in [sorted(order).index(o) for o in order]] |
| col_lengths_by_order = [0] * cols |
| for rank, orig_col in enumerate(order): |
| col_lengths_by_order[orig_col] = full_rows + (1 if rank < remainder else 0) |
| |
| columns: list[str] = [] |
| pos = 0 |
| for orig_col in range(cols): |
| length = col_lengths_by_order[orig_col] |
| columns.append(letters[pos:pos + length]) |
| pos += length |
| |
| result = [] |
| col_ptrs = [0] * cols |
| for _ in range(full_rows + (1 if remainder else 0)): |
| for c in range(cols): |
| if col_ptrs[c] < len(columns[c]): |
| result.append(columns[c][col_ptrs[c]]) |
| col_ptrs[c] += 1 |
| return "".join(result) |
|
|
|
|
| def frequency_table(letters: str) -> list[tuple[str, int, float]]: |
| n = max(len(letters), 1) |
| counts = Counter(letters) |
| return [(ch, count, round(100 * count / n, 2)) for ch, count in counts.most_common()] |
|
|
|
|
|
|
| def analyze_evidence(text: str) -> Evidence: |
| letters = clean_letters(text) |
| top_letters = frequency_table(letters)[:10] |
| ioc = index_of_coincidence(letters) |
| entropy = shannon_entropy(letters) |
| chi = chi_squared_for_english(letters) |
| caesar_candidates = best_caesar_candidates(text, 5) |
| affine_cands = best_affine_candidates(text, 5) if len(letters) >= 30 else [] |
| atbash_plain = atbash(text) |
| friedman = round(friedman_key_length(letters), 2) |
| kasiski = kasiski_key_lengths(letters) |
| transp, bigram_sup = transposition_signal(letters) |
|
|
| notes: list[str] = [] |
| if len(letters) < 40: |
| notes.append("Short samples are hard to classify reliably. Add more ciphertext for better evidence.") |
| if 0.060 <= ioc <= 0.075: |
| notes.append("Index of coincidence is close to natural English (~0.067), which often points to plaintext, monoalphabetic substitution, or transposition.") |
| elif 0.035 <= ioc <= 0.050: |
| notes.append("Index of coincidence is below English, which can suggest a polyalphabetic cipher such as Vigenère.") |
| if caesar_candidates and caesar_candidates[0][2] > 0: |
| notes.append(f"Caesar shift {caesar_candidates[0][0]} produces recognizable English words.") |
| if affine_cands and affine_cands[0][3] > 0: |
| a, b, _, words, _ = affine_cands[0] |
| notes.append(f"Affine key (a={a}, b={b}) produces {words} English-word match(es).") |
| if word_score(atbash_plain) > 0: |
| notes.append("Atbash reversal produces recognizable English words.") |
| if friedman and 2 <= friedman <= 12: |
| notes.append(f"Friedman estimate suggests a Vigenère-like key length near {friedman:.1f}.") |
| if kasiski: |
| top_k = ", ".join(f"{k} (×{n})" for k, n in kasiski[:3]) |
| notes.append(f"Kasiski-style repeated trigrams support key length(s): {top_k}.") |
| if transp >= 0.55 and bigram_sup <= 0.35: |
| notes.append("Letter frequencies look English but common bigrams (TH, HE, IN…) are scarce — classic transposition signature.") |
| if entropy > 4.2: |
| notes.append("Entropy is relatively high for A–Z text; this may indicate stronger mixing or a short/noisy sample.") |
| if len(set(letters)) < 10 and len(letters) > 20: |
| notes.append("Very few unique letters; this sample may be too constrained or not normal prose.") |
|
|
| return Evidence( |
| letters=len(letters), |
| unique_letters=len(set(letters)), |
| index_of_coincidence=round(ioc, 5), |
| entropy=round(entropy, 4), |
| chi_squared=round(chi, 2) if math.isfinite(chi) else float("inf"), |
| top_letters=top_letters, |
| top_bigrams=ngram_counts(letters, 2), |
| top_trigrams=ngram_counts(letters, 3), |
| caesar_candidates=caesar_candidates, |
| atbash_plaintext=atbash_plain, |
| affine_candidates=affine_cands, |
| friedman_key_length=friedman, |
| kasiski_key_lengths=kasiski, |
| transposition_signal=transp, |
| bigram_support=bigram_sup, |
| notes=notes, |
| ) |
|
|
|
|
| |
| |
| |
| _ALL_LABELS: list[str] = [ |
| "adfgvx", "adfgx", "aeneas_tacticus", "affine", "alberti_disk", "argenti", |
| "arnold_andre", "atbash", "autokey", "babington", "bacon_cipher", "bazeries", |
| "beaufort", "bifid", "book_cipher", "caesar", "caesar_rot", "cardano_autokey", |
| "chaocipher", "chinese_telegraph", "columnar", "columnar_transposition", |
| "commercial_code", "confederate_vigenere", "copiale", "culper_ring", "diana", |
| "double_transposition", "enigma", "fialka", "four_square", "fractionated_morse", |
| "geez_monastic", "geheimschreiber", "great_cipher", "gronsfeld", "hill", |
| "homophonic", "jefferson_disk", "jn25", "joseon_yeokhak", "kama_sutra", "kl7", |
| "kryha", "kryptos", "lorenz", "m209", "m94", "monoalphabetic", "morse_code", |
| "navajo_code", "nihilist", "nomenclator", "null_cipher", "one_time_pad", |
| "pigpen", "plaintext", "playfair", "polybius", "porta", "purple", "rail_fence", |
| "red_type_a", "rot13", "running_key", "scytale", "sigaba", "slidex", |
| "solitaire", "stager_route", "straddling_checkerboard", "substitution", |
| "tap_code", "trifid", "trithemius", "two_square", "typex", "venona_pad_reuse", |
| "vernam", "vic", "vigenere", "voynich_render", "wallis_cipher", "wheatstone", |
| "zimmermann", |
| ] |
|
|
| |
| _PRIOR = 1.0 / len(_ALL_LABELS) |
|
|
|
|
| def _build_scores(**overrides: float) -> dict[str, float]: |
| """Return a score dict seeded with the uniform prior and the given boosts.""" |
| s = {lbl: _PRIOR for lbl in _ALL_LABELS} |
| for k, v in overrides.items(): |
| if k in s: |
| s[k] = max(s[k], v) |
| return s |
|
|
|
|
| def _norm_pred(scores: dict[str, float], source: str = "heuristic") -> ModelPrediction: |
| total = sum(scores.values()) or 1.0 |
| norm = {k: round(v / total, 6) for k, v in scores.items()} |
| label = max(norm, key=norm.get) |
| return ModelPrediction(label, norm[label], norm, source) |
|
|
|
|
| def _deterministic(label: str, confidence: float) -> ModelPrediction: |
| """Return a near-certain prediction with one dominant label.""" |
| fill = (1.0 - confidence) / max(len(_ALL_LABELS) - 1, 1) |
| scores = {lbl: fill for lbl in _ALL_LABELS} |
| scores[label] = confidence |
| return ModelPrediction(label, confidence, scores, "heuristic") |
|
|
|
|
| def heuristic_classify(text: str) -> ModelPrediction: |
| """Multi-tier transparent heuristic classifier covering all 81 cipher labels. |
| |
| Tier 1 – Definitive character-set / format rules: non-alphabetic or highly |
| constrained patterns that uniquely identify a cipher. |
| Tier 2 – Statistical alphabet analysis: IoC, chi-squared, transposition |
| signal, Friedman/Kasiski, and brute-force shifts for the ~50 |
| ciphers that produce plain A–Z lettertext. |
| """ |
| stripped = text.strip() |
| no_space = re.sub(r"\s+", "", stripped) |
|
|
| |
| |
| |
|
|
| |
| if re.match(r"^[\s.]+$", stripped) and "." in stripped and "-" not in stripped: |
| return _deterministic("tap_code", 0.93) |
|
|
| |
| if re.match(r"^[.\-/ \t\n]+$", stripped) and "-" in stripped: |
| return _deterministic("morse_code", 0.93) |
|
|
| |
| _PIGPEN = set("¬•⌐┌┐├┬┴┼▽") |
| if any(c in _PIGPEN for c in stripped): |
| return _deterministic("pigpen", 0.91) |
|
|
| |
| if "⟨" in stripped or "⟩" in stripped: |
| return _deterministic("babington", 0.93) |
|
|
| |
| letters = clean_letters(text) |
| if "+" in stripped and letters: |
| return _deterministic("trifid", 0.88) |
|
|
| |
| if "-" in stripped and "/" in stripped and letters and "." not in stripped: |
| if len(set(letters)) > 8: |
| return _deterministic("navajo_code", 0.83) |
|
|
| |
| non_letter_non_space = set(c for c in no_space if not c.isalpha()) |
| |
| |
| |
| if non_letter_non_space and non_letter_non_space.issubset({"2", "3", "4", "5", "6", "7"}) and letters: |
| return _deterministic("venona_pad_reuse", 0.87) |
|
|
| |
| |
| _voynich_words = {"oror", "oxed", "otol", "chedy", "shedy", "dainy", "otaiin", |
| "cheol", "raiin", "aiiin", "chol", "chor", "daral", "shal", |
| "daiin", "chedal", "sheedy", "okedy", "otain", "qokedy", |
| "okain", "shaiin", "sharal", "okal", "okshy"} |
| |
| |
| _basic_english = { |
| "the", "and", "that", "have", "for", "not", "with", "you", "this", "but", |
| "are", "was", "its", "his", "her", "our", "can", "has", "had", "all", |
| "one", "two", "new", "old", "now", "say", "who", "may", "use", "into", |
| "from", "they", "what", "when", "each", "will", "also", "then", "come", |
| "look", "call", "give", "over", "just", "know", "time", "long", "down", |
| "most", "some", "take", "only", "good", "even", "back", "year", "much", |
| "right", "name", "after", "little", "place", "great", "being", "same", |
| "still", "found", "many", "should", "before", "other", "where", "while", |
| "about", "people", "number", "sound", "sentence", |
| } |
| if stripped and stripped[0].islower(): |
| lc_words = set(re.findall(r"[a-z]+", stripped.lower())) |
| if len(lc_words) >= 3 and lc_words & _voynich_words: |
| return _deterministic("voynich_render", 0.85) |
| |
| |
| |
| if (all(c.islower() or c == " " for c in stripped) |
| and len(stripped) > 8 |
| and word_score(stripped) < 2 |
| and not (lc_words & _basic_english)): |
| return _deterministic("voynich_render", 0.72) |
|
|
| |
| |
| |
|
|
| |
| |
| |
| if re.match(r"^(\d+\.\d+\.\d+\s*)+$", stripped): |
| triples = re.findall(r"(\d+)\.(\d+)\.(\d+)", stripped) |
| if triples: |
| max_line = max(int(t[1]) for t in triples) |
| if max_line <= 8: |
| return _deterministic("arnold_andre", 0.88) |
| return _deterministic("book_cipher", 0.85) |
| return _deterministic("arnold_andre", 0.89) |
|
|
| |
| if re.search(r"\d+\.\d+", stripped) and re.search(r"\d", stripped): |
| tokens = stripped.split() |
| if all(re.match(r"^\d+[.\d]*$", t) for t in tokens if t): |
| return _deterministic("book_cipher", 0.85) |
|
|
| |
| digits_only = re.sub(r"\s", "", stripped) |
| if digits_only and digits_only.isdigit() and not letters: |
| tokens = [t for t in stripped.split() if t] |
| if not tokens: |
| pass |
| else: |
| tok_lens = [len(t) for t in tokens if t.isdigit()] |
| avg_len = sum(tok_lens) / len(tok_lens) if tok_lens else 0 |
| nums = [int(t) for t in tokens if t.isdigit()] |
|
|
| |
| if all(len(t) == 2 and t.isdigit() and all(c in "12345" for c in t) |
| for t in tokens): |
| return _deterministic("polybius", 0.91) |
|
|
| |
| if all(len(t) == 4 and t.isdigit() for t in tokens) and len(tokens) >= 3: |
| return _deterministic("chinese_telegraph", 0.89) |
|
|
| |
| |
| if all(len(t) == 5 and t.isdigit() for t in tokens): |
| vals = [int(t) for t in tokens] |
| above_50k = sum(1 for v in vals if v >= 50000) |
| below_50k = sum(1 for v in vals if v < 50000) |
| above_90k = sum(1 for v in vals if v >= 90000) |
| |
| if above_90k / len(vals) >= 0.60: |
| return _deterministic("zimmermann", 0.85) |
| |
| if below_50k / len(vals) >= 0.80 and above_50k == 0: |
| return _deterministic("jn25", 0.83) |
|
|
| |
| if nums and all(len(t) in (4, 5) and t.isdigit() for t in tokens): |
| above_90k_any = sum(1 for n in nums if n >= 90000) |
| if above_90k_any / len(nums) >= 0.50: |
| return _deterministic("zimmermann", 0.78) |
|
|
| |
| |
| if all(len(t) == 3 for t in tokens if t.isdigit()): |
| in_range = sum(1 for n in nums if 800 <= n <= 999) |
| if len(nums) > 0 and in_range / len(nums) >= 0.70: |
| return _deterministic("culper_ring", 0.85) |
|
|
| |
| |
| if " " not in stripped and len(stripped) >= 10: |
| if len(stripped) >= 35: |
| return _deterministic("vic", 0.70) |
| return _deterministic("straddling_checkerboard", 0.72) |
|
|
| |
| if all(len(t) == 2 for t in tokens) and len(tokens) >= 4: |
| low_26 = sum(1 for n in nums if 1 <= n <= 26) |
| high_50 = sum(1 for n in nums if 50 <= n <= 99) |
| |
| |
| if low_26 / len(nums) >= 0.50 and high_50 >= 1: |
| return _deterministic("nomenclator", 0.65) |
| |
| if all(1 <= n <= 26 for n in nums): |
| return _deterministic("aeneas_tacticus", 0.72) |
| |
| |
| |
| |
| _w90 = (nums.count(90) + nums.count(91)) / len(nums) |
| if _w90 >= 0.10: |
| return _deterministic("wallis_cipher", 0.70) |
| |
| |
| |
| below_22 = sum(1 for n in nums if n < 22) |
| in_nihilist_range = sum(1 for n in nums if 22 <= n <= 99) |
| if below_22 == 0 and in_nihilist_range / len(nums) >= 0.80: |
| return _deterministic("nihilist", 0.65) |
| |
| if below_22 / len(nums) >= 0.15: |
| return _deterministic("homophonic", 0.50) |
| return _deterministic("homophonic", 0.45) |
|
|
| |
| |
| if avg_len < 3.0 and len(tokens) >= 5: |
| max_num = max(nums) if nums else 0 |
| |
| _w90 = (nums.count(90) + nums.count(91)) / len(nums) |
| if _w90 >= 0.10: |
| return _deterministic("wallis_cipher", 0.62) |
| if max_num <= 26: |
| return _deterministic("aeneas_tacticus", 0.55) |
| |
| if max_num <= 110: |
| below_22 = sum(1 for n in nums if n < 22) |
| if below_22 == 0: |
| return _deterministic("nihilist", 0.55) |
| |
| if max_num <= 99: |
| return _deterministic("homophonic", 0.50) |
| return _deterministic("homophonic", 0.45) |
|
|
| if avg_len < 3.5: |
| |
| return _deterministic("great_cipher", 0.50) |
|
|
| |
| return _deterministic("argenti", 0.45) |
|
|
| |
| if re.match(r"^([A-Za-z]\d\s+)*[A-Za-z]\d\s*$", stripped): |
| return _deterministic("copiale", 0.91) |
|
|
| |
| |
| |
| letter_set = set(letters) |
|
|
| if letters and len(letters) >= 10: |
| |
| if letter_set <= {"A", "D", "F", "G", "V", "X"}: |
| if "V" in letter_set: |
| return _deterministic("adfgvx", 0.93) |
| return _deterministic("adfgx", 0.91) |
|
|
| |
| if letter_set <= {"A", "D", "F", "G", "X"}: |
| return _deterministic("adfgx", 0.91) |
|
|
| |
| if letter_set <= {"A", "B"} and len(letters) >= 25: |
| return _deterministic("bacon_cipher", 0.93) |
|
|
| |
| word_groups = [w for w in text.upper().split() if re.match(r"^[A-Z]{5}$", w)] |
| if len(word_groups) >= 4 and len(word_groups) / max(len(text.split()), 1) >= 0.55: |
| zx_end = sum(1 for w in word_groups if w[-1] in "ZX") |
| if zx_end / len(word_groups) >= 0.25: |
| return _deterministic("commercial_code", 0.82) |
|
|
| |
| |
| |
| _al_words_raw = [w for w in stripped.upper().split() if re.match(r'^[A-Z]+$', w)] |
| if len(_al_words_raw) >= 6 and len(letters) >= 24: |
| _w4_count = sum(1 for w in _al_words_raw if len(w) == 4) |
| _w4_ratio = _w4_count / len(_al_words_raw) |
| if _w4_ratio >= 0.95: |
| _cnt_early = Counter(letters) |
| _n_early = len(letters) |
| _ioc_early = ( |
| sum(v * (v - 1) for v in _cnt_early.values()) / max(1, _n_early * (_n_early - 1)) |
| ) |
| if _ioc_early < 0.048: |
| if _ioc_early < 0.040: |
| return _deterministic("vernam", 0.52) |
| return _deterministic("running_key", 0.50) |
|
|
| |
| |
| |
| |
| n_letters = len(letters) |
|
|
| ev = analyze_evidence(text) |
| ioc = ev.index_of_coincidence |
| best_shift, best_chi, best_words, _ = ev.caesar_candidates[0] |
| atbash_words = word_score(ev.atbash_plaintext) |
| atbash_chi = chi_squared_for_english(clean_letters(ev.atbash_plaintext)) |
| affine_words = ev.affine_candidates[0][3] if ev.affine_candidates else 0 |
| raw_words = word_score(text) |
| raw_chi = chi_squared_for_english(letters) |
| fried = ev.friedman_key_length |
| kas_support = sum(n for _, n in ev.kasiski_key_lengths[:3]) |
| transp = ev.transposition_signal |
| bgm = ev.bigram_support |
| _uniq = len(set(letters)) |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| if raw_words >= 3 and raw_chi < 150: |
| return _deterministic("null_cipher", min(0.75, 0.15 + 0.06 * raw_words)) |
|
|
| |
| |
| |
| |
| _short = n_letters < 25 |
| _atbash_thr = 1 if _short else 2 |
| _bf_thr = 1 if _short else 2 |
|
|
| |
| |
| |
| if atbash_words >= _atbash_thr or (n_letters >= 25 and atbash_chi < 100): |
| return _deterministic("atbash", min(0.82, 0.25 + 0.07 * max(atbash_words, 1))) |
|
|
| |
| |
| if best_shift == 13 and (best_words >= _bf_thr or (n_letters >= 25 and best_chi < 100)): |
| return _deterministic("rot13", min(0.85, 0.25 + 0.08 * max(best_words, 1))) |
|
|
| |
| if best_words >= _bf_thr and best_shift != 0: |
| conf = min(0.82, 0.22 + 0.08 * best_words) |
| if affine_words >= best_words + 2: |
| return _deterministic("affine", min(0.78, 0.20 + 0.07 * affine_words)) |
| return _deterministic("caesar", conf) |
|
|
| |
| affine_chi = ev.affine_candidates[0][2] if ev.affine_candidates else 999 |
| best_a = ev.affine_candidates[0][0] if ev.affine_candidates else 1 |
| if affine_words >= 3: |
| return _deterministic("affine", min(0.75, 0.18 + 0.07 * affine_words)) |
| |
| |
| |
| if n_letters >= 25 and affine_chi < 50 and best_a != 1: |
| return _deterministic("affine", 0.65) |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| if n_letters < 12: |
| return ModelPrediction("too_short", 0.20, {"too_short": 0.20}, "heuristic") |
|
|
| |
| |
| |
| |
| x_freq = letters.count("X") / max(1, n_letters) |
| no_x_letters = letters.replace("X", "") |
| x_chi = chi_squared_for_english(no_x_letters) if len(no_x_letters) >= 15 else 999 |
| if x_freq > 0.05 and x_chi < 80 and ioc >= 0.045: |
| |
| return _deterministic("scytale", 0.48) |
|
|
| |
| |
| |
| |
| _transp_by_chi = (ioc >= 0.055 and raw_chi < 80 and 0.20 <= bgm <= 0.70) |
| _transp_by_signal = (transp >= 0.50 and bgm <= 0.40 and ioc >= 0.055) |
| if _transp_by_chi or _transp_by_signal: |
| |
| |
| |
| |
| |
| |
| |
| _rf_cands = best_rail_fence_candidates(letters, max_rails=10) |
| if _rf_cands: |
| _best_rf_words = max(c[3] for c in _rf_cands) |
| if _best_rf_words >= 2: |
| return _deterministic("rail_fence", 0.52) |
| |
| |
| |
| if ioc >= 0.063: |
| return _deterministic("stager_route", 0.42) |
| if n_letters <= 150: |
| return _deterministic("columnar_transposition", 0.45) |
| else: |
| return _deterministic("double_transposition", 0.40) |
|
|
| |
| if ioc >= 0.058: |
| |
| |
| |
| |
| |
| if raw_chi < 50 and bgm > 0.85 and raw_words < 3: |
| return _deterministic("lorenz", 0.48) |
| |
| |
| |
| if " " in stripped and raw_words < 2 and n_letters >= 20: |
| return _deterministic("kama_sutra", 0.45) |
| if 0.058 <= ioc < 0.068: |
| |
| if transp >= 0.35 and bgm <= 0.50: |
| return _deterministic("columnar_transposition", 0.38) |
| |
| |
| if _uniq <= 20 and n_letters >= 25 and raw_chi > 200 and kas_support == 0: |
| return _deterministic("joseon_yeokhak", 0.30) |
| |
| if 0.058 <= ioc < 0.065 and _uniq <= 22 and n_letters >= 30: |
| return _deterministic("wheatstone", 0.30) |
| return _deterministic("monoalphabetic", 0.38) |
| |
| |
| if raw_chi > 300 and kas_support >= 4 and _uniq <= 18: |
| return _deterministic("geez_monastic", 0.32) |
| if ioc >= 0.070: |
| if transp >= 0.40: |
| return _deterministic("scytale", 0.38) |
| return _deterministic("monoalphabetic", 0.36) |
|
|
| |
| if 0.046 <= ioc < 0.058: |
| |
| |
| if _uniq <= 12 and n_letters >= 30: |
| return _deterministic("bazeries", 0.38) |
| |
| |
| if ioc < 0.050 and raw_chi > 250 and kas_support >= 2: |
| return _deterministic("cardano_autokey", 0.28) |
| |
| |
| |
| if 0.049 <= ioc < 0.058 and raw_chi > 200 and kas_support >= 2: |
| return _deterministic("porta", 0.32) |
| |
| |
| if ioc >= 0.053 and raw_chi > 180 and kas_support >= 3 and _uniq <= 22: |
| return _deterministic("fractionated_morse", 0.30) |
| if 0.049 <= ioc < 0.058: |
| return _deterministic("playfair", 0.32) |
| return _deterministic("playfair", 0.35) |
|
|
| |
| if 0.040 <= ioc < 0.046: |
| |
| |
| if raw_chi > 130: |
| |
| if kas_support >= 3 and raw_chi > 350: |
| return _deterministic("slidex", 0.28) |
| if kas_support == 0: |
| |
| if raw_chi > 450: |
| return _deterministic("jefferson_disk", 0.28) |
| |
| if raw_chi > 280 and ioc >= 0.043: |
| return _deterministic("confederate_vigenere", 0.28) |
| |
| if raw_chi > 250 and ioc < 0.043: |
| return _deterministic("hill", 0.28) |
| |
| if raw_chi > 130 and ioc < 0.043: |
| return _deterministic("bifid", 0.28) |
| if 2 <= fried <= 12: |
| |
| |
| |
| if 2 <= fried <= 6 and kas_support >= 2: |
| return _deterministic("gronsfeld", 0.38) |
| if kas_support >= 3: |
| return _deterministic("vigenere", 0.42) |
| |
| |
| |
| if kas_support <= 1 and n_letters >= 40: |
| if ioc < 0.042: |
| return _deterministic("purple", 0.28) |
| if 0.042 <= ioc < 0.044: |
| return _deterministic("geheimschreiber", 0.28) |
| return _deterministic("diana", 0.28) |
| return _deterministic("vigenere", 0.36) |
| |
| |
| if kas_support == 0: |
| |
| if ioc < 0.042 and n_letters >= 20: |
| if _uniq <= 18: |
| return _deterministic("red_type_a", 0.28) |
| return _deterministic("fialka", 0.28) |
| return _deterministic("beaufort", 0.32) |
| return _deterministic("beaufort", 0.28) |
|
|
| |
| if ioc < 0.040: |
| |
| |
| |
| if n_letters >= 25: |
| _tri_dec = "".join( |
| chr((ord(c) - 65 - i % 26) % 26 + 65) |
| for i, c in enumerate(letters) |
| ) |
| if chi_squared_for_english(_tri_dec) < 50 and word_score(_tri_dec) >= 3: |
| return _deterministic("trithemius", 0.60) |
| |
| |
| |
| |
| if ioc >= 0.036 and kas_support == 0 and fried >= max(15, n_letters * 0.4): |
| return _deterministic("autokey", 0.38) |
| |
| |
| |
| |
| if ioc < 0.026: |
| return _deterministic("chaocipher", 0.28) |
| if ioc < 0.031: |
| return _deterministic("m209", 0.27) |
| |
| |
| if raw_chi > 550 and _uniq >= 24 and kas_support == 0: |
| return _deterministic("one_time_pad", 0.30) |
| return _deterministic("enigma", 0.25) |
|
|
| |
| return _deterministic("vigenere", 0.22) |
|
|
|
|
|
|
| def build_explanation(text: str, pred: ModelPrediction) -> str: |
| ev = analyze_evidence(text) |
| lines = [ |
| f"### Detective conclusion: `{pred.label}`", |
| f"**Confidence:** {pred.confidence:.1%} ", |
| f"**Prediction source:** {pred.source}", |
| "", |
| "### Evidence", |
| f"- Letters analyzed: **{ev.letters}**", |
| f"- Unique A-Z letters: **{ev.unique_letters}**", |
| f"- Index of coincidence: **{ev.index_of_coincidence}**", |
| f"- Shannon entropy: **{ev.entropy}** bits/letter", |
| f"- English chi-squared score: **{ev.chi_squared}**", |
| "", |
| "### Why that matters", |
| ] |
| if ev.notes: |
| lines.extend([f"- {note}" for note in ev.notes]) |
| else: |
| lines.append("- No single signal dominates. Treat this as a hypothesis, not a proof.") |
|
|
| lines += [ |
| "", |
| "### Top Caesar / ROT candidates", |
| "| Shift | Word clues | Chi-squared | Preview |", |
| "|---:|---:|---:|---|", |
| ] |
| for shift, chi, score, decoded in ev.caesar_candidates[:5]: |
| preview = decoded[:110].replace("\n", " ") |
| lines.append(f"| {shift} | {score} | {chi:.2f} | `{preview}` |") |
|
|
| if ev.affine_candidates: |
| lines += [ |
| "", |
| "### Top Affine candidates", |
| "| a | b | Word clues | Chi-squared | Preview |", |
| "|---:|---:|---:|---:|---|", |
| ] |
| for a, b, chi, score, decoded in ev.affine_candidates[:3]: |
| preview = decoded[:90].replace("\n", " ") |
| lines.append(f"| {a} | {b} | {score} | {chi:.2f} | `{preview}` |") |
|
|
| lines += [ |
| "", |
| "### Polyalphabetic indicators", |
| f"- Friedman key-length estimate: **{ev.friedman_key_length or '—'}**", |
| "- Kasiski candidate key lengths: " |
| + (", ".join(f"`{k}` (×{n})" for k, n in ev.kasiski_key_lengths[:5]) or "—"), |
| "", |
| "### Transposition indicators", |
| f"- Transposition signal: **{ev.transposition_signal}** (high = English letters but disrupted bigrams)", |
| f"- English-bigram support: **{ev.bigram_support}**", |
| ] |
|
|
| lines += [ |
| "", |
| "### Top letter frequencies", |
| "| Letter | Count | Percent |", |
| "|---|---:|---:|", |
| ] |
| for ch, count, pct in ev.top_letters[:10]: |
| lines.append(f"| {ch} | {count} | {pct}% |") |
|
|
| lines += [ |
| "", |
| "### Reality check", |
| "This project teaches classical cryptanalysis signals. It does **not** break modern encryption, recover passwords, bypass access controls, or prove anything about real-world cryptographic security.", |
| ] |
| return "\n".join(lines) |
|
|
|
|