""" GraphoLab core — Writer Identification. Provides: - writer_identify() identify the writer of a handwriting sample """ from __future__ import annotations import io import threading from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np from PIL import Image, ImageDraw, ImageFont from skimage import filters, transform as sk_transform from skimage.feature import hog, local_binary_pattern from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.svm import SVC # ────────────────────────────────────────────────────────────────────────────── # Configuration # ────────────────────────────────────────────────────────────────────────────── WRITER_IMG_SIZE = (128, 256) # (H, W) for feature extraction _WRITER_NAMES = { 0: "Scrittore A", 1: "Scrittore B", 2: "Scrittore C", 3: "Scrittore D", 4: "Scrittore E", } _FONTS_DIR = Path("C:/Windows/Fonts") _WRITER_FONTS = [ ("Inkfree.ttf", 19), ("LHANDW.TTF", 17), ("segoepr.ttf", 18), ("segoesc.ttf", 16), ("comic.ttf", 18), ] _SENTENCES = [ "il gatto dorme sul tetto", "la casa è piccola e bella", "oggi il cielo è molto blu", "scrivere a mano è un'arte", "ogni persona ha uno stile", "il sole tramonta a ovest", "leggo un libro ogni sera", "la penna scorre sul foglio", "le parole raccontano storie", "questo è un campione scritto", ] # ────────────────────────────────────────────────────────────────────────────── # Lazy model state # ────────────────────────────────────────────────────────────────────────────── _writer_clf: Pipeline | None = None _writer_le: LabelEncoder | None = None _writer_X_scaled: np.ndarray | None = None _writer_dist_threshold: float | None = None _writer_lock = threading.Lock() # ────────────────────────────────────────────────────────────────────────────── # Internal helpers # ────────────────────────────────────────────────────────────────────────────── def _make_synthetic_writer(writer_id: int, sample_id: int) -> Image.Image: """Generate a synthetic handwriting sample using system TTF fonts.""" rng = np.random.default_rng(writer_id * 1000 + sample_id) font_name, base_size = _WRITER_FONTS[writer_id % len(_WRITER_FONTS)] font_size = base_size + int(rng.integers(-1, 2)) try: font = ImageFont.truetype(str(_FONTS_DIR / font_name), font_size) except Exception: font = ImageFont.load_default() ink_value = int([25, 15, 35, 20, 30][writer_id % 5] + rng.integers(-5, 6)) lines = [ _SENTENCES[(writer_id * 3 + sample_id + i) % len(_SENTENCES)] for i in range(3) ] w, h = 320, 140 img = Image.new("L", (w, h), 255) draw = ImageDraw.Draw(img) line_gap = font_size + 12 + int(rng.integers(-2, 3)) y = 10 for line in lines: x = 8 + int(rng.integers(-3, 4)) draw.text((x, y), line, fill=ink_value, font=font) y += line_gap angle = float(rng.uniform(-1.5, 1.5)) img = img.rotate(angle, fillcolor=255, expand=False) return img def _preprocess_writer_img(pil_img: Image.Image) -> np.ndarray: """Convert PIL image to normalised grayscale array of WRITER_IMG_SIZE.""" gray = pil_img.convert("L") w, h = gray.size target_ratio = WRITER_IMG_SIZE[1] / WRITER_IMG_SIZE[0] # 2.0 if h > w: crop_h = int(w / target_ratio) top = h // 6 top = min(top, max(0, h - crop_h)) gray = gray.crop((0, top, w, top + crop_h)) arr = np.array(gray, dtype=np.float32) thresh = filters.threshold_otsu(arr) if arr.std() > 1 else 128.0 binary = (arr < thresh).astype(np.float32) resized = sk_transform.resize(binary, WRITER_IMG_SIZE, anti_aliasing=True) return resized.astype(np.float32) def _extract_writer_features(pil_img: Image.Image) -> np.ndarray: """Extract HOG + LBP + run-length features for writer identification.""" arr = _preprocess_writer_img(pil_img) arr8 = (arr * 255).astype(np.uint8) hog_feats = hog( arr, orientations=9, pixels_per_cell=(16, 16), cells_per_block=(2, 2), feature_vector=True, ) lbp = local_binary_pattern(arr8, P=24, R=3, method="uniform") lbp_hist, _ = np.histogram(lbp, bins=26, range=(0, 26), density=True) def _run_stats(binary_row): runs = [] cnt = 0 for v in binary_row: if v > 0.5: cnt += 1 elif cnt > 0: runs.append(cnt) cnt = 0 if cnt > 0: runs.append(cnt) return runs h_runs, v_runs = [], [] for row in arr: h_runs.extend(_run_stats(row)) for col in arr.T: v_runs.extend(_run_stats(col)) h_arr = np.array(h_runs, dtype=np.float32) if h_runs else np.array([0.0]) v_arr = np.array(v_runs, dtype=np.float32) if v_runs else np.array([0.0]) run_feats = np.array([ h_arr.mean(), h_arr.std(), h_arr.max(), v_arr.mean(), v_arr.std(), v_arr.max(), ], dtype=np.float32) return np.concatenate([hog_feats, lbp_hist, run_feats]) def _load_real_writer_samples(samples_dir: Path) -> tuple[list, list] | None: """Load samples from data/samples/writer_XX/sample_YY.png directories.""" writer_dirs = sorted(samples_dir.glob("writer_??")) if len(writer_dirs) < 2: return None X, y = [], [] for wd in writer_dirs: samples = sorted(wd.glob("sample_*.png")) if len(samples) < 3: continue for sp in samples: try: img = Image.open(sp) X.append(_extract_writer_features(img)) y.append(wd.name) except Exception: pass if len(set(y)) < 2: return None return X, y def _get_writer_model(samples_dir: Path): """Return (Pipeline, LabelEncoder), training lazily on first call (thread-safe).""" global _writer_clf, _writer_le, _writer_X_scaled, _writer_dist_threshold if _writer_clf is not None: return _writer_clf, _writer_le with _writer_lock: if _writer_clf is not None: return _writer_clf, _writer_le print("Training writer identification model...") real = _load_real_writer_samples(samples_dir) if real is not None: X_raw, labels = real else: X_raw, labels = [], [] for wid in range(5): for sid in range(10): img = _make_synthetic_writer(wid, sid) X_raw.append(_extract_writer_features(img)) labels.append(_WRITER_NAMES[wid]) le = LabelEncoder() y_enc = le.fit_transform(labels) X = np.array(X_raw) clf = Pipeline([ ("scaler", StandardScaler()), ("svc", SVC(kernel="rbf", C=10, gamma="scale", probability=True)), ]) clf.fit(X, y_enc) X_scaled = clf.named_steps["scaler"].transform(X) max_intra = 0.0 for cls in np.unique(y_enc): Xc = X_scaled[y_enc == cls] if len(Xc) < 2: continue diff = Xc[:, np.newaxis, :] - Xc[np.newaxis, :, :] dists = np.sqrt((diff ** 2).sum(axis=2)) np.fill_diagonal(dists, np.inf) max_intra = max(max_intra, dists.min(axis=1).max()) _writer_X_scaled = X_scaled _writer_dist_threshold = max_intra * 2.0 _writer_clf = clf _writer_le = le print( f"Writer model ready — {len(le.classes_)} writers, {len(X)} samples. " f"Rejection threshold: {_writer_dist_threshold:.3f}" ) return _writer_clf, _writer_le def ensure_writer_examples(examples_dir: Path) -> list[str]: """Pre-generate example images for UI examples.""" examples_dir.mkdir(parents=True, exist_ok=True) paths = [] for wid in range(5): p = examples_dir / f"writer_{wid}_example.png" if not p.exists(): img = _make_synthetic_writer(wid, sample_id=99) img.save(str(p)) paths.append(str(p)) return paths # ────────────────────────────────────────────────────────────────────────────── # Core function # ────────────────────────────────────────────────────────────────────────────── def writer_identify(image: np.ndarray, samples_dir: Path) -> tuple[str, np.ndarray | None]: """Identify the most likely writer of a handwriting sample. Args: image: RGB numpy array of the handwriting sample. samples_dir: Path to data/samples/ directory (for real writer samples). Returns: report_md: Markdown with ranked candidates. chart: Bar chart as numpy array (or None on error). """ if image is None: return "Carica un'immagine di testo manoscritto.", None try: clf, le = _get_writer_model(samples_dir) except Exception as e: return f"Errore nel caricamento del modello: {e}", None pil_img = Image.fromarray(image) try: feat = _extract_writer_features(pil_img) except Exception as e: return f"Errore nell'estrazione delle caratteristiche: {e}", None proba = clf.predict_proba([feat])[0] order = np.argsort(proba)[::-1] names = le.inverse_transform(order) scores = proba[order] is_unknown = False if _writer_X_scaled is not None and _writer_dist_threshold is not None: feat_scaled = clf.named_steps["scaler"].transform([feat])[0] min_dist = np.linalg.norm(_writer_X_scaled - feat_scaled, axis=1).min() is_unknown = min_dist > _writer_dist_threshold rows = "\n".join( f"| {'🥇' if i == 0 else '🥈' if i == 1 else '🥉' if i == 2 else ' '} " f"**{name}** | {score:.1%} |" for i, (name, score) in enumerate(zip(names, scores)) ) if is_unknown: report_md = ( "**⚠️ Scrittore non identificato nel database**\n\n" "La scrittura analizzata non corrisponde a nessuno degli scrittori noti. " "Le probabilità di seguito hanno valore puramente indicativo " "e **non devono essere usate per un'attribuzione**.\n\n" "| Candidato | Probabilità (riferimento) |\n" "|-----------|---------------------------|\n" + rows + "\n\n*La distanza dal campione più simile nel database supera la soglia " "di affidabilità. Aggiungere campioni dello scrittore al database per " "un confronto diretto.*" ) else: report_md = ( "**Identificazione Scrittore — Risultati**\n\n" "| Candidato | Probabilità |\n" "|-----------|-------------|\n" + rows + "\n\n*I risultati si basano su caratteristiche HOG + LBP + statistiche dei tratti.*" ) if _load_real_writer_samples(samples_dir) is None: report_md += ( "\n\n⚠️ *Dati sintetici: il modello è addestrato su scritture generate " "artificialmente. Per risultati forensi reali, popola `data/samples/writer_XX/`.*" ) # Bar chart fig, ax = plt.subplots(figsize=(5, max(2.5, len(names) * 0.55))) if is_unknown: colors = ["#aaaaaa"] * len(names) chart_title = "Scrittore non nel database — solo riferimento" else: colors = [ "#1B3A6B" if i == 0 else "#C8973A" if i == 1 else "#9eb8e0" for i in range(len(names)) ] chart_title = "Probabilità per scrittore" ax.barh(names[::-1], scores[::-1] * 100, color=colors[::-1]) ax.set_xlabel("Probabilità (%)") ax.set_xlim(0, 105) ax.set_title(chart_title) for i, (name, score) in enumerate(zip(names[::-1], scores[::-1])): ax.text(score * 100 + 1, i, f"{score:.1%}", va="center", fontsize=9) plt.tight_layout() buf = io.BytesIO() fig.savefig(buf, format="png", dpi=120) plt.close(fig) buf.seek(0) chart_arr = np.array(Image.open(buf)) return report_md, chart_arr