Spaces:
Runtime error
Runtime error
| """ | |
| Pokemon Card OCR Module | |
| Extracts card identity text (name, collector number, HP) from card front images | |
| using pytesseract with targeted region cropping and preprocessing. | |
| """ | |
| import re | |
| from typing import Dict, Optional | |
| import cv2 | |
| import numpy as np | |
| class CardOCR: | |
| """ | |
| Extracts text from Pokemon card front images using pytesseract. | |
| Crops specific card regions before OCR to improve accuracy: | |
| - Top 18% of card: card name | |
| - Bottom-right 20%×15%: collector number (e.g. "025 / 165") | |
| - Top-right 25%×12%: HP value | |
| """ | |
| def extract(self, image_bgr: np.ndarray) -> Dict[str, Optional[str]]: | |
| """ | |
| Extract card identity fields from a BGR image. | |
| Args: | |
| image_bgr: OpenCV BGR image (H×W×3) | |
| Returns: | |
| Dict with keys: name, collector_number, set_total, hp, raw_text | |
| All values are str or None if not detected. | |
| """ | |
| try: | |
| import pytesseract # noqa: F401 — verify available at call time | |
| except ImportError: | |
| return { | |
| "name": None, | |
| "collector_number": None, | |
| "set_total": None, | |
| "hp": None, | |
| "raw_text": None, | |
| } | |
| # Locate the card within the photo (handles cards not filling the frame) | |
| card_img = self._locate_card(image_bgr) | |
| name_crop = self._crop_name_region(card_img) | |
| number_crop = self._crop_number_region(card_img) | |
| hp_crop = self._crop_hp_region(card_img) | |
| # PSM 11 = sparse text (finds text in any position/orientation), | |
| # better than PSM 7 for bold card-name fonts on coloured backgrounds. | |
| name_text = self._run_ocr(self._preprocess_name(name_crop), psm=11) | |
| # PSM 6 = single block — number crop is now full-width so may span | |
| # multiple short lines (e.g. illustrator name above, number below). | |
| number_text = self._run_ocr(self._preprocess(number_crop), psm=6) | |
| hp_text = self._run_ocr(self._preprocess(hp_crop), psm=7) | |
| raw_text = f"{name_text} | {number_text} | {hp_text}" | |
| name = self._parse_name(name_text) | |
| # Fallback: if the standard name crop returned nothing, try a taller | |
| # region (top 25%, full width) — handles cards where the name banner | |
| # sits lower or the card crop is slightly mis-aligned. | |
| if name is None: | |
| h_c, w_c = card_img.shape[:2] | |
| wide_crop = card_img[0 : int(h_c * 0.25), :] | |
| wide_text = self._run_ocr(self._preprocess_name(wide_crop), psm=11) | |
| name = self._parse_name(wide_text) | |
| if name: | |
| raw_text = f"{name_text}[wide:{wide_text}] | {number_text} | {hp_text}" | |
| collector_number, set_total = self._parse_collector_number(number_text) | |
| hp = self._parse_hp(hp_text) | |
| return { | |
| "name": name, | |
| "collector_number": collector_number, | |
| "set_total": set_total, | |
| "hp": hp, | |
| "raw_text": raw_text, | |
| } | |
| # ------------------------------------------------------------------ # | |
| # Card localiser # | |
| # ------------------------------------------------------------------ # | |
| def _locate_card(self, img: np.ndarray) -> np.ndarray: | |
| """ | |
| Locate the Pokemon card within a photo and return a tight crop. | |
| Two strategies are attempted in order: | |
| 1. Background subtraction — thresholds near-white pixels (> 230) | |
| as background and finds the bounding box of the remaining | |
| foreground. Fast and reliable for cards on white/light tables. | |
| 2. Canny edge contours — finds the largest rectangle-shaped contour. | |
| Used as a fallback for coloured or textured backgrounds. | |
| Pokemon cards have a fixed aspect ratio of ≈ 0.714 (63 mm / 88 mm). | |
| Both strategies accept any contour with width/height in [0.50, 0.90] | |
| covering at least 5 % and at most 95 % of the full image area. | |
| Falls back to the full image if no card-shaped region is found, | |
| so OCR still runs (with lower accuracy) rather than crashing. | |
| """ | |
| img_h, img_w = img.shape[:2] | |
| img_area = img_h * img_w | |
| def _card_shaped(w: int, h: int) -> bool: | |
| """Accept both portrait (≈0.714) and landscape (≈1.40) card aspects.""" | |
| if h == 0: | |
| return False | |
| ratio = w / h | |
| if ratio > 1: | |
| ratio = 1 / ratio # normalise landscape → portrait range | |
| return 0.50 <= ratio <= 0.90 | |
| def _apply_crop(x: int, y: int, w: int, h: int) -> np.ndarray: | |
| pad = max(5, int(min(img_w, img_h) * 0.01)) | |
| x = max(0, x - pad) | |
| y = max(0, y - pad) | |
| w = min(img_w - x, w + 2 * pad) | |
| h = min(img_h - y, h + 2 * pad) | |
| return img[y : y + h, x : x + w] | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| # ── Strategy 1: background subtraction ────────────────────────── | |
| # Works well when the card is on a near-white (> 230) surface. | |
| # Skipped if the largest contour fills ≥ 85% of the image — that | |
| # means the background is dark/coloured and the threshold swept up | |
| # the whole frame as "foreground" rather than isolating the card. | |
| _, mask = cv2.threshold(gray, 230, 255, cv2.THRESH_BINARY_INV) | |
| fg_cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| if fg_cnts: | |
| largest = max(fg_cnts, key=cv2.contourArea) | |
| area = cv2.contourArea(largest) | |
| if img_area * 0.05 < area < img_area * 0.85: | |
| x, y, w, h = cv2.boundingRect(largest) | |
| if _card_shaped(w, h): | |
| return self._correct_orientation(_apply_crop(x, y, w, h)) | |
| # ── Strategy 2: Canny edge contours ───────────────────────────── | |
| blurred = cv2.GaussianBlur(gray, (5, 5), 0) | |
| edges = cv2.Canny(blurred, 30, 100) | |
| kernel = np.ones((5, 5), np.uint8) | |
| dilated = cv2.dilate(edges, kernel, iterations=2) | |
| edge_cnts, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) | |
| for cnt in sorted(edge_cnts, key=cv2.contourArea, reverse=True)[:8]: | |
| area = cv2.contourArea(cnt) | |
| if area < img_area * 0.05: | |
| break | |
| x, y, w, h = cv2.boundingRect(cnt) | |
| if _card_shaped(w, h): | |
| return self._correct_orientation(_apply_crop(x, y, w, h)) | |
| # ── Fallback: centre crop ───────────────────────────────────────── | |
| # Neither strategy found a card-shaped region. Most phone photos | |
| # place the card near the centre; strip the outer margins to remove | |
| # the status bar, navigation UI, and table background. | |
| cy, cx = int(img_h * 0.12), int(img_w * 0.05) | |
| centre = img[cy: img_h - cy, cx: img_w - cx] | |
| return self._correct_orientation(centre) | |
| def _correct_orientation(self, card: np.ndarray) -> np.ndarray: | |
| """ | |
| If the card appears landscape (width > height), rotate it to portrait. | |
| Uses a deterministic clockwise rotation to avoid extra OCR calls in the | |
| orientation path, which can be slow or unstable on noisy images. | |
| """ | |
| h, w = card.shape[:2] | |
| if w <= h: | |
| return card # already portrait | |
| return cv2.rotate(card, cv2.ROTATE_90_CLOCKWISE) | |
| def _name_text_score(self, card: np.ndarray) -> int: | |
| """ | |
| Quick OCR of the name region; returns count of alphabetic characters. | |
| Used only for orientation selection — does not affect final OCR results. | |
| """ | |
| try: | |
| import pytesseract | |
| crop = self._crop_name_region(card) | |
| processed = self._preprocess_name(crop) | |
| text = pytesseract.image_to_string( | |
| processed, | |
| config="--oem 3 --psm 11", | |
| timeout=2, | |
| ) | |
| return sum(1 for c in text if c.isalpha()) | |
| except Exception: | |
| return 0 | |
| # ------------------------------------------------------------------ # | |
| # Region croppers # | |
| # ------------------------------------------------------------------ # | |
| def _crop_name_region(self, img: np.ndarray) -> np.ndarray: | |
| """Top 18% of the card, left 65% width — contains card name. | |
| The right 35% holds the HP value and type icons; excluding it | |
| prevents those elements from confusing the name OCR.""" | |
| h, w = img.shape[:2] | |
| return img[0 : int(h * 0.18), 0 : int(w * 0.65)] | |
| def _crop_number_region(self, img: np.ndarray) -> np.ndarray: | |
| """Bottom 15% × full width — collector number. | |
| Older cards (XY era) place the number at the bottom-right; | |
| newer cards (SV era, 2023+) place it at the bottom-centre. | |
| Using the full width ensures both layouts are covered. | |
| The collector-number regex is specific enough to ignore the | |
| weakness/resistance icons and copyright text also in this strip. | |
| """ | |
| h, w = img.shape[:2] | |
| return img[int(h * 0.85) : h, :] | |
| def _crop_hp_region(self, img: np.ndarray) -> np.ndarray: | |
| """Top-right 25% width × 12% height — HP value.""" | |
| h, w = img.shape[:2] | |
| return img[0 : int(h * 0.12), int(w * 0.75) : w] | |
| # ------------------------------------------------------------------ # | |
| # Preprocessing # | |
| # ------------------------------------------------------------------ # | |
| def _preprocess(self, crop: np.ndarray) -> np.ndarray: | |
| """ | |
| Grayscale → 3× upscale → adaptive threshold → light denoise. | |
| Used for the collector-number and HP regions where the background | |
| is relatively uniform. Returns an 8-bit single-channel image. | |
| """ | |
| if crop.size == 0: | |
| return crop | |
| gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) | |
| h, w = gray.shape | |
| scaled = cv2.resize(gray, (w * 3, h * 3), interpolation=cv2.INTER_CUBIC) | |
| thresh = cv2.adaptiveThreshold( | |
| scaled, 255, | |
| cv2.ADAPTIVE_THRESH_GAUSSIAN_C, | |
| cv2.THRESH_BINARY, 31, 10 | |
| ) | |
| denoised = cv2.GaussianBlur(thresh, (3, 3), 0) | |
| return denoised | |
| def _preprocess_name(self, crop: np.ndarray) -> np.ndarray: | |
| """ | |
| Grayscale → 3× upscale → Otsu global threshold. | |
| Otsu outperforms adaptive threshold on the coloured card-name | |
| banner (fire=red, water=blue, etc.) because the global optimum | |
| cleanly separates dark text pixels from the vivid background | |
| without producing the salt-and-pepper noise that adaptive | |
| thresholding creates on gradient / patterned backgrounds. | |
| """ | |
| if crop.size == 0: | |
| return crop | |
| gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) | |
| h, w = gray.shape | |
| scaled = cv2.resize(gray, (w * 3, h * 3), interpolation=cv2.INTER_CUBIC) | |
| _, thresh = cv2.threshold( | |
| scaled, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU | |
| ) | |
| return thresh | |
| # ------------------------------------------------------------------ # | |
| # OCR runner # | |
| # ------------------------------------------------------------------ # | |
| def _run_ocr(self, crop: np.ndarray, psm: int = 7) -> str: | |
| """ | |
| Run pytesseract with OEM 3 (LSTM) and the given PSM. | |
| PSM 7 = single line; PSM 6 = single block (use for multi-line). | |
| Returns empty string on any failure. | |
| """ | |
| if crop is None or crop.size == 0: | |
| return "" | |
| try: | |
| import pytesseract | |
| config = f"--oem 3 --psm {psm}" | |
| text = pytesseract.image_to_string(crop, config=config, timeout=5) | |
| return text.strip() | |
| except Exception: | |
| return "" | |
| # ------------------------------------------------------------------ # | |
| # Parsers # | |
| # ------------------------------------------------------------------ # | |
| def _parse_name(self, text: str) -> Optional[str]: | |
| """ | |
| Extract the Pokemon name from (potentially noisy) OCR text. | |
| The name region frequently contains single-letter noise from | |
| background textures, the "BASIC / STAGE 1" banner, or artwork | |
| elements. Rather than returning the whole cleaned string, this | |
| method picks the *longest* word that is at least 3 alphabetic | |
| characters — that word is almost always the actual Pokemon name. | |
| A known game suffix (EX, GX, V, VMAX, VSTAR) immediately after | |
| the best word is appended, e.g. "Charizard EX". | |
| """ | |
| if not text: | |
| return None | |
| # Strip non-letter chars (keep accented letters used in some names) | |
| cleaned = re.sub(r"[^A-Za-z\s'\-éèêëÉÈàâùûü]", " ", text) | |
| words = cleaned.split() | |
| if not words: | |
| return None | |
| SUFFIXES = {"EX", "GX", "V", "VMAX", "VSTAR", "TAG", "TEAM"} | |
| METADATA_STOPWORDS = { | |
| "BASIC", "STAGE", "EVOLVES", "FROM", | |
| "POKEMON", "TRAINER", "ENERGY", "ITEM", | |
| "SUPPORTER", "ABILITY", "RETREAT", "WEAKNESS", | |
| "RESISTANCE", "ATTACK", "DAMAGE", | |
| } | |
| scored = [] | |
| for i, w in enumerate(words): | |
| alpha_count = sum(1 for c in w if c.isalpha()) | |
| token = re.sub(r"[^A-Za-z]", "", w).upper() | |
| if not token: | |
| continue | |
| if token in SUFFIXES or token in METADATA_STOPWORDS: | |
| continue | |
| scored.append((alpha_count, i, w)) | |
| # Prefer words with ≥ 3 alpha chars (covers "Mew", "Ditto", …) | |
| candidates = [(n, i, w) for n, i, w in scored if n >= 3] | |
| if not candidates: | |
| # Fallback: accept ≥ 2 alpha chars rather than returning None | |
| candidates = [(n, i, w) for n, i, w in scored if n >= 2] | |
| if not candidates: | |
| return None | |
| # Longest word wins; ties broken by earliest position | |
| candidates.sort(key=lambda x: (-x[0], x[1])) | |
| _, best_idx, best_word = candidates[0] | |
| best_word = re.sub(r"^[^A-Za-z]+|[^A-Za-z]+$", "", best_word) | |
| if not best_word: | |
| return None | |
| # Capitalise: keep short all-caps tokens (EX, GX) unchanged | |
| def _cap(w: str) -> str: | |
| return w if (len(w) <= 4 and w.isupper()) else w.capitalize() | |
| name_parts = [_cap(best_word)] | |
| # Append suffix if immediately following the name | |
| if best_idx + 1 < len(words): | |
| nxt = re.sub(r"[^A-Za-z]", "", words[best_idx + 1]).upper() | |
| if nxt in SUFFIXES: | |
| name_parts.append(nxt) | |
| result = " ".join(name_parts) | |
| return result if re.search(r"[A-Za-z]", result) else None | |
| def _parse_collector_number(self, text: str) -> tuple[Optional[str], Optional[str]]: | |
| """ | |
| Parse collector number from bottom-right region text. | |
| Matches patterns like: "025/165", "025 / 165", "SV025/165" | |
| Returns (collector_number, set_total) or (None, None). | |
| """ | |
| if not text: | |
| return None, None | |
| match = re.search(r"([A-Z0-9]{1,6})\s*/\s*(\d{2,4})", text) | |
| if match: | |
| return match.group(1), match.group(2) | |
| # Fallback: plain number without slash. | |
| # Use 2-3 digits only to avoid single-digit OCR noise. | |
| match = re.search(r"\b(\d{2,3})\b", text) | |
| if match: | |
| return match.group(1), None | |
| return None, None | |
| def _parse_hp(self, text: str) -> Optional[str]: | |
| """ | |
| Parse HP value from top-right region text. | |
| Matches patterns like: "HP 120", "120HP", "120" | |
| Returns the numeric HP string or None. | |
| """ | |
| if not text: | |
| return None | |
| # "HP 120" or "120 HP" or just "120" in the HP region | |
| match = re.search(r"\b(\d{1,4})\s*[Hh][Pp]\b|\b[Hh][Pp]\s*(\d{1,4})\b", text) | |
| if match: | |
| return match.group(1) or match.group(2) | |
| # Plain number in HP region (region is small so noise is low) | |
| match = re.search(r"\b(\d{1,4})\b", text) | |
| if match: | |
| val = int(match.group(1)) | |
| if 10 <= val <= 340: # realistic Pokemon HP range | |
| return str(val) | |
| return None | |