| | |
| | """ |
| | Hurricane OCR - License Plate Extraction Module |
| | Extracts structured information from Thai vehicle license plate OCR results |
| | |
| | Output format compatible with standard Thai OCR APIs |
| | """ |
| |
|
| | import re |
| | import time |
| | from typing import Dict, Optional, Any, List |
| | from dataclasses import dataclass, asdict |
| |
|
| |
|
| | |
| | THAI_PROVINCES = [ |
| | "กรุงเทพมหานคร", "กระบี่", "กาญจนบุรี", "กาฬสินธุ์", "กำแพงเพชร", |
| | "ขอนแก่น", "จันทบุรี", "ฉะเชิงเทรา", "ชลบุรี", "ชัยนาท", "ชัยภูมิ", |
| | "ชุมพร", "เชียงราย", "เชียงใหม่", "ตรัง", "ตราด", "ตาก", "นครนายก", |
| | "นครปฐม", "นครพนม", "นครราชสีมา", "นครศรีธรรมราช", "นครสวรรค์", |
| | "นนทบุรี", "นราธิวาส", "น่าน", "บึงกาฬ", "บุรีรัมย์", "ปทุมธานี", |
| | "ประจวบคีรีขันธ์", "ปราจีนบุรี", "ปัตตานี", "พระนครศรีอยุธยา", |
| | "พังงา", "พัทลุง", "พิจิตร", "พิษณุโลก", "เพชรบุรี", "เพชรบูรณ์", |
| | "แพร่", "พะเยา", "ภูเก็ต", "มหาสารคาม", "มุกดาหาร", "แม่ฮ่องสอน", |
| | "ยโสธร", "ยะลา", "ร้อยเอ็ด", "ระนอง", "ระยอง", "ราชบุรี", |
| | "ลพบุรี", "ลำปาง", "ลำพูน", "เลย", "ศรีสะเกษ", "สกลนคร", |
| | "สงขลา", "สตูล", "สมุทรปราการ", "สมุทรสงคราม", "สมุทรสาคร", |
| | "สระแก้ว", "สระบุรี", "สิงห์บุรี", "สุโขทัย", "สุพรรณบุรี", |
| | "สุราษฎร์ธานี", "สุรินทร์", "หนองคาย", "หนองบัวลำภู", "อ่างทอง", |
| | "อุดรธานี", "อุทัยธานี", "อุตรดิตถ์", "อุบลราชธานี", "อำนาจเจริญ" |
| | ] |
| |
|
| | |
| | VEHICLE_CATEGORIES = { |
| | "รถยนต์นั่งส่วนบุคคลไม่เกิน 7 คน": ["ก", "ข", "ค", "ง", "จ", "ฉ", "ช", "ซ", "ฌ", "ญ"], |
| | "รถยนต์นั่งส่วนบุคคลเกิน 7 คน": ["ฎ", "ฏ", "ฐ", "ฑ", "ฒ"], |
| | "รถยนต์บรรทุกส่วนบุคคล": ["ณ", "ด", "ต", "ถ", "ท", "ธ", "น", "บ", "ป", "ผ", "ฝ", "พ", "ฟ", "ภ", "ม", "ย", "ร", "ล", "ว", "ศ", "ษ", "ส", "ห", "ฬ", "อ"], |
| | "รถจักรยานยนต์": ["ก-ฮ"], |
| | "รถแท็กซี่": ["ท"], |
| | "รถตู้โดยสาร": ["ฮ"], |
| | } |
| |
|
| |
|
| | @dataclass |
| | class LicensePlateInfo: |
| | """ |
| | Extracted license plate information |
| | Format compatible with Thai OCR API standards |
| | """ |
| | |
| | status_code: int = 200 |
| | message: str = "Success" |
| | inference: str = "0.000" |
| | file_name: str = "" |
| | |
| | |
| | plate_number: Optional[str] = None |
| | plate_characters: Optional[str] = None |
| | plate_digits: Optional[str] = None |
| | province: Optional[str] = None |
| | province_en: Optional[str] = None |
| | |
| | |
| | vehicle_category: Optional[str] = None |
| | plate_color: Optional[str] = None |
| | plate_type: Optional[str] = None |
| | |
| | |
| | raw_text: Optional[str] = None |
| | |
| | |
| | confidence: float = 0.0 |
| | |
| | def to_dict(self) -> Dict[str, Any]: |
| | """Convert to dictionary""" |
| | return asdict(self) |
| | |
| | def to_display_dict(self) -> Dict[str, Any]: |
| | """Convert to display-friendly dictionary""" |
| | return { |
| | "เลขทะเบียน (Plate Number)": self.plate_number, |
| | "ตัวอักษร (Characters)": self.plate_characters, |
| | "ตัวเลข (Digits)": self.plate_digits, |
| | "จังหวัด (Province)": self.province, |
| | "Province (EN)": self.province_en, |
| | "ประเภทรถ (Vehicle Category)": self.vehicle_category, |
| | "สีป้าย (Plate Color)": self.plate_color, |
| | "ประเภทป้าย (Plate Type)": self.plate_type, |
| | } |
| | |
| | def to_api_response(self) -> Dict[str, Any]: |
| | """Convert to API response format""" |
| | return { |
| | "status_code": self.status_code, |
| | "message": self.message, |
| | "inference": self.inference, |
| | "file_name": self.file_name, |
| | |
| | "plate_number": self.plate_number, |
| | "plate_characters": self.plate_characters, |
| | "plate_digits": self.plate_digits, |
| | "province": self.province, |
| | "province_en": self.province_en, |
| | "vehicle_category": self.vehicle_category, |
| | "plate_color": self.plate_color, |
| | "plate_type": self.plate_type, |
| | "confidence": self.confidence, |
| | "raw_text": self.raw_text, |
| | } |
| |
|
| |
|
| | class ThaiLicensePlateExtractor: |
| | """ |
| | Extracts structured information from Thai license plate OCR text |
| | """ |
| | |
| | |
| | PROVINCE_EN_MAP = { |
| | "กรุงเทพมหานคร": "Bangkok", |
| | "กระบี่": "Krabi", |
| | "กาญจนบุรี": "Kanchanaburi", |
| | "กาฬสินธุ์": "Kalasin", |
| | "กำแพงเพชร": "Kamphaeng Phet", |
| | "ขอนแก่น": "Khon Kaen", |
| | "จันทบุรี": "Chanthaburi", |
| | "ฉะเชิงเทรา": "Chachoengsao", |
| | "ชลบุรี": "Chonburi", |
| | "ชัยนาท": "Chai Nat", |
| | "ชัยภูมิ": "Chaiyaphum", |
| | "ชุมพร": "Chumphon", |
| | "เชียงราย": "Chiang Rai", |
| | "เชียงใหม่": "Chiang Mai", |
| | "ตรัง": "Trang", |
| | "ตราด": "Trat", |
| | "ตาก": "Tak", |
| | "นครนายก": "Nakhon Nayok", |
| | "นครปฐม": "Nakhon Pathom", |
| | "นครพนม": "Nakhon Phanom", |
| | "นครราชสีมา": "Nakhon Ratchasima", |
| | "นครศรีธรรมราช": "Nakhon Si Thammarat", |
| | "นครสวรรค์": "Nakhon Sawan", |
| | "นนทบุรี": "Nonthaburi", |
| | "นราธิวาส": "Narathiwat", |
| | "น่าน": "Nan", |
| | "บึงกาฬ": "Bueng Kan", |
| | "บุรีรัมย์": "Buriram", |
| | "ปทุมธานี": "Pathum Thani", |
| | "ประจวบคีรีขันธ์": "Prachuap Khiri Khan", |
| | "ปราจีนบุรี": "Prachinburi", |
| | "ปัตตานี": "Pattani", |
| | "พระนครศรีอยุธยา": "Phra Nakhon Si Ayutthaya", |
| | "พังงา": "Phang Nga", |
| | "พัทลุง": "Phatthalung", |
| | "พิจิตร": "Phichit", |
| | "พิษณุโลก": "Phitsanulok", |
| | "เพชรบุรี": "Phetchaburi", |
| | "เพชรบูรณ์": "Phetchabun", |
| | "แพร่": "Phrae", |
| | "พะเยา": "Phayao", |
| | "ภูเก็ต": "Phuket", |
| | "มหาสารคาม": "Maha Sarakham", |
| | "มุกดาหาร": "Mukdahan", |
| | "แม่ฮ่องสอน": "Mae Hong Son", |
| | "ยโสธร": "Yasothon", |
| | "ยะลา": "Yala", |
| | "ร้อยเอ็ด": "Roi Et", |
| | "ระนอง": "Ranong", |
| | "ระยอง": "Rayong", |
| | "ราชบุรี": "Ratchaburi", |
| | "ลพบุรี": "Lopburi", |
| | "ลำปาง": "Lampang", |
| | "ลำพูน": "Lamphun", |
| | "เลย": "Loei", |
| | "ศรีสะเกษ": "Sisaket", |
| | "สกลนคร": "Sakon Nakhon", |
| | "สงขลา": "Songkhla", |
| | "สตูล": "Satun", |
| | "สมุทรปราการ": "Samut Prakan", |
| | "สมุทรสงคราม": "Samut Songkhram", |
| | "สมุทรสาคร": "Samut Sakhon", |
| | "สระแก้ว": "Sa Kaeo", |
| | "สระบุรี": "Saraburi", |
| | "สิงห์บุรี": "Sing Buri", |
| | "สุโขทัย": "Sukhothai", |
| | "สุพรรณบุรี": "Suphan Buri", |
| | "สุราษฎร์ธานี": "Surat Thani", |
| | "สุรินทร์": "Surin", |
| | "หนองคาย": "Nong Khai", |
| | "หนองบัวลำภู": "Nong Bua Lamphu", |
| | "อ่างทอง": "Ang Thong", |
| | "อุดรธานี": "Udon Thani", |
| | "อุทัยธานี": "Uthai Thani", |
| | "อุตรดิตถ์": "Uttaradit", |
| | "อุบลราชธานี": "Ubon Ratchathani", |
| | "อำนาจเจริญ": "Amnat Charoen" |
| | } |
| | |
| | def __init__(self): |
| | self.start_time = None |
| | |
| | def _start_timer(self): |
| | self.start_time = time.time() |
| | |
| | def _get_inference_time(self) -> str: |
| | if self.start_time: |
| | return f"{time.time() - self.start_time:.3f}" |
| | return "0.000" |
| | |
| | def extract_plate_number(self, text: str) -> Optional[str]: |
| | """ |
| | Extract Thai license plate number |
| | |
| | Formats: |
| | - กก 1234 (2 Thai characters + 4 digits) |
| | - 1กก 1234 (1 digit + 2 Thai characters + 4 digits) |
| | - กก 123 (2 Thai characters + 3 digits - motorcycle) |
| | - 1234 (just digits for some formats) |
| | """ |
| | patterns = [ |
| | |
| | r'(\d?[\u0E01-\u0E4F]{1,3})\s*(\d{1,4})', |
| | |
| | r'\*\*(?:Plate\s*Number|เลขทะเบียน):\*\*\s*(\d?[\u0E01-\u0E4F]{1,3})\s*(\d{1,4})', |
| | |
| | r'([\u0E01-\u0E4F]{2,3})\s*(\d{2,4})', |
| | |
| | r'(\d[\u0E01-\u0E4F]{2})\s*(\d{1,4})', |
| | ] |
| | |
| | for pattern in patterns: |
| | match = re.search(pattern, text, re.UNICODE) |
| | if match: |
| | chars = match.group(1).strip() |
| | digits = match.group(2).strip() |
| | |
| | if re.search(r'[\u0E01-\u0E4F]', chars) and digits.isdigit(): |
| | return f"{chars} {digits}" |
| | |
| | return None |
| | |
| | def extract_plate_characters(self, text: str) -> Optional[str]: |
| | """Extract the character portion of the plate (e.g., กก, 1กก)""" |
| | plate = self.extract_plate_number(text) |
| | if plate: |
| | |
| | parts = plate.split() |
| | if parts: |
| | return parts[0] |
| | |
| | |
| | patterns = [ |
| | r'\*\*(?:Characters|ตัวอักษร):\*\*\s*(\d?[\u0E01-\u0E4F]{1,3})', |
| | r'ตัวอักษร[:\s]*(\d?[\u0E01-\u0E4F]{1,3})', |
| | ] |
| | |
| | for pattern in patterns: |
| | match = re.search(pattern, text, re.UNICODE) |
| | if match: |
| | return match.group(1).strip() |
| | |
| | return None |
| | |
| | def extract_plate_digits(self, text: str) -> Optional[str]: |
| | """Extract the digit portion of the plate (e.g., 1234)""" |
| | plate = self.extract_plate_number(text) |
| | if plate: |
| | |
| | parts = plate.split() |
| | if len(parts) >= 2: |
| | return parts[1] |
| | |
| | |
| | patterns = [ |
| | r'\*\*(?:Digits|ตัวเลข):\*\*\s*(\d{1,4})', |
| | r'ตัวเลข[:\s]*(\d{1,4})', |
| | ] |
| | |
| | for pattern in patterns: |
| | match = re.search(pattern, text) |
| | if match: |
| | return match.group(1).strip() |
| | |
| | return None |
| | |
| | def extract_province(self, text: str) -> Optional[str]: |
| | """Extract Thai province name""" |
| | |
| | patterns = [ |
| | r'\*\*(?:Province|จังหวัด):\*\*\s*([\u0E01-\u0E4F]+)', |
| | r'จังหวัด[:\s]*([\u0E01-\u0E4F]+)', |
| | ] |
| | |
| | for pattern in patterns: |
| | match = re.search(pattern, text, re.UNICODE) |
| | if match: |
| | province = match.group(1).strip() |
| | |
| | for p in THAI_PROVINCES: |
| | if p in province or province in p: |
| | return p |
| | return province |
| | |
| | |
| | for province in THAI_PROVINCES: |
| | if province in text: |
| | return province |
| | |
| | return None |
| | |
| | def get_province_en(self, province_th: Optional[str]) -> Optional[str]: |
| | """Get English name for Thai province""" |
| | if province_th: |
| | return self.PROVINCE_EN_MAP.get(province_th) |
| | return None |
| | |
| | def extract_vehicle_category(self, text: str) -> Optional[str]: |
| | """Extract vehicle category""" |
| | categories = [ |
| | "รถยนต์นั่งส่วนบุคคล", |
| | "รถยนต์บรรทุกส่วนบุคคล", |
| | "รถจักรยานยนต์", |
| | "รถแท็กซี่", |
| | "รถตู้โดยสาร", |
| | "รถบรรทุก", |
| | "รถกระบะ", |
| | "รถเก๋ง", |
| | "รถตู้", |
| | "รถจักรยานยนต์ส่วนบุคคล", |
| | "รถยนต์สาธารณะ", |
| | ] |
| | |
| | |
| | match = re.search(r'\*\*(?:Vehicle\s*(?:Category|Type)|ประเภทรถ):\*\*\s*([\u0E01-\u0E4F\s]+)', text, re.UNICODE) |
| | if match: |
| | return match.group(1).strip() |
| | |
| | |
| | for cat in categories: |
| | if cat in text: |
| | return cat |
| | |
| | return None |
| | |
| | def extract_plate_color(self, text: str) -> Optional[str]: |
| | """Extract plate color""" |
| | colors = { |
| | "ขาว": "White", |
| | "เขียว": "Green", |
| | "เหลือง": "Yellow", |
| | "แดง": "Red", |
| | "น้ำเงิน": "Blue", |
| | "ดำ": "Black", |
| | } |
| | |
| | |
| | match = re.search(r'\*\*(?:Plate\s*Color|สีป้าย):\*\*\s*([\u0E01-\u0E4F]+)', text, re.UNICODE) |
| | if match: |
| | return match.group(1).strip() |
| | |
| | |
| | for color_th in colors.keys(): |
| | if color_th in text: |
| | return color_th |
| | |
| | return None |
| | |
| | def extract_plate_type(self, text: str) -> Optional[str]: |
| | """Extract plate type""" |
| | types = [ |
| | "ป้ายทะเบียนรถ", |
| | "ป้ายแดง", |
| | "ป้ายขาว", |
| | "ป้ายเขียว", |
| | "ป้ายทะเบียน", |
| | "ป้ายชั่วคราว", |
| | ] |
| | |
| | |
| | match = re.search(r'\*\*(?:Plate\s*Type|ประเภทป้าย):\*\*\s*([\u0E01-\u0E4F\s]+)', text, re.UNICODE) |
| | if match: |
| | return match.group(1).strip() |
| | |
| | |
| | for ptype in types: |
| | if ptype in text: |
| | return ptype |
| | |
| | return None |
| | |
| | def extract_all(self, ocr_text: str, file_name: str = "") -> LicensePlateInfo: |
| | """ |
| | Extract all information from OCR text |
| | |
| | Args: |
| | ocr_text: Raw OCR text result |
| | file_name: Original file name |
| | |
| | Returns: |
| | LicensePlateInfo with all extracted fields |
| | """ |
| | self._start_timer() |
| | |
| | province = self.extract_province(ocr_text) |
| | plate_number = self.extract_plate_number(ocr_text) |
| | |
| | info = LicensePlateInfo( |
| | status_code=200, |
| | message="Success", |
| | file_name=file_name, |
| | |
| | |
| | plate_number=plate_number, |
| | plate_characters=self.extract_plate_characters(ocr_text), |
| | plate_digits=self.extract_plate_digits(ocr_text), |
| | province=province, |
| | province_en=self.get_province_en(province), |
| | |
| | |
| | vehicle_category=self.extract_vehicle_category(ocr_text), |
| | plate_color=self.extract_plate_color(ocr_text), |
| | plate_type=self.extract_plate_type(ocr_text), |
| | |
| | |
| | raw_text=ocr_text, |
| | |
| | |
| | confidence=1.0 if plate_number and province else (0.5 if plate_number or province else 0.0), |
| | ) |
| | |
| | info.inference = self._get_inference_time() |
| | |
| | return info |
| |
|
| |
|
| | |
| | _plate_extractor = ThaiLicensePlateExtractor() |
| |
|
| |
|
| | def extract_license_plate(ocr_text: str, file_name: str = "") -> LicensePlateInfo: |
| | """ |
| | Extract license plate information from OCR text |
| | |
| | Args: |
| | ocr_text: Raw OCR text |
| | file_name: Original file name |
| | |
| | Returns: |
| | LicensePlateInfo with all fields |
| | """ |
| | return _plate_extractor.extract_all(ocr_text, file_name) |
| |
|
| |
|
| | def extract_to_api_response(ocr_text: str, file_name: str = "") -> Dict[str, Any]: |
| | """ |
| | Extract and return in API response format |
| | |
| | Args: |
| | ocr_text: Raw OCR text |
| | file_name: Original file name |
| | |
| | Returns: |
| | Dictionary matching standard Thai OCR API format |
| | """ |
| | info = _plate_extractor.extract_all(ocr_text, file_name) |
| | return info.to_api_response() |
| |
|
| |
|
| | |
| | def extract_document_info(ocr_text: str) -> LicensePlateInfo: |
| | """Legacy function - extract license plate info""" |
| | return extract_license_plate(ocr_text) |
| |
|