Spaces:
Sleeping
Sleeping
| from paddleocr import PaddleOCR | |
| import re | |
| # Initialize OCR | |
| ocr = PaddleOCR(use_angle_cls=True, lang='en') | |
| # Helper regex patterns | |
| PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b' | |
| AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b' | |
| DOB_REGEX = [ | |
| r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', # 12/04/1980 or 12-04-1980 | |
| r'\b\d{4}-\d{2}-\d{2}\b', # 1980-04-12 | |
| r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b', # 12-APR-1980 | |
| r'\b(19|20)\d{2}\b' # Year only | |
| ] | |
| GENDERS = ["MALE", "FEMALE", "TRANSGENDER"] | |
| def extract_kyc_fields(file_path, force_type=None): | |
| try: | |
| result = ocr.ocr(file_path, cls=True) | |
| lines = [] | |
| # Normalize OCR text | |
| for block in result: | |
| for line in block: | |
| text = re.sub(r'\s+', ' ', line[1][0].strip()) | |
| if text: | |
| lines.append(text) | |
| full_text = "\n".join(lines) | |
| # Determine card type | |
| if force_type: | |
| card_type = force_type.upper() | |
| else: | |
| pan_match = re.search(PAN_REGEX, full_text) | |
| aadhaar_match = re.search(AADHAAR_REGEX, full_text) | |
| card_type = "UNKNOWN" | |
| if pan_match: | |
| card_type = "PAN" | |
| elif aadhaar_match: | |
| card_type = "AADHAAR" | |
| response = {"card_type": card_type} | |
| if card_type == "PAN": | |
| pan_number = re.search(PAN_REGEX, full_text) | |
| response["pan_number"] = pan_number.group(0) if pan_number else "Not found" | |
| response["dob"] = extract_dob(lines) | |
| response["name"] = extract_pan_name(lines) | |
| elif card_type == "AADHAAR": | |
| aadhaar_number = re.search(AADHAAR_REGEX, full_text) | |
| response["aadhaar_number"] = aadhaar_number.group(0) if aadhaar_number else "Not found" | |
| response["dob"] = extract_dob(lines) | |
| response["gender"] = extract_gender(lines) | |
| response["name"] = extract_aadhaar_name(lines) | |
| else: | |
| response["error"] = "Could not identify document as PAN or Aadhaar." | |
| return response | |
| except Exception as e: | |
| return {"error": f"OCR processing failed: {str(e)}"} | |
| def extract_dob(lines): | |
| for line in lines: | |
| for pattern in DOB_REGEX: | |
| match = re.search(pattern, line, re.IGNORECASE) | |
| if match: | |
| return match.group(0) | |
| return "Not found" | |
| def extract_gender(lines): | |
| for line in lines: | |
| for gender in GENDERS: | |
| if gender in line.upper(): | |
| return gender | |
| return "Not found" | |
| def extract_pan_name(lines): | |
| # Heuristic: Name is after "INCOME TAX DEPARTMENT" and contains only letters/spaces | |
| for i, line in enumerate(lines): | |
| if "INCOME TAX DEPARTMENT" in line.upper(): | |
| for j in range(i + 1, len(lines)): | |
| candidate = lines[j].strip() | |
| if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate): | |
| # Skip words like INDIA, GOVT, DEPARTMENT | |
| if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]): | |
| return candidate | |
| return "Not found" | |
| def extract_aadhaar_name(lines): | |
| # Heuristic: Name is usually above DOB | |
| for i, line in enumerate(lines): | |
| if any(re.search(p, line) for p in DOB_REGEX): | |
| if i > 0: | |
| candidate_name = lines[i - 1].strip() | |
| if not re.search(r'\d', candidate_name) and len(candidate_name.split()) >= 2: | |
| if not any(x in candidate_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]): | |
| return candidate_name | |
| # Fallback: First line with >=2 words and no digits | |
| for line in lines: | |
| candidate_name = line.strip() | |
| if len(candidate_name.split()) >= 2 and not re.search(r'\d', candidate_name): | |
| if not any(x in candidate_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]): | |
| return candidate_name | |
| return "Not found" | |