Spaces:

mohakapoor
/

CaptchaOCR

Running

File size: 7,817 Bytes

from captcha.image import ImageCaptcha
import random, string, os, csv, io
import pandas as pd
from PIL import Image, ImageDraw, ImageFilter
import numpy as np
import cv2

# ===== your original config =====
DATASET_DIR = "Dataset/captchas"
LABELS = "Dataset/labels.csv"
NUM_IMAGES = 100000
CHARS = string.ascii_letters + string.digits
CAPTCHA_LEN_LOWER_LIMIT = 5
CAPTCHA_LEN_UPPER_LIMIT = 7
directories = [["train",0.8],["val",0.1],["test",0.1]]

# Match config.py dimensions
IMG_WIDTH = 256   # W_max from config
IMG_HEIGHT = 60   # H from config
GRAYSCALE = True  # grayscale from config


# ----- minimal augment helpers -----
def rand_color(lo=0, hi=255):
    return tuple(random.randint(lo, hi) for _ in range(3))

def gradient_bg(w, h):
    top = rand_color(200, 255)
    bot = rand_color(200, 255)
    arr = np.zeros((h, w, 3), dtype=np.uint8)
    for y in range(h):
        t = y / max(1, h - 1)
        arr[y, :, :] = (np.array(top) * (1 - t) + np.array(bot) * t).astype(np.uint8)
    return Image.fromarray(arr)

def add_interference(img, line_range=(0, 3), dot_range=(10, 80)):
    draw = ImageDraw.Draw(img)
    w, h = img.size
    for _ in range(random.randint(*line_range)):
        x1, y1 = random.randint(0, w-1), random.randint(0, h-1)
        x2, y2 = random.randint(0, w-1), random.randint(0, h-1)
        draw.line((x1, y1, x2, y2), fill=rand_color(50, 180), width=random.randint(1, 2))
    for _ in range(random.randint(*dot_range)):
        x, y = random.randint(0, w-1), random.randint(0, h-1)
        r = random.choice([0, 1])
        draw.ellipse((x-r, y-r, x+r, y+r), fill=rand_color(0, 200))
    return img

def perspective_warp(img, max_ratio=0.03):
    if max_ratio <= 0:
        return img
    w, h = img.size
    dx = int(w * max_ratio)
    dy = int(h * max_ratio * 0.7)
    src = np.float32([[0,0],[w,0],[w,h],[0,h]])
    dst = np.float32([[random.randint(0,dx), random.randint(0,dy)],
                      [w-random.randint(0,dx), random.randint(0,dy)],
                      [w-random.randint(0,dx), h-random.randint(0,dy)],
                      [random.randint(0,dx), h-random.randint(0,dy)]])
    M = cv2.getPerspectiveTransform(src, dst)
    arr = np.array(img.convert("RGB"))[:, :, ::-1]  # to BGR
    out = cv2.warpPerspective(arr, M, (w, h), borderMode=cv2.BORDER_REPLICATE)
    return Image.fromarray(out[:, :, ::-1])  # back to RGB

def jpeg_recompress(img, qmin=70, qmax=95):
    q = random.randint(qmin, qmax)
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=q)
    buf.seek(0)
    return Image.open(buf).convert("RGB")

def add_noise_and_blur(img, noise_sigma=(0.0, 6.0), blur_sigma=(0.0, 0.8), motion_prob=0.1):
    # gaussian noise
    s = random.uniform(*noise_sigma)
    if s > 0.05:
        arr = np.array(img).astype(np.float32)
        arr += np.random.normal(0, s, arr.shape).astype(np.float32)
        arr = np.clip(arr, 0, 255).astype(np.uint8)
        img = Image.fromarray(arr)
    # blur
    if random.random() < motion_prob:
        # simple directional blur
        ksize = random.choice([3,5])
        kernel = Image.new("L", (ksize, ksize), 0)
        draw = ImageDraw.Draw(kernel)
        draw.line((0, ksize//2, ksize-1, ksize//2), fill=255, width=1)
        kernel = kernel.rotate(random.uniform(0, 180), resample=Image.BILINEAR)
        kernel = np.array(kernel, dtype=np.float32)
        kernel /= max(1, kernel.sum())
        import cv2
        arr = np.array(img)
        arr = cv2.filter2D(arr, -1, kernel)
        img = Image.fromarray(arr)
    else:
        sigma = random.uniform(*blur_sigma)
        if sigma > 0.05:
            img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
    return img

def render_with_variation(text, width=IMG_WIDTH, height=IMG_HEIGHT):
    # randomize basic style knobs
    bg_choice = random.choice(["solid", "gradient"])
    fg_color = rand_color(0, 80)
    if bg_choice == "solid":
        bg_color = rand_color(210, 255)
        bg = Image.new("RGB", (width, height), color=bg_color)
    else:
        bg = gradient_bg(width, height)

    # Adjust font sizes for larger dimensions
    font_sizes = [int(height * 0.7), int(height * 0.75), int(height * 0.8), int(height * 0.85)]
    font_size = random.choice(font_sizes)
    
    # ImageCaptcha accepts fonts via fonts arg; here we keep default but jitter spacing
    image = ImageCaptcha(width=width, height=height, fonts=None, font_sizes=[font_size])

    # draw base image
    base = Image.frombytes('RGB', (width, height), image.generate_image(text).tobytes())

    # quick contrast tweak: recolor foreground by compositing text mask if needed
    # For minimal change, we stick with base and apply light warps/noise
    # mild rotation/shear
    angle = random.uniform(-6, 6)
    base = base.rotate(angle, resample=Image.BILINEAR, expand=False, fillcolor=bg.getpixel((0,0)))

    # perspective warp (very light)
    if random.random() < 0.6:
        base = perspective_warp(base, max_ratio=0.025)

    # draw interference over the image
    base = add_interference(base, line_range=(0, 3), dot_range=(10, 60))

    # light noise + blur + jpeg recompress to add artifacts
    base = add_noise_and_blur(base, noise_sigma=(0.0, 5.0), blur_sigma=(0.0, 0.7), motion_prob=0.12)
    base = jpeg_recompress(base, qmin=72, qmax=92)

    # optional low contrast: 20% chance to darken bg and lighten fg a bit
    if random.random() < 0.2:
        base = base.point(lambda p: int(p*0.95 + 6))

    # Convert to grayscale if specified
    if GRAYSCALE:
        base = base.convert('L')
    
    return base



# Fix: Extract names and thresholds upfront
train_name, val_name, test_name = directories[0][0], directories[1][0], directories[2][0]
train_ratio, val_ratio, test_ratio = directories[0][1], directories[1][1], directories[2][1]

# Calculate split thresholds
n = NUM_IMAGES
train_end = int(n * train_ratio)
val_end = train_end + int(n * val_ratio)

# Create directories once
train_dir = os.path.join(DATASET_DIR, train_name)
val_dir = os.path.join(DATASET_DIR, val_name)
test_dir = os.path.join(DATASET_DIR, test_name)

os.makedirs(DATASET_DIR, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

image = ImageCaptcha(width=160, height=60)  # kept for compatibility if needed

with open(LABELS, mode="w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["filename","label"])
    
    for i in range(NUM_IMAGES):
        if i % max(1, (NUM_IMAGES//100)) == 0:
            print(f"{i} images made")
        
        # Pick output directory based on thresholds
        if i < train_end:
            OUTPUT_DIR = train_dir
        elif i < val_end:
            OUTPUT_DIR = val_dir
        else:
            OUTPUT_DIR = test_dir

        text = ''.join(random.choices(CHARS, k=random.randint(CAPTCHA_LEN_LOWER_LIMIT, CAPTCHA_LEN_UPPER_LIMIT)))
        filename = f"{text}_{i}.png"
        filepath = os.path.join(OUTPUT_DIR, filename)

        # --- minimal change: replace image.write with our small variation renderer ---
        img = render_with_variation(text, width=IMG_WIDTH, height=IMG_HEIGHT)
        img.save(filepath)
        # -----------------------------------------

        writer.writerow([filename, text])

print("Data Generated!")

# Fixed split logic
df = pd.read_csv(LABELS)
n = len(df)
train_end = int(n * train_ratio)
val_end = train_end + int(n * val_ratio)

df_train = df.iloc[:train_end]
df_val = df.iloc[train_end:val_end]
df_test = df.iloc[val_end:]

df_train.to_csv(os.path.join(DATASET_DIR, f"{train_name}/labels.csv"), index=False)
df_val.to_csv(os.path.join(DATASET_DIR, f"{val_name}/labels.csv"), index=False)
df_test.to_csv(os.path.join(DATASET_DIR, f"{test_name}/labels.csv"), index=False)
print("Labels Generated")