Spaces:

mohakapoor
/

CaptchaOCR

Running

CaptchaOCR / src /data.py

mohakkapoor4

Refactor .gitignore to specify checkpoint file types and exclude all but the best model. Update inference.py to use enhanced CAPTCHA generation and adjust dimensions. Increase training epochs in train.py for better model performance. Update training metrics and data generation logic in data.py for improved dataset handling and augmentation. Update config.py for dataset path consistency.

322be7d 4 months ago

raw

history blame

7.82 kB

	from captcha.image import ImageCaptcha
	import random, string, os, csv, io
	import pandas as pd
	from PIL import Image, ImageDraw, ImageFilter
	import numpy as np
	import cv2

	# ===== your original config =====
	DATASET_DIR = "Dataset/captchas"
	LABELS = "Dataset/labels.csv"
	NUM_IMAGES = 100000
	CHARS = string.ascii_letters + string.digits
	CAPTCHA_LEN_LOWER_LIMIT = 5
	CAPTCHA_LEN_UPPER_LIMIT = 7
	directories = [["train",0.8],["val",0.1],["test",0.1]]

	# Match config.py dimensions
	IMG_WIDTH = 256 # W_max from config
	IMG_HEIGHT = 60 # H from config
	GRAYSCALE = True # grayscale from config


	# ----- minimal augment helpers -----
	def rand_color(lo=0, hi=255):
	return tuple(random.randint(lo, hi) for _ in range(3))

	def gradient_bg(w, h):
	top = rand_color(200, 255)
	bot = rand_color(200, 255)
	arr = np.zeros((h, w, 3), dtype=np.uint8)
	for y in range(h):
	t = y / max(1, h - 1)
	arr[y, :, :] = (np.array(top) * (1 - t) + np.array(bot) * t).astype(np.uint8)
	return Image.fromarray(arr)

	def add_interference(img, line_range=(0, 3), dot_range=(10, 80)):
	draw = ImageDraw.Draw(img)
	w, h = img.size
	for _ in range(random.randint(*line_range)):
	x1, y1 = random.randint(0, w-1), random.randint(0, h-1)
	x2, y2 = random.randint(0, w-1), random.randint(0, h-1)
	draw.line((x1, y1, x2, y2), fill=rand_color(50, 180), width=random.randint(1, 2))
	for _ in range(random.randint(*dot_range)):
	x, y = random.randint(0, w-1), random.randint(0, h-1)
	r = random.choice([0, 1])
	draw.ellipse((x-r, y-r, x+r, y+r), fill=rand_color(0, 200))
	return img

	def perspective_warp(img, max_ratio=0.03):
	if max_ratio <= 0:
	return img
	w, h = img.size
	dx = int(w * max_ratio)
	dy = int(h * max_ratio * 0.7)
	src = np.float32([[0,0],[w,0],[w,h],[0,h]])
	dst = np.float32([[random.randint(0,dx), random.randint(0,dy)],
	[w-random.randint(0,dx), random.randint(0,dy)],
	[w-random.randint(0,dx), h-random.randint(0,dy)],
	[random.randint(0,dx), h-random.randint(0,dy)]])
	M = cv2.getPerspectiveTransform(src, dst)
	arr = np.array(img.convert("RGB"))[:, :, ::-1] # to BGR
	out = cv2.warpPerspective(arr, M, (w, h), borderMode=cv2.BORDER_REPLICATE)
	return Image.fromarray(out[:, :, ::-1]) # back to RGB

	def jpeg_recompress(img, qmin=70, qmax=95):
	q = random.randint(qmin, qmax)
	buf = io.BytesIO()
	img.save(buf, format="JPEG", quality=q)
	buf.seek(0)
	return Image.open(buf).convert("RGB")

	def add_noise_and_blur(img, noise_sigma=(0.0, 6.0), blur_sigma=(0.0, 0.8), motion_prob=0.1):
	# gaussian noise
	s = random.uniform(*noise_sigma)
	if s > 0.05:
	arr = np.array(img).astype(np.float32)
	arr += np.random.normal(0, s, arr.shape).astype(np.float32)
	arr = np.clip(arr, 0, 255).astype(np.uint8)
	img = Image.fromarray(arr)
	# blur
	if random.random() < motion_prob:
	# simple directional blur
	ksize = random.choice([3,5])
	kernel = Image.new("L", (ksize, ksize), 0)
	draw = ImageDraw.Draw(kernel)
	draw.line((0, ksize//2, ksize-1, ksize//2), fill=255, width=1)
	kernel = kernel.rotate(random.uniform(0, 180), resample=Image.BILINEAR)
	kernel = np.array(kernel, dtype=np.float32)
	kernel /= max(1, kernel.sum())
	import cv2
	arr = np.array(img)
	arr = cv2.filter2D(arr, -1, kernel)
	img = Image.fromarray(arr)
	else:
	sigma = random.uniform(*blur_sigma)
	if sigma > 0.05:
	img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
	return img

	def render_with_variation(text, width=IMG_WIDTH, height=IMG_HEIGHT):
	# randomize basic style knobs
	bg_choice = random.choice(["solid", "gradient"])
	fg_color = rand_color(0, 80)
	if bg_choice == "solid":
	bg_color = rand_color(210, 255)
	bg = Image.new("RGB", (width, height), color=bg_color)
	else:
	bg = gradient_bg(width, height)

	# Adjust font sizes for larger dimensions
	font_sizes = [int(height * 0.7), int(height * 0.75), int(height * 0.8), int(height * 0.85)]
	font_size = random.choice(font_sizes)

	# ImageCaptcha accepts fonts via fonts arg; here we keep default but jitter spacing
	image = ImageCaptcha(width=width, height=height, fonts=None, font_sizes=[font_size])

	# draw base image
	base = Image.frombytes('RGB', (width, height), image.generate_image(text).tobytes())

	# quick contrast tweak: recolor foreground by compositing text mask if needed
	# For minimal change, we stick with base and apply light warps/noise
	# mild rotation/shear
	angle = random.uniform(-6, 6)
	base = base.rotate(angle, resample=Image.BILINEAR, expand=False, fillcolor=bg.getpixel((0,0)))

	# perspective warp (very light)
	if random.random() < 0.6:
	base = perspective_warp(base, max_ratio=0.025)

	# draw interference over the image
	base = add_interference(base, line_range=(0, 3), dot_range=(10, 60))

	# light noise + blur + jpeg recompress to add artifacts
	base = add_noise_and_blur(base, noise_sigma=(0.0, 5.0), blur_sigma=(0.0, 0.7), motion_prob=0.12)
	base = jpeg_recompress(base, qmin=72, qmax=92)

	# optional low contrast: 20% chance to darken bg and lighten fg a bit
	if random.random() < 0.2:
	base = base.point(lambda p: int(p*0.95 + 6))

	# Convert to grayscale if specified
	if GRAYSCALE:
	base = base.convert('L')

	return base



	# Fix: Extract names and thresholds upfront
	train_name, val_name, test_name = directories[0][0], directories[1][0], directories[2][0]
	train_ratio, val_ratio, test_ratio = directories[0][1], directories[1][1], directories[2][1]

	# Calculate split thresholds
	n = NUM_IMAGES
	train_end = int(n * train_ratio)
	val_end = train_end + int(n * val_ratio)

	# Create directories once
	train_dir = os.path.join(DATASET_DIR, train_name)
	val_dir = os.path.join(DATASET_DIR, val_name)
	test_dir = os.path.join(DATASET_DIR, test_name)

	os.makedirs(DATASET_DIR, exist_ok=True)
	os.makedirs(train_dir, exist_ok=True)
	os.makedirs(val_dir, exist_ok=True)
	os.makedirs(test_dir, exist_ok=True)

	image = ImageCaptcha(width=160, height=60) # kept for compatibility if needed

	with open(LABELS, mode="w", newline="") as f:
	writer = csv.writer(f)
	writer.writerow(["filename","label"])

	for i in range(NUM_IMAGES):
	if i % max(1, (NUM_IMAGES//100)) == 0:
	print(f"{i} images made")

	# Pick output directory based on thresholds
	if i < train_end:
	OUTPUT_DIR = train_dir
	elif i < val_end:
	OUTPUT_DIR = val_dir
	else:
	OUTPUT_DIR = test_dir

	text = ''.join(random.choices(CHARS, k=random.randint(CAPTCHA_LEN_LOWER_LIMIT, CAPTCHA_LEN_UPPER_LIMIT)))
	filename = f"{text}_{i}.png"
	filepath = os.path.join(OUTPUT_DIR, filename)

	# --- minimal change: replace image.write with our small variation renderer ---
	img = render_with_variation(text, width=IMG_WIDTH, height=IMG_HEIGHT)
	img.save(filepath)
	# -----------------------------------------

	writer.writerow([filename, text])

	print("Data Generated!")

	# Fixed split logic
	df = pd.read_csv(LABELS)
	n = len(df)
	train_end = int(n * train_ratio)
	val_end = train_end + int(n * val_ratio)

	df_train = df.iloc[:train_end]
	df_val = df.iloc[train_end:val_end]
	df_test = df.iloc[val_end:]

	df_train.to_csv(os.path.join(DATASET_DIR, f"{train_name}/labels.csv"), index=False)
	df_val.to_csv(os.path.join(DATASET_DIR, f"{val_name}/labels.csv"), index=False)
	df_test.to_csv(os.path.join(DATASET_DIR, f"{test_name}/labels.csv"), index=False)
	print("Labels Generated")