Spaces:

mohakapoor
/

CaptchaOCR

Running

App Files Files Community

mohakkapoor4 commited on Aug 20

Commit

322be7d

1 Parent(s): a1eb0d1

Refactor .gitignore to specify checkpoint file types and exclude all but the best model. Update inference.py to use enhanced CAPTCHA generation and adjust dimensions. Increase training epochs in train.py for better model performance. Update training metrics and data generation logic in data.py for improved dataset handling and augmentation. Update config.py for dataset path consistency.

Browse files

Files changed (11) hide show

.gitignore +7 -8
Metrics/loss_comparison.png +2 -2
Metrics/training_losses.png +2 -2
Metrics/training_metrics.txt +2 -2
app.py +72 -0
checkpoints/best_model.pth +3 -0
inference.py +15 -10
src/config.py +1 -1
src/data.py +184 -35
src/generateCaptcha.py +181 -0
train.py +1 -1

.gitignore CHANGED Viewed

@@ -80,14 +80,13 @@ desktop.ini
 !Metrics/*.jpg
 # Models and checkpoints
-checkpoints/
-*.ckpt
-*.onnx
-*.tflite
-*.pth
-*.pt
-*.bin
-*.safetensors
 runs/
 outputs/
 artifacts/

 !Metrics/*.jpg
 # Models and checkpoints
+checkpoints/*.pth
+checkpoints/*.pt
+checkpoints/*.ckpt
+checkpoints/*.onnx
+checkpoints/*.bin
+checkpoints/*.safetensors
+!checkpoints/best_model.pth
 runs/
 outputs/
 artifacts/

Metrics/loss_comparison.png CHANGED Viewed

Git LFS Details

SHA256: 4e3a5a131f815aeff76e358f45fd9af95bef77d001ef8ba538451d0b3779e005
Pointer size: 131 Bytes
Size of remote file: 322 kB

Git LFS Details

SHA256: 222aef648ad7d0110be78f32efa160916039593db240d2a7020406846ae13412
Pointer size: 131 Bytes
Size of remote file: 333 kB

Metrics/training_losses.png CHANGED Viewed

Git LFS Details

SHA256: c74acd1702091eee23712df3b801b9d4c310959a389d2d567b11567c19280db9
Pointer size: 131 Bytes
Size of remote file: 112 kB

Git LFS Details

SHA256: bd9dc01e6f9fb72c032edf1cc52d43e2a77a5267f0448af0487de1948e12c261
Pointer size: 131 Bytes
Size of remote file: 112 kB

Metrics/training_metrics.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7fb9b2125cd77da83e51022b8de31388541a080c2f63d11e8b85cb6b34efe534
-size 822

 version https://git-lfs.github.com/spec/v1
+oid sha256:9ed95b7a50649f8393e171a702b5e096adaa4d66712b3faf26c45931854dffb7
+size 842

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import random
+import gradio as gr
+from PIL import Image
+import torch
+# Import your inference module
+import inference as inf
+from src.generateCaptcha import generate_captcha
+from src.config import cfg  # sizes, charset, dirs
+# Device and one-time model load
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL = inf.load_model("checkpoints/best_model.pth").to(DEVICE).eval()
+# Ensure results dir exists
+os.makedirs(cfg.RESULT_DIR, exist_ok=True)
+def random_text():
+    L = random.randint(cfg.CAPTCHA_LEN_LOWER_LIMIT, cfg.CAPTCHA_LEN_UPPER_LIMIT)
+    return "".join(random.choices(cfg.chars, k=L))
+def ui_generate():
+    text = random_text()
+    filename = f"{text}_{random.randint(1000,9999)}.png"
+    # Use generateCaptcha.py directly
+    img = generate_captcha(text, width=cfg.W_max, height=cfg.H)
+    # Save to results directory
+    filepath = os.path.join(cfg.RESULT_DIR, filename)
+    img.save(filepath)
+    return img, text, filepath
+def ui_solve(img: Image.Image, path_hint: str):
+    # Prefer uploaded image
+    if img is not None:
+        tmp_path = os.path.join(cfg.RESULT_DIR, f"upload_{random.randint(1000,9999)}.png")
+        img.save(tmp_path)
+        tensor = inf.preprocess_image(tmp_path, (cfg.W_max, cfg.H))
+        pred = inf.predict_captcha(MODEL, tensor, DEVICE)
+        return pred
+    # Otherwise, solve the last generated image
+    if path_hint and os.path.exists(path_hint):
+        tensor = inf.preprocess_image(path_hint, (cfg.W_max, cfg.H))
+        pred = inf.predict_captcha(MODEL, tensor, DEVICE)
+        return pred
+    return "No image provided. Generate or upload first."
+with gr.Blocks(title="CAPTCHA OCR (checkpoint)") as demo:
+    gr.Markdown("## CAPTCHA OCR demo")
+    with gr.Row():
+        gen_btn = gr.Button("Generate CAPTCHA", variant="primary")
+        gt_out = gr.Textbox(label="Ground Truth", interactive=False)
+    with gr.Row():
+        img_out = gr.Image(label="Generated CAPTCHA", type="pil")
+        path_box = gr.Textbox(label="Internal Path", interactive=False, visible=False)
+    gen_btn.click(fn=ui_generate, outputs=[img_out, gt_out, path_box])
+    gr.Markdown("### Solve")
+    with gr.Row():
+        img_in = gr.Image(label="Upload CAPTCHA (optional)", type="pil")
+        solve_btn = gr.Button("Solve")
+        pred_out = gr.Textbox(label="Prediction", interactive=False)
+    solve_btn.click(fn=ui_solve, inputs=[img_in, path_box], outputs=[pred_out])
+if __name__ == "__main__":
+    demo.launch()

checkpoints/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88e646907eb2ca7a43d87a5cd251be9c7d2b79f98e30d5ca9e53b8ae93e6045d
+size 48371934

inference.py CHANGED Viewed

@@ -7,7 +7,7 @@ from src.config import cfg
 from src.model_crnn import CRNN
 from src.vocab import ctc_greedy_decode, vocab_size
 from src.plotting import TrainingMetrics
-from captcha.image import ImageCaptcha
 def load_model(checkpoint_path="checkpoints/best_model.pth"):
     """Load the trained model from checkpoint."""
@@ -69,12 +69,17 @@ def predict_captcha(model, image_tensor, device):
         return prediction[0] if prediction else ""
-def generate_test_captcha(text, filename, width=160, height=60):
-    """Generate a test CAPTCHA image."""
-    image = ImageCaptcha(width=width, height=height)
     filepath = os.path.join(cfg.RESULT_DIR, filename)
-    image.write(text, filepath)
-    print(f"Generated test CAPTCHA: {filename}")
     return filepath
 def main():
@@ -92,7 +97,7 @@ def main():
         print("Model loaded successfully!")
         # Generate test CAPTCHAs
-        print("\nGenerating test CAPTCHAs...")
         test_cases = []
         for i in range(4):
@@ -100,8 +105,8 @@ def main():
             text = ''.join(random.choices(cfg.chars, k=random.randint(cfg.CAPTCHA_LEN_LOWER_LIMIT, cfg.CAPTCHA_LEN_UPPER_LIMIT)))
             filename = f"{text}_{i}.png"
-            # Generate image
-            image_path = generate_test_captcha(text, filename)
             test_cases.append((text, image_path, ""))  # Add empty prediction slot
         # Run inference
@@ -151,7 +156,7 @@ def main():
                     correct_chars += 1
         char_accuracy = (correct_chars / total_chars) * 100 if total_chars > 0 else 0
-        print(f"🔤 Character Accuracy: {correct_chars}/{total_chars} ({char_accuracy:.1f}%)")
         if accuracy >= 80:
             print("Excellent performance!")

 from src.model_crnn import CRNN
 from src.vocab import ctc_greedy_decode, vocab_size
 from src.plotting import TrainingMetrics
+from src.generateCaptcha import generate_captcha
 def load_model(checkpoint_path="checkpoints/best_model.pth"):
     """Load the trained model from checkpoint."""
         return prediction[0] if prediction else ""
+def generate_test_captcha(text, filename, width=256, height=60):
+    """Generate a test CAPTCHA image using enhanced generation."""
+    # Use the enhanced CAPTCHA generation from generateCaptcha.py
+    img = generate_captcha(text, width=width, height=height)
+    # Ensure results directory exists
+    os.makedirs(cfg.RESULT_DIR, exist_ok=True)
     filepath = os.path.join(cfg.RESULT_DIR, filename)
+    img.save(filepath)
+    print(f"Generated enhanced test CAPTCHA: {filename}")
     return filepath
 def main():
         print("Model loaded successfully!")
         # Generate test CAPTCHAs
+        print("\nGenerating enhanced test CAPTCHAs...")
         test_cases = []
         for i in range(4):
             text = ''.join(random.choices(cfg.chars, k=random.randint(cfg.CAPTCHA_LEN_LOWER_LIMIT, cfg.CAPTCHA_LEN_UPPER_LIMIT)))
             filename = f"{text}_{i}.png"
+            # Generate enhanced image
+            image_path = generate_test_captcha(text, filename, width=cfg.W_max, height=cfg.H)
             test_cases.append((text, image_path, ""))  # Add empty prediction slot
         # Run inference
                     correct_chars += 1
         char_accuracy = (correct_chars / total_chars) * 100 if total_chars > 0 else 0
+        print(f"Character Accuracy: {correct_chars}/{total_chars} ({char_accuracy:.1f}%)")
         if accuracy >= 80:
             print("Excellent performance!")

src/config.py CHANGED Viewed

@@ -4,7 +4,7 @@ from dataclasses import dataclass
 @dataclass
 class Config:
-    data_root: str = os.getenv("DATA_ROOT","Dataset_test\captchas")
     chars: str = string.ascii_letters + string.digits
     CAPTCHA_LEN_LOWER_LIMIT: int = 5

 @dataclass
 class Config:
+    data_root: str = os.getenv("DATA_ROOT","Dataset\captchas")
     chars: str = string.ascii_letters + string.digits
     CAPTCHA_LEN_LOWER_LIMIT: int = 5

src/data.py CHANGED Viewed

@@ -1,63 +1,212 @@
 from captcha.image import ImageCaptcha
-import random
-import string
-import os
-import csv
 import pandas as pd
-# config
-DATASET_DIR = "Dataset_test/captchas"
-LABELS = "Dataset_test/labels.csv"
-NUM_IMAGES = 10000
 CHARS = string.ascii_letters + string.digits
 CAPTCHA_LEN_LOWER_LIMIT = 5
 CAPTCHA_LEN_UPPER_LIMIT = 7
 directories = [["train",0.8],["val",0.1],["test",0.1]]
 os.makedirs(DATASET_DIR, exist_ok=True)
-image = ImageCaptcha(width=160, height=60)
-with open(LABELS,mode="w",newline="") as f:
     writer = csv.writer(f)
     writer.writerow(["filename","label"])
-    OUTPUT_DIR = os.path.join(DATASET_DIR,directories[0][0])
-    os.makedirs(OUTPUT_DIR,exist_ok=True)
     for i in range(NUM_IMAGES):
-        if i%(NUM_IMAGES/100) ==0:
             print(f"{i} images made")
-        if i>(0.8*NUM_IMAGES-1) and i<(0.9*NUM_IMAGES):
-            OUTPUT_DIR = os.path.join(DATASET_DIR,directories[1][0])
-            os.makedirs(OUTPUT_DIR,exist_ok=True)
-        elif i>(0.9*NUM_IMAGES-1):
-            OUTPUT_DIR = os.path.join(DATASET_DIR,directories[2][0])
-            os.makedirs(OUTPUT_DIR,exist_ok=True)
-        text = ''.join(random.choices(CHARS, k=random.randint(CAPTCHA_LEN_LOWER_LIMIT,CAPTCHA_LEN_UPPER_LIMIT)))
         filename = f"{text}_{i}.png"
         filepath = os.path.join(OUTPUT_DIR, filename)
-        image.write(text, filepath)
-        writer.writerow([filename,text])
-print("Data Generated!")
-df = pd.read_csv(LABELS)
 n = len(df)
-train_end = int(n * directories[0][1])
-val_end = train_end + int(n * directories[2][1])
-# Split datasets
 df_train = df.iloc[:train_end]
 df_val = df.iloc[train_end:val_end]
 df_test = df.iloc[val_end:]
-# Save
-df_train.to_csv(os.path.join(DATASET_DIR,"train/labels.csv"), index=False)
-df_val.to_csv(os.path.join(DATASET_DIR,"val/labels.csv"), index=False)
-df_test.to_csv(os.path.join(DATASET_DIR,"test/labels.csv"), index=False)
 print("Labels Generated")

 from captcha.image import ImageCaptcha
+import random, string, os, csv, io
 import pandas as pd
+from PIL import Image, ImageDraw, ImageFilter
+import numpy as np
+import cv2
+# ===== your original config =====
+DATASET_DIR = "Dataset/captchas"
+LABELS = "Dataset/labels.csv"
+NUM_IMAGES = 100000
 CHARS = string.ascii_letters + string.digits
 CAPTCHA_LEN_LOWER_LIMIT = 5
 CAPTCHA_LEN_UPPER_LIMIT = 7
 directories = [["train",0.8],["val",0.1],["test",0.1]]
+# Match config.py dimensions
+IMG_WIDTH = 256   # W_max from config
+IMG_HEIGHT = 60   # H from config
+GRAYSCALE = True  # grayscale from config
+# ----- minimal augment helpers -----
+def rand_color(lo=0, hi=255):
+    return tuple(random.randint(lo, hi) for _ in range(3))
+def gradient_bg(w, h):
+    top = rand_color(200, 255)
+    bot = rand_color(200, 255)
+    arr = np.zeros((h, w, 3), dtype=np.uint8)
+    for y in range(h):
+        t = y / max(1, h - 1)
+        arr[y, :, :] = (np.array(top) * (1 - t) + np.array(bot) * t).astype(np.uint8)
+    return Image.fromarray(arr)
+def add_interference(img, line_range=(0, 3), dot_range=(10, 80)):
+    draw = ImageDraw.Draw(img)
+    w, h = img.size
+    for _ in range(random.randint(*line_range)):
+        x1, y1 = random.randint(0, w-1), random.randint(0, h-1)
+        x2, y2 = random.randint(0, w-1), random.randint(0, h-1)
+        draw.line((x1, y1, x2, y2), fill=rand_color(50, 180), width=random.randint(1, 2))
+    for _ in range(random.randint(*dot_range)):
+        x, y = random.randint(0, w-1), random.randint(0, h-1)
+        r = random.choice([0, 1])
+        draw.ellipse((x-r, y-r, x+r, y+r), fill=rand_color(0, 200))
+    return img
+def perspective_warp(img, max_ratio=0.03):
+    if max_ratio <= 0:
+        return img
+    w, h = img.size
+    dx = int(w * max_ratio)
+    dy = int(h * max_ratio * 0.7)
+    src = np.float32([[0,0],[w,0],[w,h],[0,h]])
+    dst = np.float32([[random.randint(0,dx), random.randint(0,dy)],
+                      [w-random.randint(0,dx), random.randint(0,dy)],
+                      [w-random.randint(0,dx), h-random.randint(0,dy)],
+                      [random.randint(0,dx), h-random.randint(0,dy)]])
+    M = cv2.getPerspectiveTransform(src, dst)
+    arr = np.array(img.convert("RGB"))[:, :, ::-1]  # to BGR
+    out = cv2.warpPerspective(arr, M, (w, h), borderMode=cv2.BORDER_REPLICATE)
+    return Image.fromarray(out[:, :, ::-1])  # back to RGB
+def jpeg_recompress(img, qmin=70, qmax=95):
+    q = random.randint(qmin, qmax)
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=q)
+    buf.seek(0)
+    return Image.open(buf).convert("RGB")
+def add_noise_and_blur(img, noise_sigma=(0.0, 6.0), blur_sigma=(0.0, 0.8), motion_prob=0.1):
+    # gaussian noise
+    s = random.uniform(*noise_sigma)
+    if s > 0.05:
+        arr = np.array(img).astype(np.float32)
+        arr += np.random.normal(0, s, arr.shape).astype(np.float32)
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+        img = Image.fromarray(arr)
+    # blur
+    if random.random() < motion_prob:
+        # simple directional blur
+        ksize = random.choice([3,5])
+        kernel = Image.new("L", (ksize, ksize), 0)
+        draw = ImageDraw.Draw(kernel)
+        draw.line((0, ksize//2, ksize-1, ksize//2), fill=255, width=1)
+        kernel = kernel.rotate(random.uniform(0, 180), resample=Image.BILINEAR)
+        kernel = np.array(kernel, dtype=np.float32)
+        kernel /= max(1, kernel.sum())
+        import cv2
+        arr = np.array(img)
+        arr = cv2.filter2D(arr, -1, kernel)
+        img = Image.fromarray(arr)
+    else:
+        sigma = random.uniform(*blur_sigma)
+        if sigma > 0.05:
+            img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
+    return img
+def render_with_variation(text, width=IMG_WIDTH, height=IMG_HEIGHT):
+    # randomize basic style knobs
+    bg_choice = random.choice(["solid", "gradient"])
+    fg_color = rand_color(0, 80)
+    if bg_choice == "solid":
+        bg_color = rand_color(210, 255)
+        bg = Image.new("RGB", (width, height), color=bg_color)
+    else:
+        bg = gradient_bg(width, height)
+    # Adjust font sizes for larger dimensions
+    font_sizes = [int(height * 0.7), int(height * 0.75), int(height * 0.8), int(height * 0.85)]
+    font_size = random.choice(font_sizes)
+    # ImageCaptcha accepts fonts via fonts arg; here we keep default but jitter spacing
+    image = ImageCaptcha(width=width, height=height, fonts=None, font_sizes=[font_size])
+    # draw base image
+    base = Image.frombytes('RGB', (width, height), image.generate_image(text).tobytes())
+    # quick contrast tweak: recolor foreground by compositing text mask if needed
+    # For minimal change, we stick with base and apply light warps/noise
+    # mild rotation/shear
+    angle = random.uniform(-6, 6)
+    base = base.rotate(angle, resample=Image.BILINEAR, expand=False, fillcolor=bg.getpixel((0,0)))
+    # perspective warp (very light)
+    if random.random() < 0.6:
+        base = perspective_warp(base, max_ratio=0.025)
+    # draw interference over the image
+    base = add_interference(base, line_range=(0, 3), dot_range=(10, 60))
+    # light noise + blur + jpeg recompress to add artifacts
+    base = add_noise_and_blur(base, noise_sigma=(0.0, 5.0), blur_sigma=(0.0, 0.7), motion_prob=0.12)
+    base = jpeg_recompress(base, qmin=72, qmax=92)
+    # optional low contrast: 20% chance to darken bg and lighten fg a bit
+    if random.random() < 0.2:
+        base = base.point(lambda p: int(p*0.95 + 6))
+    # Convert to grayscale if specified
+    if GRAYSCALE:
+        base = base.convert('L')
+    return base
+# Fix: Extract names and thresholds upfront
+train_name, val_name, test_name = directories[0][0], directories[1][0], directories[2][0]
+train_ratio, val_ratio, test_ratio = directories[0][1], directories[1][1], directories[2][1]
+# Calculate split thresholds
+n = NUM_IMAGES
+train_end = int(n * train_ratio)
+val_end = train_end + int(n * val_ratio)
+# Create directories once
+train_dir = os.path.join(DATASET_DIR, train_name)
+val_dir = os.path.join(DATASET_DIR, val_name)
+test_dir = os.path.join(DATASET_DIR, test_name)
 os.makedirs(DATASET_DIR, exist_ok=True)
+os.makedirs(train_dir, exist_ok=True)
+os.makedirs(val_dir, exist_ok=True)
+os.makedirs(test_dir, exist_ok=True)
+image = ImageCaptcha(width=160, height=60)  # kept for compatibility if needed
+with open(LABELS, mode="w", newline="") as f:
     writer = csv.writer(f)
     writer.writerow(["filename","label"])
     for i in range(NUM_IMAGES):
+        if i % max(1, (NUM_IMAGES//100)) == 0:
             print(f"{i} images made")
+        # Pick output directory based on thresholds
+        if i < train_end:
+            OUTPUT_DIR = train_dir
+        elif i < val_end:
+            OUTPUT_DIR = val_dir
+        else:
+            OUTPUT_DIR = test_dir
+        text = ''.join(random.choices(CHARS, k=random.randint(CAPTCHA_LEN_LOWER_LIMIT, CAPTCHA_LEN_UPPER_LIMIT)))
         filename = f"{text}_{i}.png"
         filepath = os.path.join(OUTPUT_DIR, filename)
+        # --- minimal change: replace image.write with our small variation renderer ---
+        img = render_with_variation(text, width=IMG_WIDTH, height=IMG_HEIGHT)
+        img.save(filepath)
+        # -----------------------------------------
+        writer.writerow([filename, text])
+print("Data Generated!")
+# Fixed split logic
+df = pd.read_csv(LABELS)
 n = len(df)
+train_end = int(n * train_ratio)
+val_end = train_end + int(n * val_ratio)
 df_train = df.iloc[:train_end]
 df_val = df.iloc[train_end:val_end]
 df_test = df.iloc[val_end:]
+df_train.to_csv(os.path.join(DATASET_DIR, f"{train_name}/labels.csv"), index=False)
+df_val.to_csv(os.path.join(DATASET_DIR, f"{val_name}/labels.csv"), index=False)
+df_test.to_csv(os.path.join(DATASET_DIR, f"{test_name}/labels.csv"), index=False)
 print("Labels Generated")

src/generateCaptcha.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+Simple CAPTCHA Generation Utility
+Generates individual CAPTCHA images using enhanced rendering
+"""
+import random
+import string
+from PIL import Image, ImageDraw, ImageFilter
+import numpy as np
+import cv2
+import io
+# Configuration - match your training setup
+IMG_WIDTH = 256
+IMG_HEIGHT = 60
+GRAYSCALE = True
+CHARS = string.ascii_letters + string.digits
+CAPTCHA_LEN_LOWER_LIMIT = 5
+CAPTCHA_LEN_UPPER_LIMIT = 7
+def rand_color(lo=0, hi=255):
+    """Generate random RGB color."""
+    return tuple(random.randint(lo, hi) for _ in range(3))
+def gradient_bg(w, h):
+    """Create gradient background."""
+    top = rand_color(200, 255)
+    bot = rand_color(200, 255)
+    arr = np.zeros((h, w, 3), dtype=np.uint8)
+    for y in range(h):
+        t = y / max(1, h - 1)
+        arr[y, :, :] = (np.array(top) * (1 - t) + np.array(bot) * t).astype(np.uint8)
+    return Image.fromarray(arr)
+def add_interference(img, line_range=(0, 3), dot_range=(10, 80)):
+    """Add interference patterns (lines and dots)."""
+    draw = ImageDraw.Draw(img)
+    w, h = img.size
+    for _ in range(random.randint(*line_range)):
+        x1, y1 = random.randint(0, w-1), random.randint(0, h-1)
+        x2, y2 = random.randint(0, w-1), random.randint(0, h-1)
+        draw.line((x1, y1, x2, y2), fill=rand_color(50, 180), width=random.randint(1, 2))
+    for _ in range(random.randint(*dot_range)):
+        x, y = random.randint(0, w-1), random.randint(0, h-1)
+        r = random.choice([0, 1])
+        draw.ellipse((x-r, y-r, x+r, y+r), fill=rand_color(0, 200))
+    return img
+def perspective_warp(img, max_ratio=0.03):
+    """Apply perspective warping."""
+    if max_ratio <= 0:
+        return img
+    w, h = img.size
+    dx = int(w * max_ratio)
+    dy = int(h * max_ratio * 0.7)
+    src = np.float32([[0,0],[w,0],[w,h],[0,h]])
+    dst = np.float32([[random.randint(0,dx), random.randint(0,dy)],
+                      [w-random.randint(0,dx), random.randint(0,dy)],
+                      [w-random.randint(0,dx), h-random.randint(0,dy)],
+                      [random.randint(0,dx), h-random.randint(0,dy)]])
+    M = cv2.getPerspectiveTransform(src, dst)
+    arr = np.array(img.convert("RGB"))[:, :, ::-1]  # to BGR
+    out = cv2.warpPerspective(arr, M, (w, h), borderMode=cv2.BORDER_REPLICATE)
+    return Image.fromarray(out[:, :, ::-1])  # back to RGB
+def jpeg_recompress(img, qmin=70, qmax=95):
+    """Recompress image to simulate JPEG artifacts."""
+    q = random.randint(qmin, qmax)
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=q)
+    buf.seek(0)
+    return Image.open(buf).convert("RGB")
+def add_noise_and_blur(img, noise_sigma=(0.0, 6.0), blur_sigma=(0.0, 0.8), motion_prob=0.1):
+    """Add noise and blur effects."""
+    # Gaussian noise
+    s = random.uniform(*noise_sigma)
+    if s > 0.05:
+        arr = np.array(img).astype(np.float32)
+        arr += np.random.normal(0, s, arr.shape).astype(np.float32)
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+        img = Image.fromarray(arr)
+    # Blur
+    if random.random() < motion_prob:
+        # Simple directional blur
+        ksize = random.choice([3,5])
+        kernel = Image.new("L", (ksize, ksize), 0)
+        draw = ImageDraw.Draw(kernel)
+        draw.line((0, ksize//2, ksize-1, ksize//2), fill=255, width=1)
+        kernel = kernel.rotate(random.uniform(0, 180), resample=Image.BILINEAR)
+        kernel = np.array(kernel, dtype=np.float32)
+        kernel /= max(1, kernel.sum())
+        arr = np.array(img)
+        arr = cv2.filter2D(arr, -1, kernel)
+        img = Image.fromarray(arr)
+    else:
+        sigma = random.uniform(*blur_sigma)
+        if sigma > 0.05:
+            img = img.filter(ImageFilter.GaussianBlur(radius=sigma))
+    return img
+def generate_captcha(text=None, width=IMG_WIDTH, height=IMG_HEIGHT, save_path=None):
+    """
+    Generate a single enhanced CAPTCHA image.
+    Args:
+        text (str, optional): Text to render. If None, generates random text.
+        width (int): Image width
+        height (int): Image height
+        save_path (str, optional): Path to save the image. If None, returns PIL Image.
+    Returns:
+        PIL Image if save_path is None, otherwise saves and returns the path
+    """
+    # Generate random text if none provided
+    if text is None:
+        text = ''.join(random.choices(CHARS, k=random.randint(CAPTCHA_LEN_LOWER_LIMIT, CAPTCHA_LEN_UPPER_LIMIT)))
+    # Randomize basic style
+    bg_choice = random.choice(["solid", "gradient"])
+    fg_color = rand_color(0, 80)
+    if bg_choice == "solid":
+        bg_color = rand_color(210, 255)
+        bg = Image.new("RGB", (width, height), color=bg_color)
+    else:
+        bg = gradient_bg(width, height)
+    # Adjust font sizes for larger dimensions
+    font_sizes = [int(height * 0.7), int(height * 0.75), int(height * 0.8), int(height * 0.85)]
+    font_size = random.choice(font_sizes)
+    # Use ImageCaptcha for base text rendering
+    from captcha.image import ImageCaptcha
+    image = ImageCaptcha(width=width, height=height, fonts=None, font_sizes=[font_size])
+    # Draw base image
+    base = Image.frombytes('RGB', (width, height), image.generate_image(text).tobytes())
+    # Apply enhancements
+    angle = random.uniform(-6, 6)
+    base = base.rotate(angle, resample=Image.BILINEAR, expand=False, fillcolor=bg.getpixel((0,0)))
+    # Perspective warp (very light)
+    if random.random() < 0.6:
+        base = perspective_warp(base, max_ratio=0.025)
+    # Add interference
+    base = add_interference(base, line_range=(0, 3), dot_range=(10, 60))
+    # Noise + blur + JPEG recompression
+    base = add_noise_and_blur(base, noise_sigma=(0.0, 5.0), blur_sigma=(0.0, 0.7), motion_prob=0.12)
+    base = jpeg_recompress(base, qmin=72, qmax=92)
+    # Optional low contrast
+    if random.random() < 0.2:
+        base = base.point(lambda p: int(p*0.95 + 6))
+    # Convert to grayscale if specified
+    if GRAYSCALE:
+        base = base.convert('L')
+    # Save or return
+    if save_path:
+        base.save(save_path)
+        return save_path
+    else:
+        return base
+if __name__ == "__main__":
+    # Example usage
+    print("Generating sample CAPTCHAs...")
+    # Generate with specific text
+    img1 = generate_captcha("HELLO", save_path="sample_HELLO.png")
+    print(f"Generated: sample_HELLO.png")
+    print("Done! Check the generated images.")

train.py CHANGED Viewed

@@ -50,7 +50,7 @@ def main():
     scaler = torch.amp.GradScaler('cuda', enabled=False)  # Disable AMP for stability
     # Epoch-based training with scheduler
-    epochs = 20  # Increased for OneCycleLR
     scheduler = torch.optim.lr_scheduler.OneCycleLR(
         optimizer, max_lr=3e-4, steps_per_epoch=len(train_dl), epochs=epochs
     )

     scaler = torch.amp.GradScaler('cuda', enabled=False)  # Disable AMP for stability
     # Epoch-based training with scheduler
+    epochs = 40  # Increased for OneCycleLR
     scheduler = torch.optim.lr_scheduler.OneCycleLR(
         optimizer, max_lr=3e-4, steps_per_epoch=len(train_dl), epochs=epochs
     )