CaptchaOCR / src /data.py
mohakapoor's picture
Initial project setup on Dev branch
ada63c0
raw
history blame
1.91 kB
from captcha.image import ImageCaptcha
import random
import string
import os
import csv
import pandas as pd
# config
DATASET_DIR = "Dataset_test/captchas"
LABELS = "Dataset_test/labels.csv"
NUM_IMAGES = 1000
CHARS = string.ascii_letters + string.digits
CAPTCHA_LEN_LOWER_LIMIT = 5
CAPTCHA_LEN_UPPER_LIMIT = 7
directories = [["train",0.8],["test",0.1],["val",0.1]]
os.makedirs(DATASET_DIR, exist_ok=True)
image = ImageCaptcha(width=160, height=60)
with open(LABELS,mode="w",newline="") as f:
writer = csv.writer(f)
writer.writerow(["filename","label"])
OUTPUT_DIR = os.path.join(DATASET_DIR,directories[0][0])
os.makedirs(OUTPUT_DIR,exist_ok=True)
for i in range(NUM_IMAGES):
if i%(NUM_IMAGES/100) ==0:
print(f"{i} images made")
if i>(0.8*NUM_IMAGES-1) and i<(0.9*NUM_IMAGES):
OUTPUT_DIR = os.path.join(DATASET_DIR,directories[1][0])
os.makedirs(OUTPUT_DIR,exist_ok=True)
elif i>(0.9*NUM_IMAGES-1):
OUTPUT_DIR = os.path.join(DATASET_DIR,directories[2][0])
os.makedirs(OUTPUT_DIR,exist_ok=True)
text = ''.join(random.choices(CHARS, k=random.randint(CAPTCHA_LEN_LOWER_LIMIT,CAPTCHA_LEN_UPPER_LIMIT)))
filename = f"{text}_{i}.png"
filepath = os.path.join(OUTPUT_DIR, filename)
image.write(text, filepath)
writer.writerow([filename,text])
print("Data Generated!")
df = pd.read_csv(LABELS)
n = len(df)
train_end = int(n * directories[0][1])
val_end = train_end + int(n * directories[2][1])
# Split datasets
df_train = df.iloc[:train_end]
df_val = df.iloc[train_end:val_end]
df_test = df.iloc[val_end:]
# Save
df_train.to_csv(os.path.join(DATASET_DIR,"train/labels.csv"), index=False)
df_val.to_csv(os.path.join(DATASET_DIR,"val/labels.csv"), index=False)
df_test.to_csv(os.path.join(DATASET_DIR,"test/labels.csv"), index=False)
print("Labels Generated")