Initial commit
Browse files- .gitattributes +1 -0
- PredictWord.py +49 -0
- ValidationChecker.py +93 -0
- cnn.py +196 -0
- dataset/test-00000-of-00001-bc8b28dacaaa708d.parquet +3 -0
- dataset/test.csv +0 -0
- dataset/train-00000-of-00001-92b9aa4d471d61ab.parquet +3 -0
- dataset/train.csv +3 -0
- main.py +212 -0
- prediction.py +48 -0
- requirements.txt +11 -0
- saved_models/best_model.pth +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
dataset/train.csv filter=lfs diff=lfs merge=lfs -text
|
PredictWord.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import pytesseract
|
| 3 |
+
import pyttsx3
|
| 4 |
+
import os
|
| 5 |
+
import subprocess
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
| 9 |
+
|
| 10 |
+
class PredictWord:
|
| 11 |
+
def __init__(self, image_path):
|
| 12 |
+
self.image_path = image_path
|
| 13 |
+
|
| 14 |
+
def predict(self):
|
| 15 |
+
image = cv2.imread(self.image_path)
|
| 16 |
+
if image is None:
|
| 17 |
+
print(f"Error: Image not found at '{self.image_path}'")
|
| 18 |
+
return None
|
| 19 |
+
|
| 20 |
+
# Convert to gray
|
| 21 |
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
| 22 |
+
|
| 23 |
+
custom_config = r'--oem 3 --psm 6'
|
| 24 |
+
word = pytesseract.image_to_string(gray, config=custom_config)
|
| 25 |
+
return word.strip() # 👈 This line is missing in your code
|
| 26 |
+
|
| 27 |
+
@staticmethod
|
| 28 |
+
def save_and_speak_word(word, output_dir='output', filename='output.txt'):
|
| 29 |
+
# Ensure output directory exists
|
| 30 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 31 |
+
file_path = os.path.abspath(os.path.join(output_dir, filename))
|
| 32 |
+
|
| 33 |
+
# Write word to file
|
| 34 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
| 35 |
+
f.write(word + '\n\n') # Adds space after the w
|
| 36 |
+
|
| 37 |
+
# Open file in Notepad
|
| 38 |
+
subprocess.Popen(['notepad.exe', file_path])
|
| 39 |
+
time.sleep(1) # Give Notepad time to open
|
| 40 |
+
|
| 41 |
+
# Speak the word
|
| 42 |
+
engine = pyttsx3.init()
|
| 43 |
+
engine.say(word)
|
| 44 |
+
engine.runAndWait()
|
| 45 |
+
|
| 46 |
+
def clear_notepad_file(output_dir='output', filename='output.txt'):
|
| 47 |
+
file_path = os.path.abspath(os.path.join(output_dir, filename))
|
| 48 |
+
with open(file_path, 'w', encoding='utf-8') as f:
|
| 49 |
+
f.write('')
|
ValidationChecker.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torch.utils.data import DataLoader, Dataset
|
| 4 |
+
from torchvision import transforms
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import io
|
| 8 |
+
import ast
|
| 9 |
+
import os
|
| 10 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
| 11 |
+
import timm
|
| 12 |
+
|
| 13 |
+
class EfficientNetB0Alpha(nn.Module):
|
| 14 |
+
def __init__(self, num_classes=26):
|
| 15 |
+
super().__init__()
|
| 16 |
+
self.model = timm.create_model('efficientnet_b0', pretrained=True, in_chans=1, num_classes=num_classes)
|
| 17 |
+
def forward(self, x):
|
| 18 |
+
return self.model(x)
|
| 19 |
+
|
| 20 |
+
class Dataset(Dataset):
|
| 21 |
+
def __init__(self, csv_path, transform=None, image_col='image', label_col='label'):
|
| 22 |
+
self.data = pd.read_csv(csv_path)
|
| 23 |
+
self.transform = transform
|
| 24 |
+
self.image_col = image_col
|
| 25 |
+
self.label_col = label_col
|
| 26 |
+
def __len__(self):
|
| 27 |
+
return len(self.data)
|
| 28 |
+
def __getitem__(self, idx):
|
| 29 |
+
img_data = self.data.iloc[idx][self.image_col]
|
| 30 |
+
label = self.data.iloc[idx][self.label_col]
|
| 31 |
+
if isinstance(img_data, str):
|
| 32 |
+
img_dict = ast.literal_eval(img_data)
|
| 33 |
+
img_bytes = img_dict['bytes']
|
| 34 |
+
else:
|
| 35 |
+
img_bytes = img_data['bytes']
|
| 36 |
+
img = Image.open(io.BytesIO(img_bytes)).convert('L')
|
| 37 |
+
if self.transform:
|
| 38 |
+
img = self.transform(img)
|
| 39 |
+
return img, label
|
| 40 |
+
|
| 41 |
+
def load_model(model_path, num_classes, device):
|
| 42 |
+
if not os.path.exists(model_path):
|
| 43 |
+
raise FileNotFoundError(f"Model file not found at {model_path}")
|
| 44 |
+
model = EfficientNetB0Alpha(num_classes=num_classes)
|
| 45 |
+
checkpoint = torch.load(model_path, map_location=device, weights_only=True)
|
| 46 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 47 |
+
model.to(device)
|
| 48 |
+
model.eval()
|
| 49 |
+
return model
|
| 50 |
+
|
| 51 |
+
def evaluate(model, test_loader, device, class_names):
|
| 52 |
+
model.eval()
|
| 53 |
+
correct, total = 0, 0
|
| 54 |
+
all_preds, all_labels = [], []
|
| 55 |
+
with torch.no_grad():
|
| 56 |
+
for images, labels in test_loader:
|
| 57 |
+
images, labels = images.to(device), labels.to(device)
|
| 58 |
+
outputs = model(images)
|
| 59 |
+
_, predicted = torch.max(outputs.data, 1)
|
| 60 |
+
total += labels.size(0)
|
| 61 |
+
correct += (predicted == labels).sum().item()
|
| 62 |
+
all_preds.extend(predicted.cpu().numpy())
|
| 63 |
+
all_labels.extend(labels.cpu().numpy())
|
| 64 |
+
accuracy = 100 * correct / max(total, 1)
|
| 65 |
+
print(f"Test Accuracy: {accuracy:.2f}%")
|
| 66 |
+
print("\nClassification Report:")
|
| 67 |
+
print(classification_report(all_labels, all_preds, target_names=class_names, digits=2))
|
| 68 |
+
cm = confusion_matrix(all_labels, all_preds)
|
| 69 |
+
print("\nConfusion Matrix (True Labels: rows, Predicted Labels: columns):")
|
| 70 |
+
print(pd.DataFrame(cm, index=class_names, columns=class_names))
|
| 71 |
+
return accuracy, cm
|
| 72 |
+
|
| 73 |
+
def main():
|
| 74 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 75 |
+
num_classes = 26
|
| 76 |
+
model_path = "saved_models/best_model.pth"
|
| 77 |
+
test_csv = "dataset/test.csv"
|
| 78 |
+
batch_size = 32
|
| 79 |
+
print("Device being used:", device)
|
| 80 |
+
test_transform = transforms.Compose([
|
| 81 |
+
transforms.Grayscale(num_output_channels=1),
|
| 82 |
+
transforms.Resize((224, 224)),
|
| 83 |
+
transforms.ToTensor(),
|
| 84 |
+
transforms.Normalize(mean=[0.5], std=[0.5])
|
| 85 |
+
])
|
| 86 |
+
test_dataset = Dataset(test_csv, transform=test_transform)
|
| 87 |
+
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
|
| 88 |
+
class_names = [chr(65 + i) for i in range(26)]
|
| 89 |
+
model = load_model(model_path, num_classes, device)
|
| 90 |
+
evaluate(model, test_loader, device, class_names)
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
main()
|
cnn.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.optim as optim
|
| 4 |
+
from torch.utils.data import DataLoader, Dataset
|
| 5 |
+
from torchvision import transforms
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import io
|
| 9 |
+
import ast
|
| 10 |
+
import timm
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
# Model
|
| 14 |
+
class EfficientNetB0Alpha(nn.Module):
|
| 15 |
+
def __init__(self, num_classes=26):
|
| 16 |
+
super().__init__()
|
| 17 |
+
self.model = timm.create_model('efficientnet_b0', pretrained=True, in_chans=1, num_classes=num_classes)
|
| 18 |
+
|
| 19 |
+
def forward(self, x):
|
| 20 |
+
return self.model(x)
|
| 21 |
+
|
| 22 |
+
# Dataset
|
| 23 |
+
class Dataset(Dataset):
|
| 24 |
+
def __init__(self, csv_path, transform=None, image_col='image', label_col='label'):
|
| 25 |
+
self.data = pd.read_csv(csv_path)
|
| 26 |
+
self.transform = transform
|
| 27 |
+
self.image_col = image_col
|
| 28 |
+
self.label_col = label_col
|
| 29 |
+
|
| 30 |
+
def __len__(self):
|
| 31 |
+
return len(self.data)
|
| 32 |
+
|
| 33 |
+
def __getitem__(self, idx):
|
| 34 |
+
img_data = self.data.iloc[idx][self.image_col]
|
| 35 |
+
label = self.data.iloc[idx][self.label_col]
|
| 36 |
+
if isinstance(img_data, str):
|
| 37 |
+
try:
|
| 38 |
+
img_dict = ast.literal_eval(img_data)
|
| 39 |
+
img_bytes = img_dict['bytes']
|
| 40 |
+
except (ValueError, SyntaxError, KeyError) as e:
|
| 41 |
+
raise ValueError(f"Error parsing image data at index {idx}: {e}")
|
| 42 |
+
else:
|
| 43 |
+
img_bytes = img_data['bytes']
|
| 44 |
+
try:
|
| 45 |
+
img = Image.open(io.BytesIO(img_bytes)).convert('L')
|
| 46 |
+
except Exception as e:
|
| 47 |
+
raise ValueError(f"Error decoding image at index {idx}: {e}")
|
| 48 |
+
if self.transform:
|
| 49 |
+
img = self.transform(img)
|
| 50 |
+
return img, label
|
| 51 |
+
|
| 52 |
+
# Training function
|
| 53 |
+
def train(model, train_loader, optimizer, criterion, scheduler, device):
|
| 54 |
+
model.train()
|
| 55 |
+
total_loss, total_correct, total_samples = 0, 0, 0
|
| 56 |
+
for data, targets in train_loader:
|
| 57 |
+
data, targets = data.to(device), targets.to(device)
|
| 58 |
+
optimizer.zero_grad()
|
| 59 |
+
outputs = model(data)
|
| 60 |
+
loss = criterion(outputs, targets)
|
| 61 |
+
loss.backward()
|
| 62 |
+
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
|
| 63 |
+
optimizer.step()
|
| 64 |
+
if scheduler is not None:
|
| 65 |
+
scheduler.step()
|
| 66 |
+
total_loss += loss.item()
|
| 67 |
+
_, predicted = outputs.max(1)
|
| 68 |
+
total_correct += predicted.eq(targets).sum().item()
|
| 69 |
+
total_samples += targets.size(0)
|
| 70 |
+
avg_loss = total_loss / len(train_loader)
|
| 71 |
+
accuracy = 100. * total_correct / total_samples
|
| 72 |
+
return avg_loss, accuracy
|
| 73 |
+
|
| 74 |
+
# Validation function
|
| 75 |
+
def val(model, val_loader, criterion, device):
|
| 76 |
+
model.eval()
|
| 77 |
+
total_loss, total_correct, total_samples = 0, 0, 0
|
| 78 |
+
with torch.no_grad():
|
| 79 |
+
for data, targets in val_loader:
|
| 80 |
+
data, targets = data.to(device), targets.to(device)
|
| 81 |
+
outputs = model(data)
|
| 82 |
+
loss = criterion(outputs, targets)
|
| 83 |
+
total_loss += loss.item()
|
| 84 |
+
_, predicted = outputs.max(1)
|
| 85 |
+
total_correct += predicted.eq(targets).sum().item()
|
| 86 |
+
total_samples += targets.size(0)
|
| 87 |
+
avg_loss = total_loss / len(val_loader)
|
| 88 |
+
accuracy = 100. * total_correct / total_samples
|
| 89 |
+
return avg_loss, accuracy
|
| 90 |
+
|
| 91 |
+
# Save function
|
| 92 |
+
def save(model, optimizer, epoch, accuracy, class_names, save_path="saved_models/best_model.pth"):
|
| 93 |
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
| 94 |
+
torch.save({
|
| 95 |
+
'epoch': epoch,
|
| 96 |
+
'model_state_dict': model.state_dict(),
|
| 97 |
+
'optimizer_state_dict': optimizer.state_dict(),
|
| 98 |
+
'accuracy': accuracy,
|
| 99 |
+
'class_names': class_names
|
| 100 |
+
}, save_path)
|
| 101 |
+
|
| 102 |
+
# Main method
|
| 103 |
+
def main():
|
| 104 |
+
# Config
|
| 105 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 106 |
+
num_classes = 26
|
| 107 |
+
batch_size = 32
|
| 108 |
+
learning_rate = 5e-4
|
| 109 |
+
num_epochs = 25
|
| 110 |
+
patience = 10
|
| 111 |
+
train_csv = "dataset/train.csv"
|
| 112 |
+
val_csv = "dataset/test.csv"
|
| 113 |
+
save_path = "saved_models/best_model.pth"
|
| 114 |
+
|
| 115 |
+
print("Device being used:", device)
|
| 116 |
+
|
| 117 |
+
# Transforms
|
| 118 |
+
train_transform = transforms.Compose([
|
| 119 |
+
transforms.Grayscale(num_output_channels=1),
|
| 120 |
+
transforms.Resize((256, 256)),
|
| 121 |
+
transforms.RandomCrop(224),
|
| 122 |
+
transforms.RandomHorizontalFlip(p=0.5),
|
| 123 |
+
transforms.RandomRotation(degrees=45),
|
| 124 |
+
transforms.ToTensor(),
|
| 125 |
+
transforms.Normalize(mean=[0.5], std=[0.5])
|
| 126 |
+
])
|
| 127 |
+
val_transform = transforms.Compose([
|
| 128 |
+
transforms.Grayscale(num_output_channels=1),
|
| 129 |
+
transforms.Resize((224, 224)),
|
| 130 |
+
transforms.ToTensor(),
|
| 131 |
+
transforms.Normalize(mean=[0.5], std=[0.5])
|
| 132 |
+
])
|
| 133 |
+
|
| 134 |
+
# Datasets and loaders
|
| 135 |
+
train_dataset = Dataset(train_csv, transform=train_transform)
|
| 136 |
+
val_dataset = Dataset(val_csv, transform=val_transform)
|
| 137 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
|
| 138 |
+
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
|
| 139 |
+
class_names = [chr(65 + i) for i in range(26)] # ['A', 'B', ..., 'Z']
|
| 140 |
+
|
| 141 |
+
# Model, optimizer, criterion
|
| 142 |
+
model = EfficientNetB0Alpha(num_classes=num_classes).to(device)
|
| 143 |
+
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
|
| 144 |
+
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
|
| 145 |
+
|
| 146 |
+
# Load checkpoint if it exists
|
| 147 |
+
start_epoch = 0
|
| 148 |
+
best_accuracy = 0.0
|
| 149 |
+
if os.path.exists(save_path):
|
| 150 |
+
try:
|
| 151 |
+
checkpoint = torch.load(save_path, map_location=device)
|
| 152 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 153 |
+
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
| 154 |
+
start_epoch = checkpoint['epoch'] + 1
|
| 155 |
+
best_accuracy = checkpoint['accuracy']
|
| 156 |
+
print(f"Loaded checkpoint from epoch {checkpoint['epoch']} with accuracy {best_accuracy:.2f}%")
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"Error loading checkpoint: {e}. Starting from scratch.")
|
| 159 |
+
else:
|
| 160 |
+
print(f"No checkpoint found at {save_path}. Starting from scratch.")
|
| 161 |
+
|
| 162 |
+
# Scheduler
|
| 163 |
+
scheduler = optim.lr_scheduler.OneCycleLR(
|
| 164 |
+
optimizer,
|
| 165 |
+
max_lr=learning_rate,
|
| 166 |
+
epochs=num_epochs,
|
| 167 |
+
steps_per_epoch=len(train_loader),
|
| 168 |
+
pct_start=0.3,
|
| 169 |
+
anneal_strategy='cos'
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
# Training loop
|
| 173 |
+
early_stopping_counter = 0
|
| 174 |
+
for epoch in range(start_epoch, num_epochs):
|
| 175 |
+
print(f"\nEpoch [{epoch+1}/{num_epochs}]")
|
| 176 |
+
train_loss, train_acc = train(model, train_loader, optimizer, criterion, scheduler, device)
|
| 177 |
+
print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
|
| 178 |
+
val_loss, val_acc = val(model, val_loader, criterion, device)
|
| 179 |
+
print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")
|
| 180 |
+
|
| 181 |
+
if val_acc > best_accuracy:
|
| 182 |
+
best_accuracy = val_acc
|
| 183 |
+
save(model, optimizer, epoch, best_accuracy, class_names, save_path)
|
| 184 |
+
print(f"New best model saved with accuracy: {best_accuracy:.2f}%")
|
| 185 |
+
early_stopping_counter = 0
|
| 186 |
+
else:
|
| 187 |
+
early_stopping_counter += 1
|
| 188 |
+
|
| 189 |
+
if early_stopping_counter >= patience:
|
| 190 |
+
print(f"Early stopping triggered. Best accuracy: {best_accuracy:.2f}%")
|
| 191 |
+
break
|
| 192 |
+
|
| 193 |
+
print(f"Training completed. Best validation accuracy: {best_accuracy:.2f}%")
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
main()
|
dataset/test-00000-of-00001-bc8b28dacaaa708d.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:802310c181ea4e8339aca3222edeb7ea4ac56199287564031c3f2604e60e3bb8
|
| 3 |
+
size 743978
|
dataset/test.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
dataset/train-00000-of-00001-92b9aa4d471d61ab.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41fd2c7ddf580e3f9a68b76f4bc82619409077c59391bba9eb98199c1b6d6e79
|
| 3 |
+
size 7405967
|
dataset/train.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ecd6439d57f530923a243cd132ca4a2a09cb38be320b3cb92c16b814b4c0d19b
|
| 3 |
+
size 66182401
|
main.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import numpy as np
|
| 4 |
+
import cv2
|
| 5 |
+
import mediapipe as mp
|
| 6 |
+
from prediction import predict_from_image
|
| 7 |
+
from PredictWord import PredictWord, clear_notepad_file
|
| 8 |
+
|
| 9 |
+
Header_path = "Assets/header"
|
| 10 |
+
myList = os.listdir(Header_path)
|
| 11 |
+
cam = cv2.VideoCapture(0)
|
| 12 |
+
wCam, hCam = 1280, 720
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class HandDetector:
|
| 16 |
+
def __init__(self, mode=False, maxHands=2, modelComplexity=1, detectionCon=0.8, trackCon=0.8):
|
| 17 |
+
self.mode = mode
|
| 18 |
+
self.maxHands = maxHands
|
| 19 |
+
self.modelComplexity = modelComplexity
|
| 20 |
+
self.detectionCon = detectionCon
|
| 21 |
+
self.trackCon = trackCon
|
| 22 |
+
self.mpHands = mp.solutions.hands
|
| 23 |
+
self.hands = self.mpHands.Hands(self.mode, self.maxHands, self.modelComplexity, self.detectionCon,
|
| 24 |
+
self.trackCon)
|
| 25 |
+
self.mpDraw = mp.solutions.drawing_utils
|
| 26 |
+
self.tipIds = [4, 8, 12, 16, 20]
|
| 27 |
+
self.lmList = []
|
| 28 |
+
|
| 29 |
+
def findHands(self, img):
|
| 30 |
+
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
| 31 |
+
self.results = self.hands.process(imgRGB)
|
| 32 |
+
if self.results.multi_hand_landmarks:
|
| 33 |
+
for handLms in self.results.multi_hand_landmarks:
|
| 34 |
+
self.mpDraw.draw_landmarks(img, handLms, self.mpHands.HAND_CONNECTIONS)
|
| 35 |
+
return img
|
| 36 |
+
|
| 37 |
+
def findPosition(self, img, handNo=0):
|
| 38 |
+
self.lmList = []
|
| 39 |
+
if self.results.multi_hand_landmarks:
|
| 40 |
+
myHand = self.results.multi_hand_landmarks[handNo]
|
| 41 |
+
for id, lm in enumerate(myHand.landmark):
|
| 42 |
+
h, w, c = img.shape
|
| 43 |
+
cx, cy = int(lm.x * w), int(lm.y * h)
|
| 44 |
+
self.lmList.append([id, cx, cy])
|
| 45 |
+
return self.lmList
|
| 46 |
+
|
| 47 |
+
def fingerup(self):
|
| 48 |
+
fingers = []
|
| 49 |
+
# Thumb
|
| 50 |
+
if self.lmList[self.tipIds[0]][1] < self.lmList[self.tipIds[0] - 1][1]:
|
| 51 |
+
fingers.append(1)
|
| 52 |
+
else:
|
| 53 |
+
fingers.append(0)
|
| 54 |
+
|
| 55 |
+
for id in range(1, 5):
|
| 56 |
+
if self.lmList[self.tipIds[id]][2] < self.lmList[self.tipIds[id] - 2][2]:
|
| 57 |
+
fingers.append(1)
|
| 58 |
+
else:
|
| 59 |
+
fingers.append(0)
|
| 60 |
+
return fingers
|
| 61 |
+
|
| 62 |
+
def main():
|
| 63 |
+
detector = HandDetector()
|
| 64 |
+
cTime = 0
|
| 65 |
+
|
| 66 |
+
overlayList = []
|
| 67 |
+
drawColor = (0, 0, 255)
|
| 68 |
+
for impath in myList:
|
| 69 |
+
image = cv2.imread(f'{Header_path}/{impath}')
|
| 70 |
+
if image is not None:
|
| 71 |
+
overlayList.append(image)
|
| 72 |
+
header = None
|
| 73 |
+
if overlayList:
|
| 74 |
+
header = cv2.resize(overlayList[0], (1280, 125)) if overlayList[0].shape != (125, 1280, 3) else overlayList[0]
|
| 75 |
+
|
| 76 |
+
RightBar = cv2.imread('Assets/sidebar/right.png')
|
| 77 |
+
RightBar = cv2.resize(RightBar, (230, 595))
|
| 78 |
+
LeftBar = cv2.imread('Assets/sidebar/left.png')
|
| 79 |
+
LeftBar = cv2.resize(LeftBar, (226, 300))
|
| 80 |
+
|
| 81 |
+
mode = "Drawing Mode"
|
| 82 |
+
canvas = np.zeros((720, 1280, 3), np.uint8)
|
| 83 |
+
submode = "Letter_Prediction"
|
| 84 |
+
predicted_letter = ""
|
| 85 |
+
clear_notepad_file(output_dir='output', filename='output.txt')
|
| 86 |
+
xp, yp = 0, 0
|
| 87 |
+
while True:
|
| 88 |
+
success, img = cam.read()
|
| 89 |
+
img = cv2.resize(img, (wCam, hCam))
|
| 90 |
+
img = cv2.flip(img, 1)
|
| 91 |
+
img = detector.findHands(img)
|
| 92 |
+
lmlist = detector.findPosition(img)
|
| 93 |
+
|
| 94 |
+
# Only process drawing if hand landmarks are detected
|
| 95 |
+
if len(lmlist) != 0:
|
| 96 |
+
x1, y1 = lmlist[8][1:3]
|
| 97 |
+
x2, y2 = lmlist[12][1:3]
|
| 98 |
+
fingers = []
|
| 99 |
+
if lmlist:
|
| 100 |
+
fingers = detector.fingerup()
|
| 101 |
+
|
| 102 |
+
# Selection Mode: both index and middle finger up
|
| 103 |
+
if fingers[1] == 1 and fingers[2] == 1:
|
| 104 |
+
xp, yp = 0, 0
|
| 105 |
+
if y1 < 125 and len(overlayList) >= 2:
|
| 106 |
+
if 0 < x1 < 271:
|
| 107 |
+
drawColor = (0, 0, 255)
|
| 108 |
+
header = cv2.resize(overlayList[0], (1280, 125))
|
| 109 |
+
elif 850 < x1 < 1280 and len(overlayList) > 1:
|
| 110 |
+
drawColor = (0, 0, 0)
|
| 111 |
+
header = cv2.resize(overlayList[1], (1280, 125))
|
| 112 |
+
cv2.rectangle(img, (x1, y1 - 25), (x2, y2 + 25), drawColor, cv2.FILLED)
|
| 113 |
+
|
| 114 |
+
# Rightbar actions
|
| 115 |
+
if x1 > 1050:
|
| 116 |
+
if 125 < y1 < 250:
|
| 117 |
+
canvas = np.zeros((720, 1280, 3), np.uint8) # Clear canvas
|
| 118 |
+
if 260 < y1 < 385:
|
| 119 |
+
pass
|
| 120 |
+
if 385 < y1 < 510:
|
| 121 |
+
mode = "Drawing Mode"
|
| 122 |
+
if 510 < y1 < 635:
|
| 123 |
+
mode = "Prediction Mode"
|
| 124 |
+
|
| 125 |
+
# Drawing Mode: only index finger up
|
| 126 |
+
if len(fingers) >= 3 and fingers[1] and not fingers[2] and mode == "Drawing Mode":
|
| 127 |
+
if xp == 0 and yp == 0:
|
| 128 |
+
xp, yp = x1, y1
|
| 129 |
+
xp, yp = x1, y1
|
| 130 |
+
|
| 131 |
+
if drawColor == (0, 0, 0):
|
| 132 |
+
cv2.circle(img, (x1, y1), 30, drawColor, cv2.FILLED)
|
| 133 |
+
cv2.line(canvas, (xp, yp), (x1, y1), drawColor, 75)
|
| 134 |
+
else:
|
| 135 |
+
cv2.line(canvas, (xp, yp), (x1, y1), drawColor, 15)
|
| 136 |
+
xp, yp = x1, y1
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
if mode == "Prediction Mode":
|
| 140 |
+
if LeftBar is not None:
|
| 141 |
+
img[125:425, 0:226] = LeftBar
|
| 142 |
+
if len(fingers) >= 3 and fingers[1] and not fingers[2]:
|
| 143 |
+
if xp == 0 and yp == 0:
|
| 144 |
+
xp, yp = x1, y1
|
| 145 |
+
xp, yp = x1, y1
|
| 146 |
+
|
| 147 |
+
if drawColor == (0, 0, 0):
|
| 148 |
+
cv2.circle(img, (x1, y1), 30, drawColor, cv2.FILLED)
|
| 149 |
+
cv2.line(canvas, (xp, yp), (x1, y1), drawColor, 75)
|
| 150 |
+
else:
|
| 151 |
+
cv2.line(canvas, (xp, yp), (x1, y1), drawColor, 15)
|
| 152 |
+
xp, yp = x1, y1
|
| 153 |
+
|
| 154 |
+
# Leftbar actions
|
| 155 |
+
if x1 < 300:
|
| 156 |
+
if 150 < y1 < 300:
|
| 157 |
+
submode = "Letter Prediction"
|
| 158 |
+
cv2.imwrite("Output/Letter.png", canvas)
|
| 159 |
+
predicted_letter, confidence = predict_from_image("Output/Letter.png")
|
| 160 |
+
cv2.putText(img, f'Predicted Letter: {predicted_letter}', (50, 500), cv2.FONT_HERSHEY_TRIPLEX,
|
| 161 |
+
1, (255, 0, 255), 2)
|
| 162 |
+
prediction_time = time.time()
|
| 163 |
+
reset_canvas = True
|
| 164 |
+
|
| 165 |
+
if 315 < y1 < 405:
|
| 166 |
+
submode = "Word Prediction"
|
| 167 |
+
cv2.imwrite("Output/Word.png", canvas)
|
| 168 |
+
predictor = PredictWord("Output/Word.png")
|
| 169 |
+
result = predictor.predict()
|
| 170 |
+
print("Detected word:", result)
|
| 171 |
+
PredictWord.save_and_speak_word(result, output_dir='output', filename='output.txt')
|
| 172 |
+
canvas = np.zeros((720, 1280, 3), np.uint8)
|
| 173 |
+
#
|
| 174 |
+
# # Place this outside the x1 < 300 block, so it runs every frame
|
| 175 |
+
# if reset_canvas and prediction_time is not None:
|
| 176 |
+
# if time.time() - prediction_time > 5:
|
| 177 |
+
# canvas = np.zeros((720, 1280, 3), np.uint8)
|
| 178 |
+
# reset_canvas = False
|
| 179 |
+
# prediction_time = None
|
| 180 |
+
|
| 181 |
+
# Combine canvas and camera image using bitwise operations
|
| 182 |
+
imgGray = cv2.cvtColor(canvas, cv2.COLOR_BGR2GRAY)
|
| 183 |
+
_, imgInv = cv2.threshold(imgGray, 50, 255, cv2.THRESH_BINARY_INV)
|
| 184 |
+
imgInv = cv2.cvtColor(imgInv, cv2.COLOR_GRAY2BGR)
|
| 185 |
+
img = cv2.bitwise_and(img, imgInv)
|
| 186 |
+
img = cv2.bitwise_or(img, canvas)
|
| 187 |
+
|
| 188 |
+
# Calculate FPS (frames per second)
|
| 189 |
+
pTime = time.time()
|
| 190 |
+
fps = 1 / (pTime - cTime) if cTime != 0 else 0
|
| 191 |
+
cTime = pTime
|
| 192 |
+
|
| 193 |
+
# Overlay header and RightBar only if they are loaded (robustness)
|
| 194 |
+
if header is not None:
|
| 195 |
+
img[0:125, 0:1280] = header
|
| 196 |
+
if RightBar is not None:
|
| 197 |
+
img[125:720, 1050:1280] = RightBar
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
cv2.putText(img, f"Mode : {mode}", (1065, 645), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (255, 0, 255), 1)
|
| 203 |
+
cv2.putText(img, f'FPS: {int(fps)}', (1095, 695), cv2.FONT_HERSHEY_TRIPLEX, 1, (255, 0, 255), 1)
|
| 204 |
+
cv2.imshow("Canvas", canvas)
|
| 205 |
+
cv2.imshow("Image", img)
|
| 206 |
+
if cv2.waitKey(1) & 0xFF == ord('q'):
|
| 207 |
+
break
|
| 208 |
+
|
| 209 |
+
if __name__ == "__main__":
|
| 210 |
+
main()
|
| 211 |
+
cam.release()
|
| 212 |
+
cv2.destroyAllWindows()
|
prediction.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from torchvision import transforms
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import timm
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
class EfficientNetB0Alpha(nn.Module):
|
| 9 |
+
def __init__(self, num_classes=26):
|
| 10 |
+
super().__init__()
|
| 11 |
+
self.model = timm.create_model('efficientnet_b0', pretrained=False, in_chans=1, num_classes=num_classes)
|
| 12 |
+
|
| 13 |
+
def forward(self, x):
|
| 14 |
+
return self.model(x)
|
| 15 |
+
|
| 16 |
+
# Load model and class names once
|
| 17 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 18 |
+
checkpoint_path = 'saved_models/best_model.pth'
|
| 19 |
+
num_classes = 26
|
| 20 |
+
|
| 21 |
+
transform = transforms.Compose([
|
| 22 |
+
transforms.Grayscale(num_output_channels=1),
|
| 23 |
+
transforms.Resize(224),
|
| 24 |
+
transforms.CenterCrop(224),
|
| 25 |
+
transforms.ToTensor(),
|
| 26 |
+
transforms.Normalize(mean=[0.5], std=[0.5])
|
| 27 |
+
])
|
| 28 |
+
|
| 29 |
+
model = EfficientNetB0Alpha(num_classes=num_classes).to(device)
|
| 30 |
+
if not os.path.exists(checkpoint_path):
|
| 31 |
+
raise FileNotFoundError(f"Checkpoint not found at {checkpoint_path}")
|
| 32 |
+
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=True)
|
| 33 |
+
model.load_state_dict(checkpoint['model_state_dict'])
|
| 34 |
+
class_names = checkpoint['class_names']
|
| 35 |
+
|
| 36 |
+
def predict_from_image(image_path):
|
| 37 |
+
|
| 38 |
+
img = Image.open(image_path).convert('L')
|
| 39 |
+
img = transform(img)
|
| 40 |
+
img = img.unsqueeze(0).to(device)
|
| 41 |
+
model.eval()
|
| 42 |
+
with torch.no_grad():
|
| 43 |
+
outputs = model(img)
|
| 44 |
+
probabilities = torch.softmax(outputs, dim=1)
|
| 45 |
+
confidence, predicted = torch.max(probabilities, 1)
|
| 46 |
+
predicted_class = class_names[predicted.item()]
|
| 47 |
+
confidence = confidence.item()
|
| 48 |
+
return predicted_class, confidence
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=1.9.0
|
| 2 |
+
torchvision>=0.10.0
|
| 3 |
+
timm>=0.6.0
|
| 4 |
+
pandas>=1.3.0
|
| 5 |
+
Pillow>=9.0.0
|
| 6 |
+
numpy>=1.21.0
|
| 7 |
+
opencv-python>=4.5.0
|
| 8 |
+
mediapipe>=0.8.9
|
| 9 |
+
pytesseract>=0.3.8
|
| 10 |
+
pyttsx3>=2.90
|
| 11 |
+
scikit-learn>=1.0.0
|
saved_models/best_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e25438c897561cd81ac5b1aefd353af47079bd22a57fbc7e80050c884d2419ef
|
| 3 |
+
size 48963941
|