|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
|
|
|
import torch |
|
|
import torchvision.transforms as transforms |
|
|
from PIL import Image |
|
|
import numpy as np |
|
|
import torchvision.models as models |
|
|
import torchvision |
|
|
|
|
|
|
|
|
st.set_page_config(page_title="SmartVision AI - Intelligent Multi-Class Object Recognition System", layout="wide") |
|
|
st.sidebar.title("π SmartVision AI") |
|
|
|
|
|
st.sidebar.markdown("---") |
|
|
|
|
|
st.title("π€ SmartVision AI - Intelligent Multi-Class Object Recognition System") |
|
|
st.markdown("---") |
|
|
|
|
|
page = st.sidebar.radio("Go to", ["π Home", "πΌοΈ Image Classification", "π¦ Object Detection", "π Model Performance", "πΈ Live Webcam Detection","βΉοΈ About"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if page == "π Home": |
|
|
st.subheader("π Project Overview") |
|
|
st.markdown(""" |
|
|
**SmartVision AI** is an intelligent computer vision system that performs real-time object detection |
|
|
using a custom-trained **YOLO model**. |
|
|
The system allows users to upload images and automatically identifies objects by drawing bounding boxes, |
|
|
class labels, and confidence scores. |
|
|
|
|
|
The goal of this project is to demonstrate an **end-to-end AI pipeline** β from model training |
|
|
to optimized inference and visualization. |
|
|
""") |
|
|
|
|
|
st.info("β¨ This project is designed to showcase practical skills in Deep Learning, Computer Vision, and Model Deployment, with a focus on performance optimization and clean output presentation.") |
|
|
st.markdown("---") |
|
|
st.subheader("π Key Features") |
|
|
st.markdown(""" |
|
|
β€ π **Accurate Object Detection** using a trained YOLO model |
|
|
β€ π¦ **Bounding Boxes & Labels** on detected objects |
|
|
β€ π **Confidence Scores** for every prediction |
|
|
β€ π§ **Optional CNN-based verification** |
|
|
β€ β‘ **Optimized CNNs** (VGG16, ResNet50, MobileNetV2, EfficientNet-B0) |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.subheader("π Instructions for Users") |
|
|
st.text(""" |
|
|
β€ π Navigate to the Detection page |
|
|
β€ π¦ Upload an image (JPG / PNG format) |
|
|
β€ π Wait for the model to process the image |
|
|
β€ π§ View the output image with bounding boxes and labels |
|
|
β€ β‘ Check confidence scores for each detected object |
|
|
""") |
|
|
st.info("β οΈ For best results, use clear images with good lighting and visible objects.") |
|
|
|
|
|
st.markdown("---") |
|
|
st.subheader("πΌοΈ Sample Demo Images") |
|
|
col1, col2 = st.columns(2) |
|
|
with col1: |
|
|
st.image("img.png", caption="YOLO Detection Example 1") |
|
|
with col2: |
|
|
st.image("img1.png", caption="YOLO Detection Example 2") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Classes= ['airplane', 'banana', 'bear', 'bicycle', 'bird', 'bowl', 'bus', 'cake', 'car', 'cat', 'dog', 'elephant', 'horse', 'laptop', 'motorcycle', 'mouse', 'parking meter', 'person', 'potted plant', 'sheep', 'toilet', 'traffic light', 'truck', 'tv', 'wine glass'] |
|
|
NUM_CLASSES = len(Classes) |
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torchvision.models as models |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_custom_vgg16(): |
|
|
model = models.vgg16(pretrained=False) |
|
|
|
|
|
model.classifier = nn.Sequential( |
|
|
nn.Linear(25088, 1024), |
|
|
nn.ReLU(inplace=True), |
|
|
nn.Dropout(0.5), |
|
|
|
|
|
nn.Linear(1024, 512), |
|
|
nn.ReLU(inplace=True), |
|
|
nn.Dropout(0.5), |
|
|
|
|
|
nn.Linear(512, 25) |
|
|
) |
|
|
|
|
|
model.load_state_dict( |
|
|
torch.load( |
|
|
"models/vgg16_smartvision.pth", |
|
|
map_location=torch.device("cpu") |
|
|
) |
|
|
) |
|
|
|
|
|
model.eval() |
|
|
return model |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_custom_restnet50(): |
|
|
model=models.resnet50(pretrained=False) |
|
|
|
|
|
|
|
|
model.fc = nn.Sequential( |
|
|
nn.Linear(model.fc.in_features, 512), |
|
|
nn.BatchNorm1d(512), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(0.5), |
|
|
nn.Linear(512, NUM_CLASSES) |
|
|
) |
|
|
|
|
|
model.load_state_dict( |
|
|
torch.load( |
|
|
"models/smartvision_resnet50.pth", |
|
|
map_location=torch.device("cpu") |
|
|
) |
|
|
) |
|
|
|
|
|
model.eval() |
|
|
return model |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_custom_mobilenetv2(): |
|
|
model=models.mobilenet_v2(pretrained=False) |
|
|
|
|
|
|
|
|
model.classifier = nn.Sequential( |
|
|
nn.Linear(1280, 512), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(0.4), |
|
|
nn.Linear(512, NUM_CLASSES) |
|
|
) |
|
|
|
|
|
model.load_state_dict( |
|
|
torch.load( |
|
|
"models/mobilenetv2_smartvision.pth", |
|
|
map_location=torch.device("cpu") |
|
|
) |
|
|
) |
|
|
|
|
|
model.eval() |
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_custom_EffcientNet(): |
|
|
model=models.efficientnet_b0(pretrained=False) |
|
|
|
|
|
|
|
|
model.classifier = nn.Sequential( |
|
|
nn.Dropout(0.4), |
|
|
nn.Linear(1280, 512), |
|
|
nn.ReLU(), |
|
|
nn.Dropout(0.3), |
|
|
nn.Linear(512, NUM_CLASSES) |
|
|
) |
|
|
|
|
|
model.load_state_dict( |
|
|
torch.load( |
|
|
"models/EfficientNetB0_smartvision.pth", |
|
|
map_location=torch.device("cpu") |
|
|
) |
|
|
) |
|
|
|
|
|
model.eval() |
|
|
return model |
|
|
|
|
|
|
|
|
|
|
|
preprocess = transforms.Compose([ |
|
|
transforms.Resize((224, 224)), |
|
|
transforms.ToTensor(), |
|
|
transforms.Normalize( |
|
|
mean=[0.485, 0.456, 0.406], |
|
|
std=[0.229, 0.224, 0.225] |
|
|
) |
|
|
]) |
|
|
|
|
|
|
|
|
if page == "πΌοΈ Image Classification": |
|
|
st.subheader("πΌοΈ Image Classification (Custom Trained CNN Models)") |
|
|
st.markdown(""" |
|
|
This page performs **single-object image classification** using multiple |
|
|
**custom-trained CNN models**. |
|
|
Predictions from each model are shown **side-by-side** for comparison. |
|
|
""") |
|
|
|
|
|
uploaded_file = st.file_uploader( |
|
|
"π€ Upload an Image", |
|
|
type=["jpg", "jpeg", "png"] |
|
|
) |
|
|
|
|
|
if uploaded_file: |
|
|
image = Image.open(uploaded_file).convert("RGB") |
|
|
|
|
|
st.markdown("### π· Uploaded Image") |
|
|
st.image(image, width=300) |
|
|
|
|
|
input_tensor = preprocess(image).unsqueeze(0) |
|
|
|
|
|
|
|
|
models_dict = { |
|
|
"π§ VGG16": load_custom_vgg16(), |
|
|
"π§ ResNet50": load_custom_restnet50(), |
|
|
"π§ MobileNetV2": load_custom_mobilenetv2(), |
|
|
"π§ EfficientNet-B0": load_custom_EffcientNet() |
|
|
} |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("### π Model Predictions (Top-5)") |
|
|
|
|
|
cols = st.columns(4) |
|
|
|
|
|
for col, (model_name, model) in zip(cols, models_dict.items()): |
|
|
with col: |
|
|
st.markdown(f"#### {model_name}") |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(input_tensor) |
|
|
probs = torch.nn.functional.softmax(outputs[0], dim=0) |
|
|
|
|
|
top_probs, top_idxs = torch.topk( |
|
|
probs, min(5, len(Classes)) |
|
|
) |
|
|
|
|
|
for i in range(len(top_idxs)): |
|
|
class_name = Classes[top_idxs[i].item()] |
|
|
confidence = top_probs[i].item() |
|
|
|
|
|
st.write( |
|
|
f"**{i+1}. {class_name}** β {confidence*100:.2f}%" |
|
|
) |
|
|
st.progress(float(confidence)) |
|
|
|
|
|
else: |
|
|
st.info("β¬οΈ Upload an image to classify.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import cv2 |
|
|
import numpy as np |
|
|
from ultralytics import YOLO |
|
|
from PIL import Image |
|
|
import streamlit as st |
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_yolo_model(): |
|
|
return YOLO("best (1).pt") |
|
|
|
|
|
yolo_model = load_yolo_model() |
|
|
|
|
|
|
|
|
|
|
|
if page == "π¦ Object Detection": |
|
|
st.subheader("π― Object Detection using YOLO") |
|
|
st.markdown(""" |
|
|
Upload an image to detect **multiple objects** using a custom-trained YOLO model. |
|
|
Bounding boxes, class labels, and confidence scores will be displayed. |
|
|
""") |
|
|
|
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
conf_threshold = st.slider( |
|
|
"π§ Confidence Threshold", |
|
|
min_value=0.1, |
|
|
max_value=1.0, |
|
|
value=0.5, |
|
|
step=0.05 |
|
|
) |
|
|
|
|
|
uploaded_file = st.file_uploader( |
|
|
"π€ Upload an Image (JPG / PNG)", |
|
|
type=["jpg", "jpeg", "png"] |
|
|
) |
|
|
|
|
|
if uploaded_file: |
|
|
image = Image.open(uploaded_file).convert("RGB") |
|
|
img_array = np.array(image) |
|
|
|
|
|
st.markdown("### π· Uploaded Image") |
|
|
st.image(image, width=350) |
|
|
|
|
|
st.markdown("---") |
|
|
st.markdown("### π Detection Results") |
|
|
|
|
|
|
|
|
results = yolo_model.predict( |
|
|
source=img_array, |
|
|
conf=conf_threshold, |
|
|
save=False |
|
|
) |
|
|
|
|
|
annotated_img = img_array.copy() |
|
|
|
|
|
detections_found = False |
|
|
|
|
|
for r in results: |
|
|
boxes = r.boxes |
|
|
|
|
|
if boxes is not None: |
|
|
for box in boxes: |
|
|
detections_found = True |
|
|
|
|
|
x1, y1, x2, y2 = map(int, box.xyxy[0]) |
|
|
conf = float(box.conf[0]) |
|
|
cls_id = int(box.cls[0]) |
|
|
label = yolo_model.names[cls_id] |
|
|
|
|
|
|
|
|
cv2.rectangle( |
|
|
annotated_img, |
|
|
(x1, y1), |
|
|
(x2, y2), |
|
|
(0, 255, 0), |
|
|
2 |
|
|
) |
|
|
|
|
|
|
|
|
text = f"{label} {conf*100:.2f}%" |
|
|
cv2.putText( |
|
|
annotated_img, |
|
|
text, |
|
|
(x1, y1 - 10), |
|
|
cv2.FONT_HERSHEY_SIMPLEX, |
|
|
0.6, |
|
|
(0, 255, 0), |
|
|
2 |
|
|
) |
|
|
|
|
|
if detections_found: |
|
|
st.image( |
|
|
annotated_img, |
|
|
caption="YOLO Detection Output", |
|
|
use_column_width=True |
|
|
) |
|
|
else: |
|
|
st.warning("β οΈ No objects detected. Try lowering the confidence threshold.") |
|
|
|
|
|
else: |
|
|
st.info("β¬οΈ Upload an image to start object detection.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
import pandas as pd |
|
|
import seaborn as sns |
|
|
import streamlit as st |
|
|
|
|
|
|
|
|
data = { |
|
|
"Model": ["VGG16", "ResNet50", "MobileNetV2", "EfficientNetB0"], |
|
|
"Train Accuracy": [0.877, 0.6815, 0.51, 0.5297], |
|
|
"Val Accuracy": [0.6345, 0.6855, 0.54, 0.56], |
|
|
"Test Accuracy": [0.633, 0.593, 0.579, 0.543], |
|
|
"Speed": [8.9, 0.5, 13.0, 12.6] |
|
|
} |
|
|
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
|
|
|
if page == "π Model Performance": |
|
|
st.subheader("π Model Performance Dashboard") |
|
|
st.markdown(""" |
|
|
This section presents a **comparative analysis** of different CNN models used in SmartVision AI. |
|
|
It highlights **training, validation, and test accuracy**, along with **relative inference speed**. |
|
|
""") |
|
|
|
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
st.markdown("### π Model Comparison Table") |
|
|
st.dataframe(df, use_container_width=True) |
|
|
|
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
st.markdown("### π Accuracy Comparison (Train / Validation / Test)") |
|
|
|
|
|
acc_df = df.melt( |
|
|
id_vars="Model", |
|
|
value_vars=["Train Accuracy", "Val Accuracy", "Test Accuracy"], |
|
|
var_name="Dataset", |
|
|
value_name="Accuracy" |
|
|
) |
|
|
|
|
|
fig1, ax1 = plt.subplots() |
|
|
sns.barplot( |
|
|
data=acc_df, |
|
|
x="Model", |
|
|
y="Accuracy", |
|
|
hue="Dataset", |
|
|
ax=ax1 |
|
|
) |
|
|
ax1.set_ylim(0, 1) |
|
|
ax1.set_title("Accuracy Comparison Across Models") |
|
|
ax1.set_ylabel("Accuracy") |
|
|
ax1.set_xlabel("Model") |
|
|
|
|
|
st.pyplot(fig1) |
|
|
|
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
st.markdown("### β‘ Inference Speed Comparison") |
|
|
|
|
|
fig2, ax2 = plt.subplots() |
|
|
sns.barplot( |
|
|
data=df, |
|
|
x="Model", |
|
|
y="Speed", |
|
|
ax=ax2 |
|
|
) |
|
|
ax2.set_title("Relative Inference Speed (Higher is Faster)") |
|
|
ax2.set_ylabel("Speed Score") |
|
|
ax2.set_xlabel("Model") |
|
|
|
|
|
st.pyplot(fig2) |
|
|
|
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
st.markdown("### π§ Key Observations") |
|
|
st.markdown(""" |
|
|
- **VGG16** shows strong training accuracy but noticeable generalization gap |
|
|
- **ResNet50** provides better validation stability |
|
|
- **MobileNetV2** and **EfficientNetB0** trade accuracy for faster inference |
|
|
- Lightweight models are suitable for **real-time or edge deployment** |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import cv2 |
|
|
import time |
|
|
import numpy as np |
|
|
import streamlit as st |
|
|
from ultralytics import YOLO |
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def load_pretrained_yolo(): |
|
|
return YOLO("yolov8n.pt") |
|
|
|
|
|
yolo_model_live = load_pretrained_yolo() |
|
|
|
|
|
|
|
|
if page == "πΈ Live Webcam Detection": |
|
|
st.subheader("πΈ Live Camera Detection (Lightweight Mode)") |
|
|
|
|
|
|
|
|
conf_thres = st.slider("Confidence Threshold", 0.1, 1.0, 0.5, 0.05) |
|
|
run = st.checkbox("βΆ Start Camera") |
|
|
|
|
|
FRAME_WINDOW = st.image([]) |
|
|
fps_text = st.empty() |
|
|
|
|
|
if run: |
|
|
cap = cv2.VideoCapture(0) |
|
|
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) |
|
|
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480) |
|
|
|
|
|
|
|
|
frame_skip = 3 |
|
|
frame_count = 0 |
|
|
prev_time = time.time() |
|
|
|
|
|
while run: |
|
|
ret, frame = cap.read() |
|
|
if not ret: |
|
|
break |
|
|
|
|
|
frame_count += 1 |
|
|
|
|
|
|
|
|
if frame_count % frame_skip != 0: |
|
|
continue |
|
|
|
|
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
|
|
|
|
|
results = yolo_model_live.predict( |
|
|
frame_rgb, |
|
|
conf=conf_thres, |
|
|
imgsz=416, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
annotated_frame = results[0].plot() |
|
|
|
|
|
|
|
|
curr_time = time.time() |
|
|
fps = 1 / (curr_time - prev_time) |
|
|
prev_time = curr_time |
|
|
|
|
|
fps_text.markdown(f"β‘ FPS: {fps:.1f}") |
|
|
|
|
|
FRAME_WINDOW.image( |
|
|
annotated_frame, |
|
|
channels="RGB", |
|
|
use_column_width=True |
|
|
) |
|
|
|
|
|
time.sleep(0.03) |
|
|
|
|
|
cap.release() |
|
|
|
|
|
|
|
|
|
|
|
if page == "βΉοΈ About": |
|
|
st.subheader("π About SmartVision AI") |
|
|
st.markdown("---") |
|
|
|
|
|
|
|
|
st.markdown("## π§ Project Overview") |
|
|
st.markdown(""" |
|
|
**SmartVision AI** is an end-to-end **computer vision system** designed to perform |
|
|
**image classification**, **object detection**, and **real-time inference** using |
|
|
state-of-the-art deep learning models. |
|
|
|
|
|
The project demonstrates the complete AI lifecycle: |
|
|
**dataset preparation β model training β optimized inference β deployment using Streamlit**. |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("## π Dataset Information") |
|
|
st.markdown(""" |
|
|
- **Image Classification Dataset** |
|
|
- Domain-specific dataset with **25 object classes** |
|
|
- Preprocessed and augmented for robustness |
|
|
- Split into **Train / Validation / Test** sets |
|
|
|
|
|
- **Object Detection Dataset** |
|
|
- General object detection using **COCO dataset** |
|
|
- 80 commonly occurring object classes |
|
|
- Bounding-box annotated images |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("## ποΈ Model Architectures Used") |
|
|
st.markdown(""" |
|
|
### πΉ Image Classification Models |
|
|
- **VGG16 (Custom Trained)** |
|
|
- Modified fully connected layers |
|
|
- High accuracy on domain-specific data |
|
|
|
|
|
- **ResNet50** |
|
|
- Residual connections for deeper learning |
|
|
- Strong generalization capability |
|
|
|
|
|
- **MobileNetV2** |
|
|
- Lightweight architecture |
|
|
- Optimized for speed and mobile devices |
|
|
|
|
|
- **EfficientNet-B0** |
|
|
- Balanced accuracy and efficiency |
|
|
- Compound scaling technique |
|
|
|
|
|
### πΉ Object Detection Model |
|
|
- **YOLOv8 (Pretrained)** |
|
|
- Real-time object detection |
|
|
- Single-stage detector |
|
|
- Optimized for speed and accuracy |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("## π οΈ Technical Stack") |
|
|
st.markdown(""" |
|
|
**Programming Language** |
|
|
- Python π |
|
|
|
|
|
**Deep Learning & Vision** |
|
|
- PyTorch |
|
|
- Torchvision |
|
|
- Ultralytics YOLOv8 |
|
|
- OpenCV |
|
|
|
|
|
**Data Processing & Visualization** |
|
|
- NumPy |
|
|
- Pandas |
|
|
- Matplotlib |
|
|
- Seaborn |
|
|
|
|
|
**Web & Deployment** |
|
|
- Streamlit |
|
|
- VS Code |
|
|
- Git & GitHub |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("## β‘ Performance Optimization Techniques") |
|
|
st.markdown(""" |
|
|
- Model quantization (where applicable) |
|
|
- Frame skipping for real-time inference |
|
|
- Resolution scaling for faster detection |
|
|
- CPU-optimized inference pipeline |
|
|
- Streamlit resource caching |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("## π¨βπ» Developer Information") |
|
|
st.markdown(""" |
|
|
**Developer:** Rahul Kumar |
|
|
**Degree:** B.Tech in Information Technology |
|
|
**Institution:** IIEST Shibpur |
|
|
|
|
|
**Core Interests:** |
|
|
- Computer Vision |
|
|
- Deep Learning |
|
|
- Full Stack Development |
|
|
- AI Model Deployment |
|
|
|
|
|
**Project Goal:** |
|
|
To build scalable, efficient, and production-ready AI systems |
|
|
with real-world deployment considerations. |
|
|
""") |
|
|
|
|
|
|
|
|
st.markdown("---") |
|
|
st.info("π SmartVision AI β Bridging Deep Learning Research with Real-World Applications") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
|
|
|
|
st.sidebar.markdown("---") |
|
|
|
|
|
st.sidebar.markdown("### π SmartVision AI") |
|
|
|
|
|
col1, col2, col3 = st.sidebar.columns(3) |
|
|
|
|
|
with col1: |
|
|
st.sidebar.markdown( |
|
|
"[π GitHub](https://github.com/rahul-tech-kumar/SmartVision-AI---Intelligent-Multi-Class-Object-Recognition-System)", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
with col2: |
|
|
st.sidebar.markdown( |
|
|
"[πΌ LinkedIn](https://www.linkedin.com/in/rahul-kumar-173546228/)", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
with col3: |
|
|
st.sidebar.markdown( |
|
|
"[βοΈ Email](mailto:rahulkumar11062003@gmail.com)", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|
|
|
st.sidebar.markdown("---") |
|
|
|
|
|
st.sidebar.markdown( |
|
|
""" |
|
|
<div style="text-align:center; font-size:12px; color:gray;"> |
|
|
π Built with Streamlit & PyTorch<br> |
|
|
Β© 2025 SmartVision AI |
|
|
</div> |
|
|
""", |
|
|
unsafe_allow_html=True |
|
|
) |
|
|
|