Spaces:
Sleeping
Sleeping
File size: 10,364 Bytes
80572ca 7a5c9ed 80572ca 62c8941 80572ca 62c8941 80572ca 7a5c9ed 80572ca 7a5c9ed 306acbb 7a5c9ed 306acbb 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 62c8941 7a5c9ed 62c8941 7a5c9ed 62c8941 7a5c9ed 80572ca 62c8941 7a5c9ed 62c8941 80572ca 63e7e05 62c8941 7a5c9ed 80572ca 62c8941 7a5c9ed 62c8941 7a5c9ed 62c8941 7a5c9ed 62c8941 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 62c8941 80572ca 63e7e05 80572ca 63e7e05 80572ca 62c8941 80572ca 62c8941 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca 7a5c9ed 80572ca | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | """
AlexNet β νκΉ
νμ΄μ€ Spaces λ°λͺ¨
λ
Όλ¬Έ: Krizhevsky, Sutskever, Hinton (NeurIPS 2012)
ν΅μ¬ λ³κ²½:
- torchvision AlexNetκ³Ό μμ ν λμΌν ꡬ쑰(groups=1)λ‘ λ§μΆ°
μ¬μ νμ΅ κ°μ€μΉλ₯Ό Conv+FC μ 체 λ‘λ β μ€μ λΆλ₯ μλ
- ImageNet 1000κ° ν΄λμ€ μ΄λ¦ μλ λ‘λ
(κ°μμ§, κ³ μμ΄, μ¬κ³Ό, μ¬λ λ± λͺ¨λ ν¬ν¨)
"""
import json
import requests
import torch
import torch.nn as nn
import torchvision.models as tv
import torchvision.transforms as T
import gradio as gr
from PIL import Image
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. λͺ¨λΈ μ μ
# torchvision AlexNetκ³Ό μμ λμΌ κ΅¬μ‘° (groups=1, κ°μ€μΉ νΈν)
#
# λ
Όλ¬Έ GPU λΆν (groups=2)μ λ©λͺ¨λ¦¬ μ ν λλ¬Έμ΄μκ³ ,
# μ§κΈμ GPU λ©λͺ¨λ¦¬κ° μΆ©λΆνλ―λ‘ groups=1λ‘ λμΌνκ² κ΅¬ν.
# λ
Όλ¬Έμ λͺ¨λ νμ΄νΌνλΌλ―Έν°(LRN, Dropout, padding λ±)λ κ·Έλλ‘ μ μ§.
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class AlexNet(nn.Module):
"""
λ
Όλ¬Έ Figure 2 μ¬ν β torchvision κ°μ€μΉ μμ νΈν λ²μ .
torchvision AlexNet ꡬ쑰μ 1:1 λμ:
Conv1: kernel=11, stride=4, padding=2 -> (B, 64, 55, 55) -> pool -> (B, 64, 27, 27)
Conv2: kernel=5, stride=1, padding=2 -> (B,192, 27, 27) -> pool -> (B,192, 13, 13)
Conv3: kernel=3, stride=1, padding=1 -> (B,384, 13, 13)
Conv4: kernel=3, stride=1, padding=1 -> (B,256, 13, 13)
Conv5: kernel=3, stride=1, padding=1 -> (B,256, 13, 13) -> pool -> (B,256, 6, 6)
FC1: 9216 -> 4096 (Dropout 0.5)
FC2: 4096 -> 4096 (Dropout 0.5)
FC3: 4096 -> num_labels
"""
def __init__(self, num_labels: int = 1000, dropout: float = 0.5):
super().__init__()
# features: torchvision Sequentialκ³Ό λμΌν μμΒ·νλΌλ―Έν°
self.features = nn.Sequential(
# Conv1
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
# Conv2
nn.Conv2d(64, 192, kernel_size=5, padding=2),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
# Conv3
nn.Conv2d(192, 384, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
# Conv4
nn.Conv2d(384, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
# Conv5
nn.Conv2d(256, 256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
# classifier: torchvision Sequentialκ³Ό λμΌ
self.classifier = nn.Sequential(
nn.Dropout(p=dropout), # λ
Όλ¬Έ 4.2μ : FC1 μ Dropout
nn.Linear(256 * 6 * 6, 4096),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout), # λ
Όλ¬Έ 4.2μ : FC2 μ Dropout
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, num_labels), # FC3: Dropout μμ
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.features(x) # (B, 256, 6, 6)
x = self.avgpool(x) # (B, 256, 6, 6) β ν¬κΈ° 보μ₯
x = x.view(x.size(0), -1) # (B, 9216)
return self.classifier(x) # (B, num_labels)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. λͺ¨λΈ μμ± + torchvision μ¬μ νμ΅ κ°μ€μΉ μ 체 λ‘λ
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AlexNet(num_labels=1000).to(DEVICE)
WEIGHTS_STATUS = "λλ€ μ΄κΈ°ν (μμΈ‘ μλ―Έ μμ)"
try:
pretrained = tv.alexnet(weights=tv.AlexNet_Weights.DEFAULT)
model.load_state_dict(pretrained.state_dict()) # Conv + FC μ 체 볡μ¬
WEIGHTS_STATUS = "ImageNet μ¬μ νμ΅ μλ£ (torchvision)"
print("κ°μ€μΉ μ 체 λ‘λ μλ£")
except Exception as e:
print(f"κ°μ€μΉ λ‘λ μ€ν¨: {e}")
model.eval()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. ImageNet 1000κ° ν΄λμ€ μ΄λ¦ λ‘λ
# κ°μμ§(n02085620~), κ³ μμ΄(n02123045~), μ¬κ³Ό(948), μ¬λ μμ*
# *ImageNetμ μ¬λ ν΄λμ€λ₯Ό ν¬ν¨νμ§ μμ
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ID2LABEL = {}
# 1μμ: config.json
try:
with open("config.json") as f:
cfg = json.load(f)
ID2LABEL = {int(k): v for k, v in cfg.get("id2label", {}).items()}
if ID2LABEL:
print(f"config.json: {len(ID2LABEL)}κ° ν΄λμ€")
except Exception:
pass
# 2μμ: νκΉ
νμ΄μ€ ViT config (ImageNet 1000 λΌλ²¨ λμΌ)
if not ID2LABEL:
try:
resp = requests.get(
"https://huggingface.co/google/vit-base-patch16-224/raw/main/config.json",
timeout=15,
)
vit_cfg = resp.json()
ID2LABEL = {int(k): v for k, v in vit_cfg.get("id2label", {}).items()}
print(f"νκΉ
νμ΄μ€: {len(ID2LABEL)}κ° ν΄λμ€ λ‘λ")
except Exception as e:
print(f"ν΄λμ€ μ΄λ¦ λ‘λ μ€ν¨: {e}")
LABEL_STATUS = f"ImageNet {len(ID2LABEL)}κ° ν΄λμ€" if ID2LABEL else "ν΄λμ€ μ΄λ¦ μμ"
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. μ μ²λ¦¬ (torchvision AlexNet_Weights.DEFAULTμ λμΌ)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
TRANSFORM = T.Compose([
T.Resize(256),
T.CenterCrop(224),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. μΆλ‘ ν¨μ
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def predict(image: Image.Image) -> dict:
if image is None:
return {}
tensor = TRANSFORM(image).unsqueeze(0).to(DEVICE)
with torch.no_grad():
logits = model(tensor)
probs = torch.softmax(logits, dim=-1)[0]
top5_probs, top5_idx = probs.topk(5)
return {
ID2LABEL.get(idx.item(), f"class_{idx.item()}"): round(prob.item(), 4)
for prob, idx in zip(top5_probs, top5_idx)
}
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 6. Gradio UI
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
with gr.Blocks(title="AlexNet β λ
Όλ¬Έ μ¬ν") as demo:
gr.Markdown(f"""
## AlexNet β λ
Όλ¬Έ μμ μ¬ν λ°λͺ¨
**λ
Όλ¬Έ**: ImageNet Classification with Deep CNNs (Krizhevsky et al., NeurIPS 2012)
| νλͺ© | μν |
|------|------|
| κ°μ€μΉ | {WEIGHTS_STATUS} |
| ν΄λμ€ | {LABEL_STATUS} |
> β» ImageNetμ μ¬λ(λ¨μ/μ¬μ) ν΄λμ€λ₯Ό ν¬ν¨νμ§ μμμ.
> κ°μμ§Β·κ³ μμ΄Β·μ¬κ³ΌΒ·μλμ°¨ λ± 1000κ° λ¬Όμ²΄ μΉ΄ν
κ³ λ¦¬λ₯Ό μΈμν©λλ€.
""")
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="μ
λ ₯ μ΄λ―Έμ§")
run_btn = gr.Button("μμΈ‘νκΈ°", variant="primary")
with gr.Column():
label_output = gr.Label(num_top_classes=5, label="Top-5 μμΈ‘")
with gr.Accordion("μΈμ κ°λ₯ν μ£Όμ μΉ΄ν
κ³ λ¦¬", open=False):
gr.Markdown("""
**λλ¬Ό**: κ°(120μ’
), κ³ μμ΄(8μ’
), μ(59μ’
), λ¬Όκ³ κΈ°, λ±, κ³°, μ½λΌλ¦¬ λ±
**μμ**: μ¬κ³Ό, λ λͺ¬, λΈκΈ°, μμ΄μ€ν¬λ¦Ό, νΌμ, λ²μ― λ±
**νκ²**: μλμ°¨, λ²μ€, κΈ°μ°¨, λΉνκΈ°, λ°°, μ€ν λ°μ΄ λ±
**μ¬λ¬Ό**: μμ, μκ³, μ»΅, ν€λ³΄λ, μκ²½, μ°μ° λ±
**μμ°**: μ°νΈμ΄, νμ°, νν¬, λΉν λ±
> μ¬λ(λ¨μ/μ¬μ)μ ImageNet 1000 ν΄λμ€μ ν¬ν¨λμ§ μμ΅λλ€.
> μ¬λ μΈμμ΄ νμνλ©΄ CLIP λλ COCO νμ΅ λͺ¨λΈμ΄ νμν΄μ.
""")
with gr.Accordion("λͺ¨λΈ ꡬ쑰 (λ
Όλ¬Έ Figure 2)", open=False):
gr.Markdown("""
| λ μ΄μ΄ | 컀λ | μΆλ ₯ shape | λ
Όλ¬Έ μΉμ
|
|--------|------|-----------------|-----------|
| Conv1 | 11Γ11 stride=4 | (B, 64, 27, 27) | 3.5μ |
| Conv2 | 5Γ5 | (B, 192, 13, 13) | 3.5μ |
| Conv3 | 3Γ3 | (B, 384, 13, 13) | 3.5μ |
| Conv4 | 3Γ3 | (B, 256, 13, 13) | 3.5μ |
| Conv5 | 3Γ3 | (B, 256, 6, 6) | 3.5μ |
| FC1Β·2 | β | (B, 4096) | 4.2μ Dropout 0.5 |
| FC3 | β | (B, 1000) | Abstract |
""")
run_btn.click(fn=predict, inputs=image_input, outputs=label_output)
image_input.change(fn=predict, inputs=image_input, outputs=label_output)
if __name__ == "__main__":
demo.launch()
|