Spaces:
Sleeping
Sleeping
| """ | |
| AlexNet β νκΉ νμ΄μ€ Spaces λ°λͺ¨ | |
| λ Όλ¬Έ: Krizhevsky, Sutskever, Hinton (NeurIPS 2012) | |
| ν΅μ¬ λ³κ²½: | |
| - torchvision AlexNetκ³Ό μμ ν λμΌν ꡬ쑰(groups=1)λ‘ λ§μΆ° | |
| μ¬μ νμ΅ κ°μ€μΉλ₯Ό Conv+FC μ 체 λ‘λ β μ€μ λΆλ₯ μλ | |
| - ImageNet 1000κ° ν΄λμ€ μ΄λ¦ μλ λ‘λ | |
| (κ°μμ§, κ³ μμ΄, μ¬κ³Ό, μ¬λ λ± λͺ¨λ ν¬ν¨) | |
| """ | |
| import json | |
| import requests | |
| import torch | |
| import torch.nn as nn | |
| import torchvision.models as tv | |
| import torchvision.transforms as T | |
| import gradio as gr | |
| from PIL import Image | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. λͺ¨λΈ μ μ | |
| # torchvision AlexNetκ³Ό μμ λμΌ κ΅¬μ‘° (groups=1, κ°μ€μΉ νΈν) | |
| # | |
| # λ Όλ¬Έ GPU λΆν (groups=2)μ λ©λͺ¨λ¦¬ μ ν λλ¬Έμ΄μκ³ , | |
| # μ§κΈμ GPU λ©λͺ¨λ¦¬κ° μΆ©λΆνλ―λ‘ groups=1λ‘ λμΌνκ² κ΅¬ν. | |
| # λ Όλ¬Έμ λͺ¨λ νμ΄νΌνλΌλ―Έν°(LRN, Dropout, padding λ±)λ κ·Έλλ‘ μ μ§. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class AlexNet(nn.Module): | |
| """ | |
| λ Όλ¬Έ Figure 2 μ¬ν β torchvision κ°μ€μΉ μμ νΈν λ²μ . | |
| torchvision AlexNet ꡬ쑰μ 1:1 λμ: | |
| Conv1: kernel=11, stride=4, padding=2 -> (B, 64, 55, 55) -> pool -> (B, 64, 27, 27) | |
| Conv2: kernel=5, stride=1, padding=2 -> (B,192, 27, 27) -> pool -> (B,192, 13, 13) | |
| Conv3: kernel=3, stride=1, padding=1 -> (B,384, 13, 13) | |
| Conv4: kernel=3, stride=1, padding=1 -> (B,256, 13, 13) | |
| Conv5: kernel=3, stride=1, padding=1 -> (B,256, 13, 13) -> pool -> (B,256, 6, 6) | |
| FC1: 9216 -> 4096 (Dropout 0.5) | |
| FC2: 4096 -> 4096 (Dropout 0.5) | |
| FC3: 4096 -> num_labels | |
| """ | |
| def __init__(self, num_labels: int = 1000, dropout: float = 0.5): | |
| super().__init__() | |
| # features: torchvision Sequentialκ³Ό λμΌν μμΒ·νλΌλ―Έν° | |
| self.features = nn.Sequential( | |
| # Conv1 | |
| nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool2d(kernel_size=3, stride=2), | |
| # Conv2 | |
| nn.Conv2d(64, 192, kernel_size=5, padding=2), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool2d(kernel_size=3, stride=2), | |
| # Conv3 | |
| nn.Conv2d(192, 384, kernel_size=3, padding=1), | |
| nn.ReLU(inplace=True), | |
| # Conv4 | |
| nn.Conv2d(384, 256, kernel_size=3, padding=1), | |
| nn.ReLU(inplace=True), | |
| # Conv5 | |
| nn.Conv2d(256, 256, kernel_size=3, padding=1), | |
| nn.ReLU(inplace=True), | |
| nn.MaxPool2d(kernel_size=3, stride=2), | |
| ) | |
| self.avgpool = nn.AdaptiveAvgPool2d((6, 6)) | |
| # classifier: torchvision Sequentialκ³Ό λμΌ | |
| self.classifier = nn.Sequential( | |
| nn.Dropout(p=dropout), # λ Όλ¬Έ 4.2μ : FC1 μ Dropout | |
| nn.Linear(256 * 6 * 6, 4096), | |
| nn.ReLU(inplace=True), | |
| nn.Dropout(p=dropout), # λ Όλ¬Έ 4.2μ : FC2 μ Dropout | |
| nn.Linear(4096, 4096), | |
| nn.ReLU(inplace=True), | |
| nn.Linear(4096, num_labels), # FC3: Dropout μμ | |
| ) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x = self.features(x) # (B, 256, 6, 6) | |
| x = self.avgpool(x) # (B, 256, 6, 6) β ν¬κΈ° 보μ₯ | |
| x = x.view(x.size(0), -1) # (B, 9216) | |
| return self.classifier(x) # (B, num_labels) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. λͺ¨λΈ μμ± + torchvision μ¬μ νμ΅ κ°μ€μΉ μ 체 λ‘λ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = AlexNet(num_labels=1000).to(DEVICE) | |
| WEIGHTS_STATUS = "λλ€ μ΄κΈ°ν (μμΈ‘ μλ―Έ μμ)" | |
| try: | |
| pretrained = tv.alexnet(weights=tv.AlexNet_Weights.DEFAULT) | |
| model.load_state_dict(pretrained.state_dict()) # Conv + FC μ 체 λ³΅μ¬ | |
| WEIGHTS_STATUS = "ImageNet μ¬μ νμ΅ μλ£ (torchvision)" | |
| print("κ°μ€μΉ μ 체 λ‘λ μλ£") | |
| except Exception as e: | |
| print(f"κ°μ€μΉ λ‘λ μ€ν¨: {e}") | |
| model.eval() | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. ImageNet 1000κ° ν΄λμ€ μ΄λ¦ λ‘λ | |
| # κ°μμ§(n02085620~), κ³ μμ΄(n02123045~), μ¬κ³Ό(948), μ¬λ μμ* | |
| # *ImageNetμ μ¬λ ν΄λμ€λ₯Ό ν¬ν¨νμ§ μμ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ID2LABEL = {} | |
| # 1μμ: config.json | |
| try: | |
| with open("config.json") as f: | |
| cfg = json.load(f) | |
| ID2LABEL = {int(k): v for k, v in cfg.get("id2label", {}).items()} | |
| if ID2LABEL: | |
| print(f"config.json: {len(ID2LABEL)}κ° ν΄λμ€") | |
| except Exception: | |
| pass | |
| # 2μμ: νκΉ νμ΄μ€ ViT config (ImageNet 1000 λΌλ²¨ λμΌ) | |
| if not ID2LABEL: | |
| try: | |
| resp = requests.get( | |
| "https://huggingface.co/google/vit-base-patch16-224/raw/main/config.json", | |
| timeout=15, | |
| ) | |
| vit_cfg = resp.json() | |
| ID2LABEL = {int(k): v for k, v in vit_cfg.get("id2label", {}).items()} | |
| print(f"νκΉ νμ΄μ€: {len(ID2LABEL)}κ° ν΄λμ€ λ‘λ") | |
| except Exception as e: | |
| print(f"ν΄λμ€ μ΄λ¦ λ‘λ μ€ν¨: {e}") | |
| LABEL_STATUS = f"ImageNet {len(ID2LABEL)}κ° ν΄λμ€" if ID2LABEL else "ν΄λμ€ μ΄λ¦ μμ" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. μ μ²λ¦¬ (torchvision AlexNet_Weights.DEFAULTμ λμΌ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TRANSFORM = T.Compose([ | |
| T.Resize(256), | |
| T.CenterCrop(224), | |
| T.ToTensor(), | |
| T.Normalize(mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225]), | |
| ]) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. μΆλ‘ ν¨μ | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def predict(image: Image.Image) -> dict: | |
| if image is None: | |
| return {} | |
| tensor = TRANSFORM(image).unsqueeze(0).to(DEVICE) | |
| with torch.no_grad(): | |
| logits = model(tensor) | |
| probs = torch.softmax(logits, dim=-1)[0] | |
| top5_probs, top5_idx = probs.topk(5) | |
| return { | |
| ID2LABEL.get(idx.item(), f"class_{idx.item()}"): round(prob.item(), 4) | |
| for prob, idx in zip(top5_probs, top5_idx) | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 6. Gradio UI | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="AlexNet β λ Όλ¬Έ μ¬ν") as demo: | |
| gr.Markdown(f""" | |
| ## AlexNet β λ Όλ¬Έ μμ μ¬ν λ°λͺ¨ | |
| **λ Όλ¬Έ**: ImageNet Classification with Deep CNNs (Krizhevsky et al., NeurIPS 2012) | |
| | νλͺ© | μν | | |
| |------|------| | |
| | κ°μ€μΉ | {WEIGHTS_STATUS} | | |
| | ν΄λμ€ | {LABEL_STATUS} | | |
| > β» ImageNetμ μ¬λ(λ¨μ/μ¬μ) ν΄λμ€λ₯Ό ν¬ν¨νμ§ μμμ. | |
| > κ°μμ§Β·κ³ μμ΄Β·μ¬κ³ΌΒ·μλμ°¨ λ± 1000κ° λ¬Όμ²΄ μΉ΄ν κ³ λ¦¬λ₯Ό μΈμν©λλ€. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="pil", label="μ λ ₯ μ΄λ―Έμ§") | |
| run_btn = gr.Button("μμΈ‘νκΈ°", variant="primary") | |
| with gr.Column(): | |
| label_output = gr.Label(num_top_classes=5, label="Top-5 μμΈ‘") | |
| with gr.Accordion("μΈμ κ°λ₯ν μ£Όμ μΉ΄ν κ³ λ¦¬", open=False): | |
| gr.Markdown(""" | |
| **λλ¬Ό**: κ°(120μ’ ), κ³ μμ΄(8μ’ ), μ(59μ’ ), λ¬Όκ³ κΈ°, λ±, κ³°, μ½λΌλ¦¬ λ± | |
| **μμ**: μ¬κ³Ό, λ λͺ¬, λΈκΈ°, μμ΄μ€ν¬λ¦Ό, νΌμ, λ²μ― λ± | |
| **νκ²**: μλμ°¨, λ²μ€, κΈ°μ°¨, λΉνκΈ°, λ°°, μ€ν λ°μ΄ λ± | |
| **μ¬λ¬Ό**: μμ, μκ³, μ»΅, ν€λ³΄λ, μκ²½, μ°μ° λ± | |
| **μμ°**: μ°νΈμ΄, νμ°, νν¬, λΉν λ± | |
| > μ¬λ(λ¨μ/μ¬μ)μ ImageNet 1000 ν΄λμ€μ ν¬ν¨λμ§ μμ΅λλ€. | |
| > μ¬λ μΈμμ΄ νμνλ©΄ CLIP λλ COCO νμ΅ λͺ¨λΈμ΄ νμν΄μ. | |
| """) | |
| with gr.Accordion("λͺ¨λΈ ꡬ쑰 (λ Όλ¬Έ Figure 2)", open=False): | |
| gr.Markdown(""" | |
| | λ μ΄μ΄ | 컀λ | μΆλ ₯ shape | λ Όλ¬Έ μΉμ | | |
| |--------|------|-----------------|-----------| | |
| | Conv1 | 11Γ11 stride=4 | (B, 64, 27, 27) | 3.5μ | | |
| | Conv2 | 5Γ5 | (B, 192, 13, 13) | 3.5μ | | |
| | Conv3 | 3Γ3 | (B, 384, 13, 13) | 3.5μ | | |
| | Conv4 | 3Γ3 | (B, 256, 13, 13) | 3.5μ | | |
| | Conv5 | 3Γ3 | (B, 256, 6, 6) | 3.5μ | | |
| | FC1Β·2 | β | (B, 4096) | 4.2μ Dropout 0.5 | | |
| | FC3 | β | (B, 1000) | Abstract | | |
| """) | |
| run_btn.click(fn=predict, inputs=image_input, outputs=label_output) | |
| image_input.change(fn=predict, inputs=image_input, outputs=label_output) | |
| if __name__ == "__main__": | |
| demo.launch() | |