Spaces:

Mohansai2004
/

Image_analysis

Running

App Files Files Community

Mohansai2004 commited on Aug 9, 2025

Commit

3df481e

1 Parent(s): c8547b4

fixed

Browse files

Files changed (5) hide show

Dockerfile +3 -0
app/app.py +15 -13
app/caption_model.py +39 -120
app/model.py +23 -100
requirements.txt +20 -8

Dockerfile CHANGED Viewed

@@ -9,6 +9,9 @@ RUN useradd -m -u 1000 appuser
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     curl \
     && rm -rf /var/lib/apt/lists/*
 # Set up cache directory for Hugging Face

 # Install system dependencies
 RUN apt-get update && apt-get install -y \
     curl \
+    gcc \
+    python3-dev \
+    libpython3-dev \
     && rm -rf /var/lib/apt/lists/*
 # Set up cache directory for Hugging Face

app/app.py CHANGED Viewed

@@ -1,13 +1,20 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from starlette.responses import JSONResponse
-from starlette.requests import Request
 from app.model import analyze_image
 from app.utils import read_image
-from app.caption_model import captioner
 app = FastAPI(title="Image Analyzer API", version="1.0.0")
 @app.post("/analyze")
 async def analyze(file: UploadFile = File(...)):
@@ -32,27 +39,22 @@ async def analyze(file: UploadFile = File(...)):
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/caption")
-async def generate_caption(file: UploadFile = File(...)):
     if not file or not file.filename:
         raise HTTPException(status_code=400, detail="No file uploaded.")
-    try:
-        image = read_image(file)
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Failed to read image: {str(e)}")
     if not file.content_type.startswith('image/'):
         raise HTTPException(status_code=400, detail="File must be an image")
     try:
-        result = captioner.generate_caption(image)
         return JSONResponse(content=result)
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
-    except RuntimeError as e:
-        raise HTTPException(status_code=500, detail=str(e))
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.get("/")
 def read_root():
-    return {"message": "Image Analyzer API is running"}

 from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
 from starlette.responses import JSONResponse
 from app.model import analyze_image
 from app.utils import read_image
+from app.caption_model import caption_image  # Fixed import name
 app = FastAPI(title="Image Analyzer API", version="1.0.0")
+# ✅ Add CORS Middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Change to ["https://your-frontend.com"] for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 @app.post("/analyze")
 async def analyze(file: UploadFile = File(...)):
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.post("/caption")
+async def generate_image_caption(file: UploadFile = File(...)):
     if not file or not file.filename:
         raise HTTPException(status_code=400, detail="No file uploaded.")
     if not file.content_type.startswith('image/'):
         raise HTTPException(status_code=400, detail="File must be an image")
     try:
+        image = read_image(file)
+        result = caption_image(image)  # Fixed function name
         return JSONResponse(content=result)
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
 @app.get("/")
 def read_root():
+    return {"message": "Image Analyzer API is running"}

app/caption_model.py CHANGED Viewed

@@ -1,124 +1,43 @@
 from transformers import BlipProcessor, BlipForConditionalGeneration
-import torch
 from PIL import Image
-import logging
-import time
-from typing import Dict, Any, Optional
-import gc
 MODEL_NAME = "Salesforce/blip-image-captioning-base"
-MAX_RETRIES = 3
-RETRY_DELAY = 1  # seconds
-MAX_LENGTH = 50  # Maximum length for generated captions
-class ImageCaptioner:
-    def __init__(self):
-        self.processor = None
-        self.model = None
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        logging.info(f"Using device: {self.device} for caption model")
-        self._initialize_model()
-    def _initialize_model(self):
-        for attempt in range(MAX_RETRIES):
-            try:
-                # Clear CUDA cache if using GPU
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                    gc.collect()
-                self.processor = BlipProcessor.from_pretrained(MODEL_NAME)
-                self.model = BlipForConditionalGeneration.from_pretrained(MODEL_NAME).to(self.device)
-                # Verify model loaded correctly
-                if self.model is None or self.processor is None:
-                    raise RuntimeError("Caption model or processor initialization failed")
-                # Set model to evaluation mode
-                self.model.eval()
-                logging.info(f"Caption model loaded successfully on {self.device} (attempt {attempt + 1})")
-                return
-            except Exception as e:
-                logging.error(f"Attempt {attempt + 1} failed to load caption model: {str(e)}")
-                if attempt < MAX_RETRIES - 1:
-                    time.sleep(RETRY_DELAY)
-                    continue
-                raise RuntimeError(f"Failed to initialize the image captioning model after {MAX_RETRIES} attempts")
-    def validate_image(self, image: Image.Image) -> Optional[str]:
-        """Validate image before processing"""
-        if not isinstance(image, Image.Image):
-            return "Input must be a PIL Image"
-        # Check image mode
-        if image.mode not in ('RGB', 'L'):
-            return "Image must be in RGB or grayscale format"
-        return None
-    def generate_caption(self, image: Image.Image) -> Dict[str, Any]:
-        # Validate input
-        error = self.validate_image(image)
-        if error:
-            raise ValueError(error)
-        # Check model initialization
-        if self.model is None or self.processor is None:
-            self._initialize_model()  # Try to reinitialize if models are not loaded
-        try:
-            # Clear CUDA cache if using GPU
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                gc.collect()
-            # Prepare inputs
-            inputs = self.processor(image, return_tensors="pt")
-            inputs = {k: v.to(self.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
-            # Process with error handling and memory management
-            try:
-                with torch.no_grad():
-                    # Generate caption with parameters for better quality
-                    out = self.model.generate(
-                        **inputs,
-                        max_length=MAX_LENGTH,
-                        num_beams=5,  # Beam search for better quality
-                        temperature=1.0,
-                        top_k=50,
-                        top_p=0.95,
-                        repetition_penalty=1.2,
-                        length_penalty=1.0,
-                        no_repeat_ngram_size=2
-                    )
-                    caption = self.processor.decode(out[0], skip_special_tokens=True)
-                    # Process the caption
-                    caption = caption.strip()
-                    # Ensure caption starts with capital letter and ends with period
-                    caption = caption[0].upper() + caption[1:]
-                    if not caption.endswith(('.', '!', '?')):
-                        caption += '.'
-                    return {
-                        "caption": caption,
-                        "status": "success",
-                        "model_info": {
-                            "device": self.device,
-                            "model_name": MODEL_NAME
-                        }
-                    }
-            finally:
-                # Clean up tensors
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                    gc.collect()
-        except Exception as e:
-            logging.error(f"Error during caption generation: {str(e)}")
-            raise RuntimeError(f"Failed to generate caption: {str(e)}")
-# Initialize model
-captioner = ImageCaptioner()

 from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
+import torch
 MODEL_NAME = "Salesforce/blip-image-captioning-base"
+MAX_LENGTH = 50
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model and processor only once at startup
+processor = BlipProcessor.from_pretrained(MODEL_NAME)
+model = BlipForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
+model.eval()
+def caption_image(image: Image.Image):
+    # Validate input
+    if not isinstance(image, Image.Image) or image.mode not in ('RGB', 'L'):
+        raise ValueError("Input must be a valid PIL Image in RGB or grayscale format")
+    # Preprocess input
+    inputs = processor(image, return_tensors="pt")
+    inputs = {k: v.to(device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+    # Generate caption
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_length=MAX_LENGTH,
+            num_beams=5,
+            temperature=1.0,
+            top_k=50,
+            top_p=0.95
+        )
+    # Decode caption
+    caption = processor.decode(output_ids[0], skip_special_tokens=True).strip()
+    caption = caption[0].upper() + caption[1:]
+    if not caption.endswith(('.', '!', '?')):
+        caption += '.'
+    return {
+        "caption": caption,
+        "confidence": 1.0  # BLIP doesn't return a real score
+    }

app/model.py CHANGED Viewed

@@ -1,108 +1,31 @@
 from transformers import CLIPProcessor, CLIPModel
-import torch
 from PIL import Image
-import logging
-import time
-from typing import Dict, Any
-import gc
 MODEL_NAME = "openai/clip-vit-base-patch16"
-CATEGORIES = ["food", "fitness", "healthcare"]
-MAX_RETRIES = 3
-RETRY_DELAY = 1  # seconds
-class ImageAnalyzer:
-    def __init__(self):
-        self.processor = None
-        self.model = None
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        logging.info(f"Using device: {self.device}")
-        self._initialize_model()
-    def _initialize_model(self):
-        for attempt in range(MAX_RETRIES):
-            try:
-                # Clear CUDA cache if using GPU
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                    gc.collect()
-                self.processor = CLIPProcessor.from_pretrained(MODEL_NAME)
-                self.model = CLIPModel.from_pretrained(MODEL_NAME).to(self.device)
-                # Verify model loaded correctly
-                if self.model is None or self.processor is None:
-                    raise RuntimeError("Model or processor initialization failed")
-                logging.info(f"Model loaded successfully on {self.device} (attempt {attempt + 1})")
-                return
-            except Exception as e:
-                logging.error(f"Attempt {attempt + 1} failed to load model: {str(e)}")
-                if attempt < MAX_RETRIES - 1:
-                    time.sleep(RETRY_DELAY)
-                    continue
-                raise RuntimeError(f"Failed to initialize the image analysis model after {MAX_RETRIES} attempts")
-    def analyze_image(self, image: Image.Image) -> Dict[str, Any]:
-        if not isinstance(image, Image.Image):
-            raise ValueError("Input must be a PIL Image")
-        if self.model is None or self.processor is None:
-            self._initialize_model()  # Try to reinitialize if models are not loaded
-        try:
-            # Clear CUDA cache if using GPU
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                gc.collect()
-            # Prepare inputs for CLIP
-            inputs = self.processor(
-                text=CATEGORIES,
-                images=image,
-                return_tensors="pt",
-                padding=True
-            )
-            # Move inputs to the same device as model
-            inputs = {k: v.to(self.device) if hasattr(v, 'to') else v
-                     for k, v in inputs.items()}
-            # Process with error handling and memory management
-            try:
-                with torch.no_grad():
-                    outputs = self.model(**inputs)
-                    logits_per_image = outputs.logits_per_image
-                    probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
-                    # Get top 2 predictions for more informative results
-                    top_indices = probs.argsort()[-2:][::-1]
-                    predictions = [
-                        {
-                            "category": CATEGORIES[idx],
-                            "confidence": round(float(probs[idx]), 4)
-                        }
-                        for idx in top_indices
-                    ]
-                    return {
-                        "primary_prediction": predictions[0],
-                        "alternative_prediction": predictions[1],
-                        "status": "success"
-                    }
-            finally:
-                # Clean up tensors
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
-                    gc.collect()
-        except Exception as e:
-            logging.error(f"Error during image analysis: {str(e)}")
-            raise RuntimeError(f"Failed to analyze image: {str(e)}")
-# Create a single instance to be used by the API
-analyzer = ImageAnalyzer()
-# Function to be used by the API
 def analyze_image(image: Image.Image):
-    return analyzer.analyze_image(image)

 from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
+import torch
 MODEL_NAME = "openai/clip-vit-base-patch16"
+# Load model and processor only once at startup
+processor = CLIPProcessor.from_pretrained(MODEL_NAME)
+model = CLIPModel.from_pretrained(MODEL_NAME)
+# Define the categories to classify into
+CATEGORIES = ["food", "fitness", "healthcare"]
 def analyze_image(image: Image.Image):
+    # Preprocess input
+    inputs = processor(
+        text=CATEGORIES,
+        images=image,
+        return_tensors="pt",
+        padding=True
+    )
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits_per_image = outputs.logits_per_image
+        probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
+        best_idx = probs.argmax()
+        return {
+            "category": CATEGORIES[best_idx],
+            "confidence": round(float(probs[best_idx]), 4)
+        }

requirements.txt CHANGED Viewed

@@ -1,8 +1,20 @@
-fastapi>=0.68.0
-uvicorn>=0.15.0
-transformers>=4.11.0
-Pillow>=8.3.1
-python-multipart>=0.0.5
-numpy>=1.21.0
-tqdm>=4.62.0
-requests>=2.26.0

+# Web framework and server
+fastapi
+uvicorn
+# Core ML dependencies
+torch
+torchvision
+transformers
+# Image processing
+Pillow
+# Utilities
+python-multipart
+numpy
+tqdm
+requests
+safetensors
+typing-extensions
+pydantic