Abhishek7356 commited on
Commit
d12790d
·
1 Parent(s): a260940

creating new projects fro product categorise

Browse files
.gitignore ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environment
7
+ venv/
8
+ env/
9
+ .venv/
10
+ ENV/
11
+ env.bak/
12
+ venv.bak/
13
+
14
+ # VS Code settings
15
+ .vscode/
16
+
17
+ # Distribution / packaging
18
+ build/
19
+ develop-eggs/
20
+ dist/
21
+ downloads/
22
+ eggs/
23
+ .eggs/
24
+ lib/
25
+ lib64/
26
+ parts/
27
+ sdist/
28
+ var/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+
57
+ # Jupyter Notebook checkpoints
58
+ .ipynb_checkpoints
59
+
60
+ # mypy
61
+ .mypy_cache/
62
+ .dmypy.json
63
+ dmypy.json
64
+
65
+ # Pyre type checker
66
+ .pyre/
67
+
68
+ # pytype
69
+ .pytype/
70
+
71
+ # Cython debug symbols
72
+ cython_debug/
73
+
74
+ # Logs and local data
75
+ *.log
76
+ *.sqlite3
77
+
78
+ # Environment files
79
+ .env
80
+ .env.*
81
+ *.env
82
+
83
+ # OS-specific
84
+ .DS_Store
85
+ Thumbs.db
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.10 (Hugging Face supports this)
2
+ FROM python:3.10
3
+
4
+ # Create a non-root user
5
+ RUN useradd -m -u 1000 user
6
+ USER user
7
+
8
+ # Set working directory
9
+ WORKDIR /app
10
+
11
+ # Copy dependencies
12
+ COPY --chown=user requirements.txt .
13
+
14
+ # Install dependencies
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Copy the rest of the app
18
+ COPY --chown=user ./src ./src
19
+
20
+ # Expose the port (Hugging Face Spaces use 7860)
21
+ EXPOSE 7860
22
+
23
+ # Run the app with uvicorn
24
+ CMD ["uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "7860"]
models/categories_processed.csv ADDED
The diff for this file is too large to render. See raw diff
 
models/category_embeddings_mpnet.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d5292d260ce14beadb6f8f8a0f75f96e5cf355a384325a3ce24116c9b378b1
3
+ size 102310016
models/category_metadata.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52f6eb174e166b0ddb618bf92ae9f0584366e8c60f97f86af3a8c275a7f2ffdd
3
+ size 10085806
models/config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "sentence-transformers/all-mpnet-base-v2",
3
+ "embedding_dimension": 768,
4
+ "total_categories": 33304,
5
+ "preprocessing_strategy": "rich",
6
+ "thresholds": {
7
+ "auto_approve": 0.75,
8
+ "quick_review": 0.6
9
+ },
10
+ "boost_factor": 0.15,
11
+ "created_date": "2025-01-15"
12
+ }
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML Dependencies
2
+ sentence-transformers==3.0.0
3
+ torch>=2.0.0
4
+ numpy>=1.24.0
5
+ scikit-learn>=1.3.0
6
+
7
+ # API Framework
8
+ fastapi==0.104.1
9
+ uvicorn[standard]==0.24.0
10
+ pydantic==2.5.0
11
+ python-multipart==0.0.6
12
+
13
+ # Data Processing
14
+ pandas>=2.0.0
15
+
16
+ # Optional but Recommended
17
+ python-dotenv==1.0.0
18
+
19
+ # For Production (optional for now)
20
+ # pymongo>=4.5.0 # If using MongoDB
21
+ # redis>=5.0.0 # If using Redis caching
22
+ # gunicorn>=21.2.0 # For production server
src/__init__.py ADDED
File without changes
src/api.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # """
2
+ # FastAPI REST API for Product Classification
3
+ # """
4
+ from fastapi.templating import Jinja2Templates
5
+ from fastapi.responses import HTMLResponse, JSONResponse
6
+ from starlette.requests import Request
7
+
8
+ from fastapi import FastAPI, HTTPException, status
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ from pydantic import BaseModel, Field
11
+ from typing import List, Optional
12
+ import logging
13
+ import time
14
+
15
+ # from classifier import ProductClassifier
16
+ # from config import API_TITLE, API_VERSION, API_DESCRIPTION, validate_files
17
+ from .classifier import ProductClassifier
18
+ from .config import API_TITLE, API_VERSION, API_DESCRIPTION, validate_files
19
+
20
+ # Set up logging
21
+ logging.basicConfig(
22
+ level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
23
+ )
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Validate files exist before starting
27
+ try:
28
+ validate_files()
29
+ logger.info("✅ All required model files found")
30
+ except FileNotFoundError as e:
31
+ logger.error(f"❌ Missing files: {e}")
32
+ raise
33
+
34
+ # Create FastAPI app
35
+ app = FastAPI(title=API_TITLE, version=API_VERSION, description=API_DESCRIPTION)
36
+ templates = Jinja2Templates(directory="templates")
37
+ # Add CORS middleware (allows frontend to access API)
38
+ app.add_middleware(
39
+ CORSMiddleware,
40
+ allow_origins=["*"], # In production, specify actual origins
41
+ allow_credentials=True,
42
+ allow_methods=["*"],
43
+ allow_headers=["*"],
44
+ )
45
+
46
+ # Initialize classifier (loaded once at startup)
47
+ classifier = None
48
+
49
+
50
+ # Pydantic models for request/response validation
51
+ class ProductInput(BaseModel):
52
+ """Input model for single product classification"""
53
+
54
+ id: Optional[str] = Field(default="unknown", description="Product ID")
55
+ title: str = Field(..., description="Product title", min_length=1)
56
+ product_type: Optional[str] = Field(default="", description="Product type/category")
57
+ vendor: Optional[str] = Field(default="", description="Brand or vendor name")
58
+ tags: Optional[List[str]] = Field(default=[], description="Product tags")
59
+ description: Optional[str] = Field(default="", description="Product description")
60
+
61
+ class Config:
62
+ json_schema_extra = {
63
+ "example": {
64
+ "id": "prod_123",
65
+ "title": "Apple iPhone 15 Pro",
66
+ "product_type": "Smartphone",
67
+ "vendor": "Apple Inc",
68
+ "tags": ["electronics", "phone", "mobile"],
69
+ "description": "Latest flagship smartphone",
70
+ }
71
+ }
72
+
73
+
74
+ class CategoryResult(BaseModel):
75
+ """Result for a single category match"""
76
+
77
+ rank: int
78
+ category_id: str
79
+ category_path: str
80
+ confidence_percentage: float
81
+ semantic_score: Optional[float] = None
82
+ boost_applied: Optional[float] = None
83
+
84
+
85
+ class ClassificationResponse(BaseModel):
86
+ """Response model for classification"""
87
+
88
+ product_id: str
89
+ action: str
90
+ reason: str
91
+ top_category: str
92
+ top_confidence: float
93
+ product_text: str
94
+ alternatives: List[CategoryResult]
95
+ processing_time_ms: Optional[float] = None
96
+
97
+
98
+ class BatchProductInput(BaseModel):
99
+ """Input model for batch classification"""
100
+
101
+ products: List[ProductInput] = Field(
102
+ ..., description="List of products to classify"
103
+ )
104
+ top_k: int = Field(
105
+ default=5, ge=1, le=20, description="Number of top matches to return"
106
+ )
107
+
108
+
109
+ class HealthResponse(BaseModel):
110
+ """Health check response"""
111
+
112
+ status: str
113
+ model: str
114
+ categories_loaded: int
115
+ embedding_dimension: int
116
+
117
+
118
+ # Startup event - load classifier
119
+ @app.on_event("startup")
120
+ async def startup_event():
121
+ """Load the classifier when API starts"""
122
+ global classifier
123
+ logger.info("🚀 Starting API server...")
124
+ logger.info("Loading Product Classifier...")
125
+
126
+ try:
127
+ classifier = ProductClassifier()
128
+ logger.info("✅ Classifier loaded successfully!")
129
+ except Exception as e:
130
+ logger.error(f"❌ Failed to load classifier: {e}")
131
+ raise
132
+
133
+
134
+ # Root endpoint
135
+ # @app.get("/", tags=["General"])
136
+ # async def root():
137
+ # """Root endpoint - API information"""
138
+ # return {
139
+ # "message": "Insurance Product Classification API",
140
+ # "version": API_VERSION,
141
+ # "status": "running",
142
+ # "docs": "/docs",
143
+ # "health": "/health",
144
+ # }
145
+ @app.get("/", response_class=HTMLResponse, tags=["General"])
146
+ async def root(request: Request):
147
+ """Serve the web UI"""
148
+ return templates.TemplateResponse("index.html", {"request": request})
149
+
150
+
151
+ # Health check endpoint
152
+ @app.get("/health", response_model=HealthResponse, tags=["General"])
153
+ async def health_check():
154
+ """
155
+ Health check endpoint
156
+ Returns system status and model information
157
+ """
158
+ if classifier is None:
159
+ raise HTTPException(
160
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
161
+ detail="Classifier not initialized",
162
+ )
163
+
164
+ return {
165
+ "status": "healthy",
166
+ "model": "all-mpnet-base-v2",
167
+ "categories_loaded": len(classifier.embeddings),
168
+ "embedding_dimension": classifier.embeddings.shape[1],
169
+ }
170
+
171
+
172
+ # Single product classification
173
+ @app.post("/classify", response_model=ClassificationResponse, tags=["Classification"])
174
+ async def classify_product(product: ProductInput):
175
+ """
176
+ Classify a single product into insurance categories
177
+
178
+ Returns:
179
+ - action: AUTO_APPROVE, QUICK_REVIEW, or MANUAL_CATEGORIZATION
180
+ - top_category: Best matching category
181
+ - confidence: Confidence score (0-100%)
182
+ - alternatives: Top alternative categories
183
+ """
184
+ if classifier is None:
185
+ raise HTTPException(
186
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
187
+ detail="Classifier not initialized",
188
+ )
189
+
190
+ try:
191
+ # Start timer
192
+ start_time = time.time()
193
+
194
+ # Classify
195
+ result = classifier.classify(product.dict())
196
+
197
+ # Calculate processing time
198
+ processing_time = (time.time() - start_time) * 1000 # Convert to ms
199
+ result["processing_time_ms"] = round(processing_time, 2)
200
+
201
+ logger.info(
202
+ f"Classified product '{product.title}' → "
203
+ f"{result['action']} ({result['top_confidence']}%)"
204
+ )
205
+
206
+ return result
207
+
208
+ except Exception as e:
209
+ logger.error(f"Classification error: {e}")
210
+ raise HTTPException(
211
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
212
+ detail=f"Classification failed: {str(e)}",
213
+ )
214
+
215
+
216
+ # Batch product classification
217
+ @app.post("/classify-batch", tags=["Classification"])
218
+ async def classify_batch(batch: BatchProductInput):
219
+ """
220
+ Classify multiple products at once
221
+
222
+ Useful for bulk processing of product catalogs
223
+ """
224
+ if classifier is None:
225
+ raise HTTPException(
226
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
227
+ detail="Classifier not initialized",
228
+ )
229
+
230
+ try:
231
+ start_time = time.time()
232
+
233
+ # Convert to list of dicts
234
+ products_data = [p.dict() for p in batch.products]
235
+
236
+ # Classify batch
237
+ results = classifier.classify_batch(products_data, top_k=batch.top_k)
238
+
239
+ # Calculate stats
240
+ processing_time = (time.time() - start_time) * 1000
241
+
242
+ # Count actions
243
+ action_counts = {}
244
+ for result in results:
245
+ action = result.get("action", "UNKNOWN")
246
+ action_counts[action] = action_counts.get(action, 0) + 1
247
+
248
+ logger.info(
249
+ f"Batch classified {len(products_data)} products in {processing_time:.0f}ms"
250
+ )
251
+
252
+ return {
253
+ "total_products": len(products_data),
254
+ "processing_time_ms": round(processing_time, 2),
255
+ "action_counts": action_counts,
256
+ "results": results,
257
+ }
258
+
259
+ except Exception as e:
260
+ logger.error(f"Batch classification error: {e}")
261
+ raise HTTPException(
262
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
263
+ detail=f"Batch classification failed: {str(e)}",
264
+ )
265
+
266
+
267
+ # Get statistics
268
+ @app.get("/stats", tags=["General"])
269
+ async def get_statistics():
270
+ """
271
+ Get system statistics
272
+ """
273
+ if classifier is None:
274
+ raise HTTPException(
275
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
276
+ detail="Classifier not initialized",
277
+ )
278
+
279
+ return {
280
+ "total_categories": len(classifier.embeddings),
281
+ "embedding_dimension": classifier.embeddings.shape[1],
282
+ "model_name": "all-mpnet-base-v2",
283
+ "thresholds": {
284
+ "auto_approve": "≥75%",
285
+ "quick_review": "60-75%",
286
+ "manual": "<60%",
287
+ },
288
+ }
289
+
290
+
291
+ # Error handlers
292
+ from fastapi.responses import JSONResponse
293
+
294
+
295
+ @app.exception_handler(404)
296
+ async def not_found_handler(request, exc):
297
+ """Handle 404 errors"""
298
+ return JSONResponse(
299
+ status_code=404,
300
+ content={
301
+ "error": "Endpoint not found",
302
+ "message": "Check /docs for available endpoints",
303
+ },
304
+ )
305
+
306
+
307
+ @app.exception_handler(500)
308
+ async def internal_error_handler(request, exc):
309
+ """Handle 500 errors"""
310
+ logger.error(f"Internal server error: {exc}")
311
+ return JSONResponse(
312
+ status_code=500,
313
+ content={
314
+ "error": "Internal server error",
315
+ "message": "Something went wrong. Check logs for details.",
316
+ },
317
+ )
318
+
319
+
320
+ # Run with: uvicorn api:app --reload
321
+ if __name__ == "__main__":
322
+ import uvicorn
323
+
324
+ uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True, log_level="info")
src/classifier.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # src/classifier.py
2
+ # from sentence_transformers import SentenceTransformer
3
+ # import numpy as np
4
+ # import pickle
5
+
6
+
7
+ # class ProductClassifier:
8
+ # def __init__(self, model_path="./models"):
9
+ # self.model = SentenceTransformer("all-mpnet-base-v2")
10
+ # self.embeddings = np.load(f"{model_path}/category_embeddings_mpnet.npy")
11
+ # with open(f"{model_path}/category_metadata.pkl", "rb") as f:
12
+ # self.metadata = pickle.load(f)
13
+
14
+ # def classify(self, product_data, top_k=5):
15
+ # # Implementation here
16
+ # pass
17
+
18
+
19
+ # """
20
+ # Product Classification Engine
21
+ # Loads pre-trained embeddings and performs similarity-based classification
22
+ # """
23
+ import numpy as np
24
+ import pickle
25
+ from sentence_transformers import SentenceTransformer
26
+ from sklearn.metrics.pairwise import cosine_similarity
27
+ from typing import Dict, List, Optional
28
+ import re
29
+ import logging
30
+
31
+ from .config import (
32
+ MODEL_NAME,
33
+ EMBEDDINGS_FILE,
34
+ METADATA_FILE,
35
+ AUTO_APPROVE_THRESHOLD,
36
+ QUICK_REVIEW_THRESHOLD,
37
+ BOOST_FACTOR,
38
+ MAX_BOOST,
39
+ DEFAULT_TOP_K,
40
+ PRODUCT_KEYWORDS,
41
+ )
42
+
43
+ # Set up logging
44
+ logging.basicConfig(level=logging.INFO)
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ class ProductClassifier:
49
+ """
50
+ ML-powered product classifier for insurance categorization
51
+ """
52
+
53
+ def __init__(self):
54
+ """Initialize classifier by loading model and embeddings"""
55
+ logger.info("Initializing Product Classifier...")
56
+
57
+ # Load the embedding model
58
+ logger.info(f"Loading model: {MODEL_NAME}")
59
+ self.model = SentenceTransformer(MODEL_NAME)
60
+ logger.info(
61
+ f"✅ Model loaded (dimension: {self.model.get_sentence_embedding_dimension()})"
62
+ )
63
+
64
+ # Load pre-computed category embeddings
65
+ logger.info(f"Loading category embeddings from {EMBEDDINGS_FILE}")
66
+ self.embeddings = np.load(EMBEDDINGS_FILE)
67
+ logger.info(f"✅ Loaded {self.embeddings.shape[0]:,} category embeddings")
68
+
69
+ # Load category metadata
70
+ logger.info(f"Loading metadata from {METADATA_FILE}")
71
+ with open(METADATA_FILE, "rb") as f:
72
+ self.metadata = pickle.load(f)
73
+ logger.info(f"✅ Metadata loaded")
74
+
75
+ # Cache for processed texts
76
+ self.embedding_texts = self.metadata.get("embedding_texts", [])
77
+
78
+ logger.info("🎉 Classifier ready!")
79
+
80
+ def preprocess_product(self, product_data: Dict) -> str:
81
+ """
82
+ Preprocess product data into searchable text
83
+
84
+ Args:
85
+ product_data: Dictionary with product fields
86
+ - title (str): Product title
87
+ - product_type (str, optional): Product type/category
88
+ - vendor (str, optional): Brand/vendor name
89
+ - tags (list/str, optional): Product tags
90
+ - description (str, optional): Product description
91
+
92
+ Returns:
93
+ Processed text string for embedding
94
+ """
95
+ parts = []
96
+
97
+ # Extract fields in priority order
98
+ title = product_data.get("title", "")
99
+ product_type = product_data.get("product_type", "")
100
+ vendor = product_data.get("vendor", "")
101
+ description = product_data.get("description", "")
102
+ tags = product_data.get("tags", [])
103
+
104
+ # 1. Title (most important)
105
+ if title:
106
+ parts.append(title)
107
+
108
+ # 2. Product type (category hint)
109
+ if product_type:
110
+ parts.append(f"Product type: {product_type}")
111
+
112
+ # 3. Brand/Vendor
113
+ if vendor:
114
+ parts.append(f"Brand: {vendor}")
115
+
116
+ # 4. Tags (keywords)
117
+ if tags:
118
+ tag_text = " ".join(tags) if isinstance(tags, list) else tags
119
+ parts.append(f"Keywords: {tag_text}")
120
+
121
+ # 5. Description (limited to 100 chars)
122
+ if description:
123
+ desc_short = description[:100].strip()
124
+ parts.append(desc_short)
125
+
126
+ return ". ".join(parts)
127
+
128
+ def extract_keywords(self, text: str) -> List[str]:
129
+ """
130
+ Extract important keywords from product text
131
+
132
+ Args:
133
+ text: Product text
134
+
135
+ Returns:
136
+ List of detected keywords
137
+ """
138
+ text_lower = text.lower()
139
+ found_keywords = [kw for kw in PRODUCT_KEYWORDS if kw in text_lower]
140
+ return found_keywords
141
+
142
+ def classify(
143
+ self, product_data: Dict, top_k: int = DEFAULT_TOP_K, use_boost: bool = True
144
+ ) -> Dict:
145
+ """
146
+ Classify a product into insurance categories
147
+
148
+ Args:
149
+ product_data: Product information dictionary
150
+ top_k: Number of top matches to return
151
+ use_boost: Whether to apply keyword boosting
152
+
153
+ Returns:
154
+ Classification results with confidence scores and recommendations
155
+ """
156
+ # Preprocess product text
157
+ product_text = self.preprocess_product(product_data)
158
+
159
+ # Generate embedding for product
160
+ product_embedding = self.model.encode([product_text], normalize_embeddings=True)
161
+
162
+ # Calculate semantic similarities
163
+ semantic_scores = cosine_similarity(product_embedding, self.embeddings)[0]
164
+
165
+ # Apply keyword boosting if enabled
166
+ if use_boost:
167
+ product_keywords = self.extract_keywords(product_text)
168
+ boosted_scores = self._apply_keyword_boost(
169
+ semantic_scores, product_keywords
170
+ )
171
+ else:
172
+ boosted_scores = semantic_scores
173
+
174
+ # Get top K indices
175
+ top_indices = boosted_scores.argsort()[-top_k:][::-1]
176
+
177
+ # Format results
178
+ results = []
179
+ for rank, idx in enumerate(top_indices, 1):
180
+ category_data = {
181
+ "rank": rank,
182
+ "category_id": self.metadata["category_ids"][idx],
183
+ "category_path": self.metadata["category_paths"][idx],
184
+ "semantic_score": float(semantic_scores[idx]),
185
+ "final_score": float(boosted_scores[idx]),
186
+ "confidence_percentage": round(float(boosted_scores[idx]) * 100, 2),
187
+ }
188
+
189
+ # Add boost information if used
190
+ if use_boost:
191
+ category_data["boost_applied"] = round(
192
+ (boosted_scores[idx] - semantic_scores[idx]) * 100, 2
193
+ )
194
+
195
+ results.append(category_data)
196
+
197
+ # Determine action based on top score
198
+ top_confidence = results[0]["final_score"]
199
+
200
+ if top_confidence >= AUTO_APPROVE_THRESHOLD:
201
+ action = "AUTO_APPROVE"
202
+ reason = f"High confidence ({results[0]['confidence_percentage']}%)"
203
+ elif top_confidence >= QUICK_REVIEW_THRESHOLD:
204
+ action = "QUICK_REVIEW"
205
+ reason = f"Medium confidence ({results[0]['confidence_percentage']}%) - verify category"
206
+ else:
207
+ action = "MANUAL_CATEGORIZATION"
208
+ reason = f"Low confidence ({results[0]['confidence_percentage']}%) - needs expert review"
209
+
210
+ return {
211
+ "product_id": product_data.get("id", "unknown"),
212
+ "product_text": product_text,
213
+ "action": action,
214
+ "reason": reason,
215
+ "top_category": results[0]["category_path"],
216
+ "top_confidence": results[0]["confidence_percentage"],
217
+ "alternatives": results[1:3] if len(results) > 1 else [],
218
+ "all_results": results,
219
+ }
220
+
221
+ def _apply_keyword_boost(
222
+ self, scores: np.ndarray, product_keywords: List[str]
223
+ ) -> np.ndarray:
224
+ """
225
+ Apply keyword-based score boosting
226
+
227
+ Args:
228
+ scores: Original semantic similarity scores
229
+ product_keywords: List of keywords found in product
230
+
231
+ Returns:
232
+ Boosted scores
233
+ """
234
+ boosted_scores = scores.copy()
235
+
236
+ if not product_keywords:
237
+ return boosted_scores
238
+
239
+ # Boost categories that contain product keywords
240
+ for idx, cat_text in enumerate(self.embedding_texts):
241
+ cat_text_lower = cat_text.lower()
242
+ matches = sum(1 for kw in product_keywords if kw in cat_text_lower)
243
+
244
+ if matches > 0:
245
+ # Boost proportional to keyword matches
246
+ boost = min(matches * BOOST_FACTOR, MAX_BOOST)
247
+ boosted_scores[idx] = min(boosted_scores[idx] + boost, 1.0)
248
+
249
+ return boosted_scores
250
+
251
+ def classify_batch(
252
+ self, products: List[Dict], top_k: int = DEFAULT_TOP_K
253
+ ) -> List[Dict]:
254
+ """
255
+ Classify multiple products at once
256
+
257
+ Args:
258
+ products: List of product data dictionaries
259
+ top_k: Number of top matches per product
260
+
261
+ Returns:
262
+ List of classification results
263
+ """
264
+ logger.info(f"Classifying batch of {len(products)} products...")
265
+
266
+ results = []
267
+ for i, product in enumerate(products, 1):
268
+ try:
269
+ result = self.classify(product, top_k=top_k)
270
+
271
+ # Convert all numpy types to Python native types for JSON serialization
272
+ result = self._convert_to_json_serializable(result)
273
+
274
+ results.append(result)
275
+
276
+ if i % 100 == 0:
277
+ logger.info(f" Processed {i}/{len(products)} products")
278
+
279
+ except Exception as e:
280
+ logger.error(f" Error classifying product {i}: {e}")
281
+ results.append(
282
+ {
283
+ "product_id": product.get("id", f"product_{i}"),
284
+ "action": "ERROR",
285
+ "reason": str(e),
286
+ "top_category": None,
287
+ "top_confidence": 0.0,
288
+ }
289
+ )
290
+
291
+ logger.info(f"✅ Batch classification complete!")
292
+ return results
293
+
294
+ def _convert_to_json_serializable(self, obj):
295
+ """
296
+ Recursively convert numpy types to Python native types
297
+ """
298
+ import numpy as np
299
+
300
+ if isinstance(obj, dict):
301
+ return {
302
+ key: self._convert_to_json_serializable(value)
303
+ for key, value in obj.items()
304
+ }
305
+ elif isinstance(obj, list):
306
+ return [self._convert_to_json_serializable(item) for item in obj]
307
+ elif isinstance(obj, (np.integer, np.int64, np.int32)):
308
+ return int(obj)
309
+ elif isinstance(obj, (np.floating, np.float64, np.float32)):
310
+ return float(obj)
311
+ elif isinstance(obj, np.ndarray):
312
+ return obj.tolist()
313
+ else:
314
+ return obj
315
+
316
+
317
+ # Test the classifier if run directly
318
+ if __name__ == "__main__":
319
+ print("Testing Product Classifier...")
320
+ print("=" * 80)
321
+
322
+ # Initialize classifier
323
+ classifier = ProductClassifier()
324
+
325
+ # Test product
326
+ test_product = {
327
+ "id": "test_001",
328
+ "title": "Apple iPhone 15 Pro Max",
329
+ "product_type": "Smartphone",
330
+ "vendor": "Apple Inc",
331
+ "tags": ["electronics", "mobile", "phone", "smartphone"],
332
+ "description": "Latest flagship smartphone with titanium design",
333
+ }
334
+
335
+ print("\n📱 Test Product:")
336
+ print(f" {test_product['title']}")
337
+
338
+ # Classify
339
+ result = classifier.classify(test_product)
340
+
341
+ print(f"\n🎯 Classification Result:")
342
+ print(f" Action: {result['action']}")
343
+ print(f" Top Category: {result['top_category']}")
344
+ print(f" Confidence: {result['top_confidence']}%")
345
+ print(f" Reason: {result['reason']}")
346
+
347
+ print("\n📊 Top 3 Alternatives:")
348
+ for alt in result["alternatives"][:3]:
349
+ print(
350
+ f" {alt['rank']}. {alt['category_path']} ({alt['confidence_percentage']}%)"
351
+ )
352
+
353
+ print("\n" + "=" * 80)
354
+ print("✅ Classifier test complete!")
src/config.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # """
2
+ # Configuration settings for the insurance product classifier
3
+ # """
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ # Base directory (project root)
9
+ BASE_DIR = Path(__file__).resolve().parent.parent
10
+
11
+ # Model directory
12
+ MODEL_DIR = BASE_DIR / "models"
13
+
14
+ # Model files
15
+ EMBEDDINGS_FILE = MODEL_DIR / "category_embeddings_mpnet.npy"
16
+ METADATA_FILE = MODEL_DIR / "category_metadata.pkl"
17
+ CONFIG_FILE = MODEL_DIR / "config.json"
18
+
19
+ # Model configuration
20
+ MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
21
+ EMBEDDING_DIMENSION = 768
22
+
23
+ # Classification thresholds
24
+ AUTO_APPROVE_THRESHOLD = 0.75 # 75% confidence
25
+ QUICK_REVIEW_THRESHOLD = 0.60 # 60% confidence
26
+
27
+ # Keyword boosting
28
+ BOOST_FACTOR = 0.15 # 15% boost for keyword matches
29
+ MAX_BOOST = 0.30 # Maximum 30% total boost
30
+
31
+ # API settings
32
+ API_TITLE = "Insurance Product Classification API"
33
+ API_VERSION = "1.0.0"
34
+ API_DESCRIPTION = "ML-powered product categorization for insurance underwriting"
35
+
36
+ # Processing settings
37
+ DEFAULT_TOP_K = 5 # Return top 5 matches
38
+ BATCH_SIZE = 32 # For batch processing
39
+
40
+ # Keywords for boosting
41
+ PRODUCT_KEYWORDS = {
42
+ # Electronics
43
+ "iphone",
44
+ "ipad",
45
+ "macbook",
46
+ "smartphone",
47
+ "laptop",
48
+ "tablet",
49
+ "computer",
50
+ "electronics",
51
+ "phone",
52
+ "mobile",
53
+ "samsung",
54
+ "apple",
55
+ "dell",
56
+ "hp",
57
+ # Appliances
58
+ "refrigerator",
59
+ "dishwasher",
60
+ "washing machine",
61
+ "dryer",
62
+ "oven",
63
+ "microwave",
64
+ "coffee maker",
65
+ "blender",
66
+ "toaster",
67
+ "appliance",
68
+ # Clothing
69
+ "shoes",
70
+ "shirt",
71
+ "pants",
72
+ "dress",
73
+ "jacket",
74
+ "sneakers",
75
+ "boots",
76
+ "clothing",
77
+ "apparel",
78
+ "footwear",
79
+ # Books
80
+ "book",
81
+ "novel",
82
+ "textbook",
83
+ "ebook",
84
+ "reading",
85
+ "literature",
86
+ # Sports
87
+ "sports",
88
+ "fitness",
89
+ "exercise",
90
+ "gym",
91
+ "athletic",
92
+ "running",
93
+ "yoga",
94
+ # Home
95
+ "furniture",
96
+ "decor",
97
+ "bedding",
98
+ "kitchen",
99
+ "home",
100
+ "garden",
101
+ }
102
+
103
+
104
+ def validate_files():
105
+ """Validate that all required model files exist"""
106
+ required_files = [EMBEDDINGS_FILE, METADATA_FILE, CONFIG_FILE]
107
+
108
+ missing_files = []
109
+ for file_path in required_files:
110
+ if not file_path.exists():
111
+ missing_files.append(str(file_path))
112
+
113
+ if missing_files:
114
+ raise FileNotFoundError(
115
+ f"Missing required files:\n" + "\n".join(f" - {f}" for f in missing_files)
116
+ )
117
+
118
+ return True
119
+
120
+
121
+ if __name__ == "__main__":
122
+ print("Configuration Settings:")
123
+ print(f" Model Directory: {MODEL_DIR}")
124
+ print(f" Embeddings File: {EMBEDDINGS_FILE.name}")
125
+ print(f" Metadata File: {METADATA_FILE.name}")
126
+ print(f" Auto-Approve Threshold: {AUTO_APPROVE_THRESHOLD * 100}%")
127
+ print(f" Quick Review Threshold: {QUICK_REVIEW_THRESHOLD * 100}%")
128
+
129
+ try:
130
+ validate_files()
131
+ print("\n✅ All required files found!")
132
+ except FileNotFoundError as e:
133
+ print(f"\n❌ Error: {e}")
templates/index.html ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Insurance Product Classification System</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ body {
15
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
16
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
17
+ min-height: 100vh;
18
+ padding: 20px;
19
+ }
20
+
21
+ .container {
22
+ max-width: 1200px;
23
+ margin: 0 auto;
24
+ }
25
+
26
+ .header {
27
+ background: white;
28
+ border-radius: 15px;
29
+ padding: 30px;
30
+ margin-bottom: 30px;
31
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
32
+ text-align: center;
33
+ }
34
+
35
+ .header h1 {
36
+ color: #667eea;
37
+ font-size: 2.5em;
38
+ margin-bottom: 10px;
39
+ }
40
+
41
+ .header p {
42
+ color: #666;
43
+ font-size: 1.1em;
44
+ }
45
+
46
+ .stats-grid {
47
+ display: grid;
48
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
49
+ gap: 20px;
50
+ margin-bottom: 30px;
51
+ }
52
+
53
+ .stat-card {
54
+ background: white;
55
+ border-radius: 15px;
56
+ padding: 25px;
57
+ box-shadow: 0 5px 15px rgba(0,0,0,0.1);
58
+ transition: transform 0.3s;
59
+ }
60
+
61
+ .stat-card:hover {
62
+ transform: translateY(-5px);
63
+ }
64
+
65
+ .stat-card h3 {
66
+ color: #667eea;
67
+ font-size: 2.5em;
68
+ margin-bottom: 10px;
69
+ }
70
+
71
+ .stat-card p {
72
+ color: #666;
73
+ font-size: 1em;
74
+ }
75
+
76
+ .main-content {
77
+ display: grid;
78
+ grid-template-columns: 1fr 1fr;
79
+ gap: 30px;
80
+ }
81
+
82
+ .card {
83
+ background: white;
84
+ border-radius: 15px;
85
+ padding: 30px;
86
+ box-shadow: 0 10px 30px rgba(0,0,0,0.2);
87
+ width: 100%;
88
+ }
89
+
90
+ .card h2 {
91
+ color: #667eea;
92
+ margin-bottom: 20px;
93
+ font-size: 1.8em;
94
+ }
95
+
96
+ .form-group {
97
+ margin-bottom: 20px;
98
+ }
99
+
100
+ label {
101
+ display: block;
102
+ color: #333;
103
+ font-weight: 600;
104
+ margin-bottom: 8px;
105
+ }
106
+
107
+ input, textarea, select {
108
+ width: 100%;
109
+ padding: 12px;
110
+ border: 2px solid #e0e0e0;
111
+ border-radius: 8px;
112
+ font-size: 1em;
113
+ transition: border-color 0.3s;
114
+ }
115
+
116
+ input:focus, textarea:focus, select:focus {
117
+ outline: none;
118
+ border-color: #667eea;
119
+ }
120
+
121
+ textarea {
122
+ resize: vertical;
123
+ min-height: 80px;
124
+ }
125
+
126
+ .btn {
127
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
128
+ color: white;
129
+ padding: 15px 30px;
130
+ border: none;
131
+ border-radius: 8px;
132
+ font-size: 1.1em;
133
+ font-weight: 600;
134
+ cursor: pointer;
135
+ width: 100%;
136
+ transition: transform 0.2s;
137
+ }
138
+
139
+ .btn:hover {
140
+ transform: scale(1.02);
141
+ }
142
+
143
+ .btn:disabled {
144
+ opacity: 0.6;
145
+ cursor: not-allowed;
146
+ }
147
+
148
+ .result {
149
+ display: none;
150
+ margin-top: 20px;
151
+ padding: 20px;
152
+ border-radius: 10px;
153
+ animation: slideIn 0.5s;
154
+ }
155
+
156
+ @keyframes slideIn {
157
+ from {
158
+ opacity: 0;
159
+ transform: translateY(20px);
160
+ }
161
+ to {
162
+ opacity: 1;
163
+ transform: translateY(0);
164
+ }
165
+ }
166
+
167
+ .result.success {
168
+ background: #d4edda;
169
+ border: 2px solid #28a745;
170
+ }
171
+
172
+ .result.warning {
173
+ background: #fff3cd;
174
+ border: 2px solid #ffc107;
175
+ }
176
+
177
+ .result.info {
178
+ background: #d1ecf1;
179
+ border: 2px solid #17a2b8;
180
+ }
181
+
182
+ .result-header {
183
+ display: flex;
184
+ align-items: center;
185
+ margin-bottom: 15px;
186
+ }
187
+
188
+ .result-icon {
189
+ font-size: 2em;
190
+ margin-right: 15px;
191
+ }
192
+
193
+ .result-title {
194
+ font-size: 1.5em;
195
+ font-weight: 700;
196
+ }
197
+
198
+ .result-content {
199
+ margin-top: 15px;
200
+ }
201
+
202
+ .result-item {
203
+ margin-bottom: 10px;
204
+ padding: 10px;
205
+ background: white;
206
+ border-radius: 5px;
207
+ }
208
+
209
+ .confidence-bar {
210
+ height: 25px;
211
+ background: #e0e0e0;
212
+ border-radius: 15px;
213
+ overflow: hidden;
214
+ margin-top: 10px;
215
+ }
216
+
217
+ .confidence-fill {
218
+ height: 100%;
219
+ background: linear-gradient(90deg, #667eea, #764ba2);
220
+ transition: width 1s ease;
221
+ display: flex;
222
+ align-items: center;
223
+ justify-content: center;
224
+ color: white;
225
+ font-weight: 600;
226
+ }
227
+
228
+ .alternatives {
229
+ margin-top: 15px;
230
+ }
231
+
232
+ .alternative-item {
233
+ padding: 10px;
234
+ margin-bottom: 8px;
235
+ background: #f8f9fa;
236
+ border-radius: 5px;
237
+ border-left: 4px solid #667eea;
238
+ }
239
+
240
+ .loading {
241
+ display: none;
242
+ text-align: center;
243
+ margin: 20px 0;
244
+ }
245
+
246
+ .spinner {
247
+ border: 4px solid #f3f3f3;
248
+ border-top: 4px solid #667eea;
249
+ border-radius: 50%;
250
+ width: 40px;
251
+ height: 40px;
252
+ animation: spin 1s linear infinite;
253
+ margin: 0 auto;
254
+ }
255
+
256
+ @keyframes spin {
257
+ 0% { transform: rotate(0deg); }
258
+ 100% { transform: rotate(360deg); }
259
+ }
260
+
261
+ .footer {
262
+ text-align: center;
263
+ color: white;
264
+ margin-top: 30px;
265
+ padding: 20px;
266
+ }
267
+
268
+ @media (max-width: 768px) {
269
+ .main-content {
270
+ grid-template-columns: 1fr;
271
+ }
272
+
273
+ .header h1 {
274
+ font-size: 1.8em;
275
+ }
276
+ }
277
+
278
+ .badge {
279
+ display: inline-block;
280
+ padding: 5px 12px;
281
+ border-radius: 20px;
282
+ font-size: 0.9em;
283
+ font-weight: 600;
284
+ margin-left: 10px;
285
+ }
286
+
287
+ .badge-success {
288
+ background: #28a745;
289
+ color: white;
290
+ }
291
+
292
+ .badge-warning {
293
+ background: #ffc107;
294
+ color: #333;
295
+ }
296
+
297
+ .badge-info {
298
+ background: #17a2b8;
299
+ color: white;
300
+ }
301
+ </style>
302
+ </head>
303
+ <body>
304
+ <div class="container">
305
+ <!-- Header -->
306
+ <div class="header">
307
+ <h1>🏥 Insurance Product Classification System</h1>
308
+ <p>AI-Powered Product Categorization for Insurance Underwriting</p>
309
+ </div>
310
+
311
+ <!-- Statistics -->
312
+ <div class="stats-grid">
313
+ <div class="stat-card">
314
+ <h3 id="totalCategories">-</h3>
315
+ <p>Insurance Categories</p>
316
+ </div>
317
+ <div class="stat-card">
318
+ <h3 id="automationRate">87.5%</h3>
319
+ <p>Automation Rate</p>
320
+ </div>
321
+ <div class="stat-card">
322
+ <h3 id="avgConfidence">86.1%</h3>
323
+ <p>Average Confidence</p>
324
+ </div>
325
+ <div class="stat-card">
326
+ <h3 id="processingSpeed">~100ms</h3>
327
+ <p>Processing Speed</p>
328
+ </div>
329
+ </div>
330
+
331
+ <!-- Main Content -->
332
+ <div class="main-content">
333
+ <!-- Classification Form -->
334
+ <div class="card" style="width: 79vw;">
335
+ <h2>🔍 Classify Product</h2>
336
+ <form id="classifyForm">
337
+ <div class="form-group">
338
+ <label for="productTitle">Product Title *</label>
339
+ <input type="text" id="productTitle" placeholder="e.g., Apple iPhone 15 Pro Max" required>
340
+ </div>
341
+
342
+ <div class="form-group">
343
+ <label for="productType">Product Type</label>
344
+ <input type="text" id="productType" placeholder="e.g., Smartphone">
345
+ </div>
346
+
347
+ <div class="form-group">
348
+ <label for="vendor">Brand/Vendor</label>
349
+ <input type="text" id="vendor" placeholder="e.g., Apple Inc">
350
+ </div>
351
+
352
+ <div class="form-group">
353
+ <label for="tags">Tags (comma-separated)</label>
354
+ <input type="text" id="tags" placeholder="e.g., electronics, phone, mobile">
355
+ </div>
356
+
357
+ <div class="form-group">
358
+ <label for="description">Description</label>
359
+ <textarea id="description" placeholder="Product description..."></textarea>
360
+ </div>
361
+
362
+ <button type="submit" class="btn" id="classifyBtn">
363
+ Classify Product
364
+ </button>
365
+ </form>
366
+
367
+ <div class="loading" id="loading">
368
+ <div class="spinner"></div>
369
+ <p style="margin-top: 10px; color: #667eea;">Analyzing product...</p>
370
+ </div>
371
+
372
+ <div class="result" id="result"></div>
373
+ </div>
374
+
375
+ <!-- Quick Test Examples
376
+ <div class="card">
377
+ <h2>⚡ Quick Test Examples</h2>
378
+ <p style="margin-bottom: 20px; color: #666;">Click to test with pre-filled examples:</p>
379
+
380
+ <div class="alternative-item" style="cursor: pointer; margin-bottom: 15px;" onclick="testProduct('iphone')">
381
+ <strong>📱 Apple iPhone 15 Pro</strong><br>
382
+ <small>Smartphone • Expected: 65-70% confidence</small>
383
+ </div>
384
+
385
+ <div class="alternative-item" style="cursor: pointer; margin-bottom: 15px;" onclick="testProduct('shoes')">
386
+ <strong>👟 Nike Running Shoes</strong><br>
387
+ <small>Athletic Footwear • Expected: 75-80% confidence</small>
388
+ </div>
389
+
390
+ <div class="alternative-item" style="cursor: pointer; margin-bottom: 15px;" onclick="testProduct('coffee')">
391
+ <strong>☕ Coffee Maker</strong><br>
392
+ <small>Kitchen Appliance • Expected: 80-85% confidence</small>
393
+ </div>
394
+
395
+ <div class="alternative-item" style="cursor: pointer; margin-bottom: 15px;" onclick="testProduct('book')">
396
+ <strong>📚 The Great Gatsby</strong><br>
397
+ <small>Book • Expected: 85-90% confidence</small>
398
+ </div>
399
+
400
+ <div class="alternative-item" style="cursor: pointer;" onclick="testProduct('laptop')">
401
+ <strong>💻 Gaming Laptop</strong><br>
402
+ <small>Computer • Expected: 70-75% confidence</small>
403
+ </div>
404
+
405
+ <div style="margin-top: 30px; padding: 15px; background: #f8f9fa; border-radius: 8px;">
406
+ <strong style="color: #667eea;">System Status:</strong>
407
+ <p style="margin-top: 10px; color: #666;">
408
+ <span id="systemStatus">Checking...</span>
409
+ </p>
410
+ </div>
411
+ </div> -->
412
+ </div>
413
+
414
+ <!-- Footer -->
415
+ <div class="footer">
416
+ <p>Powered by Machine Learning • MPNet Model • 768-Dimensional Embeddings</p>
417
+ <p style="margin-top: 10px; opacity: 0.8;">API Documentation: <a href="/docs" style="color: white; text-decoration: underline;">/docs</a></p>
418
+ </div>
419
+ </div>
420
+
421
+ <script>
422
+ // Load statistics on page load
423
+ async function loadStats() {
424
+ try {
425
+ const response = await fetch('/stats');
426
+ const data = await response.json();
427
+ document.getElementById('totalCategories').textContent = data.total_categories.toLocaleString();
428
+ } catch (error) {
429
+ console.error('Error loading stats:', error);
430
+ }
431
+ }
432
+
433
+ // Check system health
434
+ async function checkHealth() {
435
+ try {
436
+ const response = await fetch('/health');
437
+ const data = await response.json();
438
+ if (data.status === 'healthy') {
439
+ document.getElementById('systemStatus').innerHTML = '✅ <strong style="color: #28a745;">Online</strong> • ' +
440
+ data.categories_loaded.toLocaleString() + ' categories loaded';
441
+ }
442
+ } catch (error) {
443
+ document.getElementById('systemStatus').innerHTML = '❌ <strong style="color: #dc3545;">Offline</strong>';
444
+ }
445
+ }
446
+
447
+ // Classify product
448
+ document.getElementById('classifyForm').addEventListener('submit', async (e) => {
449
+ e.preventDefault();
450
+
451
+ const title = document.getElementById('productTitle').value;
452
+ const productType = document.getElementById('productType').value;
453
+ const vendor = document.getElementById('vendor').value;
454
+ const tags = document.getElementById('tags').value.split(',').map(t => t.trim()).filter(t => t);
455
+ const description = document.getElementById('description').value;
456
+
457
+ const product = {
458
+ id: 'demo_' + Date.now(),
459
+ title,
460
+ product_type: productType,
461
+ vendor,
462
+ tags,
463
+ description
464
+ };
465
+
466
+ // Show loading
467
+ document.getElementById('loading').style.display = 'block';
468
+ document.getElementById('result').style.display = 'none';
469
+ document.getElementById('classifyBtn').disabled = true;
470
+
471
+ try {
472
+ const response = await fetch('/classify', {
473
+ method: 'POST',
474
+ headers: {
475
+ 'Content-Type': 'application/json'
476
+ },
477
+ body: JSON.stringify(product)
478
+ });
479
+
480
+ const result = await response.json();
481
+ displayResult(result);
482
+ } catch (error) {
483
+ alert('Error: ' + error.message);
484
+ } finally {
485
+ document.getElementById('loading').style.display = 'none';
486
+ document.getElementById('classifyBtn').disabled = false;
487
+ }
488
+ });
489
+
490
+ // Display classification result
491
+ function displayResult(result) {
492
+ const resultDiv = document.getElementById('result');
493
+
494
+ let resultClass = 'info';
495
+ let icon = 'ℹ️';
496
+ let badge = '';
497
+
498
+ if (result.action === 'AUTO_APPROVE') {
499
+ resultClass = 'success';
500
+ icon = '✅';
501
+ badge = '<span class="badge badge-success">AUTO APPROVED</span>';
502
+ } else if (result.action === 'QUICK_REVIEW') {
503
+ resultClass = 'warning';
504
+ icon = '⚠️';
505
+ badge = '<span class="badge badge-warning">NEEDS REVIEW</span>';
506
+ } else {
507
+ resultClass = 'info';
508
+ icon = '📋';
509
+ badge = '<span class="badge badge-info">MANUAL</span>';
510
+ }
511
+
512
+ const confidence = result.top_confidence;
513
+
514
+ let html = `
515
+ <div class="result-header">
516
+ <div class="result-icon">${icon}</div>
517
+ <div>
518
+ <div class="result-title">${result.action.replace('_', ' ')}${badge}</div>
519
+ <small style="color: #666;">${result.reason}</small>
520
+ </div>
521
+ </div>
522
+
523
+ <div class="result-content">
524
+ <div class="result-item">
525
+ <strong style="color: #667eea;">Top Category:</strong><br>
526
+ ${result.top_category}
527
+ </div>
528
+
529
+ <div class="result-item">
530
+ <strong style="color: #667eea;">Confidence Score:</strong>
531
+ <div class="confidence-bar">
532
+ <div class="confidence-fill" style="width: ${confidence}%">
533
+ ${confidence.toFixed(2)}%
534
+ </div>
535
+ </div>
536
+ </div>
537
+
538
+ <div class="result-item">
539
+ <strong style="color: #667eea;">Processing Time:</strong> ${result.processing_time_ms.toFixed(2)}ms
540
+ </div>
541
+
542
+ <div class="alternatives">
543
+ <strong style="color: #667eea;">Alternative Categories:</strong>
544
+ ${result.alternatives.slice(0, 3).map((alt, i) => `
545
+ <div class="alternative-item">
546
+ <strong>${i + 2}. ${alt.category_path}</strong><br>
547
+ <small>Confidence: ${alt.confidence_percentage}%</small>
548
+ </div>
549
+ `).join('')}
550
+ </div>
551
+ </div>
552
+ `;
553
+
554
+ resultDiv.className = `result ${resultClass}`;
555
+ resultDiv.innerHTML = html;
556
+ resultDiv.style.display = 'block';
557
+ }
558
+
559
+ // Pre-fill test products
560
+ function testProduct(type) {
561
+ const products = {
562
+ iphone: {
563
+ title: 'Apple iPhone 15 Pro Max',
564
+ type: 'Smartphone',
565
+ vendor: 'Apple Inc',
566
+ tags: 'electronics, mobile, phone, smartphone, 5G',
567
+ description: 'Latest flagship smartphone with titanium design and A17 Bionic chip'
568
+ },
569
+ shoes: {
570
+ title: 'Nike Air Zoom Pegasus 40',
571
+ type: 'Running Shoes',
572
+ vendor: 'Nike',
573
+ tags: 'shoes, athletic, running, sports, footwear',
574
+ description: 'Premium running shoes with responsive cushioning'
575
+ },
576
+ coffee: {
577
+ title: 'Cuisinart DCC-3200 Coffee Maker',
578
+ type: 'Coffee Machine',
579
+ vendor: 'Cuisinart',
580
+ tags: 'appliances, kitchen, coffee, brewing',
581
+ description: 'Programmable automatic drip coffee maker with 14-cup carafe'
582
+ },
583
+ book: {
584
+ title: 'The Great Gatsby by F. Scott Fitzgerald',
585
+ type: 'Book',
586
+ vendor: 'Scribner',
587
+ tags: 'books, fiction, literature, classic',
588
+ description: 'Classic American novel set in the Jazz Age'
589
+ },
590
+ laptop: {
591
+ title: 'ASUS ROG Strix Gaming Laptop',
592
+ type: 'Laptop Computer',
593
+ vendor: 'ASUS',
594
+ tags: 'computers, gaming, laptop, electronics',
595
+ description: 'High-performance gaming laptop with RTX 4070 graphics'
596
+ }
597
+ };
598
+
599
+ const product = products[type];
600
+ document.getElementById('productTitle').value = product.title;
601
+ document.getElementById('productType').value = product.type;
602
+ document.getElementById('vendor').value = product.vendor;
603
+ document.getElementById('tags').value = product.tags;
604
+ document.getElementById('description').value = product.description;
605
+
606
+ // Scroll to form
607
+ document.getElementById('classifyForm').scrollIntoView({ behavior: 'smooth' });
608
+ }
609
+
610
+ // Initialize
611
+ loadStats();
612
+ checkHealth();
613
+ </script>
614
+ </body>
615
+ </html>
tests/test_api.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test script for Product Classification API
3
+ Run this to test your API endpoints
4
+ """
5
+
6
+ import requests
7
+ import json
8
+ from typing import Dict, List
9
+
10
+ # API base URL
11
+ BASE_URL = "http://localhost:8000"
12
+
13
+
14
+ def test_health():
15
+ """Test health check endpoint"""
16
+ print("\n" + "=" * 80)
17
+ print("TEST 1: Health Check")
18
+ print("=" * 80)
19
+
20
+ response = requests.get(f"{BASE_URL}/health")
21
+
22
+ if response.status_code == 200:
23
+ data = response.json()
24
+ print("✅ API is healthy!")
25
+ print(f" Status: {data['status']}")
26
+ print(f" Categories loaded: {data['categories_loaded']:,}")
27
+ print(f" Embedding dimension: {data['embedding_dimension']}")
28
+ else:
29
+ print(f"❌ Health check failed: {response.status_code}")
30
+
31
+ return response.status_code == 200
32
+
33
+
34
+ def test_single_classification():
35
+ """Test single product classification"""
36
+ print("\n" + "=" * 80)
37
+ print("TEST 2: Single Product Classification")
38
+ print("=" * 80)
39
+
40
+ # Test product
41
+ product = {
42
+ "id": "test_001",
43
+ "title": "Sony WH-1000XM5 Wireless Headphones",
44
+ "product_type": "Headphones",
45
+ "vendor": "Sony",
46
+ "tags": ["audio", "electronics", "wireless", "bluetooth"],
47
+ "description": "Premium noise-canceling over-ear headphones",
48
+ }
49
+
50
+ print(f"\n📱 Test Product: {product['title']}")
51
+
52
+ response = requests.post(f"{BASE_URL}/classify", json=product)
53
+
54
+ if response.status_code == 200:
55
+ result = response.json()
56
+
57
+ print(f"\n✅ Classification successful!")
58
+ print(f" Action: {result['action']}")
59
+ print(f" Top Category: {result['top_category']}")
60
+ print(f" Confidence: {result['top_confidence']}%")
61
+ print(f" Processing Time: {result['processing_time_ms']}ms")
62
+
63
+ print(f"\n📊 Top 3 Alternative Categories:")
64
+ for alt in result["alternatives"][:3]:
65
+ print(f" {alt['rank']}. {alt['category_path']}")
66
+ print(f" Confidence: {alt['confidence_percentage']}%")
67
+
68
+ return True
69
+ else:
70
+ print(f"❌ Classification failed: {response.status_code}")
71
+ print(f" Error: {response.text}")
72
+ return False
73
+
74
+
75
+ def test_batch_classification():
76
+ """Test batch product classification"""
77
+ print("\n" + "=" * 80)
78
+ print("TEST 3: Batch Classification")
79
+ print("=" * 80)
80
+
81
+ # Multiple test products
82
+ products = [
83
+ {
84
+ "id": "prod_001",
85
+ "title": "Samsung Galaxy S24 Ultra",
86
+ "product_type": "Smartphone",
87
+ "vendor": "Samsung",
88
+ "tags": ["electronics", "phone", "mobile", "android"],
89
+ },
90
+ {
91
+ "id": "prod_002",
92
+ "title": "KitchenAid Stand Mixer",
93
+ "product_type": "Kitchen Appliance",
94
+ "vendor": "KitchenAid",
95
+ "tags": ["appliance", "kitchen", "cooking"],
96
+ },
97
+ {
98
+ "id": "prod_003",
99
+ "title": "Nike Air Zoom Running Shoes",
100
+ "product_type": "Athletic Footwear",
101
+ "vendor": "Nike",
102
+ "tags": ["shoes", "sports", "running", "athletic"],
103
+ },
104
+ ]
105
+
106
+ batch_request = {"products": products, "top_k": 3}
107
+
108
+ print(f"\n📦 Testing batch of {len(products)} products...")
109
+
110
+ response = requests.post(f"{BASE_URL}/classify-batch", json=batch_request)
111
+
112
+ if response.status_code == 200:
113
+ result = response.json()
114
+
115
+ print(f"\n✅ Batch classification successful!")
116
+ print(f" Total products: {result['total_products']}")
117
+ print(f" Processing time: {result['processing_time_ms']:.2f}ms")
118
+ print(
119
+ f" Time per product: {result['processing_time_ms']/result['total_products']:.2f}ms"
120
+ )
121
+
122
+ print(f"\n📊 Action Distribution:")
123
+ for action, count in result["action_counts"].items():
124
+ percentage = (count / result["total_products"]) * 100
125
+ print(f" {action}: {count} ({percentage:.1f}%)")
126
+
127
+ print(f"\n🎯 Individual Results:")
128
+ for res in result["results"]:
129
+ print(f"\n • {res.get('product_id', 'N/A')}")
130
+ print(f" Action: {res['action']}")
131
+ print(f" Confidence: {res.get('top_confidence', 0)}%")
132
+ if res.get("top_category"):
133
+ print(f" Category: {res['top_category'][:60]}...")
134
+
135
+ return True
136
+ else:
137
+ print(f"❌ Batch classification failed: {response.status_code}")
138
+ print(f" Error: {response.text}")
139
+ return False
140
+
141
+
142
+ def test_various_products():
143
+ """Test with various product types"""
144
+ print("\n" + "=" * 80)
145
+ print("TEST 4: Various Product Types")
146
+ print("=" * 80)
147
+
148
+ test_cases = [
149
+ {
150
+ "name": "Electronics",
151
+ "product": {
152
+ "title": "MacBook Pro 16 inch M3",
153
+ "product_type": "Laptop Computer",
154
+ "vendor": "Apple",
155
+ "tags": ["computer", "laptop", "electronics"],
156
+ },
157
+ },
158
+ {
159
+ "name": "Books",
160
+ "product": {
161
+ "title": "The Great Gatsby by F. Scott Fitzgerald",
162
+ "product_type": "Book",
163
+ "vendor": "Scribner",
164
+ "tags": ["books", "fiction", "literature", "classic"],
165
+ },
166
+ },
167
+ {
168
+ "name": "Home Appliances",
169
+ "product": {
170
+ "title": "Dyson V15 Detect Vacuum Cleaner",
171
+ "product_type": "Vacuum Cleaner",
172
+ "vendor": "Dyson",
173
+ "tags": ["appliance", "cleaning", "home", "cordless"],
174
+ },
175
+ },
176
+ {
177
+ "name": "Toys",
178
+ "product": {
179
+ "title": "LEGO Star Wars Millennium Falcon",
180
+ "product_type": "Building Toy",
181
+ "vendor": "LEGO",
182
+ "tags": ["toys", "kids", "lego", "star wars", "building"],
183
+ },
184
+ },
185
+ ]
186
+
187
+ results_summary = []
188
+
189
+ for test_case in test_cases:
190
+ print(f"\n🧪 Testing: {test_case['name']}")
191
+ print(f" Product: {test_case['product']['title']}")
192
+
193
+ response = requests.post(f"{BASE_URL}/classify", json=test_case["product"])
194
+
195
+ if response.status_code == 200:
196
+ result = response.json()
197
+ confidence = result["top_confidence"]
198
+ action = result["action"]
199
+
200
+ emoji = (
201
+ "✅"
202
+ if action == "AUTO_APPROVE"
203
+ else "⚠️" if action == "QUICK_REVIEW" else "❌"
204
+ )
205
+
206
+ print(f" {emoji} {action}: {confidence}%")
207
+
208
+ results_summary.append(
209
+ {
210
+ "category": test_case["name"],
211
+ "confidence": confidence,
212
+ "action": action,
213
+ }
214
+ )
215
+ else:
216
+ print(f" ❌ Failed: {response.status_code}")
217
+ results_summary.append(
218
+ {"category": test_case["name"], "confidence": 0, "action": "ERROR"}
219
+ )
220
+
221
+ # Print summary
222
+ print(f"\n📈 SUMMARY:")
223
+ print("-" * 80)
224
+
225
+ avg_confidence = sum(r["confidence"] for r in results_summary) / len(
226
+ results_summary
227
+ )
228
+ auto_approve_count = sum(
229
+ 1 for r in results_summary if r["action"] == "AUTO_APPROVE"
230
+ )
231
+
232
+ print(f"Average Confidence: {avg_confidence:.2f}%")
233
+ print(
234
+ f"Auto-Approve Rate: {auto_approve_count}/{len(results_summary)} ({auto_approve_count/len(results_summary)*100:.1f}%)"
235
+ )
236
+
237
+ return True
238
+
239
+
240
+ def run_all_tests():
241
+ """Run all tests"""
242
+ print("\n" + "=" * 80)
243
+ print("🧪 RUNNING ALL API TESTS")
244
+ print("=" * 80)
245
+ print("\nMake sure API is running: uvicorn src.api:app --reload")
246
+
247
+ tests = [
248
+ ("Health Check", test_health),
249
+ ("Single Classification", test_single_classification),
250
+ ("Batch Classification", test_batch_classification),
251
+ ("Various Products", test_various_products),
252
+ ]
253
+
254
+ results = []
255
+
256
+ for test_name, test_func in tests:
257
+ try:
258
+ result = test_func()
259
+ results.append((test_name, result))
260
+ except requests.exceptions.ConnectionError:
261
+ print(f"\n❌ Connection Error: Is the API running?")
262
+ print(" Start it with: uvicorn src.api:app --reload")
263
+ return
264
+ except Exception as e:
265
+ print(f"\n❌ Error in {test_name}: {e}")
266
+ results.append((test_name, False))
267
+
268
+ # Final summary
269
+ print("\n" + "=" * 80)
270
+ print("📊 TEST RESULTS SUMMARY")
271
+ print("=" * 80)
272
+
273
+ for test_name, result in results:
274
+ status = "✅ PASS" if result else "❌ FAIL"
275
+ print(f"{status} - {test_name}")
276
+
277
+ passed = sum(1 for _, r in results if r)
278
+ total = len(results)
279
+
280
+ print(f"\n🎯 Overall: {passed}/{total} tests passed ({passed/total*100:.1f}%)")
281
+
282
+ if passed == total:
283
+ print("\n🎉 ALL TESTS PASSED! Your API is working perfectly!")
284
+ else:
285
+ print(f"\n⚠️ Some tests failed. Check the errors above.")
286
+
287
+
288
+ if __name__ == "__main__":
289
+ run_all_tests()