Spaces:

Hemang1915
/

expense-categorization-api

Sleeping

App Files Files Community

Hemang1915 commited on Apr 10, 2025

Commit

ca4fd98

verified ·

1 Parent(s): be6f038

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -6

app.py CHANGED Viewed

@@ -5,13 +5,53 @@ from fastapi import FastAPI, Request
 from pydantic import BaseModel
 import pickle
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Load label encoders
 try:
     with open("main_category_encoder_5k.pkl", "rb") as f:
         main_category_encoder = pickle.load(f)
     with open("sub_category_encoder_5k.pkl", "rb") as f:
@@ -32,7 +72,7 @@ except Exception as e:
 class BERTFNN(nn.Module):
     def __init__(self, num_main_classes, num_sub_classes):
         super(BERTFNN, self).__init__()
-        self.bert = BertModel.from_pretrained("./bert-model")  # Load locally
         self.fc_main = nn.Linear(self.bert.config.hidden_size, num_main_classes)
         self.fc_sub = nn.Linear(self.bert.config.hidden_size + num_main_classes, num_sub_classes)
@@ -73,19 +113,27 @@ class TransactionInput(BaseModel):
 async def root():
     return {"message": "Welcome to the Expense Categorization API. Use POST /predict to categorize expenses."}
-# Define predict endpoint with confidence scores
 @app.post("/predict")
 async def predict_category(transaction: TransactionInput, request: Request):
     try:
         logger.info(f"Received request: {transaction.dict()}")
-        # Tokenize input
-        tokens = tokenizer(transaction.description, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
         input_ids = tokens["input_ids"].to(device)
         attention_mask = tokens["attention_mask"].to(device)
         # Get model predictions
         with torch.no_grad():
             main_logits, sub_logits = model(input_ids, attention_mask)
         # Compute softmax probabilities for main category
         main_probs = torch.softmax(main_logits, dim=1)
@@ -100,10 +148,12 @@ async def predict_category(transaction: TransactionInput, request: Request):
         # Decode category labels
         main_category = main_category_encoder.inverse_transform([main_category_idx])[0]
         sub_category = sub_category_encoder.inverse_transform([sub_category_idx])[0]
         # Prepare response
         response = {
-            "description": transaction.description,
             "main_category": main_category,
             "main_confidence": round(main_confidence, 4),
             "sub_category": sub_category,
@@ -112,5 +162,5 @@ async def predict_category(transaction: TransactionInput, request: Request):
         logger.info(f"Response: {response}")
         return response
     except Exception as e:
-        logger.error(f"Error in prediction: {e}")
         return {"error": str(e)}, 500

 from pydantic import BaseModel
 import pickle
 import logging
+import os
+import re
+# Set Hugging Face cache directory
+os.environ["TRANSFORMERS_CACHE"] = "/path/to/writable/cache"  # Replace with a writable path
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Text cleaning function
+def clean_description(text):
+    """
+    Clean transaction description by removing prefixes, numeric codes, and separators.
+    Examples:
+        'UPI/DR/12345678/netflix subscription' -> 'netflix subscription'
+        'UPI-DR-12345678-netflix subscription' -> 'netflix subscription'
+        'VISA/123456/uber ride to office' -> 'uber ride to office'
+    """
+    # Convert to lowercase (optional, depending on model training)
+    text = text.lower()
+    # Remove common transaction prefixes and codes
+    patterns = [
+        r'^upi/dr/[0-9]+/',  # Matches 'UPI/DR/12345678/'
+        r'^upi-dr-[0-9]+-',  # Matches 'UPI-DR-12345678-'
+        r'^visa/[0-9]+/',    # Matches 'VISA/123456/'
+        r'^[a-zA-Z]+/[0-9]+/',  # Matches other prefixes like 'POS/123456/'
+        r'^[a-zA-Z]+-[0-9]+-',   # Matches other prefixes like 'POS-123456-'
+        r'\b[0-9]{6,}\b',    # Matches standalone numeric codes (6+ digits)
+    ]
+    for pattern in patterns:
+        text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+    # Replace multiple separators with a single space
+    text = re.sub(r'[-_/]+', ' ', text)
+    # Remove extra whitespace
+    text = ' '.join(text.split())
+    # Return cleaned text, or original if cleaning results in empty string
+    return text if text else "unknown transaction"
 # Load label encoders
 try:
+    # Note: Ensure these were pickled with scikit-learn 1.6.1 to avoid version mismatch
     with open("main_category_encoder_5k.pkl", "rb") as f:
         main_category_encoder = pickle.load(f)
     with open("sub_category_encoder_5k.pkl", "rb") as f:
 class BERTFNN(nn.Module):
     def __init__(self, num_main_classes, num_sub_classes):
         super(BERTFNN, self).__init__()
+        self.bert = BertModel.from_pretrained("./bert-model")
         self.fc_main = nn.Linear(self.bert.config.hidden_size, num_main_classes)
         self.fc_sub = nn.Linear(self.bert.config.hidden_size + num_main_classes, num_sub_classes)
 async def root():
     return {"message": "Welcome to the Expense Categorization API. Use POST /predict to categorize expenses."}
+# Define predict endpoint with text cleaning and confidence scores
 @app.post("/predict")
 async def predict_category(transaction: TransactionInput, request: Request):
+    logger.info("Starting prediction for request")
     try:
         logger.info(f"Received request: {transaction.dict()}")
+        # Clean the input description
+        cleaned_description = clean_description(transaction.description)
+        logger.info(f"Cleaned description: {cleaned_description}")
+        # Tokenize cleaned description
+        tokens = tokenizer(cleaned_description, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
         input_ids = tokens["input_ids"].to(device)
         attention_mask = tokens["attention_mask"].to(device)
+        logger.info("Tokenization completed")
         # Get model predictions
         with torch.no_grad():
             main_logits, sub_logits = model(input_ids, attention_mask)
+        logger.info("Model inference completed")
         # Compute softmax probabilities for main category
         main_probs = torch.softmax(main_logits, dim=1)
         # Decode category labels
         main_category = main_category_encoder.inverse_transform([main_category_idx])[0]
         sub_category = sub_category_encoder.inverse_transform([sub_category_idx])[0]
+        logger.info("Category decoding completed")
         # Prepare response
         response = {
+            "original_description": transaction.description,
+            "cleaned_description": cleaned_description,
             "main_category": main_category,
             "main_confidence": round(main_confidence, 4),
             "sub_category": sub_category,
         logger.info(f"Response: {response}")
         return response
     except Exception as e:
+        logger.error(f"Error in prediction: {str(e)}", exc_info=True)
         return {"error": str(e)}, 500