Spaces:

LogicGoInfotechSpaces
/

ExpenseCategorizeNotes

Sleeping

App Files Files Community

HariLogicgo commited on 3 days ago

Commit

8cceab7

1 Parent(s): 790aee0

added id returning

Browse files

Files changed (5) hide show

app/api/routes.py +1 -1
app/core/config.py +78 -10
app/dependencies.py +2 -0
app/schemas/categories.py +15 -6
app/services/autocategorizer.py +406 -218

app/api/routes.py CHANGED Viewed

@@ -20,7 +20,7 @@ async def categorize_transaction(
 ) -> CategorizeResponse | JSONResponse:
     started_at = time.monotonic()
     try:
-        result = await service.categorize(payload.notes)
         await api_logger.log_categorization(
             name="Auto Expense Categorization",
             status="success",

 ) -> CategorizeResponse | JSONResponse:
     started_at = time.monotonic()
     try:
+        result = await service.categorize(payload.notes, payload.user_id)
         await api_logger.log_categorization(
             name="Auto Expense Categorization",
             status="success",

app/core/config.py CHANGED Viewed

@@ -1,3 +1,9 @@
 from functools import lru_cache
 from pydantic import Field
@@ -5,24 +11,86 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
-    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
-    mongo_uri: str = Field(..., alias="MONGO_URI")
-    mongo_db: str = Field("expense", alias="MONGO_DB")
-    mongo_collection: str = Field("headcategories", alias="MONGO_COLLECTION")
-    mongo_subcategory_collection: str = Field("categories", alias="MONGO_SUBCATEGORY_COLLECTION")
-    api_logs_collection: str = Field("api_logs", alias="MONGO_API_LOGS_COLLECTION")
-    openai_api_key: str = Field(..., alias="OPENAI_API_KEY")
-    openai_model: str = Field("gpt-4o-mini", alias="OPENAI_MODEL")
-    category_cache_ttl_seconds: int = Field(300, alias="CATEGORY_CACHE_TTL")
-    use_static_categories: bool = Field(True, alias="USE_STATIC_CATEGORIES")
 @lru_cache
 def get_settings() -> Settings:
     return Settings()
 settings = get_settings()

+"""Application configuration settings.
+This module handles all configuration settings loaded from environment variables.
+Settings are validated using Pydantic and cached for performance.
+"""
 from functools import lru_cache
 from pydantic import Field
 class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+        case_sensitive=False,
+    )
+    # MongoDB Configuration
+    mongo_uri: str = Field(
+        ...,
+        alias="MONGO_URI",
+        description="MongoDB connection URI",
+    )
+    mongo_db: str = Field(
+        "expense",
+        alias="MONGO_DB",
+        description="MongoDB database name",
+    )
+    mongo_collection: str = Field(
+        "headcategories",
+        alias="MONGO_COLLECTION",
+        description="MongoDB collection name for headcategories",
+    )
+    mongo_subcategory_collection: str = Field(
+        "categories",
+        alias="MONGO_SUBCATEGORY_COLLECTION",
+        description="MongoDB collection name for categories",
+    )
+    api_logs_collection: str = Field(
+        "api_logs",
+        alias="MONGO_API_LOGS_COLLECTION",
+        description="MongoDB collection name for API logs",
+    )
+    # OpenAI Configuration
+    openai_api_key: str = Field(
+        ...,
+        alias="OPENAI_API_KEY",
+        description="OpenAI API key for LLM requests",
+    )
+    openai_model: str = Field(
+        "gpt-4o-mini",
+        alias="OPENAI_MODEL",
+        description="OpenAI model to use for categorization",
+    )
+    # Performance & Caching Configuration
+    category_cache_ttl_seconds: int = Field(
+        300,
+        alias="CATEGORY_CACHE_TTL",
+        description="Time-to-live for category cache in seconds (5 minutes default)",
+        ge=60,  # Minimum 1 minute
+    )
+    db_query_timeout_seconds: float = Field(
+        5.0,
+        alias="DB_QUERY_TIMEOUT",
+        description="Timeout for database queries in seconds",
+        ge=1.0,
+        le=30.0,
+    )
+    model_api_timeout_seconds: float = Field(
+        15.0,
+        alias="MODEL_API_TIMEOUT",
+        description="Timeout for OpenAI API calls in seconds",
+        ge=5.0,
+        le=60.0,
+    )
 @lru_cache
 def get_settings() -> Settings:
+    """Get cached settings instance.
+    Returns:
+        Settings: Application settings instance
+    """
     return Settings()
+# Global settings instance
 settings = get_settings()

app/dependencies.py CHANGED Viewed

@@ -15,6 +15,8 @@ def _get_service() -> AutoCategoryService:
         openai_client=openai_client,
         model=settings.openai_model,
         cache_ttl_seconds=settings.category_cache_ttl_seconds,
     )

         openai_client=openai_client,
         model=settings.openai_model,
         cache_ttl_seconds=settings.category_cache_ttl_seconds,
+        db_timeout_seconds=settings.db_query_timeout_seconds,
+        model_timeout_seconds=settings.model_api_timeout_seconds,
     )

app/schemas/categories.py CHANGED Viewed

@@ -2,19 +2,28 @@ from __future__ import annotations
 from typing import Optional
-from pydantic import BaseModel, Field
 class CategorizeRequest(BaseModel):
     notes: str = Field(..., min_length=1, description="Full transaction note.")
-    user_id: Optional[str] = Field(
-        None, description="Optional user identifier associated with the request."
-    )
 class CategoryPrediction(BaseModel):
-    title: str = Field(..., description="High-level category title.")
-    subcategory: str = Field(..., description="Specific subcategory chosen by the model.")
 class CategorizeResponse(BaseModel):

 from typing import Optional
+from bson import ObjectId
+from pydantic import BaseModel, Field, field_validator
 class CategorizeRequest(BaseModel):
     notes: str = Field(..., min_length=1, description="Full transaction note.")
+    user_id: str = Field(..., description="User identifier associated with the request.")
 class CategoryPrediction(BaseModel):
+    headcategory_id: str = Field(..., description="High-level category ObjectId.")
+    headcategory_title: str = Field(..., description="High-level category title.")
+    category_id: str = Field(..., description="Specific subcategory ObjectId chosen by the model.")
+    category_title: str = Field(..., description="Specific subcategory title chosen by the model.")
+    @field_validator('headcategory_id', 'category_id')
+    @classmethod
+    def validate_object_id(cls, v: str) -> str:
+        """Validate that the string is a valid ObjectId."""
+        if not ObjectId.is_valid(v):
+            raise ValueError(f"Invalid ObjectId: {v}")
+        return v
 class CategorizeResponse(BaseModel):

app/services/autocategorizer.py CHANGED Viewed

@@ -4,8 +4,9 @@ import ast
 import asyncio
 import json
 import re
 import time
-from typing import Callable, Dict, List, Optional
 from bson import ObjectId
 from fastapi import HTTPException
@@ -19,127 +20,6 @@ from app.schemas.categories import CategoryPrediction
 class AutoCategoryService:
     """Classifies transaction notes into the closest Mongo-backed category."""
-    # Curated categories requested by the client. When enabled via settings.use_static_categories,
-    # we bypass Mongo reads to avoid noisy data and long scans.
-    _STATIC_CATEGORIES: List[Dict[str, object]] = [
-        {
-            "title": "Food & Drinks",
-            "subcategories": ["Groceries", "Restaurant, Fast - Food", "Bar, Cafe", "Food & Drink"],
-        },
-        {
-            "title": "Investments",
-            "subcategories": [
-                "Investments",
-                "Realty",
-                "Vehicles, Chattels",
-                "Finacial investments",
-                "Savings",
-                "Collections",
-            ],
-        },
-        {
-            "title": "Communication,PC",
-            "subcategories": ["Communication,PC", "Phone", "Internet", "Software, app, games", "Postal services"],
-        },
-        {
-            "title": "Financial Expenses",
-            "subcategories": [
-                "Financial expenses",
-                "Taxes",
-                "Insurances",
-                "Loan, interests",
-                "Fines",
-                "Advisory",
-                "Charges, Fees",
-                "Child Support",
-            ],
-        },
-        {
-            "title": "Life & Entertainment",
-            "subcategories": [
-                "Life & Entertainment",
-                "Health, Care, Doctor",
-                "Wellness, Beauty",
-                "Active sport, Fitness",
-                "Culture, sport events",
-                "Life events",
-                "Hobbies",
-                "Education, Development",
-                "Books, Audio, subscription",
-                "TV, Streaming",
-                "Holiday, Trip, Hotels",
-                "Charity, Gifts",
-                "Alcohol, Tobacco",
-                "Lottery, Gamblings",
-            ],
-        },
-        {
-            "title": "Vehicle",
-            "subcategories": [
-                "Vehicle",
-                "Fuel",
-                "Parking",
-                "Vehicle maintenance",
-                "Rentals",
-                "Vehicle insurance",
-                "Leasing",
-            ],
-        },
-        {
-            "title": "Transportation",
-            "subcategories": ["Transportation", "Public transport", "Taxi", "Long distance", "Business trips"],
-        },
-        {
-            "title": "Housing",
-            "subcategories": [
-                "Housing",
-                "Rent",
-                "Mortgage",
-                "Energy, utilities",
-                "Services",
-                "Maintenance, repairs",
-                "Property insurance",
-            ],
-        },
-        {
-            "title": "Shopping",
-            "subcategories": [
-                "Shopping",
-                "Clothes & shoes",
-                "Jewels & Accessories",
-                "Health & Beauty",
-                "Kids",
-                "Home & Garden",
-                "Pets & Animals",
-                "Electronics",
-                "Gift",
-                "Stationary",
-                "Free time",
-                "Chemist",
-            ],
-        },
-        {
-            "title": "Income",
-            "subcategories": [
-                "Income",
-                "Wage, Invoices",
-                "Sale",
-                "Rental income",
-                "Dues & grants",
-                "Lending, renting",
-                "Checks, coupons",
-                "Lottery, gambling",
-                "Refunds",
-                "Child support",
-                "Gifts",
-                "Account Manage",
-            ],
-        },
-    ]
-    _categories_timeout_seconds = 15.0
-    _model_timeout_seconds = 20.0
     def __init__(
         self,
         collection_getter: Callable[[], AsyncIOMotorCollection],
@@ -147,15 +27,20 @@ class AutoCategoryService:
         openai_client: AsyncOpenAI,
         model: str,
         cache_ttl_seconds: int,
     ) -> None:
         self._collection_getter = collection_getter
         self._subcategory_collection_getter = subcategory_collection_getter
         self._openai_client = openai_client
         self._model = model
         self._cache_ttl_seconds = cache_ttl_seconds
-        self._cached_categories: List[Dict[str, object]] | None = None
-        self._last_loaded: float = 0.0
-        self._lock = asyncio.Lock()
     def _collection(self) -> AsyncIOMotorCollection:
         return self._collection_getter()
@@ -163,72 +48,220 @@ class AutoCategoryService:
     def _subcategory_collection(self) -> AsyncIOMotorCollection:
         return self._subcategory_collection_getter()
-    async def categorize(self, notes: str) -> CategoryPrediction:
         try:
-            categories = await asyncio.wait_for(
-                self._get_categories(), timeout=self._categories_timeout_seconds
             )
         except asyncio.TimeoutError as exc:
             raise HTTPException(status_code=504, detail="Timed out loading categories from database.") from exc
         except Exception as exc:
             raise HTTPException(status_code=502, detail="Failed to load categories from database.") from exc
-        if not categories:
-            raise HTTPException(status_code=500, detail="No categories configured.")
-        formatted_categories = self._format_categories(categories)
-        user_prompt = (
             "Transaction note:\n"
             f"{notes}\n\n"
-            "Available categories and subcategories:\n"
             f"{formatted_categories}\n\n"
-            "Respond with the exact title and subcategory from the list above."
         )
-        request_payload = dict(
             model=self._model,
-            input=[
                 {
                     "role": "system",
                     "content": (
                         "You classify financial transactions into the closest category. "
-                        "Only use the provided title and subcategory options. "
-                        "Output valid JSON with keys 'title' and 'subcategory'."
                     ),
                 },
-                {"role": "user", "content": [{"type": "input_text", "text": user_prompt}]},
             ],
         )
         try:
-            response = await asyncio.wait_for(
-                self._create_model_response(request_payload),
-                timeout=self._model_timeout_seconds,
-            )
-        except TypeError as exc:
-            # Older openai-python clients (pre 1.3x) do not yet support response_format.
-            if "response_format" not in str(exc):
-                raise
-            response = await asyncio.wait_for(
-                self._openai_client.responses.create(**request_payload),
                 timeout=self._model_timeout_seconds,
             )
         except asyncio.TimeoutError as exc:
-            raise HTTPException(status_code=504, detail="Timed out waiting for model response.") from exc
         except Exception as exc:
-            raise HTTPException(status_code=502, detail="Failed to call the model API.") from exc
         try:
-            payload = self._parse_response_payload(response)
         except ValueError as exc:
-            raise HTTPException(status_code=502, detail="Failed to parse model output.") from exc
-        title = payload.get("title")
-        subcategory = payload.get("subcategory")
-        if not isinstance(title, str) or not isinstance(subcategory, str):
-            raise HTTPException(status_code=502, detail="Model response missing category fields.")
-        return CategoryPrediction(title=title.strip(), subcategory=subcategory.strip())
     def _parse_response_payload(self, response) -> Dict[str, object]:
         raw_text = self._extract_response_text(response)
@@ -252,6 +285,14 @@ class AutoCategoryService:
     @staticmethod
     def _extract_response_text(response) -> str:
         text = getattr(response, "output_text", "") or ""
         if isinstance(text, str) and text.strip():
             return text.strip()
@@ -328,74 +369,221 @@ class AutoCategoryService:
         return None
-    async def _get_categories(self) -> List[Dict[str, object]]:
-        async with self._lock:
             now = time.monotonic()
-            if self._cached_categories and (now - self._last_loaded) < self._cache_ttl_seconds:
-                return self._cached_categories
-            if settings.use_static_categories:
-                self._cached_categories = self._STATIC_CATEGORIES
-                self._last_loaded = now
-                return self._cached_categories
-            # Use headcategories + categories to avoid scanning millions of raw transaction titles.
-            head_collection = self._collection()
-            subcategory_collection = self._subcategory_collection()
-            pipeline = [
-                {"$match": {"type": "EXPENSE", "categories": {"$type": "array", "$ne": []}}},
-                {"$group": {"_id": "$title", "category_ids": {"$first": "$categories"}}},
-            ]
-            head_docs = await head_collection.aggregate(pipeline).to_list(length=1000)
-            all_ids: set[ObjectId] = set()
-            for doc in head_docs:
-                for cid in doc.get("category_ids") or []:
-                    if isinstance(cid, ObjectId):
-                        all_ids.add(cid)
-            subcategory_titles: Dict[ObjectId, str] = {}
-            if all_ids:
-                cursor = subcategory_collection.find({"_id": {"$in": list(all_ids)}}, {"title": 1})
-                async for subdoc in cursor:
-                    title = subdoc.get("title")
-                    if isinstance(title, str) and title.strip():
-                        subcategory_titles[subdoc["_id"]] = title.strip()
-            categories: List[Dict[str, object]] = []
-            for doc in head_docs:
-                raw_title = doc.get("_id")
-                if not isinstance(raw_title, str) or not raw_title.strip():
-                    continue
-                ids = [cid for cid in (doc.get("category_ids") or []) if isinstance(cid, ObjectId)]
-                subcategories = sorted({subcategory_titles[cid] for cid in ids if cid in subcategory_titles})
-                if not subcategories:
-                    continue
-                categories.append(
-                    {
-                        "title": raw_title.strip(),
-                        "subcategories": subcategories,
-                    }
-                )
-            self._cached_categories = categories
-            self._last_loaded = now
-            return categories
     async def _create_model_response(self, request_payload: Dict[str, object]):
-        return await self._openai_client.responses.create(
-            response_format={"type": "json_object"},
-            **request_payload,
-        )
     @staticmethod
-    def _format_categories(categories: List[Dict[str, object]]) -> str:
         lines = []
         for category in categories:
             subs = category.get("subcategories") or []
-            subs_text = ", ".join(subs) if subs else "Unspecified"
             lines.append(f"- {category.get('title', 'Unknown')}: {subs_text}")
         return "\n".join(lines)

 import asyncio
 import json
 import re
+import string
 import time
+from typing import Callable, Dict, List, Optional, Tuple
 from bson import ObjectId
 from fastapi import HTTPException
 class AutoCategoryService:
     """Classifies transaction notes into the closest Mongo-backed category."""
     def __init__(
         self,
         collection_getter: Callable[[], AsyncIOMotorCollection],
         openai_client: AsyncOpenAI,
         model: str,
         cache_ttl_seconds: int,
+        db_timeout_seconds: float,
+        model_timeout_seconds: float,
     ) -> None:
         self._collection_getter = collection_getter
         self._subcategory_collection_getter = subcategory_collection_getter
         self._openai_client = openai_client
         self._model = model
         self._cache_ttl_seconds = cache_ttl_seconds
+        self._db_timeout_seconds = db_timeout_seconds
+        self._model_timeout_seconds = model_timeout_seconds
+        # User-specific cache for headcategories: {user_id: (data, timestamp)}
+        self._headcategories_cache: Dict[str, Tuple[Dict[str, object], float]] = {}
+        self._cache_lock = asyncio.Lock()
     def _collection(self) -> AsyncIOMotorCollection:
         return self._collection_getter()
     def _subcategory_collection(self) -> AsyncIOMotorCollection:
         return self._subcategory_collection_getter()
+    async def categorize(self, notes: str, user_id: str) -> CategoryPrediction:
+        """Categorize transaction notes using a two-step approach:
+        1. First match notes to a headcategory title
+        2. Then match notes to a category within that headcategory
+        """
+        # Step 1: Fetch all headcategories for the user (with caching)
         try:
+            headcategories_data = await asyncio.wait_for(
+                self._get_headcategories_cached(user_id), timeout=self._db_timeout_seconds
+            )
+        except asyncio.TimeoutError as exc:
+            raise HTTPException(status_code=504, detail="Timed out loading headcategories from database.") from exc
+        except Exception as exc:
+            raise HTTPException(status_code=502, detail="Failed to load headcategories from database.") from exc
+        if not headcategories_data or not headcategories_data.get("headcategories"):
+            raise HTTPException(status_code=500, detail="No headcategories configured for this user.")
+        # Step 2: Use LLM to match notes to a headcategory title
+        headcategory_titles = [hc.get("title", "") for hc in headcategories_data["headcategories"]]
+        formatted_headcategories = "\n".join([f"- {title}" for title in headcategory_titles if title])
+        headcategory_prompt = (
+            "Transaction note:\n"
+            f"{notes}\n\n"
+            "Available headcategories:\n"
+            f"{formatted_headcategories}\n\n"
+            "Respond with the exact headcategory title from the list above that best matches this transaction."
+        )
+        headcategory_request = dict(
+            model=self._model,
+            messages=[
+                {
+                    "role": "system",
+                    "content": (
+                        "You classify financial transactions into the closest headcategory. "
+                        "Only use the provided headcategory title options. "
+                        "Output valid JSON with key 'title'."
+                    ),
+                },
+                {"role": "user", "content": headcategory_prompt},
+            ],
+        )
+        try:
+            headcategory_response = await asyncio.wait_for(
+                self._create_model_response(headcategory_request),
+                timeout=self._model_timeout_seconds,
+            )
+        except asyncio.TimeoutError as exc:
+            raise HTTPException(status_code=504, detail="Timed out waiting for headcategory model response.") from exc
+        except Exception as exc:
+            error_msg = str(exc)
+            raise HTTPException(
+                status_code=502,
+                detail=f"Failed to call the model API for headcategory: {error_msg}"
+            ) from exc
+        try:
+            headcategory_payload = self._parse_response_payload(headcategory_response)
+        except ValueError as exc:
+            raise HTTPException(status_code=502, detail="Failed to parse headcategory model output.") from exc
+        matched_headcategory_title = headcategory_payload.get("title")
+        if not isinstance(matched_headcategory_title, str):
+            raise HTTPException(status_code=502, detail="Model response missing headcategory title field.")
+        # Step 3: Find the matched headcategory and get its categories (optimized lookup)
+        matched_headcategory = None
+        matched_title_normalized = self._normalize_string(matched_headcategory_title)
+        matched_title_lower = matched_headcategory_title.lower()
+        # Try exact normalized match first (most common case)
+        for hc in headcategories_data["headcategories"]:
+            hc_title = hc.get("title", "")
+            if self._normalize_string(hc_title) == matched_title_normalized:
+                matched_headcategory = hc
+                break
+        # Try partial matching if exact normalized match fails
+        if not matched_headcategory:
+            for hc in headcategories_data["headcategories"]:
+                hc_title = hc.get("title", "").lower()
+                if matched_title_lower in hc_title or hc_title in matched_title_lower:
+                    matched_headcategory = hc
+                    break
+        if not matched_headcategory:
+            available_titles = ", ".join(headcategory_titles[:10])
+            raise HTTPException(
+                status_code=502,
+                detail=(
+                    f"Could not find matching headcategory for title: '{matched_headcategory_title}'. "
+                    f"Available headcategories: {available_titles}"
+                )
+            )
+        headcategory_id = matched_headcategory.get("_id")
+        category_ids = matched_headcategory.get("category_ids", [])
+        if not isinstance(headcategory_id, ObjectId):
+            raise HTTPException(status_code=500, detail="Invalid headcategory ID format.")
+        if not category_ids:
+            raise HTTPException(status_code=500, detail="Selected headcategory has no categories.")
+        # Step 4: Fetch categories from categories collection
+        try:
+            categories_data = await asyncio.wait_for(
+                self._get_categories_by_ids(category_ids), timeout=self._db_timeout_seconds
             )
         except asyncio.TimeoutError as exc:
             raise HTTPException(status_code=504, detail="Timed out loading categories from database.") from exc
         except Exception as exc:
             raise HTTPException(status_code=502, detail="Failed to load categories from database.") from exc
+        if not categories_data or not categories_data.get("categories"):
+            raise HTTPException(status_code=500, detail="No categories found for the selected headcategory.")
+        # Step 5: Use LLM to match notes to a specific category
+        category_titles = [cat.get("title", "") for cat in categories_data["categories"]]
+        formatted_categories = "\n".join([f"- {title}" for title in category_titles if title])
+        category_prompt = (
             "Transaction note:\n"
             f"{notes}\n\n"
+            "Available categories:\n"
             f"{formatted_categories}\n\n"
+            "Respond with the exact category title from the list above that best matches this transaction."
         )
+        category_request = dict(
             model=self._model,
+            messages=[
                 {
                     "role": "system",
                     "content": (
                         "You classify financial transactions into the closest category. "
+                        "Only use the provided category title options. "
+                        "Output valid JSON with key 'title'."
                     ),
                 },
+                {"role": "user", "content": category_prompt},
             ],
         )
         try:
+            category_response = await asyncio.wait_for(
+                self._create_model_response(category_request),
                 timeout=self._model_timeout_seconds,
             )
         except asyncio.TimeoutError as exc:
+            raise HTTPException(status_code=504, detail="Timed out waiting for category model response.") from exc
         except Exception as exc:
+            error_msg = str(exc)
+            raise HTTPException(
+                status_code=502,
+                detail=f"Failed to call the model API for category: {error_msg}"
+            ) from exc
         try:
+            category_payload = self._parse_response_payload(category_response)
         except ValueError as exc:
+            raise HTTPException(status_code=502, detail="Failed to parse category model output.") from exc
+        matched_category_title = category_payload.get("title")
+        if not isinstance(matched_category_title, str):
+            raise HTTPException(status_code=502, detail="Model response missing category title field.")
+        # Step 6: Find the matched category ID (optimized lookup)
+        matched_category = None
+        matched_cat_title_normalized = self._normalize_string(matched_category_title)
+        matched_cat_title_lower = matched_category_title.lower()
+        # Try exact normalized match first (most common case)
+        for cat in categories_data["categories"]:
+            cat_title = cat.get("title", "")
+            if self._normalize_string(cat_title) == matched_cat_title_normalized:
+                matched_category = cat
+                break
+        # Try partial matching if exact normalized match fails
+        if not matched_category:
+            for cat in categories_data["categories"]:
+                cat_title = cat.get("title", "").lower()
+                if matched_cat_title_lower in cat_title or cat_title in matched_cat_title_lower:
+                    matched_category = cat
+                    break
+        if not matched_category:
+            available_titles = ", ".join(category_titles[:10])
+            raise HTTPException(
+                status_code=502,
+                detail=(
+                    f"Could not find matching category for title: '{matched_category_title}'. "
+                    f"Available categories: {available_titles}"
+                )
+            )
+        category_id = matched_category.get("_id")
+        if not isinstance(category_id, ObjectId):
+            raise HTTPException(status_code=500, detail="Invalid category ID format.")
+        # Get titles from matched objects
+        headcategory_title = matched_headcategory.get("title", "")
+        category_title = matched_category.get("title", "")
+        return CategoryPrediction(
+            headcategory_id=str(headcategory_id),
+            headcategory_title=headcategory_title,
+            category_id=str(category_id),
+            category_title=category_title,
+        )
     def _parse_response_payload(self, response) -> Dict[str, object]:
         raw_text = self._extract_response_text(response)
     @staticmethod
     def _extract_response_text(response) -> str:
+        """Extract text from OpenAI API response (supports both Chat Completions and Responses API)."""
+        # Try standard Chat Completions API format first
+        if hasattr(response, "choices") and response.choices:
+            message = response.choices[0].message
+            if hasattr(message, "content") and message.content:
+                return message.content.strip()
+        # Try Responses API format
         text = getattr(response, "output_text", "") or ""
         if isinstance(text, str) and text.strip():
             return text.strip()
         return None
+    async def _get_headcategories_cached(self, user_id: str) -> Dict[str, object]:
+        """Fetch headcategories from MongoDB with user-specific caching."""
+        async with self._cache_lock:
             now = time.monotonic()
+            # Check cache
+            if user_id in self._headcategories_cache:
+                cached_data, cached_time = self._headcategories_cache[user_id]
+                if (now - cached_time) < self._cache_ttl_seconds:
+                    return cached_data
+                # Cache expired, remove it
+                del self._headcategories_cache[user_id]
+        # Fetch from database
+        data = await self._get_headcategories(user_id)
+        # Update cache
+        async with self._cache_lock:
+            self._headcategories_cache[user_id] = (data, time.monotonic())
+        return data
+    async def _get_headcategories(self, user_id: str) -> Dict[str, object]:
+        """Fetch headcategories from MongoDB filtered by user_id."""
+        head_collection = self._collection()
+        # Convert user_id string to ObjectId
+        try:
+            user_object_id = ObjectId(user_id)
+        except Exception as exc:
+            raise HTTPException(status_code=400, detail=f"Invalid user_id format: {user_id}") from exc
+        # Query headcategories filtered by user_id - only fetch needed fields for performance
+        head_docs = await head_collection.find(
+            {"user": user_object_id, "categories": {"$type": "array", "$ne": []}},
+            {"_id": 1, "title": 1, "categories": 1}  # Only fetch needed fields
+        ).to_list(length=1000)
+        if not head_docs:
+            return {"headcategories": []}
+        # Build headcategories structure
+        headcategories: List[Dict[str, object]] = []
+        for head_doc in head_docs:
+            head_id = head_doc.get("_id")
+            if not isinstance(head_id, ObjectId):
+                continue
+            category_ids = [cid for cid in (head_doc.get("categories") or []) if isinstance(cid, ObjectId)]
+            if not category_ids:
+                continue
+            headcategories.append({
+                "_id": head_id,
+                "title": head_doc.get("title", ""),
+                "category_ids": category_ids,
+            })
+        return {"headcategories": headcategories}
+    async def _get_categories_by_ids(self, category_ids: List[ObjectId]) -> Dict[str, object]:
+        """Fetch categories from MongoDB by their ObjectIds."""
+        subcategory_collection = self._subcategory_collection()
+        if not category_ids:
+            return {"categories": []}
+        # Query categories collection with the provided ObjectIds
+        categories: List[Dict[str, object]] = []
+        cursor = subcategory_collection.find(
+            {"_id": {"$in": category_ids}},
+            {"title": 1, "_id": 1}
+        )
+        async for cat_doc in cursor:
+            cat_id = cat_doc.get("_id")
+            if isinstance(cat_id, ObjectId):
+                categories.append({
+                    "_id": cat_id,
+                    "title": cat_doc.get("title", ""),
+                })
+        return {"categories": categories}
     async def _create_model_response(self, request_payload: Dict[str, object]):
+        """Create a model response using OpenAI Chat Completions API."""
+        try:
+            return await self._openai_client.chat.completions.create(
+                response_format={"type": "json_object"},
+                **request_payload,
+            )
+        except TypeError as exc:
+            # Fallback for older openai-python clients or custom API endpoints
+            if "responses" in dir(self._openai_client):
+                return await self._openai_client.responses.create(
+                    response_format={"type": "json_object"},
+                    **request_payload,
+                )
+            raise
     @staticmethod
+    def _format_categories_for_llm(categories: List[Dict[str, object]]) -> str:
+        """Format categories for LLM prompt."""
         lines = []
         for category in categories:
             subs = category.get("subcategories") or []
+            subs_text = ", ".join([sub.get("title", "") for sub in subs if isinstance(sub, dict)]) if subs else "Unspecified"
             lines.append(f"- {category.get('title', 'Unknown')}: {subs_text}")
         return "\n".join(lines)
+    @staticmethod
+    def _normalize_string(s: str) -> str:
+        """Normalize string by removing punctuation and extra spaces for better matching."""
+        # Remove punctuation and convert to lowercase
+        normalized = s.translate(str.maketrans('', '', string.punctuation)).lower().strip()
+        # Replace multiple spaces with single space
+        normalized = ' '.join(normalized.split())
+        return normalized
+    @staticmethod
+    def _find_matching_ids(
+        categories: List[Dict[str, object]],
+        title: str,
+        subcategory: str
+    ) -> tuple[ObjectId | None, ObjectId | None]:
+        """Find matching headcategory_id and category_id based on title and subcategory strings.
+        Uses flexible matching:
+        1. Exact match (case-insensitive)
+        2. Normalized match (removes punctuation)
+        3. Partial match (one contains the other)
+        4. Word-based match (checks if key words match)
+        """
+        title_lower = title.strip().lower()
+        subcategory_lower = subcategory.strip().lower()
+        title_normalized = AutoCategoryService._normalize_string(title)
+        subcategory_normalized = AutoCategoryService._normalize_string(subcategory)
+        # First pass: exact match
+        for category in categories:
+            head_title = category.get("title", "").strip().lower()
+            if head_title != title_lower:
+                continue
+            subcategories = category.get("subcategories", [])
+            for sub in subcategories:
+                if isinstance(sub, dict):
+                    sub_title = sub.get("title", "").strip().lower()
+                    if sub_title == subcategory_lower:
+                        headcategory_id = category.get("headcategory_id")
+                        category_id = sub.get("_id")
+                        if isinstance(headcategory_id, ObjectId) and isinstance(category_id, ObjectId):
+                            return headcategory_id, category_id
+        # Second pass: normalized match (removes punctuation, handles "Wage" vs "Wage, Invoices")
+        for category in categories:
+            head_title = category.get("title", "").strip().lower()
+            head_title_norm = AutoCategoryService._normalize_string(head_title)
+            if head_title_norm != title_normalized and title_normalized not in head_title_norm and head_title_norm not in title_normalized:
+                continue
+            subcategories = category.get("subcategories", [])
+            for sub in subcategories:
+                if isinstance(sub, dict):
+                    sub_title = sub.get("title", "").strip().lower()
+                    sub_title_norm = AutoCategoryService._normalize_string(sub_title)
+                    if (sub_title_norm == subcategory_normalized or
+                        subcategory_normalized in sub_title_norm or
+                        sub_title_norm in subcategory_normalized):
+                        headcategory_id = category.get("headcategory_id")
+                        category_id = sub.get("_id")
+                        if isinstance(headcategory_id, ObjectId) and isinstance(category_id, ObjectId):
+                            return headcategory_id, category_id
+        # Third pass: partial match (one contains the other)
+        for category in categories:
+            head_title = category.get("title", "").strip().lower()
+            # Check if title matches (exact or contains)
+            if title_lower not in head_title and head_title not in title_lower:
+                continue
+            subcategories = category.get("subcategories", [])
+            for sub in subcategories:
+                if isinstance(sub, dict):
+                    sub_title = sub.get("title", "").strip().lower()
+                    # Check if subcategory matches (exact or contains)
+                    if (subcategory_lower in sub_title or sub_title in subcategory_lower or
+                        subcategory_lower.split()[0] in sub_title or sub_title.split()[0] in subcategory_lower):
+                        headcategory_id = category.get("headcategory_id")
+                        category_id = sub.get("_id")
+                        if isinstance(headcategory_id, ObjectId) and isinstance(category_id, ObjectId):
+                            return headcategory_id, category_id
+        # Fourth pass: word-based matching (for cases like "Wage" matching "Wage, Invoices")
+        title_words = set(title_lower.split())
+        subcategory_words = set(subcategory_lower.split())
+        for category in categories:
+            head_title = category.get("title", "").strip().lower()
+            head_title_words = set(head_title.split())
+            # Check if there's significant word overlap for title
+            if not title_words.intersection(head_title_words) and not head_title_words.intersection(title_words):
+                continue
+            subcategories = category.get("subcategories", [])
+            for sub in subcategories:
+                if isinstance(sub, dict):
+                    sub_title = sub.get("title", "").strip().lower()
+                    sub_title_words = set(sub_title.split())
+                    # Check if there's significant word overlap for subcategory
+                    if (subcategory_words.intersection(sub_title_words) or
+                        sub_title_words.intersection(subcategory_words)):
+                        headcategory_id = category.get("headcategory_id")
+                        category_id = sub.get("_id")
+                        if isinstance(headcategory_id, ObjectId) and isinstance(category_id, ObjectId):
+                            return headcategory_id, category_id
+        return None, None