LogicGoInfotechSpaces commited on
Commit
2c85aa8
·
verified ·
1 Parent(s): 123305c

Update app/services/autocategorizer.py

Browse files
Files changed (1) hide show
  1. app/services/autocategorizer.py +194 -8
app/services/autocategorizer.py CHANGED
@@ -7,6 +7,7 @@ import re
7
  import time
8
  from typing import Callable, Dict, List, Optional
9
 
 
10
  from fastapi import HTTPException
11
  from motor.motor_asyncio import AsyncIOMotorCollection
12
  from openai import AsyncOpenAI
@@ -18,14 +19,137 @@ from app.schemas.categories import CategoryPrediction
18
  class AutoCategoryService:
19
  """Classifies transaction notes into the closest Mongo-backed category."""
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def __init__(
22
  self,
23
  collection_getter: Callable[[], AsyncIOMotorCollection],
 
24
  openai_client: AsyncOpenAI,
25
  model: str,
26
  cache_ttl_seconds: int,
27
  ) -> None:
28
  self._collection_getter = collection_getter
 
29
  self._openai_client = openai_client
30
  self._model = model
31
  self._cache_ttl_seconds = cache_ttl_seconds
@@ -36,8 +160,19 @@ class AutoCategoryService:
36
  def _collection(self) -> AsyncIOMotorCollection:
37
  return self._collection_getter()
38
 
 
 
 
39
  async def categorize(self, notes: str) -> CategoryPrediction:
40
- categories = await self._get_categories()
 
 
 
 
 
 
 
 
41
  if not categories:
42
  raise HTTPException(status_code=500, detail="No categories configured.")
43
 
@@ -66,15 +201,22 @@ class AutoCategoryService:
66
  )
67
 
68
  try:
69
- response = await self._openai_client.responses.create(
70
- response_format={"type": "json_object"},
71
- **request_payload,
72
  )
73
  except TypeError as exc:
74
  # Older openai-python clients (pre 1.3x) do not yet support response_format.
75
  if "response_format" not in str(exc):
76
  raise
77
- response = await self._openai_client.responses.create(**request_payload)
 
 
 
 
 
 
 
78
 
79
  try:
80
  payload = self._parse_response_payload(response)
@@ -192,12 +334,50 @@ class AutoCategoryService:
192
  if self._cached_categories and (now - self._last_loaded) < self._cache_ttl_seconds:
193
  return self._cached_categories
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  categories: List[Dict[str, object]] = []
196
- async for doc in self._collection().find({}, {"_id": 0}):
 
 
 
 
 
 
 
 
 
197
  categories.append(
198
  {
199
- "title": doc.get("title", ""),
200
- "subcategories": doc.get("subcategories", []),
201
  }
202
  )
203
 
@@ -205,6 +385,12 @@ class AutoCategoryService:
205
  self._last_loaded = now
206
  return categories
207
 
 
 
 
 
 
 
208
  @staticmethod
209
  def _format_categories(categories: List[Dict[str, object]]) -> str:
210
  lines = []
 
7
  import time
8
  from typing import Callable, Dict, List, Optional
9
 
10
+ from bson import ObjectId
11
  from fastapi import HTTPException
12
  from motor.motor_asyncio import AsyncIOMotorCollection
13
  from openai import AsyncOpenAI
 
19
  class AutoCategoryService:
20
  """Classifies transaction notes into the closest Mongo-backed category."""
21
 
22
+ # Curated categories requested by the client. When enabled via settings.use_static_categories,
23
+ # we bypass Mongo reads to avoid noisy data and long scans.
24
+ _STATIC_CATEGORIES: List[Dict[str, object]] = [
25
+ {
26
+ "title": "Food & Drinks",
27
+ "subcategories": ["Groceries", "Restaurant, Fast - Food", "Bar, Cafe", "Food & Drink"],
28
+ },
29
+ {
30
+ "title": "Investments",
31
+ "subcategories": [
32
+ "Investments",
33
+ "Realty",
34
+ "Vehicles, Chattels",
35
+ "Finacial investments",
36
+ "Savings",
37
+ "Collections",
38
+ ],
39
+ },
40
+ {
41
+ "title": "Communication,PC",
42
+ "subcategories": ["Communication,PC", "Phone", "Internet", "Software, app, games", "Postal services"],
43
+ },
44
+ {
45
+ "title": "Financial Expenses",
46
+ "subcategories": [
47
+ "Financial expenses",
48
+ "Taxes",
49
+ "Insurances",
50
+ "Loan, interests",
51
+ "Fines",
52
+ "Advisory",
53
+ "Charges, Fees",
54
+ "Child Support",
55
+ ],
56
+ },
57
+ {
58
+ "title": "Life & Entertainment",
59
+ "subcategories": [
60
+ "Life & Entertainment",
61
+ "Health, Care, Doctor",
62
+ "Wellness, Beauty",
63
+ "Active sport, Fitness",
64
+ "Culture, sport events",
65
+ "Life events",
66
+ "Hobbies",
67
+ "Education, Development",
68
+ "Books, Audio, subscription",
69
+ "TV, Streaming",
70
+ "Holiday, Trip, Hotels",
71
+ "Charity, Gifts",
72
+ "Alcohol, Tobacco",
73
+ "Lottery, Gamblings",
74
+ ],
75
+ },
76
+ {
77
+ "title": "Vehicle",
78
+ "subcategories": [
79
+ "Vehicle",
80
+ "Fuel",
81
+ "Parking",
82
+ "Vehicle maintenance",
83
+ "Rentals",
84
+ "Vehicle insurance",
85
+ "Leasing",
86
+ ],
87
+ },
88
+ {
89
+ "title": "Transportation",
90
+ "subcategories": ["Transportation", "Public transport", "Taxi", "Long distance", "Business trips"],
91
+ },
92
+ {
93
+ "title": "Housing",
94
+ "subcategories": [
95
+ "Housing",
96
+ "Rent",
97
+ "Mortgage",
98
+ "Energy, utilities",
99
+ "Services",
100
+ "Maintenance, repairs",
101
+ "Property insurance",
102
+ ],
103
+ },
104
+ {
105
+ "title": "Shopping",
106
+ "subcategories": [
107
+ "Shopping",
108
+ "Clothes & shoes",
109
+ "Jewels & Accessories",
110
+ "Health & Beauty",
111
+ "Kids",
112
+ "Home & Garden",
113
+ "Pets & Animals",
114
+ "Electronics",
115
+ "Gift",
116
+ "Stationary",
117
+ "Free time",
118
+ "Chemist",
119
+ ],
120
+ },
121
+ {
122
+ "title": "Income",
123
+ "subcategories": [
124
+ "Income",
125
+ "Wage, Invoices",
126
+ "Sale",
127
+ "Rental income",
128
+ "Dues & grants",
129
+ "Lending, renting",
130
+ "Checks, coupons",
131
+ "Lottery, gambling",
132
+ "Refunds",
133
+ "Child support",
134
+ "Gifts",
135
+ "Account Manage",
136
+ ],
137
+ },
138
+ ]
139
+
140
+ _categories_timeout_seconds = 15.0
141
+ _model_timeout_seconds = 20.0
142
+
143
  def __init__(
144
  self,
145
  collection_getter: Callable[[], AsyncIOMotorCollection],
146
+ subcategory_collection_getter: Callable[[], AsyncIOMotorCollection],
147
  openai_client: AsyncOpenAI,
148
  model: str,
149
  cache_ttl_seconds: int,
150
  ) -> None:
151
  self._collection_getter = collection_getter
152
+ self._subcategory_collection_getter = subcategory_collection_getter
153
  self._openai_client = openai_client
154
  self._model = model
155
  self._cache_ttl_seconds = cache_ttl_seconds
 
160
  def _collection(self) -> AsyncIOMotorCollection:
161
  return self._collection_getter()
162
 
163
+ def _subcategory_collection(self) -> AsyncIOMotorCollection:
164
+ return self._subcategory_collection_getter()
165
+
166
  async def categorize(self, notes: str) -> CategoryPrediction:
167
+ try:
168
+ categories = await asyncio.wait_for(
169
+ self._get_categories(), timeout=self._categories_timeout_seconds
170
+ )
171
+ except asyncio.TimeoutError as exc:
172
+ raise HTTPException(status_code=504, detail="Timed out loading categories from database.") from exc
173
+ except Exception as exc:
174
+ raise HTTPException(status_code=502, detail="Failed to load categories from database.") from exc
175
+
176
  if not categories:
177
  raise HTTPException(status_code=500, detail="No categories configured.")
178
 
 
201
  )
202
 
203
  try:
204
+ response = await asyncio.wait_for(
205
+ self._create_model_response(request_payload),
206
+ timeout=self._model_timeout_seconds,
207
  )
208
  except TypeError as exc:
209
  # Older openai-python clients (pre 1.3x) do not yet support response_format.
210
  if "response_format" not in str(exc):
211
  raise
212
+ response = await asyncio.wait_for(
213
+ self._openai_client.responses.create(**request_payload),
214
+ timeout=self._model_timeout_seconds,
215
+ )
216
+ except asyncio.TimeoutError as exc:
217
+ raise HTTPException(status_code=504, detail="Timed out waiting for model response.") from exc
218
+ except Exception as exc:
219
+ raise HTTPException(status_code=502, detail="Failed to call the model API.") from exc
220
 
221
  try:
222
  payload = self._parse_response_payload(response)
 
334
  if self._cached_categories and (now - self._last_loaded) < self._cache_ttl_seconds:
335
  return self._cached_categories
336
 
337
+ if settings.use_static_categories:
338
+ self._cached_categories = self._STATIC_CATEGORIES
339
+ self._last_loaded = now
340
+ return self._cached_categories
341
+
342
+ # Use headcategories + categories to avoid scanning millions of raw transaction titles.
343
+ head_collection = self._collection()
344
+ subcategory_collection = self._subcategory_collection()
345
+
346
+ pipeline = [
347
+ {"$match": {"type": "EXPENSE", "categories": {"$type": "array", "$ne": []}}},
348
+ {"$group": {"_id": "$title", "category_ids": {"$first": "$categories"}}},
349
+ ]
350
+ head_docs = await head_collection.aggregate(pipeline).to_list(length=1000)
351
+
352
+ all_ids: set[ObjectId] = set()
353
+ for doc in head_docs:
354
+ for cid in doc.get("category_ids") or []:
355
+ if isinstance(cid, ObjectId):
356
+ all_ids.add(cid)
357
+
358
+ subcategory_titles: Dict[ObjectId, str] = {}
359
+ if all_ids:
360
+ cursor = subcategory_collection.find({"_id": {"$in": list(all_ids)}}, {"title": 1})
361
+ async for subdoc in cursor:
362
+ title = subdoc.get("title")
363
+ if isinstance(title, str) and title.strip():
364
+ subcategory_titles[subdoc["_id"]] = title.strip()
365
+
366
  categories: List[Dict[str, object]] = []
367
+ for doc in head_docs:
368
+ raw_title = doc.get("_id")
369
+ if not isinstance(raw_title, str) or not raw_title.strip():
370
+ continue
371
+
372
+ ids = [cid for cid in (doc.get("category_ids") or []) if isinstance(cid, ObjectId)]
373
+ subcategories = sorted({subcategory_titles[cid] for cid in ids if cid in subcategory_titles})
374
+ if not subcategories:
375
+ continue
376
+
377
  categories.append(
378
  {
379
+ "title": raw_title.strip(),
380
+ "subcategories": subcategories,
381
  }
382
  )
383
 
 
385
  self._last_loaded = now
386
  return categories
387
 
388
+ async def _create_model_response(self, request_payload: Dict[str, object]):
389
+ return await self._openai_client.responses.create(
390
+ response_format={"type": "json_object"},
391
+ **request_payload,
392
+ )
393
+
394
  @staticmethod
395
  def _format_categories(categories: List[Dict[str, object]]) -> str:
396
  lines = []