LogicGoInfotechSpaces commited on
Commit
76af018
·
1 Parent(s): b8d9bc1

Improve detector schema + add API tester

Browse files
Files changed (8) hide show
  1. README.md +18 -4
  2. requirements.txt +1 -0
  3. src/api.py +2 -2
  4. src/config.py +18 -1
  5. src/duplicate_detector.py +2 -0
  6. src/models.py +29 -3
  7. src/repositories.py +15 -10
  8. test.py +71 -0
README.md CHANGED
@@ -33,10 +33,10 @@ Quick Start
33
 
34
  2. Copy `.env.example` to `.env` and set the Mongo connection string if you do not want to rely on the baked-in default.
35
 
36
- 3. Run the detector (the default config scans the last 48 h of data and writes suggestions only):
37
 
38
  ```
39
- python3 -m src.main --minutes 12 --lookback-hours 72
40
  ```
41
 
42
  You will see log lines such as:
@@ -63,7 +63,7 @@ Endpoints:
63
  Collections
64
  -----------
65
 
66
- * `expenses`: source data. The detector expects fields such as `_id`, `amount`, `currency`, `merchant`, `expense_time`.
67
  * `merchant_aliases`: optional alias definitions (`name`, `aliases`).
68
  * `merge_suggestions`: the service writes documents shaped as:
69
 
@@ -94,12 +94,26 @@ All tunables live in `src/config.py`. Environment variables take precedence, so
94
  | --- | --- | --- |
95
  | `MONGO_URI` | Mongo connection string | Provided URI |
96
  | `MONGO_DB` | Database name | `expense` |
97
- | `MONGO_EXPENSE_COLLECTION` | Expenses collection | `expenses` |
98
  | `MONGO_ALIAS_COLLECTION` | Merchant alias collection | `merchant_aliases` |
99
  | `MONGO_SUGGESTION_COLLECTION` | Merge-suggestion collection | `merge_suggestions` |
100
  | `AMOUNT_TOLERANCE_PCT` | Amount delta percentage | `1.0` |
101
  | `TIME_TOLERANCE_MINUTES` | Time delta minutes | `10` |
102
  | `DEFAULT_LOOKBACK_HOURS` | How far back to scan | `48` |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  Next Steps
105
  ----------
 
33
 
34
  2. Copy `.env.example` to `.env` and set the Mongo connection string if you do not want to rely on the baked-in default.
35
 
36
+ 3. Run the detector (the default config scans the last 48 h of data and writes suggestions only). For the historical `transactions` collection you may want to bump the lookback window:
37
 
38
  ```
39
+ python3 -m src.main --minutes 30 --lookback-hours 720
40
  ```
41
 
42
  You will see log lines such as:
 
63
  Collections
64
  -----------
65
 
66
+ * `transactions` (default): source data. The detector automatically maps the `date`/`createdAt` timestamp and `note`/`paymentType` merchant fields so you still get near-duplicate detection without reshaping your documents. Entries are only compared if they belong to the same `user`.
67
  * `merchant_aliases`: optional alias definitions (`name`, `aliases`).
68
  * `merge_suggestions`: the service writes documents shaped as:
69
 
 
94
  | --- | --- | --- |
95
  | `MONGO_URI` | Mongo connection string | Provided URI |
96
  | `MONGO_DB` | Database name | `expense` |
97
+ | `MONGO_EXPENSE_COLLECTION` | Expenses collection | `transactions` |
98
  | `MONGO_ALIAS_COLLECTION` | Merchant alias collection | `merchant_aliases` |
99
  | `MONGO_SUGGESTION_COLLECTION` | Merge-suggestion collection | `merge_suggestions` |
100
  | `AMOUNT_TOLERANCE_PCT` | Amount delta percentage | `1.0` |
101
  | `TIME_TOLERANCE_MINUTES` | Time delta minutes | `10` |
102
  | `DEFAULT_LOOKBACK_HOURS` | How far back to scan | `48` |
103
+ | `TIME_FIELDS` | CSV priority order for timestamps | `date,expense_time,createdAt` |
104
+ | `MERCHANT_FIELDS` | CSV priority order for merchant labels | `merchant,note,paymentType,type,to` |
105
+ | `USER_FIELD` | Source field that stores the user id (inferred automatically) | `user` |
106
+
107
+ Smoke Test
108
+ ----------
109
+
110
+ Use the bundled `test.py` script to hit the running API (locally or on the Hugging Face Space) via the base URL:
111
+
112
+ ```
113
+ python3 test.py --base-url https://LogicGoInfotechSpaces-duplicate-transaction-detection.hf.space --lookback-hours 720 --limit 5000
114
+ ```
115
+
116
+ The script calls `/health`, `/duplicates/detect`, and `/suggestions` in sequence and prints the responses so you can quickly verify the deployment.
117
 
118
  Next Steps
119
  ----------
requirements.txt CHANGED
@@ -2,4 +2,5 @@ pymongo>=4.8.0
2
  python-dotenv>=1.0.1
3
  fastapi>=0.115.0
4
  uvicorn>=0.30.6
 
5
 
 
2
  python-dotenv>=1.0.1
3
  fastapi>=0.115.0
4
  uvicorn>=0.30.6
5
+ requests>=2.32.3
6
 
src/api.py CHANGED
@@ -33,8 +33,8 @@ suggestion_repository = MergeSuggestionRepository.from_client(mongo_client)
33
  class DetectRequest(BaseModel):
34
  lookback_hours: int | None = Field(
35
  default=None,
36
- ge=1,
37
- description="Hours to look back when scanning expenses.",
38
  )
39
  limit: int | None = Field(
40
  default=None,
 
33
  class DetectRequest(BaseModel):
34
  lookback_hours: int | None = Field(
35
  default=None,
36
+ ge=0,
37
+ description="Hours to look back when scanning expenses (0 = full history).",
38
  )
39
  limit: int | None = Field(
40
  default=None,
src/config.py CHANGED
@@ -5,6 +5,7 @@ from __future__ import annotations
5
  import os
6
  from dataclasses import dataclass
7
  from decimal import Decimal
 
8
 
9
  from dotenv import load_dotenv
10
 
@@ -31,11 +32,19 @@ def _get_int(env_key: str, default: str) -> int:
31
  raise ValueError(f"Invalid int for {env_key}: {raw_value}") from exc
32
 
33
 
 
 
 
 
 
 
 
 
34
  @dataclass(frozen=True)
35
  class Settings:
36
  mongo_uri: str = os.getenv("MONGO_URI", DEFAULT_MONGO_URI)
37
  mongo_db: str = os.getenv("MONGO_DB", "expense")
38
- expense_collection: str = os.getenv("MONGO_EXPENSE_COLLECTION", "expenses")
39
  alias_collection: str = os.getenv("MONGO_ALIAS_COLLECTION", "merchant_aliases")
40
  suggestion_collection: str = os.getenv("MONGO_SUGGESTION_COLLECTION", "merge_suggestions")
41
  amount_tolerance_pct: Decimal = _get_decimal("AMOUNT_TOLERANCE_PCT", "1.0")
@@ -43,6 +52,14 @@ class Settings:
43
  default_lookback_hours: int = _get_int("DEFAULT_LOOKBACK_HOURS", "48")
44
  service_name: str = os.getenv("SERVICE_NAME", "duplicate-detector")
45
  max_batch_size: int = _get_int("MAX_BATCH_SIZE", "5000")
 
 
 
 
 
 
 
 
46
 
47
 
48
  settings = Settings()
 
5
  import os
6
  from dataclasses import dataclass
7
  from decimal import Decimal
8
+ from typing import Tuple
9
 
10
  from dotenv import load_dotenv
11
 
 
32
  raise ValueError(f"Invalid int for {env_key}: {raw_value}") from exc
33
 
34
 
35
+ def _get_tuple(env_key: str, default: str) -> Tuple[str, ...]:
36
+ raw_value = os.getenv(env_key, default)
37
+ values = [value.strip() for value in raw_value.split(",") if value.strip()]
38
+ if not values:
39
+ raise ValueError(f"{env_key} must contain at least one value")
40
+ return tuple(values)
41
+
42
+
43
  @dataclass(frozen=True)
44
  class Settings:
45
  mongo_uri: str = os.getenv("MONGO_URI", DEFAULT_MONGO_URI)
46
  mongo_db: str = os.getenv("MONGO_DB", "expense")
47
+ expense_collection: str = os.getenv("MONGO_EXPENSE_COLLECTION", "transactions")
48
  alias_collection: str = os.getenv("MONGO_ALIAS_COLLECTION", "merchant_aliases")
49
  suggestion_collection: str = os.getenv("MONGO_SUGGESTION_COLLECTION", "merge_suggestions")
50
  amount_tolerance_pct: Decimal = _get_decimal("AMOUNT_TOLERANCE_PCT", "1.0")
 
52
  default_lookback_hours: int = _get_int("DEFAULT_LOOKBACK_HOURS", "48")
53
  service_name: str = os.getenv("SERVICE_NAME", "duplicate-detector")
54
  max_batch_size: int = _get_int("MAX_BATCH_SIZE", "5000")
55
+ time_fields: Tuple[str, ...] = _get_tuple(
56
+ "TIME_FIELDS",
57
+ "date,expense_time,createdAt",
58
+ )
59
+ merchant_fields: Tuple[str, ...] = _get_tuple(
60
+ "MERCHANT_FIELDS",
61
+ "merchant,note,paymentType,type,to",
62
+ )
63
 
64
 
65
  settings = Settings()
src/duplicate_detector.py CHANGED
@@ -45,6 +45,8 @@ class DuplicateDetector:
45
  for i, exp_a in enumerate(expenses):
46
  for j in range(i + 1, len(expenses)):
47
  exp_b = expenses[j]
 
 
48
  delta_minutes = _minutes_delta(exp_a.expense_time, exp_b.expense_time)
49
  if delta_minutes > self.time_tolerance_minutes:
50
  break
 
45
  for i, exp_a in enumerate(expenses):
46
  for j in range(i + 1, len(expenses)):
47
  exp_b = expenses[j]
48
+ if exp_a.user_id and exp_b.user_id and exp_a.user_id != exp_b.user_id:
49
+ continue
50
  delta_minutes = _minutes_delta(exp_a.expense_time, exp_b.expense_time)
51
  if delta_minutes > self.time_tolerance_minutes:
52
  break
src/models.py CHANGED
@@ -3,10 +3,32 @@
3
  from __future__ import annotations
4
 
5
  from dataclasses import dataclass, field
6
- from datetime import datetime
7
  from decimal import Decimal
8
  from typing import List, Mapping, Sequence
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  @dataclass(frozen=True)
12
  class Expense:
@@ -15,6 +37,7 @@ class Expense:
15
  currency: str
16
  merchant: str
17
  expense_time: datetime
 
18
  source: str | None = None
19
  metadata: Mapping[str, object] | None = None
20
 
@@ -24,12 +47,15 @@ class Expense:
24
  amount_value = Decimal(str(doc["amount"]))
25
  except KeyError as exc:
26
  raise ValueError("Expense document missing 'amount'") from exc
 
 
27
  return Expense(
28
  expense_id=str(doc.get("_id")),
29
  amount=amount_value,
30
  currency=str(doc.get("currency", "INR")),
31
- merchant=str(doc.get("merchant", "")).strip(),
32
- expense_time=doc["expense_time"],
 
33
  source=doc.get("source"),
34
  metadata=doc.get("metadata") or {},
35
  )
 
3
  from __future__ import annotations
4
 
5
  from dataclasses import dataclass, field
6
+ from datetime import datetime, timezone
7
  from decimal import Decimal
8
  from typing import List, Mapping, Sequence
9
 
10
+ from .config import settings
11
+
12
+
13
+ def _extract_expense_time(doc: Mapping[str, object]) -> datetime:
14
+ for field in settings.time_fields:
15
+ value = doc.get(field)
16
+ if isinstance(value, datetime):
17
+ if value.tzinfo is None:
18
+ value = value.replace(tzinfo=timezone.utc)
19
+ return value
20
+ raise ValueError("Expense document missing a valid timestamp field")
21
+
22
+
23
+ def _extract_merchant(doc: Mapping[str, object]) -> str:
24
+ for field in settings.merchant_fields:
25
+ value = doc.get(field)
26
+ if value:
27
+ text = str(value).strip()
28
+ if text:
29
+ return text
30
+ return ""
31
+
32
 
33
  @dataclass(frozen=True)
34
  class Expense:
 
37
  currency: str
38
  merchant: str
39
  expense_time: datetime
40
+ user_id: str | None = None
41
  source: str | None = None
42
  metadata: Mapping[str, object] | None = None
43
 
 
47
  amount_value = Decimal(str(doc["amount"]))
48
  except KeyError as exc:
49
  raise ValueError("Expense document missing 'amount'") from exc
50
+ expense_time = _extract_expense_time(doc)
51
+ merchant_value = _extract_merchant(doc)
52
  return Expense(
53
  expense_id=str(doc.get("_id")),
54
  amount=amount_value,
55
  currency=str(doc.get("currency", "INR")),
56
+ merchant=merchant_value,
57
+ expense_time=expense_time,
58
+ user_id=str(doc.get("user")) if doc.get("user") else None,
59
  source=doc.get("source"),
60
  metadata=doc.get("metadata") or {},
61
  )
src/repositories.py CHANGED
@@ -28,16 +28,21 @@ class ExpenseRepository:
28
  )
29
 
30
  def fetch_recent(self, lookback_hours: int, limit: int) -> List[Expense]:
31
- since = datetime.now(tz=timezone.utc) - timedelta(hours=lookback_hours)
32
- cursor = (
33
- self._collection.find(
34
- {"expense_time": {"$gte": since}},
35
- sort=[("expense_time", 1)],
36
- limit=limit,
37
- )
38
- or []
39
- )
40
- return [Expense.from_document(doc) for doc in cursor]
 
 
 
 
 
41
 
42
 
43
  class MerchantAliasRepository:
 
28
  )
29
 
30
  def fetch_recent(self, lookback_hours: int, limit: int) -> List[Expense]:
31
+ since: datetime | None = None
32
+ if lookback_hours > 0:
33
+ since = datetime.now(tz=timezone.utc) - timedelta(hours=lookback_hours)
34
+ if since:
35
+ time_filters = [
36
+ {field: {"$gte": since}}
37
+ for field in config.settings.time_fields
38
+ ]
39
+ query = {"$or": time_filters}
40
+ else:
41
+ query = {}
42
+ cursor = self._collection.find(query, limit=limit) or []
43
+ expenses = [Expense.from_document(doc) for doc in cursor]
44
+ expenses.sort(key=lambda expense: expense.expense_time)
45
+ return expenses
46
 
47
 
48
  class MerchantAliasRepository:
test.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simple CLI to hit the duplicate-detector API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import os
8
+ import sys
9
+ from typing import Any, Dict
10
+
11
+ import requests
12
+
13
+ DEFAULT_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:8000")
14
+
15
+
16
+ def _print_response(label: str, response: requests.Response) -> None:
17
+ print(f"\n== {label} ==")
18
+ print(f"Status: {response.status_code}")
19
+ if response.headers.get("content-type", "").startswith("application/json"):
20
+ try:
21
+ payload = response.json()
22
+ print(json.dumps(payload, indent=2, default=str))
23
+ except ValueError:
24
+ print(response.text)
25
+ else:
26
+ print(response.text)
27
+
28
+
29
+ def main(argv: list[str] | None = None) -> int:
30
+ parser = argparse.ArgumentParser(description="Exercise the duplicate-detector API.")
31
+ parser.add_argument(
32
+ "--base-url",
33
+ default=DEFAULT_BASE_URL,
34
+ help=f"API base URL (default: {DEFAULT_BASE_URL}).",
35
+ )
36
+ parser.add_argument("--lookback-hours", type=int, default=720)
37
+ parser.add_argument("--limit", type=int, default=5000)
38
+ parser.add_argument("--amount-pct", type=float, default=1.0)
39
+ parser.add_argument("--minutes", type=int, default=30)
40
+ args = parser.parse_args(argv)
41
+
42
+ base_url = args.base_url.rstrip("/")
43
+ payload: Dict[str, Any] = {
44
+ "lookback_hours": args.lookback_hours,
45
+ "limit": args.limit,
46
+ "amount_pct": args.amount_pct,
47
+ "minutes": args.minutes,
48
+ }
49
+
50
+ try:
51
+ health = requests.get(f"{base_url}/health", timeout=10)
52
+ _print_response("Health", health)
53
+
54
+ detect = requests.post(
55
+ f"{base_url}/duplicates/detect",
56
+ json=payload,
57
+ timeout=60,
58
+ )
59
+ _print_response("Detect", detect)
60
+
61
+ suggestions = requests.get(f"{base_url}/suggestions", timeout=10)
62
+ _print_response("Suggestions", suggestions)
63
+ return 0
64
+ except requests.RequestException as exc:
65
+ print(f"Request failed: {exc}", file=sys.stderr)
66
+ return 1
67
+
68
+
69
+ if __name__ == "__main__":
70
+ raise SystemExit(main())
71
+