Commit
·
76af018
1
Parent(s):
b8d9bc1
Improve detector schema + add API tester
Browse files- README.md +18 -4
- requirements.txt +1 -0
- src/api.py +2 -2
- src/config.py +18 -1
- src/duplicate_detector.py +2 -0
- src/models.py +29 -3
- src/repositories.py +15 -10
- test.py +71 -0
README.md
CHANGED
|
@@ -33,10 +33,10 @@ Quick Start
|
|
| 33 |
|
| 34 |
2. Copy `.env.example` to `.env` and set the Mongo connection string if you do not want to rely on the baked-in default.
|
| 35 |
|
| 36 |
-
3. Run the detector (the default config scans the last 48 h of data and writes suggestions only):
|
| 37 |
|
| 38 |
```
|
| 39 |
-
python3 -m src.main --minutes
|
| 40 |
```
|
| 41 |
|
| 42 |
You will see log lines such as:
|
|
@@ -63,7 +63,7 @@ Endpoints:
|
|
| 63 |
Collections
|
| 64 |
-----------
|
| 65 |
|
| 66 |
-
* `
|
| 67 |
* `merchant_aliases`: optional alias definitions (`name`, `aliases`).
|
| 68 |
* `merge_suggestions`: the service writes documents shaped as:
|
| 69 |
|
|
@@ -94,12 +94,26 @@ All tunables live in `src/config.py`. Environment variables take precedence, so
|
|
| 94 |
| --- | --- | --- |
|
| 95 |
| `MONGO_URI` | Mongo connection string | Provided URI |
|
| 96 |
| `MONGO_DB` | Database name | `expense` |
|
| 97 |
-
| `MONGO_EXPENSE_COLLECTION` | Expenses collection | `
|
| 98 |
| `MONGO_ALIAS_COLLECTION` | Merchant alias collection | `merchant_aliases` |
|
| 99 |
| `MONGO_SUGGESTION_COLLECTION` | Merge-suggestion collection | `merge_suggestions` |
|
| 100 |
| `AMOUNT_TOLERANCE_PCT` | Amount delta percentage | `1.0` |
|
| 101 |
| `TIME_TOLERANCE_MINUTES` | Time delta minutes | `10` |
|
| 102 |
| `DEFAULT_LOOKBACK_HOURS` | How far back to scan | `48` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
Next Steps
|
| 105 |
----------
|
|
|
|
| 33 |
|
| 34 |
2. Copy `.env.example` to `.env` and set the Mongo connection string if you do not want to rely on the baked-in default.
|
| 35 |
|
| 36 |
+
3. Run the detector (the default config scans the last 48 h of data and writes suggestions only). For the historical `transactions` collection you may want to bump the lookback window:
|
| 37 |
|
| 38 |
```
|
| 39 |
+
python3 -m src.main --minutes 30 --lookback-hours 720
|
| 40 |
```
|
| 41 |
|
| 42 |
You will see log lines such as:
|
|
|
|
| 63 |
Collections
|
| 64 |
-----------
|
| 65 |
|
| 66 |
+
* `transactions` (default): source data. The detector automatically maps the `date`/`createdAt` timestamp and `note`/`paymentType` merchant fields so you still get near-duplicate detection without reshaping your documents. Entries are only compared if they belong to the same `user`.
|
| 67 |
* `merchant_aliases`: optional alias definitions (`name`, `aliases`).
|
| 68 |
* `merge_suggestions`: the service writes documents shaped as:
|
| 69 |
|
|
|
|
| 94 |
| --- | --- | --- |
|
| 95 |
| `MONGO_URI` | Mongo connection string | Provided URI |
|
| 96 |
| `MONGO_DB` | Database name | `expense` |
|
| 97 |
+
| `MONGO_EXPENSE_COLLECTION` | Expenses collection | `transactions` |
|
| 98 |
| `MONGO_ALIAS_COLLECTION` | Merchant alias collection | `merchant_aliases` |
|
| 99 |
| `MONGO_SUGGESTION_COLLECTION` | Merge-suggestion collection | `merge_suggestions` |
|
| 100 |
| `AMOUNT_TOLERANCE_PCT` | Amount delta percentage | `1.0` |
|
| 101 |
| `TIME_TOLERANCE_MINUTES` | Time delta minutes | `10` |
|
| 102 |
| `DEFAULT_LOOKBACK_HOURS` | How far back to scan | `48` |
|
| 103 |
+
| `TIME_FIELDS` | CSV priority order for timestamps | `date,expense_time,createdAt` |
|
| 104 |
+
| `MERCHANT_FIELDS` | CSV priority order for merchant labels | `merchant,note,paymentType,type,to` |
|
| 105 |
+
| `USER_FIELD` | Source field that stores the user id (inferred automatically) | `user` |
|
| 106 |
+
|
| 107 |
+
Smoke Test
|
| 108 |
+
----------
|
| 109 |
+
|
| 110 |
+
Use the bundled `test.py` script to hit the running API (locally or on the Hugging Face Space) via the base URL:
|
| 111 |
+
|
| 112 |
+
```
|
| 113 |
+
python3 test.py --base-url https://LogicGoInfotechSpaces-duplicate-transaction-detection.hf.space --lookback-hours 720 --limit 5000
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
The script calls `/health`, `/duplicates/detect`, and `/suggestions` in sequence and prints the responses so you can quickly verify the deployment.
|
| 117 |
|
| 118 |
Next Steps
|
| 119 |
----------
|
requirements.txt
CHANGED
|
@@ -2,4 +2,5 @@ pymongo>=4.8.0
|
|
| 2 |
python-dotenv>=1.0.1
|
| 3 |
fastapi>=0.115.0
|
| 4 |
uvicorn>=0.30.6
|
|
|
|
| 5 |
|
|
|
|
| 2 |
python-dotenv>=1.0.1
|
| 3 |
fastapi>=0.115.0
|
| 4 |
uvicorn>=0.30.6
|
| 5 |
+
requests>=2.32.3
|
| 6 |
|
src/api.py
CHANGED
|
@@ -33,8 +33,8 @@ suggestion_repository = MergeSuggestionRepository.from_client(mongo_client)
|
|
| 33 |
class DetectRequest(BaseModel):
|
| 34 |
lookback_hours: int | None = Field(
|
| 35 |
default=None,
|
| 36 |
-
ge=
|
| 37 |
-
description="Hours to look back when scanning expenses.",
|
| 38 |
)
|
| 39 |
limit: int | None = Field(
|
| 40 |
default=None,
|
|
|
|
| 33 |
class DetectRequest(BaseModel):
|
| 34 |
lookback_hours: int | None = Field(
|
| 35 |
default=None,
|
| 36 |
+
ge=0,
|
| 37 |
+
description="Hours to look back when scanning expenses (0 = full history).",
|
| 38 |
)
|
| 39 |
limit: int | None = Field(
|
| 40 |
default=None,
|
src/config.py
CHANGED
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
| 5 |
import os
|
| 6 |
from dataclasses import dataclass
|
| 7 |
from decimal import Decimal
|
|
|
|
| 8 |
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
|
|
@@ -31,11 +32,19 @@ def _get_int(env_key: str, default: str) -> int:
|
|
| 31 |
raise ValueError(f"Invalid int for {env_key}: {raw_value}") from exc
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
@dataclass(frozen=True)
|
| 35 |
class Settings:
|
| 36 |
mongo_uri: str = os.getenv("MONGO_URI", DEFAULT_MONGO_URI)
|
| 37 |
mongo_db: str = os.getenv("MONGO_DB", "expense")
|
| 38 |
-
expense_collection: str = os.getenv("MONGO_EXPENSE_COLLECTION", "
|
| 39 |
alias_collection: str = os.getenv("MONGO_ALIAS_COLLECTION", "merchant_aliases")
|
| 40 |
suggestion_collection: str = os.getenv("MONGO_SUGGESTION_COLLECTION", "merge_suggestions")
|
| 41 |
amount_tolerance_pct: Decimal = _get_decimal("AMOUNT_TOLERANCE_PCT", "1.0")
|
|
@@ -43,6 +52,14 @@ class Settings:
|
|
| 43 |
default_lookback_hours: int = _get_int("DEFAULT_LOOKBACK_HOURS", "48")
|
| 44 |
service_name: str = os.getenv("SERVICE_NAME", "duplicate-detector")
|
| 45 |
max_batch_size: int = _get_int("MAX_BATCH_SIZE", "5000")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
settings = Settings()
|
|
|
|
| 5 |
import os
|
| 6 |
from dataclasses import dataclass
|
| 7 |
from decimal import Decimal
|
| 8 |
+
from typing import Tuple
|
| 9 |
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
|
|
|
|
| 32 |
raise ValueError(f"Invalid int for {env_key}: {raw_value}") from exc
|
| 33 |
|
| 34 |
|
| 35 |
+
def _get_tuple(env_key: str, default: str) -> Tuple[str, ...]:
|
| 36 |
+
raw_value = os.getenv(env_key, default)
|
| 37 |
+
values = [value.strip() for value in raw_value.split(",") if value.strip()]
|
| 38 |
+
if not values:
|
| 39 |
+
raise ValueError(f"{env_key} must contain at least one value")
|
| 40 |
+
return tuple(values)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
@dataclass(frozen=True)
|
| 44 |
class Settings:
|
| 45 |
mongo_uri: str = os.getenv("MONGO_URI", DEFAULT_MONGO_URI)
|
| 46 |
mongo_db: str = os.getenv("MONGO_DB", "expense")
|
| 47 |
+
expense_collection: str = os.getenv("MONGO_EXPENSE_COLLECTION", "transactions")
|
| 48 |
alias_collection: str = os.getenv("MONGO_ALIAS_COLLECTION", "merchant_aliases")
|
| 49 |
suggestion_collection: str = os.getenv("MONGO_SUGGESTION_COLLECTION", "merge_suggestions")
|
| 50 |
amount_tolerance_pct: Decimal = _get_decimal("AMOUNT_TOLERANCE_PCT", "1.0")
|
|
|
|
| 52 |
default_lookback_hours: int = _get_int("DEFAULT_LOOKBACK_HOURS", "48")
|
| 53 |
service_name: str = os.getenv("SERVICE_NAME", "duplicate-detector")
|
| 54 |
max_batch_size: int = _get_int("MAX_BATCH_SIZE", "5000")
|
| 55 |
+
time_fields: Tuple[str, ...] = _get_tuple(
|
| 56 |
+
"TIME_FIELDS",
|
| 57 |
+
"date,expense_time,createdAt",
|
| 58 |
+
)
|
| 59 |
+
merchant_fields: Tuple[str, ...] = _get_tuple(
|
| 60 |
+
"MERCHANT_FIELDS",
|
| 61 |
+
"merchant,note,paymentType,type,to",
|
| 62 |
+
)
|
| 63 |
|
| 64 |
|
| 65 |
settings = Settings()
|
src/duplicate_detector.py
CHANGED
|
@@ -45,6 +45,8 @@ class DuplicateDetector:
|
|
| 45 |
for i, exp_a in enumerate(expenses):
|
| 46 |
for j in range(i + 1, len(expenses)):
|
| 47 |
exp_b = expenses[j]
|
|
|
|
|
|
|
| 48 |
delta_minutes = _minutes_delta(exp_a.expense_time, exp_b.expense_time)
|
| 49 |
if delta_minutes > self.time_tolerance_minutes:
|
| 50 |
break
|
|
|
|
| 45 |
for i, exp_a in enumerate(expenses):
|
| 46 |
for j in range(i + 1, len(expenses)):
|
| 47 |
exp_b = expenses[j]
|
| 48 |
+
if exp_a.user_id and exp_b.user_id and exp_a.user_id != exp_b.user_id:
|
| 49 |
+
continue
|
| 50 |
delta_minutes = _minutes_delta(exp_a.expense_time, exp_b.expense_time)
|
| 51 |
if delta_minutes > self.time_tolerance_minutes:
|
| 52 |
break
|
src/models.py
CHANGED
|
@@ -3,10 +3,32 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from dataclasses import dataclass, field
|
| 6 |
-
from datetime import datetime
|
| 7 |
from decimal import Decimal
|
| 8 |
from typing import List, Mapping, Sequence
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
@dataclass(frozen=True)
|
| 12 |
class Expense:
|
|
@@ -15,6 +37,7 @@ class Expense:
|
|
| 15 |
currency: str
|
| 16 |
merchant: str
|
| 17 |
expense_time: datetime
|
|
|
|
| 18 |
source: str | None = None
|
| 19 |
metadata: Mapping[str, object] | None = None
|
| 20 |
|
|
@@ -24,12 +47,15 @@ class Expense:
|
|
| 24 |
amount_value = Decimal(str(doc["amount"]))
|
| 25 |
except KeyError as exc:
|
| 26 |
raise ValueError("Expense document missing 'amount'") from exc
|
|
|
|
|
|
|
| 27 |
return Expense(
|
| 28 |
expense_id=str(doc.get("_id")),
|
| 29 |
amount=amount_value,
|
| 30 |
currency=str(doc.get("currency", "INR")),
|
| 31 |
-
merchant=
|
| 32 |
-
expense_time=
|
|
|
|
| 33 |
source=doc.get("source"),
|
| 34 |
metadata=doc.get("metadata") or {},
|
| 35 |
)
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from dataclasses import dataclass, field
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
from decimal import Decimal
|
| 8 |
from typing import List, Mapping, Sequence
|
| 9 |
|
| 10 |
+
from .config import settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _extract_expense_time(doc: Mapping[str, object]) -> datetime:
|
| 14 |
+
for field in settings.time_fields:
|
| 15 |
+
value = doc.get(field)
|
| 16 |
+
if isinstance(value, datetime):
|
| 17 |
+
if value.tzinfo is None:
|
| 18 |
+
value = value.replace(tzinfo=timezone.utc)
|
| 19 |
+
return value
|
| 20 |
+
raise ValueError("Expense document missing a valid timestamp field")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _extract_merchant(doc: Mapping[str, object]) -> str:
|
| 24 |
+
for field in settings.merchant_fields:
|
| 25 |
+
value = doc.get(field)
|
| 26 |
+
if value:
|
| 27 |
+
text = str(value).strip()
|
| 28 |
+
if text:
|
| 29 |
+
return text
|
| 30 |
+
return ""
|
| 31 |
+
|
| 32 |
|
| 33 |
@dataclass(frozen=True)
|
| 34 |
class Expense:
|
|
|
|
| 37 |
currency: str
|
| 38 |
merchant: str
|
| 39 |
expense_time: datetime
|
| 40 |
+
user_id: str | None = None
|
| 41 |
source: str | None = None
|
| 42 |
metadata: Mapping[str, object] | None = None
|
| 43 |
|
|
|
|
| 47 |
amount_value = Decimal(str(doc["amount"]))
|
| 48 |
except KeyError as exc:
|
| 49 |
raise ValueError("Expense document missing 'amount'") from exc
|
| 50 |
+
expense_time = _extract_expense_time(doc)
|
| 51 |
+
merchant_value = _extract_merchant(doc)
|
| 52 |
return Expense(
|
| 53 |
expense_id=str(doc.get("_id")),
|
| 54 |
amount=amount_value,
|
| 55 |
currency=str(doc.get("currency", "INR")),
|
| 56 |
+
merchant=merchant_value,
|
| 57 |
+
expense_time=expense_time,
|
| 58 |
+
user_id=str(doc.get("user")) if doc.get("user") else None,
|
| 59 |
source=doc.get("source"),
|
| 60 |
metadata=doc.get("metadata") or {},
|
| 61 |
)
|
src/repositories.py
CHANGED
|
@@ -28,16 +28,21 @@ class ExpenseRepository:
|
|
| 28 |
)
|
| 29 |
|
| 30 |
def fetch_recent(self, lookback_hours: int, limit: int) -> List[Expense]:
|
| 31 |
-
since
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
class MerchantAliasRepository:
|
|
|
|
| 28 |
)
|
| 29 |
|
| 30 |
def fetch_recent(self, lookback_hours: int, limit: int) -> List[Expense]:
|
| 31 |
+
since: datetime | None = None
|
| 32 |
+
if lookback_hours > 0:
|
| 33 |
+
since = datetime.now(tz=timezone.utc) - timedelta(hours=lookback_hours)
|
| 34 |
+
if since:
|
| 35 |
+
time_filters = [
|
| 36 |
+
{field: {"$gte": since}}
|
| 37 |
+
for field in config.settings.time_fields
|
| 38 |
+
]
|
| 39 |
+
query = {"$or": time_filters}
|
| 40 |
+
else:
|
| 41 |
+
query = {}
|
| 42 |
+
cursor = self._collection.find(query, limit=limit) or []
|
| 43 |
+
expenses = [Expense.from_document(doc) for doc in cursor]
|
| 44 |
+
expenses.sort(key=lambda expense: expense.expense_time)
|
| 45 |
+
return expenses
|
| 46 |
|
| 47 |
|
| 48 |
class MerchantAliasRepository:
|
test.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Simple CLI to hit the duplicate-detector API."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
from typing import Any, Dict
|
| 10 |
+
|
| 11 |
+
import requests
|
| 12 |
+
|
| 13 |
+
DEFAULT_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:8000")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _print_response(label: str, response: requests.Response) -> None:
|
| 17 |
+
print(f"\n== {label} ==")
|
| 18 |
+
print(f"Status: {response.status_code}")
|
| 19 |
+
if response.headers.get("content-type", "").startswith("application/json"):
|
| 20 |
+
try:
|
| 21 |
+
payload = response.json()
|
| 22 |
+
print(json.dumps(payload, indent=2, default=str))
|
| 23 |
+
except ValueError:
|
| 24 |
+
print(response.text)
|
| 25 |
+
else:
|
| 26 |
+
print(response.text)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def main(argv: list[str] | None = None) -> int:
|
| 30 |
+
parser = argparse.ArgumentParser(description="Exercise the duplicate-detector API.")
|
| 31 |
+
parser.add_argument(
|
| 32 |
+
"--base-url",
|
| 33 |
+
default=DEFAULT_BASE_URL,
|
| 34 |
+
help=f"API base URL (default: {DEFAULT_BASE_URL}).",
|
| 35 |
+
)
|
| 36 |
+
parser.add_argument("--lookback-hours", type=int, default=720)
|
| 37 |
+
parser.add_argument("--limit", type=int, default=5000)
|
| 38 |
+
parser.add_argument("--amount-pct", type=float, default=1.0)
|
| 39 |
+
parser.add_argument("--minutes", type=int, default=30)
|
| 40 |
+
args = parser.parse_args(argv)
|
| 41 |
+
|
| 42 |
+
base_url = args.base_url.rstrip("/")
|
| 43 |
+
payload: Dict[str, Any] = {
|
| 44 |
+
"lookback_hours": args.lookback_hours,
|
| 45 |
+
"limit": args.limit,
|
| 46 |
+
"amount_pct": args.amount_pct,
|
| 47 |
+
"minutes": args.minutes,
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
health = requests.get(f"{base_url}/health", timeout=10)
|
| 52 |
+
_print_response("Health", health)
|
| 53 |
+
|
| 54 |
+
detect = requests.post(
|
| 55 |
+
f"{base_url}/duplicates/detect",
|
| 56 |
+
json=payload,
|
| 57 |
+
timeout=60,
|
| 58 |
+
)
|
| 59 |
+
_print_response("Detect", detect)
|
| 60 |
+
|
| 61 |
+
suggestions = requests.get(f"{base_url}/suggestions", timeout=10)
|
| 62 |
+
_print_response("Suggestions", suggestions)
|
| 63 |
+
return 0
|
| 64 |
+
except requests.RequestException as exc:
|
| 65 |
+
print(f"Request failed: {exc}", file=sys.stderr)
|
| 66 |
+
return 1
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
raise SystemExit(main())
|
| 71 |
+
|