Spaces:
Sleeping
Sleeping
Sajil Awale commited on
Commit ·
5ec41fc
1
Parent(s): afa5a12
uodaded the test notebook
Browse files- notebooks/1_test_pdf_reader.ipynb +628 -80
- requirements.txt +3 -1
notebooks/1_test_pdf_reader.ipynb
CHANGED
|
@@ -15,6 +15,14 @@
|
|
| 15 |
"execution_count": 2,
|
| 16 |
"metadata": {},
|
| 17 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
{
|
| 19 |
"data": {
|
| 20 |
"text/plain": [
|
|
@@ -32,6 +40,22 @@
|
|
| 32 |
"from langchain.chat_models import init_chat_model\n",
|
| 33 |
"from langchain_core.prompts import ChatPromptTemplate\n",
|
| 34 |
"from langchain_core.output_parsers import JsonOutputParser\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"\n",
|
| 36 |
"from dotenv import load_dotenv\n",
|
| 37 |
"load_dotenv()"
|
|
@@ -39,16 +63,57 @@
|
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"cell_type": "code",
|
| 42 |
-
"execution_count":
|
| 43 |
"metadata": {},
|
| 44 |
"outputs": [],
|
| 45 |
"source": [
|
| 46 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
" df = pd.read_csv(file_path)\n",
|
| 48 |
" headers = df.columns.tolist()\n",
|
| 49 |
-
" # Taking 10 rows gives the LLM a better chance to see both a purchase and a payment\n",
|
| 50 |
" sample_data = df.head(10).to_json() \n",
|
| 51 |
"\n",
|
|
|
|
| 52 |
" prompt = ChatPromptTemplate.from_template(\"\"\"\n",
|
| 53 |
" Act as a financial data parser. Analyze this CSV data:\n",
|
| 54 |
" Filename: {filename}\n",
|
|
@@ -57,7 +122,7 @@
|
|
| 57 |
"\n",
|
| 58 |
" TASK:\n",
|
| 59 |
" 1. Map the CSV columns to standard fields: date, description, amount, and category.\n",
|
| 60 |
-
" 2. Determine the 'sign_convention'
|
| 61 |
" \n",
|
| 62 |
" RULES:\n",
|
| 63 |
" - If the filename suggests 'Discover' credit card, spending are usually POSITIVE.\n",
|
|
@@ -67,7 +132,6 @@
|
|
| 67 |
" - Look at the sample data for known merchants or spending patterns.\n",
|
| 68 |
" - If spending (like a restaurant or store) is NEGATIVE (e.g., -25.00), the convention is 'spending_is_negative'.\n",
|
| 69 |
" - If spending is POSITIVE (e.g., 25.00), the convention is 'spending_is_positive'.\n",
|
| 70 |
-
" \n",
|
| 71 |
"\n",
|
| 72 |
" OUTPUT FORMAT (JSON ONLY):\n",
|
| 73 |
" {{\n",
|
|
@@ -82,53 +146,216 @@
|
|
| 82 |
" chain = prompt | llm | JsonOutputParser()\n",
|
| 83 |
" mapping = chain.invoke({\"headers\": headers, \"sample\": sample_data, \"filename\": file_path})\n",
|
| 84 |
"\n",
|
| 85 |
-
" #
|
| 86 |
" standard_df = pd.DataFrame()\n",
|
|
|
|
| 87 |
" standard_df['transaction_date'] = pd.to_datetime(df[mapping['date_col']])\n",
|
| 88 |
" standard_df['description'] = df[mapping['desc_col']]\n",
|
| 89 |
" \n",
|
| 90 |
-
" #
|
| 91 |
-
" # Goal: All spending (outflow) = POSITIVE, All payments (inflow) = NEGATIVE\n",
|
| 92 |
" raw_amounts = pd.to_numeric(df[mapping['amount_col']])\n",
|
| 93 |
-
" \n",
|
| 94 |
" if mapping['sign_convention'] == \"spending_is_negative\":\n",
|
| 95 |
-
" # If the bank shows spending as -100 and payments as +100, \n",
|
| 96 |
-
" # we flip everything so spending is +100 and payments are -100.\n",
|
| 97 |
" standard_df['amount'] = raw_amounts * -1\n",
|
| 98 |
" else:\n",
|
| 99 |
-
" # If the bank already shows spending as +100 and payments as -100, keep it.\n",
|
| 100 |
" standard_df['amount'] = raw_amounts\n",
|
| 101 |
" \n",
|
| 102 |
" standard_df['category'] = df[mapping['category_col']] if mapping.get('category_col') else 'Uncategorized'\n",
|
| 103 |
" standard_df['source_file'] = file_path.split(\"/\")[-1]\n",
|
| 104 |
"\n",
|
| 105 |
-
" #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
" conn = sqlite3.connect(db_path)\n",
|
| 107 |
" standard_df.to_sql(\"transactions\", conn, if_exists=\"append\", index=False)\n",
|
| 108 |
" conn.close()\n",
|
| 109 |
" \n",
|
| 110 |
-
" print(f\"✅ Ingested {file_path}. Logic: {mapping['sign_convention']}\")"
|
| 111 |
]
|
| 112 |
},
|
| 113 |
{
|
| 114 |
"cell_type": "code",
|
| 115 |
-
"execution_count":
|
| 116 |
"metadata": {},
|
| 117 |
"outputs": [
|
| 118 |
-
{
|
| 119 |
-
"name": "stderr",
|
| 120 |
-
"output_type": "stream",
|
| 121 |
-
"text": [
|
| 122 |
-
"/Users/sawale/Documents/learning/money_rag/.venv/lib/python3.12/site-packages/google/cloud/aiplatform/models.py:52: FutureWarning: Support for google-cloud-storage < 3.0.0 will be removed in a future version of google-cloud-aiplatform. Please upgrade to google-cloud-storage >= 3.0.0.\n",
|
| 123 |
-
" from google.cloud.aiplatform.utils import gcs_utils\n"
|
| 124 |
-
]
|
| 125 |
-
},
|
| 126 |
{
|
| 127 |
"name": "stdout",
|
| 128 |
"output_type": "stream",
|
| 129 |
"text": [
|
| 130 |
-
"
|
| 131 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
]
|
| 133 |
}
|
| 134 |
],
|
|
@@ -137,21 +364,23 @@
|
|
| 137 |
"path2 = \"/Users/sawale/Documents/learning/money_rag/demo_data/Chase5282_Activity20240110_20260110_20260111.CSV\"\n",
|
| 138 |
"\n",
|
| 139 |
"# Initialize the Gemini model via Vertex AI\n",
|
| 140 |
-
"
|
| 141 |
" \"gemini-2.5-flash\", \n",
|
| 142 |
" model_provider=\"google_vertexai\",\n",
|
| 143 |
" project='gen-lang-client-0311515393',\n",
|
| 144 |
" location='us-central1',\n",
|
| 145 |
")\n",
|
| 146 |
"\n",
|
| 147 |
-
"\n",
|
| 148 |
-
"
|
| 149 |
-
"
|
|
|
|
|
|
|
| 150 |
]
|
| 151 |
},
|
| 152 |
{
|
| 153 |
"cell_type": "code",
|
| 154 |
-
"execution_count":
|
| 155 |
"metadata": {},
|
| 156 |
"outputs": [
|
| 157 |
{
|
|
@@ -175,53 +404,65 @@
|
|
| 175 |
" <thead>\n",
|
| 176 |
" <tr style=\"text-align: right;\">\n",
|
| 177 |
" <th></th>\n",
|
|
|
|
| 178 |
" <th>transaction_date</th>\n",
|
| 179 |
" <th>description</th>\n",
|
| 180 |
" <th>amount</th>\n",
|
| 181 |
" <th>category</th>\n",
|
| 182 |
" <th>source_file</th>\n",
|
|
|
|
| 183 |
" </tr>\n",
|
| 184 |
" </thead>\n",
|
| 185 |
" <tbody>\n",
|
| 186 |
" <tr>\n",
|
| 187 |
" <th>0</th>\n",
|
|
|
|
| 188 |
" <td>2024-10-17 00:00:00</td>\n",
|
| 189 |
" <td>BACK MARKET BROOKLYN NY</td>\n",
|
| 190 |
" <td>231.19</td>\n",
|
| 191 |
" <td>Merchandise</td>\n",
|
| 192 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
|
|
|
| 193 |
" </tr>\n",
|
| 194 |
" <tr>\n",
|
| 195 |
" <th>1</th>\n",
|
|
|
|
| 196 |
" <td>2024-10-18 00:00:00</td>\n",
|
| 197 |
" <td>TEMU.COM 8884958368 DE</td>\n",
|
| 198 |
" <td>16.51</td>\n",
|
| 199 |
" <td>Merchandise</td>\n",
|
| 200 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
|
|
|
| 201 |
" </tr>\n",
|
| 202 |
" <tr>\n",
|
| 203 |
" <th>2</th>\n",
|
|
|
|
| 204 |
" <td>2024-10-18 00:00:00</td>\n",
|
| 205 |
" <td>WALMART STORE 00332 HUNTSVILLE AL</td>\n",
|
| 206 |
" <td>146.73</td>\n",
|
| 207 |
" <td>Merchandise</td>\n",
|
| 208 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
|
|
|
| 209 |
" </tr>\n",
|
| 210 |
" <tr>\n",
|
| 211 |
" <th>3</th>\n",
|
|
|
|
| 212 |
" <td>2024-10-18 00:00:00</td>\n",
|
| 213 |
" <td>$100 STATEMENT CREDIT W 1ST PU</td>\n",
|
| 214 |
" <td>-100.00</td>\n",
|
| 215 |
" <td>Awards and Rebate Credits</td>\n",
|
| 216 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
|
|
|
| 217 |
" </tr>\n",
|
| 218 |
" <tr>\n",
|
| 219 |
" <th>4</th>\n",
|
|
|
|
| 220 |
" <td>2024-11-02 00:00:00</td>\n",
|
| 221 |
" <td>PY *KUNG-FU TEA AL HUNTSVILLE AL</td>\n",
|
| 222 |
" <td>8.09</td>\n",
|
| 223 |
" <td>Restaurants</td>\n",
|
| 224 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
|
|
|
| 225 |
" </tr>\n",
|
| 226 |
" <tr>\n",
|
| 227 |
" <th>...</th>\n",
|
|
@@ -230,102 +471,127 @@
|
|
| 230 |
" <td>...</td>\n",
|
| 231 |
" <td>...</td>\n",
|
| 232 |
" <td>...</td>\n",
|
|
|
|
|
|
|
| 233 |
" </tr>\n",
|
| 234 |
" <tr>\n",
|
| 235 |
" <th>245</th>\n",
|
|
|
|
| 236 |
" <td>2025-06-18 00:00:00</td>\n",
|
| 237 |
" <td>PANDA EXPRESS #2005</td>\n",
|
| 238 |
" <td>52.87</td>\n",
|
| 239 |
" <td>Food & Drink</td>\n",
|
| 240 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
|
|
|
| 241 |
" </tr>\n",
|
| 242 |
" <tr>\n",
|
| 243 |
" <th>246</th>\n",
|
|
|
|
| 244 |
" <td>2025-06-14 00:00:00</td>\n",
|
| 245 |
" <td>Payment Thank You-Mobile</td>\n",
|
| 246 |
" <td>-62.07</td>\n",
|
| 247 |
" <td>None</td>\n",
|
| 248 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
|
|
|
| 249 |
" </tr>\n",
|
| 250 |
" <tr>\n",
|
| 251 |
" <th>247</th>\n",
|
|
|
|
| 252 |
" <td>2025-06-12 00:00:00</td>\n",
|
| 253 |
" <td>STARS AND STRIKES - HUNTS</td>\n",
|
| 254 |
" <td>21.80</td>\n",
|
| 255 |
" <td>Entertainment</td>\n",
|
| 256 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
|
|
|
| 257 |
" </tr>\n",
|
| 258 |
" <tr>\n",
|
| 259 |
" <th>248</th>\n",
|
|
|
|
| 260 |
" <td>2025-06-11 00:00:00</td>\n",
|
| 261 |
" <td>WAL-MART #332</td>\n",
|
| 262 |
" <td>4.47</td>\n",
|
| 263 |
" <td>Groceries</td>\n",
|
| 264 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
|
|
|
| 265 |
" </tr>\n",
|
| 266 |
" <tr>\n",
|
| 267 |
" <th>249</th>\n",
|
|
|
|
| 268 |
" <td>2025-06-11 00:00:00</td>\n",
|
| 269 |
" <td>WAL-MART #332</td>\n",
|
| 270 |
" <td>57.60</td>\n",
|
| 271 |
" <td>Groceries</td>\n",
|
| 272 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
|
|
|
| 273 |
" </tr>\n",
|
| 274 |
" </tbody>\n",
|
| 275 |
"</table>\n",
|
| 276 |
-
"<p>250 rows ×
|
| 277 |
"</div>"
|
| 278 |
],
|
| 279 |
"text/plain": [
|
| 280 |
-
"
|
| 281 |
-
"0 2024-10-17 00:00:00
|
| 282 |
-
"1 2024-10-18 00:00:00
|
| 283 |
-
"2 2024-10-18 00:00:00
|
| 284 |
-
"3 2024-10-18 00:00:00
|
| 285 |
-
"4 2024-11-02 00:00:00
|
| 286 |
-
"..
|
| 287 |
-
"245 2025-06-18 00:00:00
|
| 288 |
-
"246 2025-06-14 00:00:00
|
| 289 |
-
"247 2025-06-12 00:00:00
|
| 290 |
-
"248 2025-06-11 00:00:00
|
| 291 |
-
"249 2025-06-11 00:00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
"\n",
|
| 293 |
-
"
|
| 294 |
-
"0
|
| 295 |
-
"1
|
| 296 |
-
"2
|
| 297 |
-
"3
|
| 298 |
-
"4
|
| 299 |
-
"..
|
| 300 |
-
"245
|
| 301 |
-
"246
|
| 302 |
-
"247
|
| 303 |
-
"248
|
| 304 |
-
"249
|
| 305 |
"\n",
|
| 306 |
-
"
|
| 307 |
-
"0
|
| 308 |
-
"1
|
| 309 |
-
"2
|
| 310 |
-
"3
|
| 311 |
-
"4
|
| 312 |
-
"..
|
| 313 |
-
"245
|
| 314 |
-
"246
|
| 315 |
-
"247
|
| 316 |
-
"248
|
| 317 |
-
"249
|
| 318 |
"\n",
|
| 319 |
-
"[250 rows x
|
| 320 |
]
|
| 321 |
},
|
| 322 |
-
"execution_count":
|
| 323 |
"metadata": {},
|
| 324 |
"output_type": "execute_result"
|
| 325 |
}
|
| 326 |
],
|
| 327 |
"source": [
|
| 328 |
-
"
|
| 329 |
"import sqlite3\n",
|
| 330 |
"import pandas as pd\n",
|
| 331 |
"\n",
|
|
@@ -344,24 +610,306 @@
|
|
| 344 |
},
|
| 345 |
{
|
| 346 |
"cell_type": "code",
|
| 347 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
"metadata": {},
|
| 349 |
"outputs": [
|
| 350 |
{
|
| 351 |
-
"
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
}
|
| 360 |
],
|
| 361 |
"source": [
|
| 362 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
]
|
| 364 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
{
|
| 366 |
"cell_type": "code",
|
| 367 |
"execution_count": null,
|
|
|
|
| 15 |
"execution_count": 2,
|
| 16 |
"metadata": {},
|
| 17 |
"outputs": [
|
| 18 |
+
{
|
| 19 |
+
"name": "stderr",
|
| 20 |
+
"output_type": "stream",
|
| 21 |
+
"text": [
|
| 22 |
+
"/Users/sawale/Documents/learning/money_rag/.venv/lib/python3.12/site-packages/google/cloud/aiplatform/models.py:52: FutureWarning: Support for google-cloud-storage < 3.0.0 will be removed in a future version of google-cloud-aiplatform. Please upgrade to google-cloud-storage >= 3.0.0.\n",
|
| 23 |
+
" from google.cloud.aiplatform.utils import gcs_utils\n"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
{
|
| 27 |
"data": {
|
| 28 |
"text/plain": [
|
|
|
|
| 40 |
"from langchain.chat_models import init_chat_model\n",
|
| 41 |
"from langchain_core.prompts import ChatPromptTemplate\n",
|
| 42 |
"from langchain_core.output_parsers import JsonOutputParser\n",
|
| 43 |
+
"import uuid\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"from langchain_qdrant import QdrantVectorStore\n",
|
| 46 |
+
"from qdrant_client import QdrantClient\n",
|
| 47 |
+
"from qdrant_client.http.models import Distance, VectorParams\n",
|
| 48 |
+
"from langchain_google_vertexai import VertexAIEmbeddings\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"from typing import TypedDict\n",
|
| 51 |
+
"from dataclasses import dataclass\n",
|
| 52 |
+
"from langgraph.runtime import get_runtime\n",
|
| 53 |
+
"from langgraph.checkpoint.memory import InMemorySaver\n",
|
| 54 |
+
"from langchain_community.utilities import SQLDatabase\n",
|
| 55 |
+
"from langchain_core.tools import tool\n",
|
| 56 |
+
"from langchain.agents import create_agent\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"\n",
|
| 59 |
"\n",
|
| 60 |
"from dotenv import load_dotenv\n",
|
| 61 |
"load_dotenv()"
|
|
|
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"cell_type": "code",
|
| 66 |
+
"execution_count": 13,
|
| 67 |
"metadata": {},
|
| 68 |
"outputs": [],
|
| 69 |
"source": [
|
| 70 |
+
"import os \n",
|
| 71 |
+
"import json\n",
|
| 72 |
+
"import asyncio\n",
|
| 73 |
+
"from langchain_community.tools import DuckDuckGoSearchRun\n",
|
| 74 |
+
"\n",
|
| 75 |
+
"# Initialize search\n",
|
| 76 |
+
"search_tool = DuckDuckGoSearchRun()\n",
|
| 77 |
+
"MERCHANT_CACHE_FILE = \"merchant_cache.json\"\n",
|
| 78 |
+
"\n",
|
| 79 |
+
"# Helper to load/save cache\n",
|
| 80 |
+
"def _load_cache():\n",
|
| 81 |
+
" if os.path.exists(MERCHANT_CACHE_FILE):\n",
|
| 82 |
+
" try:\n",
|
| 83 |
+
" with open(MERCHANT_CACHE_FILE, 'r') as f:\n",
|
| 84 |
+
" return json.load(f)\n",
|
| 85 |
+
" except:\n",
|
| 86 |
+
" return {}\n",
|
| 87 |
+
" return {}\n",
|
| 88 |
+
"\n",
|
| 89 |
+
"def _save_cache(cache):\n",
|
| 90 |
+
" with open(MERCHANT_CACHE_FILE, 'w') as f:\n",
|
| 91 |
+
" json.dump(cache, f)\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"async def get_merchant_category_async(description, cache, sem):\n",
|
| 94 |
+
" \"\"\"Async search with semaphore and shared cache\"\"\"\n",
|
| 95 |
+
" if description in cache:\n",
|
| 96 |
+
" return cache[description]\n",
|
| 97 |
+
" \n",
|
| 98 |
+
" async with sem:\n",
|
| 99 |
+
" try:\n",
|
| 100 |
+
" # Random small sleep to jitter requests slightly\n",
|
| 101 |
+
" await asyncio.sleep(0.05) \n",
|
| 102 |
+
" print(f\" 🔍 Web searching for: {description}...\")\n",
|
| 103 |
+
" # Run the search asynchronously\n",
|
| 104 |
+
" result = await search_tool.ainvoke(f\"What type of business / store is '{description}'? Whats is location\")\n",
|
| 105 |
+
" return result\n",
|
| 106 |
+
" except Exception as e:\n",
|
| 107 |
+
" print(f\" ⚠️ Search failed for {description}: {e}\")\n",
|
| 108 |
+
" return \"Unknown\"\n",
|
| 109 |
+
"\n",
|
| 110 |
+
"async def ingest_csv(file_path, llm, db_path=\"money_rag.db\"):\n",
|
| 111 |
+
" print(f\"📂 Processing {file_path}...\")\n",
|
| 112 |
" df = pd.read_csv(file_path)\n",
|
| 113 |
" headers = df.columns.tolist()\n",
|
|
|
|
| 114 |
" sample_data = df.head(10).to_json() \n",
|
| 115 |
"\n",
|
| 116 |
+
" # 1. LLM Mapping (Sync is fine here)\n",
|
| 117 |
" prompt = ChatPromptTemplate.from_template(\"\"\"\n",
|
| 118 |
" Act as a financial data parser. Analyze this CSV data:\n",
|
| 119 |
" Filename: {filename}\n",
|
|
|
|
| 122 |
"\n",
|
| 123 |
" TASK:\n",
|
| 124 |
" 1. Map the CSV columns to standard fields: date, description, amount, and category.\n",
|
| 125 |
+
" 2. Determine the 'sign_convention' for spending.\n",
|
| 126 |
" \n",
|
| 127 |
" RULES:\n",
|
| 128 |
" - If the filename suggests 'Discover' credit card, spending are usually POSITIVE.\n",
|
|
|
|
| 132 |
" - Look at the sample data for known merchants or spending patterns.\n",
|
| 133 |
" - If spending (like a restaurant or store) is NEGATIVE (e.g., -25.00), the convention is 'spending_is_negative'.\n",
|
| 134 |
" - If spending is POSITIVE (e.g., 25.00), the convention is 'spending_is_positive'.\n",
|
|
|
|
| 135 |
"\n",
|
| 136 |
" OUTPUT FORMAT (JSON ONLY):\n",
|
| 137 |
" {{\n",
|
|
|
|
| 146 |
" chain = prompt | llm | JsonOutputParser()\n",
|
| 147 |
" mapping = chain.invoke({\"headers\": headers, \"sample\": sample_data, \"filename\": file_path})\n",
|
| 148 |
"\n",
|
| 149 |
+
" # 2. Standardization\n",
|
| 150 |
" standard_df = pd.DataFrame()\n",
|
| 151 |
+
" standard_df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]\n",
|
| 152 |
" standard_df['transaction_date'] = pd.to_datetime(df[mapping['date_col']])\n",
|
| 153 |
" standard_df['description'] = df[mapping['desc_col']]\n",
|
| 154 |
" \n",
|
| 155 |
+
" # Normalization Logic\n",
|
|
|
|
| 156 |
" raw_amounts = pd.to_numeric(df[mapping['amount_col']])\n",
|
|
|
|
| 157 |
" if mapping['sign_convention'] == \"spending_is_negative\":\n",
|
|
|
|
|
|
|
| 158 |
" standard_df['amount'] = raw_amounts * -1\n",
|
| 159 |
" else:\n",
|
|
|
|
| 160 |
" standard_df['amount'] = raw_amounts\n",
|
| 161 |
" \n",
|
| 162 |
" standard_df['category'] = df[mapping['category_col']] if mapping.get('category_col') else 'Uncategorized'\n",
|
| 163 |
" standard_df['source_file'] = file_path.split(\"/\")[-1]\n",
|
| 164 |
"\n",
|
| 165 |
+
" # 3. --- Async Enrichment Step ---\n",
|
| 166 |
+
" print(\" ✨ Enriching descriptions (Async)...\")\n",
|
| 167 |
+
" unique_descriptions = standard_df['description'].unique()\n",
|
| 168 |
+
" \n",
|
| 169 |
+
" # Load cache once\n",
|
| 170 |
+
" cache = _load_cache()\n",
|
| 171 |
+
" \n",
|
| 172 |
+
" # Create a semaphore to limit concurrent web searches (e.g. 5)\n",
|
| 173 |
+
" sem = asyncio.Semaphore(5)\n",
|
| 174 |
+
" \n",
|
| 175 |
+
" # Create tasks for all descriptions\n",
|
| 176 |
+
" # Note: We process ALL descriptions now effectively\n",
|
| 177 |
+
" tasks = []\n",
|
| 178 |
+
" for desc in unique_descriptions:\n",
|
| 179 |
+
" tasks.append(get_merchant_category_async(desc, cache, sem))\n",
|
| 180 |
+
" \n",
|
| 181 |
+
" # Run in parallel\n",
|
| 182 |
+
" results = await asyncio.gather(*tasks)\n",
|
| 183 |
+
" \n",
|
| 184 |
+
" # Update cache object with new results\n",
|
| 185 |
+
" for desc, res in zip(unique_descriptions, results):\n",
|
| 186 |
+
" cache[desc] = res\n",
|
| 187 |
+
" \n",
|
| 188 |
+
" # Save cache back to disk\n",
|
| 189 |
+
" _save_cache(cache)\n",
|
| 190 |
+
" \n",
|
| 191 |
+
" # Map back\n",
|
| 192 |
+
" desc_map = dict(zip(unique_descriptions, results))\n",
|
| 193 |
+
" standard_df['enriched_info'] = standard_df['description'].map(desc_map).fillna(\"\")\n",
|
| 194 |
+
"\n",
|
| 195 |
+
" # 4. Save to DB\n",
|
| 196 |
" conn = sqlite3.connect(db_path)\n",
|
| 197 |
" standard_df.to_sql(\"transactions\", conn, if_exists=\"append\", index=False)\n",
|
| 198 |
" conn.close()\n",
|
| 199 |
" \n",
|
| 200 |
+
" print(f\"✅ Ingested {len(standard_df)} rows from {file_path.split('/')[-1]}. Logic: {mapping['sign_convention']}\")"
|
| 201 |
]
|
| 202 |
},
|
| 203 |
{
|
| 204 |
"cell_type": "code",
|
| 205 |
+
"execution_count": 14,
|
| 206 |
"metadata": {},
|
| 207 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
{
|
| 209 |
"name": "stdout",
|
| 210 |
"output_type": "stream",
|
| 211 |
"text": [
|
| 212 |
+
"📂 Processing /Users/sawale/Documents/learning/money_rag/demo_data/Discover-AllAvailable-20260110.csv...\n",
|
| 213 |
+
" ✨ Enriching descriptions (Async)...\n",
|
| 214 |
+
" 🔍 Web searching for: BACK MARKET BROOKLYN NY...\n",
|
| 215 |
+
" 🔍 Web searching for: TEMU.COM 8884958368 DE...\n",
|
| 216 |
+
" 🔍 Web searching for: WALMART STORE 00332 HUNTSVILLE AL...\n",
|
| 217 |
+
" 🔍 Web searching for: $100 STATEMENT CREDIT W 1ST PU...\n",
|
| 218 |
+
" 🔍 Web searching for: PY *KUNG-FU TEA AL HUNTSVILLE AL...\n",
|
| 219 |
+
" 🔍 Web searching for: MADISON MONTGOMERY AL...\n",
|
| 220 |
+
" 🔍 Web searching for: INTERNET PAYMENT - THANK YOU...\n",
|
| 221 |
+
" 🔍 Web searching for: GRUBHUB - UNIVERSITY OF HUNTSVILLE AL...\n",
|
| 222 |
+
" 🔍 Web searching for: MINT MOBILE 800-683-7392 CA...\n",
|
| 223 |
+
" 🔍 Web searching for: POPEYES 2577 HUNTSVILLE AL...\n",
|
| 224 |
+
" 🔍 Web searching for: 88 BUFFET HUNTSVILLE AL...\n",
|
| 225 |
+
" 🔍 Web searching for: VIET HUONG VIETNAMESE RE HUNTSVILLE AL...\n",
|
| 226 |
+
" 🔍 Web searching for: CASHBACK BONUS REDEMPTION PYMT/STMT CRDT...\n",
|
| 227 |
+
" 🔍 Web searching for: SPO*THECURRYMODERNINDIAN HUNTSVILLE AL...\n",
|
| 228 |
+
" 🔍 Web searching for: H&M 0273HUNTSVILLE HUNTSVILLE ALUS0273001241222182740...\n",
|
| 229 |
+
" 🔍 Web searching for: INDIAN BAZAAR HUNTSVILLE AL...\n",
|
| 230 |
+
" 🔍 Web searching for: HANDELS HOMEMADE HUNTSVI HUNTSVILLE AL...\n",
|
| 231 |
+
" 🔍 Web searching for: UAH COLLEGE 256-824-6170 AL...\n",
|
| 232 |
+
" 🔍 Web searching for: UAH COLLEGE FSF 800-346-9252 MA...\n",
|
| 233 |
+
" 🔍 Web searching for: CHIPOTLE 1687 NASHVILLE TN...\n",
|
| 234 |
+
" 🔍 Web searching for: TST*PIE TOWN TACOS - F NASHVILLE TN00153526022200965677AA...\n",
|
| 235 |
+
" 🔍 Web searching for: INDIAN BAZAAR HUNTSVILLE ALGOOGLE PAY ENDING IN 8984...\n",
|
| 236 |
+
" 🔍 Web searching for: INDIA MART HUNTSVILLE ALGOOGLE PAY ENDING IN 8984...\n",
|
| 237 |
+
" 🔍 Web searching for: PAYPAL *KEVDUDE1186 KEV 888-221-1161 CA...\n",
|
| 238 |
+
" 🔍 Web searching for: LYFT *RIDE WED 10AM 8552800278 CA...\n",
|
| 239 |
+
" 🔍 Web searching for: SKECHERS USA INC 1069 HUNTSVILLE AL...\n",
|
| 240 |
+
" 🔍 Web searching for: STORE HUNTSVILLE AL...\n",
|
| 241 |
+
" 🔍 Web searching for: LYFT *RIDE WED 3PM 8552800278 CA...\n",
|
| 242 |
+
" 🔍 Web searching for: SQ *TAQUERIA LAS ADELI HUNTSVILLE AL0002305843021411201895...\n",
|
| 243 |
+
" 🔍 Web searching for: UAH HUNTSVILLE DUNKIN HUNTSVILLE AL...\n",
|
| 244 |
+
" 🔍 Web searching for: WALMART.COM 800-925-6278 AR...\n",
|
| 245 |
+
" 🔍 Web searching for: WALMART.COM 8009256278 BENTONVILLE AR...\n",
|
| 246 |
+
" 🔍 Web searching for: TOUS LES JOURS - HUNTSVI HUNTSVILLE AL...\n",
|
| 247 |
+
" 🔍 Web searching for: MARSHALLS #422 HUNTSVILLE AL...\n",
|
| 248 |
+
" 🔍 Web searching for: ROSS STORE #2436 HUNTSVILLE AL...\n",
|
| 249 |
+
" 🔍 Web searching for: SPRINTAX NR TAX 8882038900 NY...\n",
|
| 250 |
+
" 🔍 Web searching for: USPS PO 0142460804 HUNTSVILLE AL...\n",
|
| 251 |
+
" 🔍 Web searching for: CHIPOTLE 1796 HUNTSVILLE ALGOOGLE PAY ENDING IN 8984...\n",
|
| 252 |
+
" 🔍 Web searching for: TST*POURHOUSE HUNTSVILLE AL00031984024314246667AA...\n",
|
| 253 |
+
" 🔍 Web searching for: TST*WOKS UP HUNTSVILLE AL00075396024313993332AA...\n",
|
| 254 |
+
" 🔍 Web searching for: SPIRIT AIRLINES 8014012222 FL...\n",
|
| 255 |
+
" 🔍 Web searching for: CHIPOTLE 1796 HUNTSVILLE AL...\n",
|
| 256 |
+
" 🔍 Web searching for: UAH BURSARS OFFICE HUNTSVILLE AL...\n",
|
| 257 |
+
" 🔍 Web searching for: STARS AND STRIKES - HUNT HUNTSVILLE AL...\n",
|
| 258 |
+
" 🔍 Web searching for: ROSS STORES #620 HUNTSVILLE AL...\n",
|
| 259 |
+
" 🔍 Web searching for: TST*KAMADO RAMEN - MID HUNTSVILLE AL00006963025030352515AA...\n",
|
| 260 |
+
" 🔍 Web searching for: SQ *MOM'SCLAYCO HARVEST AL0002305843022068424398...\n",
|
| 261 |
+
" 🔍 Web searching for: DOLLARTREE HUNTSVILLE AL...\n",
|
| 262 |
+
" 🔍 Web searching for: SLIM & HUSKIES NASHVILLE TN...\n",
|
| 263 |
+
" 🔍 Web searching for: CHIPOTLE 1392 SANTA MONICA CA...\n",
|
| 264 |
+
" 🔍 Web searching for: DOLLAR TREE LAS VEGAS NV...\n",
|
| 265 |
+
" 🔍 Web searching for: LYFT *RIDE TUE 12AM 8552800278 CA...\n",
|
| 266 |
+
" 🔍 Web searching for: SQ *SHIKU GCM LOS ANGELES CA0001152921515467218869...\n",
|
| 267 |
+
" 🔍 Web searching for: SQ *SHIKU GCM LOS ANGELES CA0001152921515467211997...\n",
|
| 268 |
+
" 🔍 Web searching for: WALMART STORE 05686 BURBANK CA...\n",
|
| 269 |
+
" 🔍 Web searching for: CAFE BELLA NEWPORT SAN DIEGO CAGOOGLE PAY ENDING IN 8984...\n",
|
| 270 |
+
" 🔍 Web searching for: CHIPOTLE 2883 NORTH LAS VEGNVGOOGLE PAY ENDING IN 8984...\n",
|
| 271 |
+
" 🔍 Web searching for: SHELL10006319007 HESPERIA CAGOOGLE PAY ENDING IN 8984...\n",
|
| 272 |
+
" 🔍 Web searching for: PANDA EXPRESS #1964 LAS VEGAS NV...\n",
|
| 273 |
+
" 🔍 Web searching for: DENNY'S #0141 QR LAS VEGAS NVGOOGLE PAY ENDING IN 8984...\n",
|
| 274 |
+
" 🔍 Web searching for: LAS VEGAS SOUVENIRS AND LAS VEGAS NV...\n",
|
| 275 |
+
" 🔍 Web searching for: CTLP*FIRST CLASS VENDI BELLGARDENS CA...\n",
|
| 276 |
+
" 🔍 Web searching for: SHELL12874333011 FRANKLIN TN...\n",
|
| 277 |
+
" 🔍 Web searching for: AMARAVATI INDIAN CUISINE BRENTWOOD TNGOOGLE PAY ENDING IN 8984...\n",
|
| 278 |
+
" 🔍 Web searching for: CENTRAL MARKET NASHVILLE TN...\n",
|
| 279 |
+
" 🔍 Web searching for: TST*PRINCES HOT CHICKE NASHVILLE TN00104605025320544723AA...\n",
|
| 280 |
+
" 🔍 Web searching for: TST*PRINCES HOT CHICKE NASHVILLE TN00104605025321087148AA...\n",
|
| 281 |
+
" 🔍 Web searching for: WALMART STORE 05616 NASHVILLE TN...\n",
|
| 282 |
+
" 🔍 Web searching for: PY *KUNG-FU TEA AL HUNTSVILLE ALGOOGLE PAY ENDING IN 8984...\n",
|
| 283 |
+
" 🔍 Web searching for: 2LEVY R&C CHATTANOOGA TNGOOGLE PAY ENDING IN 8984...\n",
|
| 284 |
+
"✅ Ingested 124 rows from Discover-AllAvailable-20260110.csv. Logic: spending_is_positive\n",
|
| 285 |
+
"📂 Processing /Users/sawale/Documents/learning/money_rag/demo_data/Chase5282_Activity20240110_20260110_20260111.CSV...\n",
|
| 286 |
+
" ✨ Enriching descriptions (Async)...\n",
|
| 287 |
+
" 🔍 Web searching for: TOUS LES JOURS - HUNTSVIL...\n",
|
| 288 |
+
" 🔍 Web searching for: Payment Thank You-Mobile...\n",
|
| 289 |
+
" ��� Web searching for: INDIAN BAZAAR...\n",
|
| 290 |
+
" 🔍 Web searching for: TST*BLUE OAK BBQ-HUNTSVI...\n",
|
| 291 |
+
" 🔍 Web searching for: AMC 4112 VAL BEND 18...\n",
|
| 292 |
+
" 🔍 Web searching for: HANDELS HOMEMADE JONES V...\n",
|
| 293 |
+
" 🔍 Web searching for: PAYYOURSELFBACK CREDIT...\n",
|
| 294 |
+
" 🔍 Web searching for: TST* HYDERABAD HOUSE...\n",
|
| 295 |
+
" 🔍 Web searching for: PATEL BROTHERS NASHVILLE...\n",
|
| 296 |
+
" 🔍 Web searching for: CITY OF HUNTSVILLE...\n",
|
| 297 |
+
" 🔍 Web searching for: WM SUPERCENTER #332...\n",
|
| 298 |
+
" 🔍 Web searching for: WAL-MART #0332...\n",
|
| 299 |
+
" 🔍 Web searching for: AMAZON MKTPL*OS1RI3LN3...\n",
|
| 300 |
+
" 🔍 Web searching for: TST* HATTIE B'S HUNTSVILL...\n",
|
| 301 |
+
" 🔍 Web searching for: AMAZON MKTPL*BI23Z6JR0...\n",
|
| 302 |
+
" 🔍 Web searching for: AMAZON MKTPL*BI9IW9OS2...\n",
|
| 303 |
+
" 🔍 Web searching for: AMAZON MKTPL*BI0296OJ2...\n",
|
| 304 |
+
" 🔍 Web searching for: AMAZON MKTPL*BB71A2881...\n",
|
| 305 |
+
" 🔍 Web searching for: AMAZON MKTPL*BB3FU2UQ2...\n",
|
| 306 |
+
" 🔍 Web searching for: AMAZON MKTPL*BI03P1OX2...\n",
|
| 307 |
+
" 🔍 Web searching for: AMAZON MKTPL*BB92U9QK2...\n",
|
| 308 |
+
" 🔍 Web searching for: AMAZON MKTPL*BB9TA14Q0...\n",
|
| 309 |
+
" 🔍 Web searching for: 88 BUFFET...\n",
|
| 310 |
+
" 🔍 Web searching for: AMAZON MKTPL*BB0DC71B1...\n",
|
| 311 |
+
" 🔍 Web searching for: AMAZON MKTPL*B20NN4ID0...\n",
|
| 312 |
+
" 🔍 Web searching for: AMAZON MKTPL*B273C1WY2...\n",
|
| 313 |
+
" 🔍 Web searching for: AMAZON MKTPL*B27IN41E1...\n",
|
| 314 |
+
" 🔍 Web searching for: AMAZON MKTPL*B250Z60P1...\n",
|
| 315 |
+
" 🔍 Web searching for: BEST BUY 00005140...\n",
|
| 316 |
+
" 🔍 Web searching for: DAVES HOT CHICKEN 1282...\n",
|
| 317 |
+
" 🔍 Web searching for: SQ *VIETCUISINE LLC...\n",
|
| 318 |
+
" 🔍 Web searching for: CHICK-FIL-A #00579...\n",
|
| 319 |
+
" 🔍 Web searching for: COSTCO WHSE #0356...\n",
|
| 320 |
+
" 🔍 Web searching for: AMAZON MKTPL*NK4AM43Q2...\n",
|
| 321 |
+
" 🔍 Web searching for: HUNTSVILLE FLV...\n",
|
| 322 |
+
" 🔍 Web searching for: AMAZON MKTPL*NM1H055K0...\n",
|
| 323 |
+
" 🔍 Web searching for: MAPCO EXPRESS #3403...\n",
|
| 324 |
+
" 🔍 Web searching for: DUNKIN #346212 Q35...\n",
|
| 325 |
+
" 🔍 Web searching for: CENTRAL MARKET...\n",
|
| 326 |
+
" 🔍 Web searching for: TARA INTERNATIONAL MARKET...\n",
|
| 327 |
+
" 🔍 Web searching for: BOTAN MARKET INC...\n",
|
| 328 |
+
" 🔍 Web searching for: AMARAVATI INDIAN CUISINE...\n",
|
| 329 |
+
" 🔍 Web searching for: GRUBHUB - UNIVERSITY OF A...\n",
|
| 330 |
+
" 🔍 Web searching for: BURGER KING #4959...\n",
|
| 331 |
+
" 🔍 Web searching for: PANDA EXPRESS #3013...\n",
|
| 332 |
+
" 🔍 Web searching for: MCDONALD'S F2431...\n",
|
| 333 |
+
" 🔍 Web searching for: ENDZONE COLLECTIBLES...\n",
|
| 334 |
+
" 🔍 Web searching for: ZIMMAD EVE* ZIMMAD JOI...\n",
|
| 335 |
+
" 🔍 Web searching for: SQ *SPILL COFFEE AND CREA...\n",
|
| 336 |
+
" 🔍 Web searching for: 10267 CAVA WHITESBURG...\n",
|
| 337 |
+
" 🔍 Web searching for: SPO*DRAGONSFORGECAFE...\n",
|
| 338 |
+
" 🔍 Web searching for: UAH BURSARS OFFICE...\n",
|
| 339 |
+
" 🔍 Web searching for: MARATHON PETRO42804...\n",
|
| 340 |
+
" 🔍 Web searching for: TST*NOTHING BUT NOODLES...\n",
|
| 341 |
+
" 🔍 Web searching for: VEDA INDIAN CUISINE...\n",
|
| 342 |
+
" 🔍 Web searching for: DOLLARTREE...\n",
|
| 343 |
+
" 🔍 Web searching for: TARGET 00013466...\n",
|
| 344 |
+
" 🔍 Web searching for: POPEYES 2577...\n",
|
| 345 |
+
" 🔍 Web searching for: DEORALI GROCERY...\n",
|
| 346 |
+
" 🔍 Web searching for: HELLO ATLANTA #33...\n",
|
| 347 |
+
" 🔍 Web searching for: SKY VIEW ATLANTA...\n",
|
| 348 |
+
" 🔍 Web searching for: STARBUCKS 25111...\n",
|
| 349 |
+
" 🔍 Web searching for: BP#8998205AM/PM WADE GRE...\n",
|
| 350 |
+
" 🔍 Web searching for: Waffle House 0857...\n",
|
| 351 |
+
" 🔍 Web searching for: CINEMARK 1131 BOXCON...\n",
|
| 352 |
+
" 🔍 Web searching for: CINEMARK 1131 RSTBAR...\n",
|
| 353 |
+
" 🔍 Web searching for: HOMEGOODS # 0568...\n",
|
| 354 |
+
" 🔍 Web searching for: ASIAN MARKET...\n",
|
| 355 |
+
" 🔍 Web searching for: PANDA EXPRESS #2005...\n",
|
| 356 |
+
" 🔍 Web searching for: STARS AND STRIKES - HUNTS...\n",
|
| 357 |
+
" 🔍 Web searching for: WAL-MART #332...\n",
|
| 358 |
+
"✅ Ingested 126 rows from Chase5282_Activity20240110_20260110_20260111.CSV. Logic: spending_is_negative\n"
|
| 359 |
]
|
| 360 |
}
|
| 361 |
],
|
|
|
|
| 364 |
"path2 = \"/Users/sawale/Documents/learning/money_rag/demo_data/Chase5282_Activity20240110_20260110_20260111.CSV\"\n",
|
| 365 |
"\n",
|
| 366 |
"# Initialize the Gemini model via Vertex AI\n",
|
| 367 |
+
"vertex_llm = init_chat_model(\n",
|
| 368 |
" \"gemini-2.5-flash\", \n",
|
| 369 |
" model_provider=\"google_vertexai\",\n",
|
| 370 |
" project='gen-lang-client-0311515393',\n",
|
| 371 |
" location='us-central1',\n",
|
| 372 |
")\n",
|
| 373 |
"\n",
|
| 374 |
+
"# Run async functions in Jupyter\n",
|
| 375 |
+
"# We run them sequentially here to avoid file lock issues with SQLite if both try to write at once\n",
|
| 376 |
+
"# (Though SQLite handles concurrency, keeping ingestion strictly ordered is safer for the demo)\n",
|
| 377 |
+
"await ingest_csv(path1, vertex_llm)\n",
|
| 378 |
+
"await ingest_csv(path2, vertex_llm)"
|
| 379 |
]
|
| 380 |
},
|
| 381 |
{
|
| 382 |
"cell_type": "code",
|
| 383 |
+
"execution_count": 15,
|
| 384 |
"metadata": {},
|
| 385 |
"outputs": [
|
| 386 |
{
|
|
|
|
| 404 |
" <thead>\n",
|
| 405 |
" <tr style=\"text-align: right;\">\n",
|
| 406 |
" <th></th>\n",
|
| 407 |
+
" <th>id</th>\n",
|
| 408 |
" <th>transaction_date</th>\n",
|
| 409 |
" <th>description</th>\n",
|
| 410 |
" <th>amount</th>\n",
|
| 411 |
" <th>category</th>\n",
|
| 412 |
" <th>source_file</th>\n",
|
| 413 |
+
" <th>enriched_info</th>\n",
|
| 414 |
" </tr>\n",
|
| 415 |
" </thead>\n",
|
| 416 |
" <tbody>\n",
|
| 417 |
" <tr>\n",
|
| 418 |
" <th>0</th>\n",
|
| 419 |
+
" <td>8ea03f61-dd51-45f9-a633-bd363154a424</td>\n",
|
| 420 |
" <td>2024-10-17 00:00:00</td>\n",
|
| 421 |
" <td>BACK MARKET BROOKLYN NY</td>\n",
|
| 422 |
" <td>231.19</td>\n",
|
| 423 |
" <td>Merchandise</td>\n",
|
| 424 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
| 425 |
+
" <td>Online Retailer in New York, NY . See BBB rati...</td>\n",
|
| 426 |
" </tr>\n",
|
| 427 |
" <tr>\n",
|
| 428 |
" <th>1</th>\n",
|
| 429 |
+
" <td>02107b64-e0bd-4a7f-b5cc-674b7767a50f</td>\n",
|
| 430 |
" <td>2024-10-18 00:00:00</td>\n",
|
| 431 |
" <td>TEMU.COM 8884958368 DE</td>\n",
|
| 432 |
" <td>16.51</td>\n",
|
| 433 |
" <td>Merchandise</td>\n",
|
| 434 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
| 435 |
+
" <td>Temu ' s business model has allowed it to beco...</td>\n",
|
| 436 |
" </tr>\n",
|
| 437 |
" <tr>\n",
|
| 438 |
" <th>2</th>\n",
|
| 439 |
+
" <td>a5a7cc9f-46ab-4913-a695-290b75f590a9</td>\n",
|
| 440 |
" <td>2024-10-18 00:00:00</td>\n",
|
| 441 |
" <td>WALMART STORE 00332 HUNTSVILLE AL</td>\n",
|
| 442 |
" <td>146.73</td>\n",
|
| 443 |
" <td>Merchandise</td>\n",
|
| 444 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
| 445 |
+
" <td>Walmart Inc. is an American multinational reta...</td>\n",
|
| 446 |
" </tr>\n",
|
| 447 |
" <tr>\n",
|
| 448 |
" <th>3</th>\n",
|
| 449 |
+
" <td>332c4fac-7760-4c63-992b-fd24fcdd4eee</td>\n",
|
| 450 |
" <td>2024-10-18 00:00:00</td>\n",
|
| 451 |
" <td>$100 STATEMENT CREDIT W 1ST PU</td>\n",
|
| 452 |
" <td>-100.00</td>\n",
|
| 453 |
" <td>Awards and Rebate Credits</td>\n",
|
| 454 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
| 455 |
+
" <td>All U.S. Bank credit cards offer contactless c...</td>\n",
|
| 456 |
" </tr>\n",
|
| 457 |
" <tr>\n",
|
| 458 |
" <th>4</th>\n",
|
| 459 |
+
" <td>ef01b0e5-5a29-4c08-ac16-fff028cc23e6</td>\n",
|
| 460 |
" <td>2024-11-02 00:00:00</td>\n",
|
| 461 |
" <td>PY *KUNG-FU TEA AL HUNTSVILLE AL</td>\n",
|
| 462 |
" <td>8.09</td>\n",
|
| 463 |
" <td>Restaurants</td>\n",
|
| 464 |
" <td>Discover-AllAvailable-20260110.csv</td>\n",
|
| 465 |
+
" <td>Jan 22, 2021 · Best part to me--besides the ro...</td>\n",
|
| 466 |
" </tr>\n",
|
| 467 |
" <tr>\n",
|
| 468 |
" <th>...</th>\n",
|
|
|
|
| 471 |
" <td>...</td>\n",
|
| 472 |
" <td>...</td>\n",
|
| 473 |
" <td>...</td>\n",
|
| 474 |
+
" <td>...</td>\n",
|
| 475 |
+
" <td>...</td>\n",
|
| 476 |
" </tr>\n",
|
| 477 |
" <tr>\n",
|
| 478 |
" <th>245</th>\n",
|
| 479 |
+
" <td>8c5d2425-5ddd-4532-8279-d6b3e733ec01</td>\n",
|
| 480 |
" <td>2025-06-18 00:00:00</td>\n",
|
| 481 |
" <td>PANDA EXPRESS #2005</td>\n",
|
| 482 |
" <td>52.87</td>\n",
|
| 483 |
" <td>Food & Drink</td>\n",
|
| 484 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
| 485 |
+
" <td>This is a list of notable current and former f...</td>\n",
|
| 486 |
" </tr>\n",
|
| 487 |
" <tr>\n",
|
| 488 |
" <th>246</th>\n",
|
| 489 |
+
" <td>02578b56-f0a1-490d-8cf1-dd061a26ebf2</td>\n",
|
| 490 |
" <td>2025-06-14 00:00:00</td>\n",
|
| 491 |
" <td>Payment Thank You-Mobile</td>\n",
|
| 492 |
" <td>-62.07</td>\n",
|
| 493 |
" <td>None</td>\n",
|
| 494 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
| 495 |
+
" <td>Locate a mobile phone | GSM locator of mobile ...</td>\n",
|
| 496 |
" </tr>\n",
|
| 497 |
" <tr>\n",
|
| 498 |
" <th>247</th>\n",
|
| 499 |
+
" <td>e0acc481-3baf-458b-9cdf-75bc1524a5f5</td>\n",
|
| 500 |
" <td>2025-06-12 00:00:00</td>\n",
|
| 501 |
" <td>STARS AND STRIKES - HUNTS</td>\n",
|
| 502 |
" <td>21.80</td>\n",
|
| 503 |
" <td>Entertainment</td>\n",
|
| 504 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
| 505 |
+
" <td>At our Huntsville , AL location , we pride our...</td>\n",
|
| 506 |
" </tr>\n",
|
| 507 |
" <tr>\n",
|
| 508 |
" <th>248</th>\n",
|
| 509 |
+
" <td>dfc97eee-5207-464a-b6e1-a49fd80df48c</td>\n",
|
| 510 |
" <td>2025-06-11 00:00:00</td>\n",
|
| 511 |
" <td>WAL-MART #332</td>\n",
|
| 512 |
" <td>4.47</td>\n",
|
| 513 |
" <td>Groceries</td>\n",
|
| 514 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
| 515 |
+
" <td>From toys and video games to fashionable cloth...</td>\n",
|
| 516 |
" </tr>\n",
|
| 517 |
" <tr>\n",
|
| 518 |
" <th>249</th>\n",
|
| 519 |
+
" <td>576a6f5a-d090-4e4c-84f1-c1cc3fe8809c</td>\n",
|
| 520 |
" <td>2025-06-11 00:00:00</td>\n",
|
| 521 |
" <td>WAL-MART #332</td>\n",
|
| 522 |
" <td>57.60</td>\n",
|
| 523 |
" <td>Groceries</td>\n",
|
| 524 |
" <td>Chase5282_Activity20240110_20260110_20260111.CSV</td>\n",
|
| 525 |
+
" <td>From toys and video games to fashionable cloth...</td>\n",
|
| 526 |
" </tr>\n",
|
| 527 |
" </tbody>\n",
|
| 528 |
"</table>\n",
|
| 529 |
+
"<p>250 rows × 7 columns</p>\n",
|
| 530 |
"</div>"
|
| 531 |
],
|
| 532 |
"text/plain": [
|
| 533 |
+
" id transaction_date \\\n",
|
| 534 |
+
"0 8ea03f61-dd51-45f9-a633-bd363154a424 2024-10-17 00:00:00 \n",
|
| 535 |
+
"1 02107b64-e0bd-4a7f-b5cc-674b7767a50f 2024-10-18 00:00:00 \n",
|
| 536 |
+
"2 a5a7cc9f-46ab-4913-a695-290b75f590a9 2024-10-18 00:00:00 \n",
|
| 537 |
+
"3 332c4fac-7760-4c63-992b-fd24fcdd4eee 2024-10-18 00:00:00 \n",
|
| 538 |
+
"4 ef01b0e5-5a29-4c08-ac16-fff028cc23e6 2024-11-02 00:00:00 \n",
|
| 539 |
+
".. ... ... \n",
|
| 540 |
+
"245 8c5d2425-5ddd-4532-8279-d6b3e733ec01 2025-06-18 00:00:00 \n",
|
| 541 |
+
"246 02578b56-f0a1-490d-8cf1-dd061a26ebf2 2025-06-14 00:00:00 \n",
|
| 542 |
+
"247 e0acc481-3baf-458b-9cdf-75bc1524a5f5 2025-06-12 00:00:00 \n",
|
| 543 |
+
"248 dfc97eee-5207-464a-b6e1-a49fd80df48c 2025-06-11 00:00:00 \n",
|
| 544 |
+
"249 576a6f5a-d090-4e4c-84f1-c1cc3fe8809c 2025-06-11 00:00:00 \n",
|
| 545 |
+
"\n",
|
| 546 |
+
" description amount category \\\n",
|
| 547 |
+
"0 BACK MARKET BROOKLYN NY 231.19 Merchandise \n",
|
| 548 |
+
"1 TEMU.COM 8884958368 DE 16.51 Merchandise \n",
|
| 549 |
+
"2 WALMART STORE 00332 HUNTSVILLE AL 146.73 Merchandise \n",
|
| 550 |
+
"3 $100 STATEMENT CREDIT W 1ST PU -100.00 Awards and Rebate Credits \n",
|
| 551 |
+
"4 PY *KUNG-FU TEA AL HUNTSVILLE AL 8.09 Restaurants \n",
|
| 552 |
+
".. ... ... ... \n",
|
| 553 |
+
"245 PANDA EXPRESS #2005 52.87 Food & Drink \n",
|
| 554 |
+
"246 Payment Thank You-Mobile -62.07 None \n",
|
| 555 |
+
"247 STARS AND STRIKES - HUNTS 21.80 Entertainment \n",
|
| 556 |
+
"248 WAL-MART #332 4.47 Groceries \n",
|
| 557 |
+
"249 WAL-MART #332 57.60 Groceries \n",
|
| 558 |
"\n",
|
| 559 |
+
" source_file \\\n",
|
| 560 |
+
"0 Discover-AllAvailable-20260110.csv \n",
|
| 561 |
+
"1 Discover-AllAvailable-20260110.csv \n",
|
| 562 |
+
"2 Discover-AllAvailable-20260110.csv \n",
|
| 563 |
+
"3 Discover-AllAvailable-20260110.csv \n",
|
| 564 |
+
"4 Discover-AllAvailable-20260110.csv \n",
|
| 565 |
+
".. ... \n",
|
| 566 |
+
"245 Chase5282_Activity20240110_20260110_20260111.CSV \n",
|
| 567 |
+
"246 Chase5282_Activity20240110_20260110_20260111.CSV \n",
|
| 568 |
+
"247 Chase5282_Activity20240110_20260110_20260111.CSV \n",
|
| 569 |
+
"248 Chase5282_Activity20240110_20260110_20260111.CSV \n",
|
| 570 |
+
"249 Chase5282_Activity20240110_20260110_20260111.CSV \n",
|
| 571 |
"\n",
|
| 572 |
+
" enriched_info \n",
|
| 573 |
+
"0 Online Retailer in New York, NY . See BBB rati... \n",
|
| 574 |
+
"1 Temu ' s business model has allowed it to beco... \n",
|
| 575 |
+
"2 Walmart Inc. is an American multinational reta... \n",
|
| 576 |
+
"3 All U.S. Bank credit cards offer contactless c... \n",
|
| 577 |
+
"4 Jan 22, 2021 · Best part to me--besides the ro... \n",
|
| 578 |
+
".. ... \n",
|
| 579 |
+
"245 This is a list of notable current and former f... \n",
|
| 580 |
+
"246 Locate a mobile phone | GSM locator of mobile ... \n",
|
| 581 |
+
"247 At our Huntsville , AL location , we pride our... \n",
|
| 582 |
+
"248 From toys and video games to fashionable cloth... \n",
|
| 583 |
+
"249 From toys and video games to fashionable cloth... \n",
|
| 584 |
"\n",
|
| 585 |
+
"[250 rows x 7 columns]"
|
| 586 |
]
|
| 587 |
},
|
| 588 |
+
"execution_count": 15,
|
| 589 |
"metadata": {},
|
| 590 |
"output_type": "execute_result"
|
| 591 |
}
|
| 592 |
],
|
| 593 |
"source": [
|
| 594 |
+
"\n",
|
| 595 |
"import sqlite3\n",
|
| 596 |
"import pandas as pd\n",
|
| 597 |
"\n",
|
|
|
|
| 610 |
},
|
| 611 |
{
|
| 612 |
"cell_type": "code",
|
| 613 |
+
"execution_count": 6,
|
| 614 |
+
"metadata": {},
|
| 615 |
+
"outputs": [],
|
| 616 |
+
"source": [
|
| 617 |
+
"# df_view[\"amount\"].sum()"
|
| 618 |
+
]
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"cell_type": "code",
|
| 622 |
+
"execution_count": null,
|
| 623 |
"metadata": {},
|
| 624 |
"outputs": [
|
| 625 |
{
|
| 626 |
+
"name": "stderr",
|
| 627 |
+
"output_type": "stream",
|
| 628 |
+
"text": [
|
| 629 |
+
"/Users/sawale/Documents/learning/money_rag/.venv/lib/python3.12/site-packages/vertexai/_model_garden/_model_garden_models.py:278: UserWarning: This feature is deprecated as of June 24, 2025 and will be removed on June 24, 2026. For details, see https://cloud.google.com/vertex-ai/generative-ai/docs/deprecations/genai-vertexai-sdk.\n",
|
| 630 |
+
" warning_logs.show_deprecation_warning()\n"
|
| 631 |
+
]
|
| 632 |
+
},
|
| 633 |
+
{
|
| 634 |
+
"name": "stdout",
|
| 635 |
+
"output_type": "stream",
|
| 636 |
+
"text": [
|
| 637 |
+
"✅ Synced 250 records to Qdrant.\n"
|
| 638 |
+
]
|
| 639 |
}
|
| 640 |
],
|
| 641 |
"source": [
|
| 642 |
+
"db = SQLDatabase.from_uri(\"sqlite:///money_rag.db\")\n",
|
| 643 |
+
"\n",
|
| 644 |
+
"embeddings = VertexAIEmbeddings(model_name=\"text-embedding-005\")\n",
|
| 645 |
+
"\n",
|
| 646 |
+
"\n",
|
| 647 |
+
"# Initialize Qdrant with disk persistence\n",
|
| 648 |
+
"# \"path\" creates a local directory to store the vectors\n",
|
| 649 |
+
"qdrant_client = QdrantClient(path=\"qdrant_db\") \n",
|
| 650 |
+
"COLLECTION_NAME = \"transactions\"\n",
|
| 651 |
+
"\n",
|
| 652 |
+
"def sync_to_qdrant(db_path: str):\n",
|
| 653 |
+
" \"\"\"Sync the transactions table from SQLite to Qdrant vector store.\"\"\"\n",
|
| 654 |
+
" # Load data from the database\n",
|
| 655 |
+
" conn = sqlite3.connect(db_path)\n",
|
| 656 |
+
" # Ensure we select the new column\n",
|
| 657 |
+
" df = pd.read_sql_query(\"SELECT * FROM transactions\", conn)\n",
|
| 658 |
+
" conn.close()\n",
|
| 659 |
+
"\n",
|
| 660 |
+
" # Recreate collection to ensure clean state on re-ingestion\n",
|
| 661 |
+
" if qdrant_client.collection_exists(COLLECTION_NAME):\n",
|
| 662 |
+
" qdrant_client.delete_collection(COLLECTION_NAME)\n",
|
| 663 |
+
" \n",
|
| 664 |
+
" qdrant_client.create_collection(\n",
|
| 665 |
+
" collection_name=COLLECTION_NAME,\n",
|
| 666 |
+
" vectors_config=VectorParams(size=768, distance=Distance.COSINE),\n",
|
| 667 |
+
" )\n",
|
| 668 |
+
" \n",
|
| 669 |
+
" vector_store = QdrantVectorStore(\n",
|
| 670 |
+
" client=qdrant_client,\n",
|
| 671 |
+
" collection_name=COLLECTION_NAME,\n",
|
| 672 |
+
" embedding=embeddings,\n",
|
| 673 |
+
" )\n",
|
| 674 |
+
"\n",
|
| 675 |
+
" # Use description + enrichment as the main text for embedding\n",
|
| 676 |
+
" texts = []\n",
|
| 677 |
+
" for _, row in df.iterrows():\n",
|
| 678 |
+
" # Combine original description with the permanent enriched info\n",
|
| 679 |
+
" # If enriched_info is present, it looks like: \"MCDONALDS - Fast food chain...\"\n",
|
| 680 |
+
" enriched = row.get('enriched_info', '')\n",
|
| 681 |
+
" if enriched:\n",
|
| 682 |
+
" texts.append(f\"{row['description']} - {enriched}\")\n",
|
| 683 |
+
" else:\n",
|
| 684 |
+
" texts.append(str(row['description']))\n",
|
| 685 |
+
" \n",
|
| 686 |
+
" # Store other fields as metadata for correlation\n",
|
| 687 |
+
" metadatas = df[['id', 'amount', 'category', 'transaction_date']].to_dict('records')\n",
|
| 688 |
+
" # Convert timestamps to string for metadata compatibility\n",
|
| 689 |
+
" for m in metadatas:\n",
|
| 690 |
+
" m['transaction_date'] = str(m['transaction_date'])\n",
|
| 691 |
+
"\n",
|
| 692 |
+
" vector_store.add_texts(texts=texts, metadatas=metadatas)\n",
|
| 693 |
+
" print(f\"✅ Synced {len(texts)} records to Qdrant at 'qdrant_db/'.\")\n",
|
| 694 |
+
" return vector_store\n",
|
| 695 |
+
"\n",
|
| 696 |
+
"# Initialize store from DB\n",
|
| 697 |
+
"vector_store = sync_to_qdrant(\"money_rag.db\")"
|
| 698 |
]
|
| 699 |
},
|
| 700 |
+
{
|
| 701 |
+
"cell_type": "code",
|
| 702 |
+
"execution_count": 34,
|
| 703 |
+
"metadata": {},
|
| 704 |
+
"outputs": [],
|
| 705 |
+
"source": [
|
| 706 |
+
"@dataclass\n",
|
| 707 |
+
"class RuntimeContext:\n",
|
| 708 |
+
" db: SQLDatabase\n",
|
| 709 |
+
" vector_store: QdrantVectorStore\n",
|
| 710 |
+
"\n",
|
| 711 |
+
"@tool\n",
|
| 712 |
+
"def execute_sql(query: str) -> str:\n",
|
| 713 |
+
" \"\"\"Execute a SQLite command and return results.\"\"\"\n",
|
| 714 |
+
" runtime = get_runtime(RuntimeContext)\n",
|
| 715 |
+
" db = runtime.context.db\n",
|
| 716 |
+
" try:\n",
|
| 717 |
+
" return db.run(query)\n",
|
| 718 |
+
" except Exception as e:\n",
|
| 719 |
+
" return f\"Error: {e}\"\n",
|
| 720 |
+
"\n",
|
| 721 |
+
"\n",
|
| 722 |
+
"@tool\n",
|
| 723 |
+
"def semantic_search(query: str, topk: int = 5) -> str:\n",
|
| 724 |
+
" \"\"\"\n",
|
| 725 |
+
" Search for transactions semantically when exact category or description matches are unknown.\n",
|
| 726 |
+
" Use this to find specific merchants or types of spending (e.g., 'streaming services' or 'fast food').\n",
|
| 727 |
+
" Returns a list of matching transactions with their IDs and metadata.\n",
|
| 728 |
+
" \"\"\"\n",
|
| 729 |
+
" runtime = get_runtime(RuntimeContext)\n",
|
| 730 |
+
" vs = runtime.context.vector_store\n",
|
| 731 |
+
" results = vs.similarity_search(query, k=topk)\n",
|
| 732 |
+
" \n",
|
| 733 |
+
" output = []\n",
|
| 734 |
+
" for doc in results:\n",
|
| 735 |
+
" output.append(f\"Result: {doc.page_content} | Metadata: {doc.metadata}\")\n",
|
| 736 |
+
" \n",
|
| 737 |
+
" return \"\\n\".join(output) if output else \"No semantically similar transactions found.\"\n",
|
| 738 |
+
"\n",
|
| 739 |
+
"\n",
|
| 740 |
+
"SYSTEM = f\"\"\"You are a sophisticated financial analyst with access to both a SQLite database and a semantic search tool.\n",
|
| 741 |
+
"\n",
|
| 742 |
+
"Workflow:\n",
|
| 743 |
+
"1. **Identify the Need**: If the user's request uses vague terms (e.g., \"junk food\", \"travel stuff\") or you don't know the exact category/description name, start with `semantic_search`.\n",
|
| 744 |
+
"2. **Semantic Discovery**:\n",
|
| 745 |
+
" - Call `semantic_search(query, topk=...)`. \n",
|
| 746 |
+
" - **Pro Tip**: Use a higher `topk` (e.g., 10 or 20) if you suspect there are many relevant transactions to find.\n",
|
| 747 |
+
" - **Relevance Check**: Not all results in the `topk` list may be relevant. You have permission to select only the few that match the user's intent and discard the rest.\n",
|
| 748 |
+
" - **Reiteration Logic**: \n",
|
| 749 |
+
" - If the initial results seem too narrow, or if you suspect more relevant transactions exist but were cut off, reiterate by calling `semantic_search` again with a higher `topk` (e.g., 20, 50).\n",
|
| 750 |
+
" - **Stop Condition**: If you found a few matching transactions and are confident that's all of them (i.e., the rest of the results are clearly irrelevant), do NOT iterate further.\n",
|
| 751 |
+
"3. **Filter & Extract**:\n",
|
| 752 |
+
" - Review the results from semantic search. Manually filter out any that aren't relevant to the user's specific intent.\n",
|
| 753 |
+
" - Extract the unique `id` values from the metadata of relevant results.\n",
|
| 754 |
+
"4. **Handling No Results**:\n",
|
| 755 |
+
" - If `semantic_search` returns nothing useful, or if the results are clearly not what the user asked for (e.g., user asked for \"coffee\" but results are all \"gas stations\"), STOP.\n",
|
| 756 |
+
" - Return a clear message: \"I couldn't find any transactions related to [topic].\"\n",
|
| 757 |
+
" - **DO NOT GUESS**. It is better to say you found nothing than to sum up unrelated transactions.\n",
|
| 758 |
+
"5. **SQL Execution**:\n",
|
| 759 |
+
" - Use `execute_sql` to perform the final calculation or retrieval.\n",
|
| 760 |
+
" - Correlate results by using the discovered IDs in your query: `SELECT SUM(amount) FROM transactions WHERE id IN ('uuid1', 'uuid2', ...)`.\n",
|
| 761 |
+
" - You can also use discovered merchant names if they share a common pattern.\n",
|
| 762 |
+
"\n",
|
| 763 |
+
"Rules:\n",
|
| 764 |
+
"- **Thinking**: Explain your plan before calling any tools.\n",
|
| 765 |
+
"- **Read-only**: No modifications (INSERT/UPDATE/DELETE/etc.) to the database.\n",
|
| 766 |
+
"- **Spending Logic**: \n",
|
| 767 |
+
" - Spending = POSITIVE values (> 0). \n",
|
| 768 |
+
" - Payments/Refunds = NEGATIVE values (< 0). EXCLUDE negative values when calculating spending.\n",
|
| 769 |
+
"- **SQL Formatting**: Limit results to 5 rows for non-aggregation queries. Use `SUM()` for totals.\n",
|
| 770 |
+
"\"\"\"\n",
|
| 771 |
+
"\n",
|
| 772 |
+
"agent = create_agent(\n",
|
| 773 |
+
" model=vertex_llm,\n",
|
| 774 |
+
" tools=[execute_sql, semantic_search],\n",
|
| 775 |
+
" system_prompt=SYSTEM,\n",
|
| 776 |
+
" context_schema=RuntimeContext,\n",
|
| 777 |
+
" checkpointer=InMemorySaver(),\n",
|
| 778 |
+
")"
|
| 779 |
+
]
|
| 780 |
+
},
|
| 781 |
+
{
|
| 782 |
+
"cell_type": "code",
|
| 783 |
+
"execution_count": 35,
|
| 784 |
+
"metadata": {},
|
| 785 |
+
"outputs": [
|
| 786 |
+
{
|
| 787 |
+
"name": "stdout",
|
| 788 |
+
"output_type": "stream",
|
| 789 |
+
"text": [
|
| 790 |
+
"Match: CASHBACK BONUS REDEMPTION PYMT/STMT CRDT - What is this charge? Cashback bonus redemption pymt / stmt crdt . Cash - back from mogl 858-36958. His IP address, location , Internet provider, what browser he uses and operating system? You can find out all this using \"2IP spy\".It serves two main functions. It provides the network location of the host and identifies the host or network interface. What is the purpose of an IP address? Cashback bonus redemption pymt / stmt crdt .March is the month in which I had most number of transcations. I think this is because I visited the store next to me very frequently for small purchases. Cashback bonus redemption pymt / stmt crdt . Redemption Activity This Period is the total amount of Rewards you redeemed during the statement period and includes Miles partners, gift cards, account credits, electronic deposits and charitable donations. Cashback bonus redemption pymt / stmt crdt .Question2 : What was the frequency(number of occurences) of each category in terms of expenses made over the entire time period?. | Metadata: {'id': '026ac6fc-41ce-45e4-b8d5-c109b73c1e79', 'amount': -11.32, 'category': 'Awards and Rebate Credits', 'transaction_date': '2025-03-18 00:00:00', '_id': '598fb646d8a54e10a725ea61394e176a', '_collection_name': 'transactions'}\n",
|
| 791 |
+
"Match: CASHBACK BONUS REDEMPTION PYMT/STMT CRDT - What is this charge? Cashback bonus redemption pymt / stmt crdt . Cash - back from mogl 858-36958. His IP address, location , Internet provider, what browser he uses and operating system? You can find out all this using \"2IP spy\".It serves two main functions. It provides the network location of the host and identifies the host or network interface. What is the purpose of an IP address? Cashback bonus redemption pymt / stmt crdt .March is the month in which I had most number of transcations. I think this is because I visited the store next to me very frequently for small purchases. Cashback bonus redemption pymt / stmt crdt . Redemption Activity This Period is the total amount of Rewards you redeemed during the statement period and includes Miles partners, gift cards, account credits, electronic deposits and charitable donations. Cashback bonus redemption pymt / stmt crdt .Question2 : What was the frequency(number of occurences) of each category in terms of expenses made over the entire time period?. | Metadata: {'id': '8dc63630-aa67-4bb0-8012-70d244a2ba51', 'amount': -9.0, 'category': 'Awards and Rebate Credits', 'transaction_date': '2025-05-15 00:00:00', '_id': 'ecd0bafc8fee44e491fec958ffe14b51', '_collection_name': 'transactions'}\n",
|
| 792 |
+
"Match: CASHBACK BONUS REDEMPTION PYMT/STMT CRDT - What is this charge? Cashback bonus redemption pymt / stmt crdt . Cash - back from mogl 858-36958. His IP address, location , Internet provider, what browser he uses and operating system? You can find out all this using \"2IP spy\".It serves two main functions. It provides the network location of the host and identifies the host or network interface. What is the purpose of an IP address? Cashback bonus redemption pymt / stmt crdt .March is the month in which I had most number of transcations. I think this is because I visited the store next to me very frequently for small purchases. Cashback bonus redemption pymt / stmt crdt . Redemption Activity This Period is the total amount of Rewards you redeemed during the statement period and includes Miles partners, gift cards, account credits, electronic deposits and charitable donations. Cashback bonus redemption pymt / stmt crdt .Question2 : What was the frequency(number of occurences) of each category in terms of expenses made over the entire time period?. | Metadata: {'id': 'f93f111c-91bc-46f6-8a30-d14f4fb097af', 'amount': -2.55, 'category': 'Awards and Rebate Credits', 'transaction_date': '2025-11-06 00:00:00', '_id': '40f2d5f43c6d46e683de588e0ac3e36e', '_collection_name': 'transactions'}\n"
|
| 793 |
+
]
|
| 794 |
+
}
|
| 795 |
+
],
|
| 796 |
+
"source": [
|
| 797 |
+
"# # Initialize the store once\n",
|
| 798 |
+
"# # vector_store = sync_to_qdrant(\"money_rag.db\")\n",
|
| 799 |
+
"\n",
|
| 800 |
+
"# Test search\n",
|
| 801 |
+
"query = \"Where did I spend money on groceries or food?\"\n",
|
| 802 |
+
"results = vector_store.similarity_search(query, k=3)\n",
|
| 803 |
+
"\n",
|
| 804 |
+
"for doc in results:\n",
|
| 805 |
+
" print(f\"Match: {doc.page_content} | Metadata: {doc.metadata}\")"
|
| 806 |
+
]
|
| 807 |
+
},
|
| 808 |
+
{
|
| 809 |
+
"cell_type": "code",
|
| 810 |
+
"execution_count": 36,
|
| 811 |
+
"metadata": {},
|
| 812 |
+
"outputs": [
|
| 813 |
+
{
|
| 814 |
+
"name": "stdout",
|
| 815 |
+
"output_type": "stream",
|
| 816 |
+
"text": [
|
| 817 |
+
"================================\u001b[1m Human Message \u001b[0m=================================\n",
|
| 818 |
+
"\n",
|
| 819 |
+
"is there anything i need to be worried about in my spending over the last 2 months?\n",
|
| 820 |
+
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
|
| 821 |
+
"Tool Calls:\n",
|
| 822 |
+
" execute_sql (53fdebc4-4abc-4bda-ac02-e1f2f89b8140)\n",
|
| 823 |
+
" Call ID: 53fdebc4-4abc-4bda-ac02-e1f2f89b8140\n",
|
| 824 |
+
" Args:\n",
|
| 825 |
+
" query: SELECT SUM(amount) FROM transactions WHERE transaction_date >= date('now', '-2 months') AND amount > 0\n",
|
| 826 |
+
"=================================\u001b[1m Tool Message \u001b[0m=================================\n",
|
| 827 |
+
"Name: execute_sql\n",
|
| 828 |
+
"\n",
|
| 829 |
+
"[(2028.51,)]\n",
|
| 830 |
+
"==================================\u001b[1m Ai Message \u001b[0m==================================\n",
|
| 831 |
+
"\n",
|
| 832 |
+
"Your total spending over the last two months is $2028.51.\n",
|
| 833 |
+
"\n",
|
| 834 |
+
"To tell you if there's anything to \"worry about,\" I need a bit more information. Would you like me to:\n",
|
| 835 |
+
"1. Break down your spending by category?\n",
|
| 836 |
+
"2. Identify any unusually large transactions?\n",
|
| 837 |
+
"3. Compare your spending to previous months?\n",
|
| 838 |
+
"4. Look for any recurring subscriptions?\n"
|
| 839 |
+
]
|
| 840 |
+
}
|
| 841 |
+
],
|
| 842 |
+
"source": [
|
| 843 |
+
"question = \"is there anything i need to be worried about in my spending over the last 2 months?\"\n",
|
| 844 |
+
"steps = []\n",
|
| 845 |
+
"\n",
|
| 846 |
+
"for step in agent.stream(\n",
|
| 847 |
+
" {\"messages\": [{\"role\": \"user\", \"content\": question}]},\n",
|
| 848 |
+
" {\"configurable\": {\"thread_id\": \"1\"}},\n",
|
| 849 |
+
" stream_mode=\"values\",\n",
|
| 850 |
+
" context=RuntimeContext(db=db, vector_store=vector_store)\n",
|
| 851 |
+
"):\n",
|
| 852 |
+
" step[\"messages\"][-1].pretty_print()\n",
|
| 853 |
+
" steps.append(step)"
|
| 854 |
+
]
|
| 855 |
+
},
|
| 856 |
+
{
|
| 857 |
+
"cell_type": "code",
|
| 858 |
+
"execution_count": null,
|
| 859 |
+
"metadata": {},
|
| 860 |
+
"outputs": [
|
| 861 |
+
{
|
| 862 |
+
"name": "stdout",
|
| 863 |
+
"output_type": "stream",
|
| 864 |
+
"text": [
|
| 865 |
+
"💬 Chat with your financial data! (Type 'exit' to stop)\n"
|
| 866 |
+
]
|
| 867 |
+
}
|
| 868 |
+
],
|
| 869 |
+
"source": [
|
| 870 |
+
"# ...existing code...\n",
|
| 871 |
+
"# Interactive Chat Loop\n",
|
| 872 |
+
"print(\"💬 Chat with your financial data! (Type 'exit' to stop)\")\n",
|
| 873 |
+
"\n",
|
| 874 |
+
"while True:\n",
|
| 875 |
+
" try:\n",
|
| 876 |
+
" user_input = input(\"User: \")\n",
|
| 877 |
+
" if user_input.lower() in [\"exit\", \"quit\", \"q\"]:\n",
|
| 878 |
+
" print(\"Goodbye!\")\n",
|
| 879 |
+
" break\n",
|
| 880 |
+
" \n",
|
| 881 |
+
" print(\"\\n\" + \"-\"*50)\n",
|
| 882 |
+
" \n",
|
| 883 |
+
" # Stream the agent's response\n",
|
| 884 |
+
" for step in agent.stream(\n",
|
| 885 |
+
" {\"messages\": [{\"role\": \"user\", \"content\": user_input}]},\n",
|
| 886 |
+
" {\"configurable\": {\"thread_id\": \"1\"}}, # Keeps memory of the conversation\n",
|
| 887 |
+
" stream_mode=\"values\",\n",
|
| 888 |
+
" context=RuntimeContext(db=db, vector_store=vector_store)\n",
|
| 889 |
+
" ):\n",
|
| 890 |
+
" step[\"messages\"][-1].pretty_print()\n",
|
| 891 |
+
" \n",
|
| 892 |
+
" print(\"-\" * 50 + \"\\n\")\n",
|
| 893 |
+
" \n",
|
| 894 |
+
" except KeyboardInterrupt:\n",
|
| 895 |
+
" print(\"\\nGoodbye!\")\n",
|
| 896 |
+
" break"
|
| 897 |
+
]
|
| 898 |
+
},
|
| 899 |
+
{
|
| 900 |
+
"cell_type": "code",
|
| 901 |
+
"execution_count": null,
|
| 902 |
+
"metadata": {},
|
| 903 |
+
"outputs": [],
|
| 904 |
+
"source": []
|
| 905 |
+
},
|
| 906 |
+
{
|
| 907 |
+
"cell_type": "code",
|
| 908 |
+
"execution_count": null,
|
| 909 |
+
"metadata": {},
|
| 910 |
+
"outputs": [],
|
| 911 |
+
"source": []
|
| 912 |
+
},
|
| 913 |
{
|
| 914 |
"cell_type": "code",
|
| 915 |
"execution_count": null,
|
requirements.txt
CHANGED
|
@@ -6,4 +6,6 @@ tabulate
|
|
| 6 |
langchain-google-vertexai
|
| 7 |
mcp
|
| 8 |
langchain-mcp
|
| 9 |
-
langchain-community
|
|
|
|
|
|
|
|
|
| 6 |
langchain-google-vertexai
|
| 7 |
mcp
|
| 8 |
langchain-mcp
|
| 9 |
+
langchain-community
|
| 10 |
+
qdrant-client
|
| 11 |
+
langchain-qdrant
|