ahmzakif commited on
Commit
fd99b61
·
verified ·
1 Parent(s): c8aeeaa

feat: add new project

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +37 -0
  2. .gitattributes +4 -0
  3. .gitignore +58 -0
  4. .gradio/certificate.pem +31 -0
  5. Dockerfile +34 -0
  6. LICENSE +201 -0
  7. QUICKSTART.md +135 -0
  8. README.md +354 -13
  9. TECHNICAL_ASSESSMENT.md +645 -0
  10. app.py +757 -0
  11. data/Bhatla.pdf +3 -0
  12. data/EBA_ECB 2024 Report on Payment Fraud.pdf +3 -0
  13. data/fraudTest.csv +3 -0
  14. data/fraudTrain.csv +3 -0
  15. docker-compose.yml +27 -0
  16. main.py +112 -0
  17. requirements.txt +17 -0
  18. src/__init__.py +5 -0
  19. src/__pycache__/__init__.cpython-311.pyc +0 -0
  20. src/api/__init__.py +4 -0
  21. src/api/__pycache__/__init__.cpython-311.pyc +0 -0
  22. src/api/__pycache__/routes.cpython-311.pyc +0 -0
  23. src/api/routes.py +126 -0
  24. src/config/__init__.py +8 -0
  25. src/config/__pycache__/__init__.cpython-311.pyc +0 -0
  26. src/config/__pycache__/config.cpython-311.pyc +0 -0
  27. src/config/config.py +46 -0
  28. src/data/__init__.py +8 -0
  29. src/data/__pycache__/__init__.cpython-311.pyc +0 -0
  30. src/data/__pycache__/processor.cpython-311.pyc +0 -0
  31. src/data/processor.py +108 -0
  32. src/llm/__init__.py +8 -0
  33. src/llm/__pycache__/__init__.cpython-311.pyc +0 -0
  34. src/llm/__pycache__/groq_client.cpython-311.pyc +0 -0
  35. src/llm/groq_client.py +81 -0
  36. src/rag/__init__.py +9 -0
  37. src/rag/__pycache__/__init__.cpython-311.pyc +0 -0
  38. src/rag/__pycache__/csv_document_generator.cpython-311.pyc +0 -0
  39. src/rag/__pycache__/document_loader.cpython-311.pyc +0 -0
  40. src/rag/__pycache__/vector_store.cpython-311.pyc +0 -0
  41. src/rag/csv_document_generator.py +278 -0
  42. src/rag/document_loader.py +117 -0
  43. src/rag/vector_store.py +111 -0
  44. src/schemas/__init__.py +18 -0
  45. src/schemas/__pycache__/__init__.cpython-311.pyc +0 -0
  46. src/schemas/__pycache__/fraud.cpython-311.pyc +0 -0
  47. src/schemas/fraud.py +62 -0
  48. src/services/__init__.py +7 -0
  49. src/services/__pycache__/__init__.cpython-311.pyc +0 -0
  50. src/services/__pycache__/fraud_analyzer.cpython-311.pyc +0 -0
.dockerignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ venv/
24
+ .env
25
+
26
+ # Project Specific
27
+ logs/
28
+ chroma_db/
29
+ vector_store/
30
+ .vscode/
31
+ .idea/
32
+ .git/
33
+ .gitignore
34
+
35
+ # Large data files (handled via volumes in docker-compose)
36
+ data/fraudTrain.csv
37
+ data/fraudTest.csv
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/Bhatla.pdf filter=lfs diff=lfs merge=lfs -text
37
+ data/EBA_ECB[[:space:]]2024[[:space:]]Report[[:space:]]on[[:space:]]Payment[[:space:]]Fraud.pdf filter=lfs diff=lfs merge=lfs -text
38
+ data/fraudTest.csv filter=lfs diff=lfs merge=lfs -text
39
+ data/fraudTrain.csv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ .gradio/
23
+
24
+ # Virtual Environment
25
+ venv/
26
+ env/
27
+ ENV/
28
+ .venv
29
+
30
+ # IDE
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # Environment variables
38
+ .env
39
+ .env.local
40
+
41
+ # Data (ignore large CSV and PDF files)
42
+ data/*.csv
43
+ data/*.pdf
44
+
45
+ # Vector store
46
+ chroma_db/
47
+ *.db
48
+
49
+ # Logs
50
+ *.log
51
+ logs/
52
+
53
+ # OS
54
+ .DS_Store
55
+ Thumbs.db
56
+
57
+
58
+
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONDONTWRITEBYTECODE=1
6
+ ENV PYTHONUNBUFFERED=1
7
+ ENV PYTHONPATH=/app
8
+
9
+ # Set the working directory in the container
10
+ WORKDIR /app
11
+
12
+ # Install system dependencies
13
+ RUN apt-get update && apt-get install -y --no-install-recommends \
14
+ build-essential \
15
+ curl \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ # Copy the requirements file into the container at /app
19
+ COPY requirements.txt .
20
+
21
+ # Install any needed packages specified in requirements.txt
22
+ RUN pip install --no-cache-dir -r requirements.txt
23
+
24
+ # Copy the rest of the application code into the container at /app
25
+ COPY . .
26
+
27
+ # Create directory for persistent vector store
28
+ RUN mkdir -p /app/chroma_db
29
+
30
+ # Expose ports for Gradio (7860) and FastAPI (8000)
31
+ EXPOSE 7860 8000
32
+
33
+ # Default command (can be overridden in docker-compose)
34
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
QUICKSTART.md ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quick Start Guide
2
+
3
+ Panduan cepat untuk menjalankan aplikasi Fraud Detection menggunakan LangChain dan Groq.
4
+
5
+ ## Prerequisites
6
+
7
+ 1. Python 3.10 atau lebih tinggi
8
+ 2. Groq API Key (dapatkan di https://console.groq.com/)
9
+
10
+ ## Setup Cepat
11
+
12
+ ### 1. Install Dependencies
13
+
14
+ ```bash
15
+ pip install -r requirements.txt
16
+ ```
17
+
18
+ ### 2. Setup Environment Variable
19
+
20
+ Buat file `.env` di root directory:
21
+
22
+ ```env
23
+ GROQ_API_KEY=your_groq_api_key_here
24
+ ```
25
+
26
+ Atau export sebagai environment variable:
27
+
28
+ ```bash
29
+ # Windows PowerShell
30
+ $env:GROQ_API_KEY="your_groq_api_key_here"
31
+
32
+ # Linux/Mac
33
+ export GROQ_API_KEY="your_groq_api_key_here"
34
+ ```
35
+
36
+ ### 3. Jalankan Server
37
+
38
+ ```bash
39
+ python main.py
40
+ ```
41
+
42
+ Server akan berjalan di `http://localhost:8000`
43
+
44
+ ## Menggunakan API
45
+
46
+ ### 1. Melalui Browser
47
+
48
+ Buka `http://localhost:8000/docs` untuk melihat dokumentasi interaktif Swagger UI.
49
+
50
+ ### 2. Melalui cURL
51
+
52
+ #### Health Check
53
+ ```bash
54
+ curl http://localhost:8000/api/v1/health
55
+ ```
56
+
57
+ #### Analisis Transaksi
58
+ ```bash
59
+ curl -X POST "http://localhost:8000/api/v1/analyze" \
60
+ -H "Content-Type: application/json" \
61
+ -d '{
62
+ "transaction_id": 0,
63
+ "use_rag": true
64
+ }'
65
+ ```
66
+
67
+ #### Analisis dengan Data Langsung
68
+ ```bash
69
+ curl -X POST "http://localhost:8000/api/v1/analyze" \
70
+ -H "Content-Type: application/json" \
71
+ -d '{
72
+ "transaction_data": {
73
+ "merchant": "Suspicious Merchant",
74
+ "category": "grocery_pos",
75
+ "amt": 5000.00,
76
+ "city": "Jakarta",
77
+ "state": "DKI"
78
+ },
79
+ "use_rag": true
80
+ }'
81
+ ```
82
+
83
+ ### 3. Menggunakan Python Script
84
+
85
+ Jalankan contoh penggunaan:
86
+
87
+ ```bash
88
+ python example_usage.py
89
+ ```
90
+
91
+ ## Struktur Kode
92
+
93
+ ```
94
+ ├── config.py # Konfigurasi aplikasi
95
+ ├── main.py # FastAPI application
96
+ ├── example_usage.py # Contoh penggunaan
97
+ ├── requirements.txt # Dependencies
98
+ └── src/
99
+ ├── api/ # API routes
100
+ ├── data/ # Data processing
101
+ ├── llm/ # LangChain Groq integration
102
+ ├── rag/ # RAG system
103
+ ├── schemas/ # Pydantic models
104
+ └── services/ # Business logic
105
+ ```
106
+
107
+ ## Fitur Utama
108
+
109
+ 1. **LLM Integration**: Menggunakan Groq dengan LangChain
110
+ 2. **RAG System**: Menggunakan dokumen PDF sebagai konteks
111
+ 3. **RESTful API**: FastAPI dengan dokumentasi otomatis
112
+ 4. **Modular Design**: Kode yang mudah di-maintain dan di-extend
113
+
114
+ ## Troubleshooting
115
+
116
+ ### Error: "Groq API key is required"
117
+ - Pastikan `GROQ_API_KEY` sudah di-set di environment variable atau file `.env`
118
+
119
+ ### Error: "PDF file not found"
120
+ - Pastikan file PDF ada di folder `data/`
121
+ - Atau sesuaikan path di `config.py`
122
+
123
+ ### Dataset terlalu besar
124
+ - Aplikasi secara default hanya memuat sample data (10,000 rows untuk training, 1,000 untuk test)
125
+ - Untuk memuat full dataset, edit `src/data/processor.py` dan hapus parameter `nrows`
126
+
127
+ ## Next Steps
128
+
129
+ 1. Baca dokumentasi lengkap di `README.md`
130
+ 2. Explore API documentation di `http://localhost:8000/docs`
131
+ 3. Customize konfigurasi di `config.py`
132
+ 4. Extend functionality sesuai kebutuhan
133
+
134
+
135
+
README.md CHANGED
@@ -1,13 +1,354 @@
1
- ---
2
- title: Fraud Chatbot
3
- emoji: 🚀
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 6.3.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fraud Detection Chatbot
2
+
3
+ AI-powered fraud detection system menggunakan LangChain, Groq, dan RAG (Retrieval Augmented Generation) dengan Gradio interface dan FastAPI backend.
4
+
5
+ ## 🎯 Fitur Utama
6
+
7
+ ### 1. **Gradio Web Interface** (`app.py`)
8
+
9
+ - **Chat with Fraud Expert**: Tanya jawab interaktif dengan inline citations & **Response Quality Scoring**
10
+ - **Analyze by Transaction ID**: Analisis data historis lengkap (semua kolom CSV) berdasarkan ID
11
+ - **Analyze Manual Transaction**: Input manual transaction details, termasuk **Advanced Optional Fields** (Age, Gender, Location)
12
+ - **Dataset Summary**: Statistik lengkap dari 1.2M+ total transaksi
13
+
14
+ ### 2. **RAG System dengan Dual Data Sources**
15
+
16
+ - **PDF Documents**: Research papers tentang fraud detection
17
+ - Bhatla.pdf
18
+ - EBA_ECB 2024 Report on Payment Fraud.pdf
19
+ - **CSV Insights**: Extracted patterns dari fraudTrain.csv
20
+ - Fraud patterns by category (14 documents)
21
+ - Merchant risk profiles (20 documents)
22
+ - Location-based insights (15 documents)
23
+ - Statistical summaries (2 documents)
24
+
25
+ ### 3. **FastAPI REST API**
26
+
27
+ - RESTful endpoints dengan dokumentasi otomatis
28
+ - Batch analysis support
29
+ - CORS enabled untuk frontend integration
30
+
31
+ ### 4. **Inline Source Citations**
32
+
33
+ - LLM responses include `[Source X]` citations
34
+ - Source reference list at the end
35
+ - Transparency dan verifikasi informasi
36
+
37
+ ## 📁 Struktur Proyek
38
+
39
+ ```
40
+ .
41
+ ├── app.py # Gradio web interface (MAIN)
42
+ ├── main.py # FastAPI application
43
+ ├── requirements.txt # Dependencies
44
+ ├── README.md # Dokumentasi
45
+ ├── QUICKSTART.md # Quick start guide
46
+ ├── data/ # Data dan dokumen
47
+ │ ├── fraudTrain.csv # Training dataset (351 MB)
48
+ │ ├── fraudTest.csv # Test dataset
49
+ │ ├── Bhatla.pdf # Research paper
50
+ │ └── EBA_ECB 2024 Report on Payment Fraud.pdf
51
+ ├── src/
52
+ │ ├── api/ # API routes
53
+ │ │ └── routes.py
54
+ │ ├── config/ # Configuration
55
+ │ │ ├── __init__.py
56
+ │ │ └── config.py
57
+ │ ├── data/ # Data processing
58
+ │ │ └── processor.py
59
+ │ ├── llm/ # LLM integration
60
+ │ │ └── groq_client.py
61
+ │ ├── rag/ # RAG system
62
+ │ │ ├── document_loader.py
63
+ │ │ ├── vector_store.py
64
+ │ │ └── csv_document_generator.py # NEW: CSV insights
65
+ │ ├── schemas/ # Pydantic schemas
66
+ │ │ └── fraud.py
67
+ │ └── services/ # Business logic
68
+ │ └── fraud_analyzer.py
69
+ └── test/ # Test files
70
+ ├── example_usage.py
71
+ └── test_vector_store.py
72
+ ```
73
+
74
+ ## 🚀 Instalasi
75
+
76
+ ### 1. Clone & Setup Environment
77
+
78
+ ```bash
79
+ # Create virtual environment
80
+ python -m venv venv
81
+
82
+ # Activate
83
+ # Windows:
84
+ venv\Scripts\activate
85
+ # Linux/Mac:
86
+ source venv/bin/activate
87
+ ```
88
+
89
+ ### 2. Install Dependencies
90
+
91
+ ```bash
92
+ pip install -r requirements.txt
93
+ ```
94
+
95
+ ### 3. Setup Environment Variables
96
+
97
+ Buat file `.env` di root directory:
98
+
99
+ ```env
100
+ GROQ_API_KEY=your_groq_api_key_here
101
+ ```
102
+
103
+ ## 💻 Penggunaan
104
+
105
+ ### Gradio Web Interface (Recommended)
106
+
107
+ ```bash
108
+ python app.py
109
+ ```
110
+
111
+ Interface akan terbuka di:
112
+
113
+ - Local: `http://localhost:7860`
114
+ - Public: Shareable link (expires in 72 hours)
115
+
116
+ ### FastAPI Backend
117
+
118
+ ```bash
119
+ python main.py
120
+ ```
121
+
122
+ API akan berjalan di `http://localhost:8000`
123
+
124
+ **API Documentation:**
125
+
126
+ - Swagger UI: `http://localhost:8000/docs`
127
+ - ReDoc: `http://localhost:8000/redoc`
128
+
129
+ ### Docker (Recommended for Deployment)
130
+
131
+ Jika Anda memiliki Docker dan Docker Compose terinstal:
132
+
133
+ ```bash
134
+ # Build dan jalankan semua service (UI & API)
135
+ docker-compose up --build -d
136
+ ```
137
+
138
+ Service akan tersedia di:
139
+
140
+ - **Gradio UI**: `http://localhost:7860`
141
+ - **FastAPI Docs**: `http://localhost:8000/docs`
142
+
143
+ Untuk mematikan service:
144
+
145
+ ```bash
146
+ docker-compose down
147
+ ```
148
+
149
+ ## 📖 Contoh Penggunaan
150
+
151
+ ### Gradio Interface
152
+
153
+ 1. **Chat with Fraud Expert**
154
+
155
+ - Enable "Use RAG" untuk enhanced responses
156
+ - Tanya: "What are fraud patterns in grocery transactions?"
157
+ - Response akan include inline citations `[Source 1]`
158
+ 2. **Analyze Transaction**
159
+
160
+ - Input Transaction ID atau manual data
161
+ - Enable RAG untuk analysis dengan context
162
+ - Lihat detailed fraud analysis dengan sources
163
+ 3. **Dataset Summary**
164
+
165
+ - View transaction statistics
166
+ - See RAG knowledge base info (243 documents total)
167
+
168
+ ### API Endpoints
169
+
170
+ #### 1. Health Check
171
+
172
+ ```bash
173
+ curl http://localhost:8000/api/v1/health
174
+ ```
175
+
176
+ #### 2. Analyze Transaction (by ID)
177
+
178
+ ```bash
179
+ curl -X POST "http://localhost:8000/api/v1/analyze" \
180
+ -H "Content-Type: application/json" \
181
+ -d '{
182
+ "transaction_id": 0,
183
+ "use_rag": true
184
+ }'
185
+ ```
186
+
187
+ #### 3. Analyze Transaction (Manual Data)
188
+
189
+ ```bash
190
+ curl -X POST "http://localhost:8000/api/v1/analyze" \
191
+ -H "Content-Type: application/json" \
192
+ -d '{
193
+ "transaction_data": {
194
+ "merchant": "Amazon",
195
+ "category": "shopping_net",
196
+ "amt": 150.00,
197
+ "city": "Jakarta",
198
+ "state": "DKI"
199
+ },
200
+ "use_rag": true
201
+ }'
202
+ ```
203
+
204
+ #### 4. Get Dataset Summary
205
+
206
+ ```bash
207
+ curl http://localhost:8000/api/v1/summary
208
+ ```
209
+
210
+ #### 5. Batch Analysis
211
+
212
+ ```bash
213
+ curl -X POST "http://localhost:8000/api/v1/batch-analyze?transaction_ids=[0,1,2]&use_rag=true"
214
+ ```
215
+
216
+ ## 🏗️ Arsitektur
217
+
218
+ ### RAG System Flow
219
+
220
+ ```
221
+ User Query
222
+
223
+ Vector Store (Chroma)
224
+
225
+ Retrieve Top K Documents (PDF + CSV insights)
226
+
227
+ Format with Source Numbers [Source 1], [Source 2]
228
+
229
+ LLM (Groq) with Context
230
+
231
+ Response with Inline Citations
232
+
233
+ Source Reference List
234
+ ```
235
+
236
+ ### Komponen Utama
237
+
238
+ 1. **GroqClient** (`src/llm/groq_client.py`):
239
+
240
+ - Groq LLM integration via LangChain
241
+ - Model: `meta-llama/llama-4-maverick-17b-128e-instruct`
242
+ - Max tokens: 8192
243
+
244
+ - **ResponseQualityScorer** (`src/services/quality_scorer.py`):
245
+ - Automated evaluation of LLM responses
246
+ - Metrics: Relevance, Completeness, Citation Quality, Clarity
247
+
248
+ 2. **DocumentLoader** (`src/rag/document_loader.py`):
249
+
250
+ - Load PDF documents dengan PyPDFLoader
251
+ - Load CSV insights via CSVDocumentGenerator
252
+ - Text splitting dengan RecursiveCharacterTextSplitter
253
+ 3. **CSVDocumentGenerator** (`src/rag/csv_document_generator.py`):
254
+
255
+ - Extract fraud patterns by category
256
+ - Generate merchant risk profiles
257
+ - Create location-based insights
258
+ - Statistical summaries
259
+ 4. **VectorStore** (`src/rag/vector_store.py`):
260
+
261
+ - Chroma vector database
262
+ - HuggingFace embeddings (sentence-transformers/all-MiniLM-L6-v2)
263
+ - Similarity search untuk RAG
264
+ 5. **FraudAnalyzer** (`src/services/fraud_analyzer.py`):
265
+
266
+ - Main service untuk fraud analysis
267
+ - RAG chain dengan inline citation instructions
268
+ - Batch analysis support
269
+
270
+ ## ⚙️ Konfigurasi
271
+
272
+ File `src/config/config.py`:
273
+
274
+ ```python
275
+ # Groq API
276
+ max_tokens: int = 8192
277
+ groq_model: str = "meta-llama/llama-4-maverick-17b-128e-instruct"
278
+
279
+ # RAG
280
+ chunk_size: int = 1000
281
+ chunk_overlap: int = 200
282
+
283
+ # Data Paths
284
+ data_dir: Path = Path("data")
285
+ train_data_path: Path = data_dir / "fraudTrain.csv"
286
+ pdf_dir: Path = data_dir
287
+ ```
288
+
289
+ ## 🎨 UI Features
290
+
291
+ - **Modern Design**: Inter font, clean layout
292
+ - **Vertical Layout**: Analysis results appear below inputs
293
+ - **Response Quality Scoring**: Otomatis menilai kualitas jawaban (0-100)
294
+ - **Advanced Manual Analysis**: Optional fields collapsible section untuk high-precision simulation
295
+ - **Clean Terminal**: Warnings suppressed untuk better UX
296
+
297
+ ## 📊 Dataset
298
+
299
+ - **fraudTrain.csv**: 351 MB, 1.29M+ transactions
300
+ - **CSV Insights**: 1,050,000 rows di-load untuk RAG generation
301
+ - **Dataset Stats**: Menampilkan statistik dari full 1.29M rows
302
+
303
+ ## 🔍 RAG Knowledge Base
304
+
305
+ **Total: 243 documents**
306
+
307
+ - **PDF Documents**: 187 chunks
308
+
309
+ - Bhatla.pdf: 67 chunks
310
+ - EBA_ECB 2024 Report: 120 chunks
311
+ - **CSV Insights**: 51 documents
312
+
313
+ - Fraud Pattern Analysis: 14
314
+ - Merchant Profiles: 20
315
+ - Location Insights: 15
316
+ - Statistical Summaries: 2
317
+
318
+ ## 🧪 Testing
319
+
320
+ ```bash
321
+ # Run example usage
322
+ python test/example_usage.py
323
+
324
+ # Run vector store test
325
+ python test/test_vector_store.py
326
+ ```
327
+
328
+ ## 📝 Development
329
+
330
+ ### Code Style
331
+
332
+ - PEP 8 compliant
333
+ - Type hints untuk semua functions
334
+ - Google-style docstrings
335
+ - Modular architecture
336
+
337
+ ### Best Practices
338
+
339
+ - Clean code dengan separation of concerns
340
+ - No unused functions (cleaned up)
341
+ - Proper error handling
342
+ - Comprehensive logging
343
+
344
+ ## 🚨 Catatan Penting & Troubleshooting
345
+
346
+ 1. **API Key**: Pastikan `GROQ_API_KEY` sudah benar di file `.env`.
347
+ 2. **Besar Dataset**: Dataset asli sangat besar (1.29M+ rows). Sistem menggunakan sampling 1M+ rows untuk insight RAG agar performa tetap terjaga.
348
+ 3. **Dependency Conflict**: Jika menginstal manual dan terjadi konflik versi `huggingface-hub`, gunakan versi `>=0.27.0` untuk kompatibilitas dengan Gradio 6.
349
+ 4. **Volume Mounting**: Saat menggunakan Docker, folder `data/` dan `chroma_db/` akan di-mount ke container secara otomatis.
350
+ 5. **ChromaDB**: Error telemetry ChromaDB dapat diabaikan, fitur pencarian tetap berfungsi normal.
351
+
352
+ ## 📄 License
353
+
354
+ MIT License
TECHNICAL_ASSESSMENT.md ADDED
@@ -0,0 +1,645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Technical Requirements Assessment
2
+
3
+ * [ ]
4
+
5
+ ---
6
+
7
+ ## Requirements Checklist
8
+
9
+ ### ✅ 1. Accuracy: Akurasi dan Relevansi Response
10
+
11
+ #### Implementation Details:
12
+
13
+ **A. RAG System dengan Dual Data Sources**
14
+
15
+ - **Location:** `src/rag/vector_store.py`, `src/rag/document_loader.py`
16
+ - **Implementation:**
17
+ ```python
18
+ # Vector Store dengan Chroma DB
19
+ # File: src/rag/vector_store.py (line 34-37)
20
+ self.embeddings = HuggingFaceEmbeddings(
21
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
22
+ model_kwargs={"device": "cpu"},
23
+ )
24
+ ```
25
+
26
+ **B. Data Sources (243 Documents Total)**
27
+
28
+ 1. **PDF Documents (187 chunks)**
29
+
30
+ - Bhatla.pdf: 67 chunks
31
+ - EBA_ECB 2024 Report: 120 chunks
32
+ 2. **CSV Insights (51 documents)**
33
+
34
+ - Fraud Pattern Analysis: 14 documents
35
+ - Merchant Profiles: 20 documents
36
+ - Location Insights: 15 documents
37
+ - Statistical Summaries: 2 documents
38
+
39
+ **C. Inline Source Citations**
40
+
41
+ - **Location:** `app.py` (line 328-337)
42
+ - **Format:** `[Source X]` inline dalam response
43
+ - **Verification:** Source reference list di akhir response
44
+
45
+ **D. Transaction Query Detection**
46
+
47
+ - **Location:** `app.py` (line 284-307)
48
+ - **Implementation:**
49
+ ```python
50
+ # Auto-detect transaction ID dalam query
51
+ transaction_query = re.search(r'transaction\s+(?:id\s+)?(\d+)', message.lower())
52
+ # Fetch actual transaction data
53
+ transaction = data_processor.get_transaction_summary(transaction_id)
54
+ ```
55
+
56
+ **E. Merchant Name Cleaning (Artifact Removal)**
57
+
58
+ - **Location:** `src/data/processor.py` (line 39-42), `src/rag/csv_document_generator.py` (line 35-38)
59
+ - **Problem:** All merchants in the synthetic dataset have a "fraud_" prefix, leading to false positive analysis by the LLM.
60
+ - **Fix:** Automated removal of the "fraud_" prefix during data ingestion and LLM prompting instructions to ignore the artifact.
61
+
62
+ **F. Deterministic Responses**
63
+
64
+ - **Location:** `src/llm/groq_client.py` (line 23)
65
+ - **Setting:** `temperature: float = 0`
66
+
67
+ **Evidence:**
68
+
69
+ - ✅ RAG retrieves top-k relevant documents
70
+ - ✅ Inline citations untuk transparency
71
+ - ✅ Actual transaction data untuk specific queries
72
+ - ✅ Merchant name cleaning untuk menghilangkan "false positive" indikator
73
+ - ✅ Temperature 0 untuk consistent responses
74
+
75
+ ---
76
+
77
+ ### ✅ 2. Coverage: Adaptabilitas untuk Berbagai Pertanyaan
78
+
79
+ #### Implementation Details:
80
+
81
+ **A. Multiple Interfaces**
82
+
83
+ - **Location:** `app.py`
84
+ - **Interfaces:**
85
+ 1. Chat with Fraud Expert (line 277-403)
86
+ 2. Analyze by Transaction ID (line 106-138)
87
+ 3. Analyze Manual Transaction (line 141-178)
88
+ 4. Dataset Summary (line 182-274)
89
+
90
+ **B. Flexible Query Handling**
91
+
92
+ - **Natural Language Transaction Queries:**
93
+ ```python
94
+ # Supports queries like:
95
+ # - "is transaction id 996746 fraud?"
96
+ # - "analyze transaction 12345"
97
+ # - "what about transaction id 999?"
98
+ ```
99
+
100
+ **C. RAG Coverage Across Domains**
101
+
102
+ - Fraud patterns by category (14 categories)
103
+ - Merchant risk profiles (20 merchants)
104
+ - Geographic insights (15 states)
105
+ - Statistical patterns (overall + by amount range)
106
+
107
+ **D. API Endpoints**
108
+
109
+ - **Location:** `src/api/routes.py`
110
+ - **Endpoints:**
111
+ - `POST /api/v1/analyze` - Single transaction
112
+ - `POST /api/v1/batch-analyze` - Multiple transactions
113
+ - `GET /api/v1/summary` - Dataset overview
114
+ - `GET /api/v1/health` - Health check
115
+
116
+ **Evidence:**
117
+
118
+ - ✅ 4 different interaction modes
119
+ - ✅ Handles general + specific queries
120
+ - ✅ Supports 1.2M+ transactions
121
+ - ✅ REST API untuk programmatic access
122
+
123
+ ---
124
+
125
+ ### ✅ 3. Readability: Struktur Kode dan Naming
126
+
127
+ #### Implementation Details:
128
+
129
+ **A. Modular Architecture**
130
+
131
+ ```
132
+ src/
133
+ ├── api/ # REST API layer
134
+ │ └── routes.py
135
+ ├── config/ # Configuration management
136
+ │ ├── __init__.py
137
+ │ └── config.py
138
+ ├── data/ # Data processing
139
+ │ └── processor.py
140
+ ├── llm/ # LLM integration
141
+ │ └── groq_client.py
142
+ ├── rag/ # RAG system
143
+ │ ├── document_loader.py
144
+ │ ├── vector_store.py
145
+ │ └── csv_document_generator.py
146
+ ├── schemas/ # Pydantic models
147
+ │ └── fraud.py
148
+ └── services/ # Business logic
149
+ ├── fraud_analyzer.py
150
+ └── quality_scorer.py
151
+ ```
152
+
153
+ **B. Naming Conventions**
154
+
155
+ - **Classes:** `PascalCase`
156
+ - `FraudAnalyzer`, `VectorStore`, `ResponseQualityScorer`
157
+ - **Functions:** `snake_case`
158
+ - `analyze_transaction()`, `load_csv_insights()`, `score_response()`
159
+ - **Constants:** `UPPER_CASE` in config
160
+ - `GROQ_API_KEY`, `MAX_TOKENS`
161
+
162
+ **C. Type Hints (100% Coverage)**
163
+
164
+ ```python
165
+ # Example: src/services/fraud_analyzer.py
166
+ def analyze_transaction(
167
+ self,
168
+ transaction_id: Optional[int] = None,
169
+ transaction_data: Optional[Dict] = None,
170
+ use_rag: bool = True,
171
+ ) -> Dict:
172
+ ```
173
+
174
+ **D. Documentation**
175
+
176
+ - **Docstrings:** Google-style untuk semua functions
177
+ - **Comments:** Inline comments untuk complex logic
178
+ - **README.md:** Comprehensive project documentation
179
+
180
+ **Evidence:**
181
+
182
+ - ✅ Clear separation of concerns
183
+ - ✅ Consistent naming across codebase
184
+ - ✅ Type hints untuk IDE support
185
+ - ✅ Well-documented code
186
+
187
+ ---
188
+
189
+ ### ✅ 4. Exception Handling: Error Handling & Edge Cases
190
+
191
+ #### Implementation Details:
192
+
193
+ **A. Transaction Not Found**
194
+
195
+ - **Location:** `src/data/processor.py` (line 60-62)
196
+
197
+ ```python
198
+ if transaction.empty:
199
+ raise ValueError(f"Transaction {transaction_id} not found")
200
+ ```
201
+
202
+ **B. File Not Found**
203
+
204
+ - **Location:** `src/data/processor.py` (line 32-33)
205
+
206
+ ```python
207
+ if not data_path.exists():
208
+ raise FileNotFoundError(f"Training data not found: {data_path}")
209
+ ```
210
+
211
+ **C. RAG Fallback Mechanism**
212
+
213
+ - **Location:** `src/services/fraud_analyzer.py` (line 151-154)
214
+
215
+ ```python
216
+ except Exception as e:
217
+ logger.warning(f"RAG chain failed, falling back to direct LLM: {str(e)}")
218
+ analysis_text = self._direct_analysis(formatted_transaction)
219
+ sources = []
220
+ ```
221
+
222
+ **D. Chat Error Handling**
223
+
224
+ - **Location:** `app.py` (line 395-398)
225
+
226
+ ```python
227
+ except Exception as e:
228
+ logger.error(f"Chat failed: {e}")
229
+ history.append([message, f"❌ Error: {str(e)}"])
230
+ return history
231
+ ```
232
+
233
+ **E. Graceful Degradation**
234
+
235
+ - **Location:** `app.py` (line 74-82)
236
+
237
+ ```python
238
+ # CSV loading dengan try-except
239
+ try:
240
+ csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
241
+ all_documents.extend(csv_documents)
242
+ except Exception as e:
243
+ logger.warning(f"⚠ Failed to load CSV insights: {e}")
244
+ # System continues without CSV insights
245
+ ```
246
+
247
+ **F. API Validation**
248
+
249
+ - **Location:** `src/schemas/fraud.py`
250
+ - **Pydantic models** untuk request validation
251
+
252
+ **Evidence:**
253
+
254
+ - ✅ Comprehensive error handling
255
+ - ✅ Graceful degradation
256
+ - ✅ Logging untuk debugging
257
+
258
+ ---
259
+
260
+ ### ✅ 5. Performance: Optimasi Sistem
261
+
262
+ #### Implementation Details:
263
+
264
+ **A. Efficient Embeddings**
265
+
266
+ - **Location:** `src/rag/vector_store.py` (line 34-37)
267
+ - **Model:** `sentence-transformers/all-MiniLM-L6-v2`
268
+ - Lightweight (80MB)
269
+ - Fast inference
270
+ - Good accuracy/speed tradeoff
271
+
272
+ **B. Sampling Strategy**
273
+
274
+ - **Location:** `src/rag/csv_document_generator.py` (line 15)
275
+
276
+ ```python
277
+ sample_size: int = 1050000 # ~81% of full dataset
278
+ # Balance between coverage and performance
279
+ ```
280
+
281
+ **C. Chunking Optimization**
282
+
283
+ - **Location:** `src/config/config.py` (line 29-30)
284
+
285
+ ```python
286
+ chunk_size: int = 1000 # Optimal for context
287
+ chunk_overlap: int = 200 # Preserve context continuity
288
+ ```
289
+
290
+ **D. In-Memory Vector Store**
291
+
292
+ - **Location:** `src/config/config.py` (line 31)
293
+
294
+ ```python
295
+ vector_store_path: Optional[str] = None # Fast in-memory storage
296
+ ```
297
+
298
+ - **Trade-off:** Speed vs persistence
299
+ - **Benefit:** No disk I/O latency
300
+
301
+ **E. Lazy Loading**
302
+
303
+ - **Location:** `src/data/processor.py` (line 54-55)
304
+
305
+ ```python
306
+ if self.train_df is None:
307
+ self.load_train_data() # Load only when needed
308
+ ```
309
+
310
+ **F. Batch Processing**
311
+
312
+ - **Location:** `src/services/fraud_analyzer.py` (line 218-245)
313
+
314
+ ```python
315
+ def batch_analyze(
316
+ self,
317
+ transaction_ids: List[int],
318
+ use_rag: bool = True,
319
+ ) -> List[Dict]:
320
+ # Process multiple transactions efficiently
321
+ ```
322
+
323
+ **G. Max Tokens Optimization**
324
+
325
+ - **Location:** `src/config/config.py` (line 14)
326
+
327
+ ```python
328
+ max_tokens: int = 8192 # Model maximum
329
+ ```
330
+
331
+ **Performance Metrics:**
332
+
333
+ - Document loading: ~5-10 seconds
334
+ - Vector store creation: ~3-5 seconds
335
+ - Query response: ~1-3 seconds
336
+ - Full dataset load: ~15-20 seconds
337
+
338
+ **Evidence:**
339
+
340
+ - ✅ Lightweight embeddings
341
+ - ✅ Strategic sampling
342
+ - ✅ Optimized chunking
343
+ - ✅ Fast in-memory storage
344
+
345
+ ---
346
+
347
+ ### ✅ 6. Data Processing: Embeddings, RAG, Pre/Post Processing
348
+
349
+ #### Implementation Details:
350
+
351
+ **A. Embeddings**
352
+
353
+ - **Location:** `src/rag/vector_store.py` (line 34-37)
354
+ - **Model:** sentence-transformers/all-MiniLM-L6-v2
355
+ - **Dimension:** 384
356
+ - **Normalization:** L2 normalized
357
+
358
+ **B. RAG Pipeline**
359
+
360
+ **1. Document Loading**
361
+
362
+ - **PDF Processing** (`src/rag/document_loader.py` line 53-75)
363
+
364
+ ```python
365
+ # PyPDFLoader → RecursiveCharacterTextSplitter
366
+ loader = PyPDFLoader(str(pdf_path))
367
+ documents = loader.load()
368
+ chunks = self.text_splitter.split_documents(documents)
369
+ ```
370
+ - **CSV Processing** (`src/rag/csv_document_generator.py`)
371
+
372
+ ```python
373
+ # Extract structured insights
374
+ - generate_fraud_pattern_documents()
375
+ - generate_statistical_summaries()
376
+ - generate_merchant_profiles()
377
+ - generate_location_insights()
378
+ ```
379
+
380
+ **2. Vector Store Creation**
381
+
382
+ - **Location:** `src/rag/vector_store.py` (line 52-65)
383
+ ```python
384
+ # Chroma DB with HuggingFace embeddings
385
+ self.vector_store = Chroma.from_documents(
386
+ documents=documents,
387
+ embedding=self.embeddings,
388
+ persist_directory=self.persist_directory,
389
+ )
390
+ ```
391
+
392
+ **3. Retrieval**
393
+
394
+ - **Similarity Search** (line 82-96)
395
+ ```python
396
+ # Top-k retrieval dengan metadata
397
+ results = self.vector_store.similarity_search(
398
+ query=query,
399
+ k=k,
400
+ )
401
+ ```
402
+
403
+ **C. Preprocessing**
404
+
405
+ **1. PDF Text Splitting**
406
+
407
+ ```python
408
+ # Recursive character splitting
409
+ chunk_size=1000
410
+ chunk_overlap=200
411
+ # Preserves context across chunks
412
+ ```
413
+
414
+ **2. CSV Data Extraction**
415
+
416
+ ```python
417
+ # Structured insight generation
418
+ - Fraud patterns by category
419
+ - Statistical aggregations
420
+ - Merchant risk profiles
421
+ - Geographic analysis
422
+ ```
423
+
424
+ **3. Transaction Formatting**
425
+
426
+ - **Location:** `src/data/processor.py` (line 78-104)
427
+
428
+ ```python
429
+ def format_transaction_for_llm(self, transaction: Dict) -> str:
430
+ # Format dengan clear labels
431
+ # Include all relevant fields
432
+ # Human-readable format
433
+ ```
434
+
435
+ **D. Postprocessing**
436
+
437
+ **1. Source Reference Collection**
438
+
439
+ - **Location:** `app.py` (line 295-318)
440
+
441
+ ```python
442
+ # Extract metadata dari retrieved docs
443
+ # Format source references
444
+ # Include file names, page numbers, data types
445
+ ```
446
+
447
+ **2. Response Formatting**
448
+
449
+ ```python
450
+ # Structured sections:
451
+ # - Transaction Details
452
+ # - Fraud Analysis
453
+ # - Quality Score
454
+ # - Source References
455
+ ```
456
+
457
+ **3. Quality Scoring**
458
+
459
+ - **Location:** `src/services/quality_scorer.py`
460
+
461
+ ```python
462
+ # Automated quality assessment
463
+ # 4 metrics: relevance, completeness, citations, clarity
464
+ # Grade: A-F
465
+ ```
466
+
467
+ **Evidence:**
468
+
469
+ - ✅ Comprehensive embedding strategy
470
+ - ✅ Dual-source RAG (PDF + CSV)
471
+ - ✅ Structured preprocessing
472
+ - ✅ Rich postprocessing dengan quality scoring
473
+
474
+ ---
475
+
476
+ ### ✅ 7. Prompt Design: Multiple Layers
477
+
478
+ #### Implementation Details:
479
+
480
+ **Layer 1: System Role Definition**
481
+
482
+ - **Location:** `app.py` (line 356-365)
483
+
484
+ ```python
485
+ system_message = """You are an expert fraud detection analyst.
486
+ Help users understand fraud patterns, detection methods, and transaction analysis."""
487
+ ```
488
+
489
+ **Layer 2: Citation Instructions**
490
+
491
+ - **Location:** `app.py` (line 358-363)
492
+
493
+ ```python
494
+ IMPORTANT CITATION RULES:
495
+ - When using information from the provided context sources, you MUST add an inline citation
496
+ - Format citations as: [Source X]
497
+ - Place citations at the end of sentences
498
+ ```
499
+
500
+ **Layer 3: Transaction Analysis Guidelines**
501
+
502
+ - **Location:** `app.py` (line 365-369)
503
+
504
+ ```python
505
+ TRANSACTION ANALYSIS:
506
+ - If transaction details are provided, analyze them thoroughly
507
+ - Compare transaction characteristics against known fraud patterns
508
+ - Provide a clear fraud risk assessment (Low/Medium/High)
509
+ ```
510
+
511
+ **Layer 4: RAG Context**
512
+
513
+ - **Location:** `app.py` (line 320-348)
514
+
515
+ ```python
516
+ # Retrieved documents dengan source numbers
517
+ context = "\n\nRelevant context from fraud detection documents:\n"
518
+ for i, doc in enumerate(docs, 1):
519
+ context += f"\n[Source {i}] {doc.page_content[:500]}...\n"
520
+ ```
521
+
522
+ **Layer 5: Transaction Data**
523
+
524
+ - **Location:** `app.py` (line 293-306)
525
+
526
+ ```python
527
+ # Auto-fetched transaction details
528
+ transaction_context = f"\n\n**Transaction ID {transaction_id} Details:**\n"
529
+ transaction_context += f"- Merchant: {transaction.get('merchant', 'N/A')}\n"
530
+ transaction_context += f"- Actual Fraud Status: {'FRAUD' if ... else 'LEGITIMATE'}\n"
531
+ ```
532
+
533
+ **Layer 6: RAG Chain Template**
534
+
535
+ - **Location:** `src/services/fraud_analyzer.py` (line 46-66)
536
+
537
+ ```python
538
+ template = """You are an expert fraud detection analyst.
539
+ Use the following context from fraud detection research papers...
540
+
541
+ Context:
542
+ {context}
543
+
544
+ Question: {question}
545
+
546
+ IMPORTANT CITATION RULES:
547
+ ...
548
+ """
549
+ ```
550
+
551
+ **Evidence:**
552
+
553
+ - ✅ 6-layer prompt architecture
554
+ - ✅ Clear role definition
555
+ - ✅ Explicit instructions
556
+ - ✅ Dynamic context injection
557
+
558
+ ---
559
+
560
+ ### ✅ 8. Quality Scoring: Response Assessment
561
+
562
+ #### Implementation Details:
563
+
564
+ **A. Quality Scorer Module**
565
+
566
+ - **Location:** `src/services/quality_scorer.py`
567
+ - **Class:** `ResponseQualityScorer`
568
+
569
+ **B. Scoring Metrics (4 Dimensions)**
570
+
571
+ - **Relevance (35%):** Analyzes query term matching and contextual alignment.
572
+ - **Completeness (25%):** Evaluates depth of information and structural integrity.
573
+ - **Citation Quality (25%):** Validates presence and distribution of inline citations.
574
+ - **Clarity (15%):** Assesses sentence structure and formatting.
575
+
576
+ **C. Integration:** Automatically triggered for every chatbot response, providing a detailed breakdown and an overall grade (A-F).
577
+
578
+ ---
579
+
580
+ ### ✅ 9. Advanced Manual Analysis
581
+
582
+ #### Implementation Details:
583
+
584
+ - **Location:** `app.py`
585
+ - **Feature:** Collapsible "Advanced Fields" section in the Manual Transaction Analysis tab.
586
+ - **Inputs:** Gender, Age, Job, ZIP Code, City Population, and Merchant Coordinates.
587
+ - **Improved Accuracy:** Provides the LLM with significantly more context, matching the granularity of the actual dataset for more realistic simulations.
588
+
589
+ ---
590
+
591
+ ## Summary Matrix
592
+
593
+ | # | Requirement | Status | Evidence |
594
+ | - | ------------------ | ------ | ------------------------------------- |
595
+ | 1 | Accuracy | ✅ | RAG, Citations, Transaction Detection |
596
+ | 2 | Coverage | ✅ | 4 Interfaces, Flexible Queries, API |
597
+ | 3 | Readability | ✅ | Modular, Type Hints, Docstrings |
598
+ | 4 | Exception Handling | ✅ | Comprehensive Error Handling |
599
+ | 5 | Performance | ✅ | Optimized Embeddings, Sampling |
600
+ | 6 | Data Processing | ✅ | RAG Pipeline, Pre/Post Processing |
601
+ | 7 | Prompt Design | ✅ | 6-Layer Architecture |
602
+ | 8 | Quality Scoring | ✅ | 4-Metric Automated Scoring |
603
+ | 9 | Advanced Manual | ✅ | Modular UI with 7 optional fields |
604
+
605
+ **Overall Assessment:** ✅ ALL REQUIREMENTS MET
606
+
607
+ ---
608
+
609
+ ## Key Achievements
610
+
611
+ 1. ✅ **Dual-Source RAG** - PDF research papers + CSV fraud patterns
612
+ 2. ✅ **Inline Citations** - Transparent source referencing
613
+ 3. ✅ **Transaction Query Detection** - Natural language transaction analysis
614
+ 4. ✅ **Multi-Layer Prompting** - 6-layer prompt architecture
615
+ 5. ✅ **Quality Scoring** - Automated 4-metric response assessment
616
+ 6. ✅ **Comprehensive Error Handling** - Graceful degradation
617
+ 7. ✅ **Performance Optimization** - Strategic sampling, efficient embeddings
618
+ 8. ✅ **Clean Architecture** - Modular, well-documented codebase
619
+
620
+ ---
621
+
622
+ ## Files Reference
623
+
624
+ ### Core Implementation
625
+
626
+ - `app.py` - Gradio interface dengan quality scoring
627
+ - `main.py` - FastAPI application
628
+ - `src/services/fraud_analyzer.py` - Main analysis service
629
+ - `src/services/quality_scorer.py` - Quality assessment
630
+ - `src/rag/vector_store.py` - Vector store management
631
+ - `src/rag/document_loader.py` - Document loading
632
+ - `src/rag/csv_document_generator.py` - CSV insights extraction
633
+ - `src/data/processor.py` - Data processing
634
+ - `src/llm/groq_client.py` - LLM integration
635
+ - `src/config/config.py` - Configuration
636
+
637
+ ### Documentation
638
+
639
+ - `README.md` - Project documentation
640
+ - `QUICKSTART.md` - Quick start guide
641
+ - `requirements.txt` - Dependencies
642
+
643
+ ---
644
+
645
+ **Conclusion:** Project successfully implements all required features with high quality standards and additional bonus features (multi-layer prompting, quality scoring).
app.py ADDED
@@ -0,0 +1,757 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio interface for Fraud Detection Chatbot."""
2
+
3
+ import logging
4
+ import warnings
5
+ import os
6
+
7
+ # Suppress warnings for cleaner output
8
+ warnings.filterwarnings('ignore', category=FutureWarning)
9
+ warnings.filterwarnings('ignore', category=DeprecationWarning)
10
+ warnings.filterwarnings('ignore', category=UserWarning)
11
+ warnings.filterwarnings('ignore', message='.*LangChain.*')
12
+
13
+ # Disable ChromaDB telemetry to avoid errors
14
+ os.environ['ANONYMIZED_TELEMETRY'] = 'False'
15
+
16
+ import gradio as gr
17
+ from pathlib import Path
18
+ import pandas as pd
19
+
20
+ from src.data.processor import FraudDataProcessor
21
+ from src.llm.groq_client import GroqClient
22
+ from src.rag.document_loader import DocumentLoader
23
+ from src.rag.vector_store import VectorStore
24
+ from src.services.fraud_analyzer import FraudAnalyzer
25
+ from src.services.quality_scorer import ResponseQualityScorer
26
+ from src.config.config import settings
27
+
28
+ logging.basicConfig(level=logging.INFO)
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Suppress chromadb logging
32
+ logging.getLogger('chromadb').setLevel(logging.ERROR)
33
+ logging.getLogger('chromadb.telemetry').setLevel(logging.CRITICAL)
34
+
35
+ # Initialize components globally
36
+ groq_client = None
37
+ vector_store = None
38
+ fraud_analyzer = None
39
+ data_processor = None
40
+ quality_scorer = ResponseQualityScorer()
41
+
42
+
43
+ def initialize_system():
44
+ """Initialize the fraud detection system."""
45
+ global groq_client, vector_store, fraud_analyzer, data_processor
46
+
47
+ logger.info("Initializing Fraud Detection System...")
48
+
49
+ # Initialize Groq client
50
+ groq_client = GroqClient()
51
+ logger.info("✓ Groq client initialized")
52
+
53
+ # Initialize data processor
54
+ data_processor = FraudDataProcessor()
55
+ logger.info("✓ Data processor initialized")
56
+
57
+ # Setup RAG system
58
+ try:
59
+ document_loader = DocumentLoader(
60
+ chunk_size=settings.chunk_size,
61
+ chunk_overlap=settings.chunk_overlap,
62
+ )
63
+
64
+ all_documents = []
65
+
66
+ # Load PDF documents
67
+ pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
68
+ if pdf_documents:
69
+ all_documents.extend(pdf_documents)
70
+ logger.info(f"✓ Loaded {len(pdf_documents)} PDF documents")
71
+ else:
72
+ logger.warning("⚠ No PDF documents found")
73
+
74
+ # Load CSV insights
75
+ csv_path = settings.data_dir / "fraudTrain.csv"
76
+ if csv_path.exists():
77
+ try:
78
+ csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
79
+ all_documents.extend(csv_documents)
80
+ logger.info(f"✓ Loaded {len(csv_documents)} CSV insight documents")
81
+ except Exception as e:
82
+ logger.warning(f"⚠ Failed to load CSV insights: {e}")
83
+ else:
84
+ logger.warning(f"⚠ CSV file not found: {csv_path}")
85
+
86
+ # Add all documents to vector store
87
+ if all_documents:
88
+ vector_store = VectorStore()
89
+ vector_store.add_documents(all_documents)
90
+ logger.info(f"✓ RAG system initialized with {len(all_documents)} total documents")
91
+ else:
92
+ logger.warning("⚠ No documents loaded for RAG system")
93
+
94
+ except Exception as e:
95
+ logger.warning(f"⚠ RAG setup failed: {e}")
96
+
97
+
98
+ # Create fraud analyzer
99
+ fraud_analyzer = FraudAnalyzer(
100
+ groq_client=groq_client,
101
+ vector_store=vector_store,
102
+ )
103
+ logger.info("✓ Fraud analyzer initialized")
104
+
105
+ return "✅ System initialized successfully!"
106
+
107
+
108
+ def analyze_by_transaction_id(transaction_id: int, use_rag: bool):
109
+ """Analyze fraud by transaction ID."""
110
+ if fraud_analyzer is None:
111
+ return "❌ System not initialized. Please wait for initialization to complete."
112
+
113
+ try:
114
+ transaction_id = int(transaction_id)
115
+ result = fraud_analyzer.analyze_transaction(
116
+ transaction_id=transaction_id,
117
+ use_rag=use_rag,
118
+ )
119
+
120
+ # Format the response
121
+ transaction = result['transaction']
122
+ analysis = result['analysis']
123
+
124
+ response = f"""### 📊 Transaction Details
125
+ **Merchant:** {transaction.get('merchant', 'N/A')}
126
+ **Category:** {transaction.get('category', 'N/A')}
127
+ **Amount:** ${transaction.get('amt', 0):.2f}
128
+ **City:** {transaction.get('city', 'N/A')}
129
+ **State:** {transaction.get('state', 'N/A')}
130
+
131
+ ---
132
+
133
+ ### 🔍 Fraud Analysis
134
+ {analysis}
135
+ """
136
+ return response
137
+
138
+ except Exception as e:
139
+ logger.error(f"Analysis failed: {e}")
140
+ return f"❌ Error: {str(e)}"
141
+
142
+
143
+ def analyze_by_manual_data(
144
+ merchant: str, category: str, amount: float, city: str, state: str, use_rag: bool,
145
+ gender: str = None, age: int = None, job: str = None, zip_code: str = None,
146
+ city_pop: int = None, merch_lat: float = None, merch_long: float = None
147
+ ):
148
+ """Analyze fraud by manual transaction data."""
149
+ if fraud_analyzer is None:
150
+ return "❌ System not initialized. Please wait for initialization to complete."
151
+
152
+ try:
153
+ # Clean merchant name from prefix if present
154
+ clean_merchant = merchant.replace('fraud_', '') if merchant else merchant
155
+
156
+ transaction_data = {
157
+ "merchant": clean_merchant,
158
+ "category": category,
159
+ "amt": float(amount),
160
+ "city": city,
161
+ "state": state,
162
+ }
163
+
164
+ # Add advanced fields if provided
165
+ if gender:
166
+ transaction_data["gender"] = gender
167
+ if age:
168
+ transaction_data["age"] = age
169
+ if job:
170
+ transaction_data["job"] = job
171
+ if zip_code:
172
+ transaction_data["zip"] = zip_code
173
+ if city_pop:
174
+ transaction_data["city_pop"] = city_pop
175
+ if merch_lat is not None:
176
+ transaction_data["merch_lat"] = merch_lat
177
+ if merch_long is not None:
178
+ transaction_data["merch_long"] = merch_long
179
+
180
+ result = fraud_analyzer.analyze_transaction(
181
+ transaction_data=transaction_data,
182
+ use_rag=use_rag,
183
+ )
184
+
185
+ analysis = result['analysis']
186
+
187
+ response = f"""### 📊 Transaction Details
188
+ **Merchant:** {merchant}
189
+ **Category:** {category}
190
+ **Amount:** ${amount:.2f}
191
+ **City:** {city}
192
+ **State:** {state}
193
+ """
194
+
195
+ # Add advanced fields to display if provided
196
+ if gender or age or job:
197
+ response += "\n**Cardholder Info:**\n"
198
+ if gender:
199
+ response += f"- Gender: {gender}\n"
200
+ if age:
201
+ response += f"- Age: {age}\n"
202
+ if job:
203
+ response += f"- Job: {job}\n"
204
+
205
+ if zip_code or city_pop:
206
+ response += "\n**Location Details:**\n"
207
+ if zip_code:
208
+ response += f"- ZIP: {zip_code}\n"
209
+ if city_pop:
210
+ response += f"- City Population: {city_pop:,}\n"
211
+
212
+ if merch_lat is not None or merch_long is not None:
213
+ response += "\n**Merchant Location:**\n"
214
+ response += f"- Coordinates: ({merch_lat}, {merch_long})\n"
215
+
216
+ response += f"""
217
+ ---
218
+
219
+ ### 🔍 Fraud Analysis
220
+ {analysis}
221
+ """
222
+ return response
223
+
224
+ except Exception as e:
225
+ logger.error(f"Analysis failed: {e}")
226
+ return f"❌ Error: {str(e)}"
227
+
228
+
229
+
230
+ def get_dataset_summary():
231
+ """Get dataset summary statistics including RAG documents."""
232
+ if data_processor is None:
233
+ return "❌ System not initialized."
234
+
235
+ try:
236
+ # Get transaction data summary
237
+ summary = data_processor.get_transaction_summary()
238
+
239
+ response = f"""### 📊 Transaction Dataset Summary
240
+
241
+ **Total Transactions:** {summary['total_transactions']:,}
242
+ **Fraud Cases:** {summary['fraud_count']:,}
243
+ **Fraud Rate:** {summary['fraud_percentage']:.2f}%
244
+ **Average Amount:** ${summary['average_amount']:.2f}
245
+
246
+ ---
247
+
248
+ **Top Transaction Categories:**
249
+ """
250
+ for category, count in list(summary['categories'].items())[:10]:
251
+ response += f"\n- {category}: {count:,}"
252
+
253
+ # Add RAG document summary if available
254
+ if vector_store is not None:
255
+ response += "\n\n---\n\n### 📚 RAG Knowledge Base\n\n"
256
+
257
+ # Count documents by type
258
+ try:
259
+ # Get all documents from vector store
260
+ all_docs = vector_store.vector_store._collection.get()
261
+
262
+ if all_docs and 'metadatas' in all_docs:
263
+ metadatas = all_docs['metadatas']
264
+
265
+ # Count by source type
266
+ pdf_count = 0
267
+ csv_pattern_count = 0
268
+ csv_merchant_count = 0
269
+ csv_location_count = 0
270
+ csv_stats_count = 0
271
+
272
+ pdf_sources = set()
273
+
274
+ for meta in metadatas:
275
+ doc_type = meta.get('type', 'document')
276
+ source = meta.get('source', '')
277
+
278
+ if doc_type == 'fraud_pattern':
279
+ csv_pattern_count += 1
280
+ elif doc_type == 'merchant_profile':
281
+ csv_merchant_count += 1
282
+ elif doc_type == 'location_insight':
283
+ csv_location_count += 1
284
+ elif doc_type == 'statistical_summary':
285
+ csv_stats_count += 1
286
+ else:
287
+ # PDF document
288
+ pdf_count += 1
289
+ if source.endswith('.pdf'):
290
+ pdf_sources.add(source)
291
+
292
+ response += f"**Total Documents in RAG:** {len(metadatas):,}\n\n"
293
+
294
+ if pdf_count > 0:
295
+ response += f"**📄 PDF Research Documents:** {pdf_count:,}\n"
296
+ for pdf in sorted(pdf_sources):
297
+ response += f" - {pdf}\n"
298
+ response += "\n"
299
+
300
+ csv_total = csv_pattern_count + csv_merchant_count + csv_location_count + csv_stats_count
301
+ if csv_total > 0:
302
+ response += f"**📊 CSV-Derived Insights:** {csv_total:,}\n"
303
+ if csv_pattern_count > 0:
304
+ response += f" - Fraud Pattern Analysis: {csv_pattern_count}\n"
305
+ if csv_merchant_count > 0:
306
+ response += f" - Merchant Profiles: {csv_merchant_count}\n"
307
+ if csv_location_count > 0:
308
+ response += f" - Location Insights: {csv_location_count}\n"
309
+ if csv_stats_count > 0:
310
+ response += f" - Statistical Summaries: {csv_stats_count}\n"
311
+ else:
312
+ response += "**Status:** RAG system initialized but no document metadata available."
313
+
314
+ except Exception as e:
315
+ logger.warning(f"Could not retrieve RAG document stats: {e}")
316
+ response += "**Status:** RAG system active (document count unavailable)"
317
+
318
+ return response
319
+
320
+ except Exception as e:
321
+ logger.error(f"Summary failed: {e}")
322
+ return f"❌ Error: {str(e)}"
323
+
324
+
325
+ def chat_with_fraud_expert(message: str, history: list, use_rag: bool):
326
+ """Chat with fraud detection expert."""
327
+ if groq_client is None:
328
+ return history + [[message, "❌ System not initialized. Please wait for initialization to complete."]]
329
+
330
+ try:
331
+ # Check if message is asking about a specific transaction ID
332
+ import re
333
+ transaction_query = re.search(r'transaction\s+(?:id\s+)?(\d+)', message.lower())
334
+ transaction_context = ""
335
+
336
+ if transaction_query and data_processor is not None:
337
+ transaction_id = int(transaction_query.group(1))
338
+ try:
339
+ # Get transaction data
340
+ transaction = data_processor.get_transaction_summary(transaction_id)
341
+
342
+ # Format transaction details with all relevant columns
343
+ transaction_context = f"\n\n**Transaction ID {transaction_id} Details:**\n"
344
+ transaction_context += f"- **Transaction Number:** {transaction.get('trans_num', 'N/A')}\n"
345
+ transaction_context += f"- **Date/Time:** {transaction.get('trans_date_trans_time', 'N/A')}\n"
346
+ transaction_context += f"- **Merchant:** {transaction.get('merchant', 'N/A')}\n"
347
+ transaction_context += f"- **Category:** {transaction.get('category', 'N/A')}\n"
348
+ transaction_context += f"- **Amount:** ${transaction.get('amt', 0):.2f}\n"
349
+ transaction_context += f"- **Location:** {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}\n"
350
+ transaction_context += f"- **Merchant Coordinates:** ({transaction.get('merch_lat', 'N/A')}, {transaction.get('merch_long', 'N/A')})\n"
351
+ transaction_context += f"\n**Cardholder Information:**\n"
352
+ transaction_context += f"- **Name:** {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}\n"
353
+ transaction_context += f"- **Gender:** {transaction.get('gender', 'N/A')}\n"
354
+ transaction_context += f"- **Date of Birth:** {transaction.get('dob', 'N/A')}\n"
355
+ transaction_context += f"- **Job:** {transaction.get('job', 'N/A')}\n"
356
+ transaction_context += f"- **Street:** {transaction.get('street', 'N/A')}\n"
357
+ transaction_context += f"- **City/State/ZIP:** {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')} {transaction.get('zip', 'N/A')}\n"
358
+ transaction_context += f"- **Cardholder Coordinates:** ({transaction.get('lat', 'N/A')}, {transaction.get('long', 'N/A')})\n"
359
+ transaction_context += f"- **City Population:** {transaction.get('city_pop', 'N/A')}\n"
360
+ transaction_context += f"\n**Card Information:**\n"
361
+ transaction_context += f"- **Card Number:** {transaction.get('cc_num', 'N/A')}\n"
362
+ transaction_context += f"\n**Fraud Status:**\n"
363
+ transaction_context += f"- **Actual Status:** {'🚨 FRAUD' if transaction.get('is_fraud', 0) == 1 else '✅ LEGITIMATE'}\n"
364
+
365
+ logger.info(f"Found transaction {transaction_id} for chat query")
366
+ except ValueError as e:
367
+ transaction_context = f"\n\n**Note:** {str(e)}\n"
368
+ except Exception as e:
369
+ logger.warning(f"Could not fetch transaction {transaction_id}: {e}")
370
+
371
+ # If RAG is enabled and vector store is available, get relevant context
372
+ context = ""
373
+ source_references = []
374
+
375
+ if use_rag and vector_store is not None:
376
+ docs = vector_store.similarity_search(message, k=3)
377
+ if docs:
378
+ context = "\n\nRelevant context from fraud detection documents:\n"
379
+ for i, doc in enumerate(docs, 1):
380
+ # Add context with source number
381
+ context += f"\n[Source {i}] {doc.page_content[:500]}...\n"
382
+
383
+ # Collect source information for reference list
384
+ source_file = doc.metadata.get('source', 'Unknown')
385
+ page_num = doc.metadata.get('page', 'N/A')
386
+ doc_type = doc.metadata.get('type', 'document')
387
+
388
+ # Format source info
389
+ if doc_type == 'fraud_pattern':
390
+ category = doc.metadata.get('category', 'N/A')
391
+ source_references.append(f"Source {i}: CSV Data - Fraud Pattern Analysis ({category})")
392
+ elif doc_type == 'statistical_summary':
393
+ scope = doc.metadata.get('scope', 'N/A')
394
+ source_references.append(f"Source {i}: CSV Data - Statistical Summary ({scope})")
395
+ elif doc_type == 'merchant_profile':
396
+ merchant = doc.metadata.get('merchant', 'N/A')
397
+ source_references.append(f"Source {i}: CSV Data - Merchant Profile ({merchant})")
398
+ elif doc_type == 'location_insight':
399
+ state = doc.metadata.get('state', 'N/A')
400
+ source_references.append(f"Source {i}: CSV Data - Location Analysis ({state})")
401
+ else:
402
+ # PDF document
403
+ if page_num != 'N/A':
404
+ source_references.append(f"Source {i}: {source_file}, Page {page_num}")
405
+ else:
406
+ source_references.append(f"Source {i}: {source_file}")
407
+
408
+ # Create prompt with transaction data and context
409
+ full_prompt = message
410
+ if transaction_context:
411
+ full_prompt = f"{message}\n{transaction_context}"
412
+ if context:
413
+ full_prompt = f"{full_prompt}\n{context}"
414
+
415
+ # Enhanced system message with inline citation instructions
416
+ system_message = """You are an expert fraud detection analyst. Help users understand fraud patterns, detection methods, and transaction analysis.
417
+
418
+ IMPORTANT CITATION RULES:
419
+ - When using information from the provided context sources, you MUST add an inline citation immediately after the relevant sentence or paragraph.
420
+ - Format citations as: [Source X] where X is the source number from the context.
421
+ - Place citations at the end of sentences that use information from that source.
422
+ - You can cite multiple sources in one paragraph if needed: [Source 1, Source 2]
423
+ - Be specific and reference the data when using information from sources.
424
+
425
+ TRANSACTION ANALYSIS:
426
+ - If transaction details are provided, analyze them thoroughly.
427
+ - Note: Ignore "fraud_" prefix in merchant names; it is an artifact of the synthetic dataset and NOT an indicator of fraud.
428
+ - Compare transaction characteristics against known fraud patterns.
429
+ - Provide a clear fraud risk assessment (Low/Medium/High).
430
+ - Explain your reasoning with specific indicators.
431
+
432
+ Example:
433
+ "Online gaming merchants often experience higher fraud rates due to card-not-present transactions. [Source 1] The average fraud rate in this category is 5.2%. [Source 2]"
434
+
435
+ Provide clear, actionable insights with proper inline citations."""
436
+
437
+ # Get response from LLM
438
+ response = groq_client.invoke(
439
+ prompt=full_prompt,
440
+ system_message=system_message,
441
+ )
442
+
443
+ # Score response quality
444
+ score_result = quality_scorer.score_response(
445
+ response=response,
446
+ query=message,
447
+ has_rag=use_rag and vector_store is not None,
448
+ sources=source_references,
449
+ )
450
+
451
+ # Add quality score display
452
+ quality_display = quality_scorer.format_score_display(score_result)
453
+ response += quality_display
454
+
455
+ # Add source reference list at the end
456
+ if source_references:
457
+ response += "\n**📚 Source References:**\n"
458
+ for ref in source_references:
459
+ response += f"\n- {ref}"
460
+
461
+ # Log quality score
462
+ logger.info(f"Response quality score: {score_result['overall_score']}/100 (Grade: {score_result['grade']})")
463
+
464
+ history.append({"role": "user", "content": message})
465
+ history.append({"role": "assistant", "content": response})
466
+ return history
467
+
468
+ except Exception as e:
469
+ logger.error(f"Chat failed: {e}")
470
+ history.append({"role": "user", "content": message})
471
+ history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
472
+ return history
473
+
474
+
475
+
476
+ # Create Gradio interface
477
+ def create_interface():
478
+ """Create the Gradio interface."""
479
+
480
+ with gr.Blocks(
481
+ theme=gr.themes.Soft(
482
+ primary_hue="blue",
483
+ secondary_hue="slate",
484
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
485
+ ),
486
+ title="Fraud Detection Chatbot",
487
+ css="""
488
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
489
+
490
+ * {
491
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif !important;
492
+ }
493
+
494
+ .gradio-container {
495
+ max-width: 1200px !important;
496
+ }
497
+
498
+ h1, h2, h3, h4, h5, h6 {
499
+ font-weight: 600 !important;
500
+ }
501
+
502
+ .markdown-text {
503
+ font-size: 15px !important;
504
+ line-height: 1.6 !important;
505
+ }
506
+
507
+ button {
508
+ font-weight: 500 !important;
509
+ }
510
+ """
511
+ ) as demo:
512
+
513
+ gr.Markdown("""
514
+ # 🛡️ Fraud Detection Chatbot
515
+
516
+ AI-powered fraud detection system using LangChain, Groq, and RAG (Retrieval Augmented Generation).
517
+ """)
518
+
519
+ # System status
520
+ with gr.Row():
521
+ init_status = gr.Textbox(
522
+ label="System Status",
523
+ value="Initializing...",
524
+ interactive=False,
525
+ )
526
+
527
+ # Tabs for different functionalities
528
+ with gr.Tabs():
529
+
530
+ # Tab 1: Chat with Expert
531
+ with gr.Tab("💬 Chat with Fraud Expert"):
532
+ gr.Markdown("""
533
+ Ask questions about fraud detection, transaction patterns, or get expert advice.
534
+ """)
535
+
536
+ with gr.Row():
537
+ chat_use_rag = gr.Checkbox(
538
+ label="Use RAG (Enhanced with fraud detection documents + CSV data)",
539
+ value=True,
540
+ )
541
+
542
+ chatbot = gr.Chatbot(
543
+ label="Fraud Detection Expert",
544
+ height=500,
545
+ )
546
+
547
+ with gr.Row():
548
+ chat_input = gr.Textbox(
549
+ label="Your Question",
550
+ placeholder="Ask about fraud detection, transaction analysis, etc...",
551
+ scale=4,
552
+ )
553
+ chat_submit = gr.Button("Send", variant="primary", scale=1)
554
+
555
+ chat_clear = gr.Button("Clear Chat")
556
+
557
+ # Chat examples
558
+ gr.Examples(
559
+ examples=[
560
+ "What are common indicators of credit card fraud?",
561
+ "How can I detect unusual transaction patterns?",
562
+ "What are fraud patterns in grocery transactions?",
563
+ "Which merchants have high fraud rates?",
564
+ "What states have elevated fraud activity?",
565
+ ],
566
+ inputs=chat_input,
567
+ )
568
+
569
+ # Tab 2: Analyze by Transaction ID
570
+ with gr.Tab("🔍 Analyze by Transaction ID"):
571
+ gr.Markdown("""
572
+ Analyze a specific transaction from the dataset by its ID.
573
+ """)
574
+
575
+ txn_id_input = gr.Number(
576
+ label="Transaction ID",
577
+ value=0,
578
+ precision=0,
579
+ )
580
+ txn_id_use_rag = gr.Checkbox(
581
+ label="Use RAG (Enhanced analysis)",
582
+ value=True,
583
+ )
584
+ txn_id_submit = gr.Button("Analyze Transaction", variant="primary")
585
+
586
+ txn_id_output = gr.Markdown(label="Analysis Result")
587
+
588
+
589
+ # Tab 3: Analyze Manual Transaction
590
+ with gr.Tab("✍️ Analyze Manual Transaction"):
591
+ gr.Markdown("""
592
+ Enter transaction details manually for fraud analysis.
593
+ """)
594
+
595
+ # Basic Fields
596
+ gr.Markdown("### Basic Transaction Information")
597
+ manual_merchant = gr.Textbox(
598
+ label="Merchant Name",
599
+ placeholder="e.g., Amazon, Walmart",
600
+ )
601
+ manual_category = gr.Dropdown(
602
+ label="Category",
603
+ choices=[
604
+ "grocery_pos", "gas_transport", "misc_net",
605
+ "shopping_net", "shopping_pos", "entertainment",
606
+ "food_dining", "personal_care", "health_fitness",
607
+ "travel", "kids_pets", "home"
608
+ ],
609
+ value="grocery_pos",
610
+ )
611
+ manual_amount = gr.Number(
612
+ label="Amount ($)",
613
+ value=100.0,
614
+ )
615
+ manual_city = gr.Textbox(
616
+ label="City",
617
+ placeholder="e.g., Jakarta",
618
+ )
619
+ manual_state = gr.Textbox(
620
+ label="State",
621
+ placeholder="e.g., DKI",
622
+ )
623
+
624
+ # Advanced Fields (Accordion)
625
+ with gr.Accordion("🔧 Advanced Fields (Optional)", open=False):
626
+ gr.Markdown("*Provide additional details for more accurate fraud analysis*")
627
+
628
+ with gr.Row():
629
+ manual_gender = gr.Radio(
630
+ label="Cardholder Gender",
631
+ choices=["M", "F"],
632
+ value="M",
633
+ )
634
+ manual_age = gr.Number(
635
+ label="Cardholder Age",
636
+ value=35,
637
+ precision=0,
638
+ )
639
+
640
+ manual_job = gr.Textbox(
641
+ label="Cardholder Job",
642
+ placeholder="e.g., Engineer, Teacher",
643
+ )
644
+
645
+ with gr.Row():
646
+ manual_zip = gr.Textbox(
647
+ label="ZIP Code",
648
+ placeholder="e.g., 12345",
649
+ )
650
+ manual_city_pop = gr.Number(
651
+ label="City Population",
652
+ value=100000,
653
+ precision=0,
654
+ )
655
+
656
+ with gr.Row():
657
+ manual_merch_lat = gr.Number(
658
+ label="Merchant Latitude",
659
+ value=0.0,
660
+ )
661
+ manual_merch_long = gr.Number(
662
+ label="Merchant Longitude",
663
+ value=0.0,
664
+ )
665
+
666
+ manual_use_rag = gr.Checkbox(
667
+ label="Use RAG (Enhanced analysis)",
668
+ value=True,
669
+ )
670
+ manual_submit = gr.Button("Analyze Transaction", variant="primary")
671
+
672
+ manual_output = gr.Markdown(label="Analysis Result")
673
+
674
+
675
+ # Tab 4: Dataset Summary
676
+ with gr.Tab("📊 Dataset Summary"):
677
+ gr.Markdown("""
678
+ View statistics and insights from the fraud detection dataset.
679
+ """)
680
+
681
+ summary_button = gr.Button("Get Dataset Summary", variant="primary")
682
+ summary_output = gr.Markdown(label="Summary")
683
+
684
+ # Event handlers
685
+ def chat_fn(message, history, use_rag):
686
+ return chat_with_fraud_expert(message, history, use_rag)
687
+
688
+ chat_submit.click(
689
+ fn=chat_fn,
690
+ inputs=[chat_input, chatbot, chat_use_rag],
691
+ outputs=chatbot,
692
+ ).then(
693
+ lambda: "",
694
+ outputs=chat_input,
695
+ )
696
+
697
+ chat_input.submit(
698
+ fn=chat_fn,
699
+ inputs=[chat_input, chatbot, chat_use_rag],
700
+ outputs=chatbot,
701
+ ).then(
702
+ lambda: "",
703
+ outputs=chat_input,
704
+ )
705
+
706
+ chat_clear.click(
707
+ lambda: [],
708
+ outputs=chatbot,
709
+ )
710
+
711
+ txn_id_submit.click(
712
+ fn=analyze_by_transaction_id,
713
+ inputs=[txn_id_input, txn_id_use_rag],
714
+ outputs=txn_id_output,
715
+ )
716
+
717
+ manual_submit.click(
718
+ fn=analyze_by_manual_data,
719
+ inputs=[
720
+ manual_merchant,
721
+ manual_category,
722
+ manual_amount,
723
+ manual_city,
724
+ manual_state,
725
+ manual_use_rag,
726
+ manual_gender,
727
+ manual_age,
728
+ manual_job,
729
+ manual_zip,
730
+ manual_city_pop,
731
+ manual_merch_lat,
732
+ manual_merch_long,
733
+ ],
734
+ outputs=manual_output,
735
+ )
736
+
737
+ summary_button.click(
738
+ fn=get_dataset_summary,
739
+ outputs=summary_output,
740
+ )
741
+
742
+ # Initialize system on load
743
+ demo.load(
744
+ fn=initialize_system,
745
+ outputs=init_status,
746
+ )
747
+
748
+ return demo
749
+
750
+
751
+ if __name__ == "__main__":
752
+ demo = create_interface()
753
+ demo.launch(
754
+ server_name="0.0.0.0",
755
+ server_port=7860,
756
+ share=False,
757
+ )
data/Bhatla.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6961a50224423ea16eb2b97486fdbe88b4d1a48fd9289687e911e3ae10c4596d
3
+ size 1215275
data/EBA_ECB 2024 Report on Payment Fraud.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca63ad08f8ea7d5d0db77bd5953bbc0ebca987a3b7e4df501c43e825dfe5ebf
3
+ size 734484
data/fraudTest.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12d553ab19440c752d2531ee1af44bb64f12cc3d3839f1649f19e81c230545f0
3
+ size 150354339
data/fraudTrain.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd7139200dbfcbed0b6742bbe05a4f1abce532c4fef20918228a651647a3e75d
3
+ size 351238196
docker-compose.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ app:
3
+ build: .
4
+ container_name: fraud-detection-ui
5
+ ports:
6
+ - "7860:7860"
7
+ volumes:
8
+ - ./data:/app/data
9
+ - ./chroma_db:/app/chroma_db
10
+ env_file:
11
+ - .env
12
+ environment:
13
+ - HOST=0.0.0.0
14
+ restart: always
15
+
16
+ api:
17
+ build: .
18
+ container_name: fraud-detection-api
19
+ command: uvicorn main:app --host 0.0.0.0 --port 8000
20
+ ports:
21
+ - "8000:8000"
22
+ volumes:
23
+ - ./data:/app/data
24
+ - ./chroma_db:/app/chroma_db
25
+ env_file:
26
+ - .env
27
+ restart: always
main.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main FastAPI application."""
2
+
3
+ import logging
4
+ import warnings
5
+ import os
6
+ from contextlib import asynccontextmanager
7
+
8
+ # Suppress warnings for cleaner output
9
+ warnings.filterwarnings('ignore', category=FutureWarning)
10
+ warnings.filterwarnings('ignore', category=DeprecationWarning)
11
+ warnings.filterwarnings('ignore', message='.*LangChain.*')
12
+
13
+ # Disable ChromaDB telemetry to avoid errors
14
+ os.environ['ANONYMIZED_TELEMETRY'] = 'False'
15
+
16
+ from fastapi import FastAPI
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+
19
+ from src.config.config import settings
20
+ from src.api.routes import router
21
+
22
+ # Configure logging
23
+ logging.basicConfig(
24
+ level=logging.INFO if not settings.debug else logging.DEBUG,
25
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
26
+ )
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Suppress chromadb logging
31
+ logging.getLogger('chromadb').setLevel(logging.ERROR)
32
+ logging.getLogger('chromadb.telemetry').setLevel(logging.CRITICAL)
33
+
34
+
35
+ @asynccontextmanager
36
+ async def lifespan(app: FastAPI):
37
+ """Lifespan context manager for startup and shutdown events."""
38
+ # Startup
39
+ logger.info("Starting Fraud Detection API...")
40
+ logger.info(f"Using Groq model: {settings.groq_model}")
41
+
42
+ # Initialize RAG system if needed
43
+ try:
44
+ from src.rag.document_loader import DocumentLoader
45
+ from src.rag.vector_store import VectorStore
46
+
47
+ logger.info("Initializing RAG system...")
48
+ document_loader = DocumentLoader(
49
+ chunk_size=settings.chunk_size,
50
+ chunk_overlap=settings.chunk_overlap,
51
+ )
52
+
53
+ # Load PDF documents
54
+ pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
55
+
56
+ if pdf_documents:
57
+ vector_store = VectorStore()
58
+ vector_store.add_documents(pdf_documents)
59
+ logger.info("RAG system initialized successfully")
60
+ else:
61
+ logger.warning("No PDF documents found for RAG system")
62
+ except Exception as e:
63
+ logger.warning(f"Failed to initialize RAG system: {str(e)}")
64
+
65
+ yield
66
+
67
+ # Shutdown
68
+ logger.info("Shutting down Fraud Detection API...")
69
+
70
+
71
+ # Create FastAPI app
72
+ app = FastAPI(
73
+ title=settings.app_name,
74
+ version=settings.app_version,
75
+ description="Fraud Detection API using LangChain and Groq",
76
+ lifespan=lifespan,
77
+ )
78
+
79
+ # Add CORS middleware
80
+ app.add_middleware(
81
+ CORSMiddleware,
82
+ allow_origins=["*"],
83
+ allow_credentials=True,
84
+ allow_methods=["*"],
85
+ allow_headers=["*"],
86
+ )
87
+
88
+ # Include routers
89
+ app.include_router(router)
90
+
91
+
92
+ @app.get("/", tags=["root"])
93
+ async def root() -> dict:
94
+ """Root endpoint."""
95
+ return {
96
+ "message": "Fraud Detection API",
97
+ "version": settings.app_version,
98
+ "docs": "/docs",
99
+ }
100
+
101
+
102
+ if __name__ == "__main__":
103
+ import uvicorn
104
+
105
+ uvicorn.run(
106
+ "main:app",
107
+ host=settings.api_host,
108
+ port=settings.api_port,
109
+ reload=settings.debug,
110
+ )
111
+
112
+
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.128.0
2
+ gradio==6.3.0
3
+ langchain_chroma==1.1.0
4
+ langchain_community==0.4.1
5
+ langchain_core==1.2.7
6
+ langchain_groq==1.1.1
7
+ langchain_text_splitters==1.1.0
8
+ pandas==2.3.3
9
+ pydantic==2.12.5
10
+ pydantic_settings==2.12.0
11
+ uvicorn==0.40.0
12
+ python-dotenv==1.0.1
13
+ pypdf==5.1.0
14
+ sentence-transformers==3.3.1
15
+ huggingface-hub>=0.27.0
16
+ httpx==0.28.1
17
+ loguru==0.7.3
src/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Fraud detection application package."""
2
+
3
+ __version__ = "1.0.0"
4
+
5
+
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (234 Bytes). View file
 
src/api/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """API routes module."""
2
+
3
+
4
+
src/api/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (193 Bytes). View file
 
src/api/__pycache__/routes.cpython-311.pyc ADDED
Binary file (5.69 kB). View file
 
src/api/routes.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """API routes for fraud detection."""
2
+
3
+ import logging
4
+ from typing import Dict, List
5
+
6
+ from fastapi import APIRouter, HTTPException, status
7
+
8
+ from src.schemas.fraud import (
9
+ FraudAnalysisRequest,
10
+ FraudAnalysisResponse,
11
+ TransactionSummary,
12
+ )
13
+ from src.services.fraud_analyzer import FraudAnalyzer
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ router = APIRouter(prefix="/api/v1", tags=["fraud"])
18
+
19
+ # Initialize services (in production, use dependency injection)
20
+ fraud_analyzer = FraudAnalyzer()
21
+
22
+
23
+ @router.get("/health", summary="Health check")
24
+ async def health_check() -> Dict[str, str]:
25
+ """Health check endpoint."""
26
+ return {"status": "healthy", "service": "fraud-detection-api"}
27
+
28
+
29
+ @router.post(
30
+ "/analyze",
31
+ response_model=FraudAnalysisResponse,
32
+ status_code=status.HTTP_200_OK,
33
+ summary="Analyze transaction for fraud",
34
+ )
35
+ async def analyze_transaction(request: FraudAnalysisRequest) -> FraudAnalysisResponse:
36
+ """Analyze a transaction for fraud indicators.
37
+
38
+ Args:
39
+ request: Fraud analysis request.
40
+
41
+ Returns:
42
+ Fraud analysis response with detailed assessment.
43
+ """
44
+ try:
45
+ if not request.transaction_id and not request.transaction_data:
46
+ raise HTTPException(
47
+ status_code=status.HTTP_400_BAD_REQUEST,
48
+ detail="Either transaction_id or transaction_data must be provided",
49
+ )
50
+
51
+ result = fraud_analyzer.analyze_transaction(
52
+ transaction_id=request.transaction_id,
53
+ transaction_data=request.transaction_data.dict() if request.transaction_data else None,
54
+ use_rag=request.use_rag,
55
+ )
56
+
57
+ return FraudAnalysisResponse(**result)
58
+ except ValueError as e:
59
+ raise HTTPException(
60
+ status_code=status.HTTP_400_BAD_REQUEST,
61
+ detail=str(e),
62
+ )
63
+ except Exception as e:
64
+ logger.error(f"Error analyzing transaction: {str(e)}")
65
+ raise HTTPException(
66
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
67
+ detail=f"Internal server error: {str(e)}",
68
+ )
69
+
70
+
71
+ @router.get(
72
+ "/summary",
73
+ response_model=TransactionSummary,
74
+ summary="Get transaction summary",
75
+ )
76
+ async def get_summary() -> TransactionSummary:
77
+ """Get summary statistics of the fraud dataset.
78
+
79
+ Returns:
80
+ Transaction summary with statistics.
81
+ """
82
+ try:
83
+ summary = fraud_analyzer.data_processor.get_transaction_summary()
84
+ return TransactionSummary(**summary)
85
+ except Exception as e:
86
+ logger.error(f"Error getting summary: {str(e)}")
87
+ raise HTTPException(
88
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
89
+ detail=f"Internal server error: {str(e)}",
90
+ )
91
+
92
+
93
+ @router.post(
94
+ "/batch-analyze",
95
+ response_model=List[FraudAnalysisResponse],
96
+ summary="Batch analyze multiple transactions",
97
+ )
98
+ async def batch_analyze(
99
+ transaction_ids: List[int],
100
+ use_rag: bool = True,
101
+ ) -> List[FraudAnalysisResponse]:
102
+ """Analyze multiple transactions in batch.
103
+
104
+ Args:
105
+ transaction_ids: List of transaction IDs to analyze.
106
+ use_rag: Whether to use RAG for context.
107
+
108
+ Returns:
109
+ List of fraud analysis responses.
110
+ """
111
+ try:
112
+ if not transaction_ids:
113
+ raise HTTPException(
114
+ status_code=status.HTTP_400_BAD_REQUEST,
115
+ detail="At least one transaction_id must be provided",
116
+ )
117
+
118
+ results = fraud_analyzer.batch_analyze(transaction_ids, use_rag=use_rag)
119
+ return [FraudAnalysisResponse(**result) for result in results]
120
+ except Exception as e:
121
+ logger.error(f"Error in batch analysis: {str(e)}")
122
+ raise HTTPException(
123
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
124
+ detail=f"Internal server error: {str(e)}",
125
+ )
126
+
src/config/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """Configuration module."""
2
+
3
+ from src.config.config import settings
4
+
5
+ __all__ = ["settings"]
6
+
7
+
8
+
src/config/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (295 Bytes). View file
 
src/config/__pycache__/config.cpython-311.pyc ADDED
Binary file (2.01 kB). View file
 
src/config/config.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Configuration module for the fraud detection application."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ from pydantic_settings import BaseSettings
8
+
9
+
10
+ class Settings(BaseSettings):
11
+ """Application settings."""
12
+
13
+ # Groq API Configuration
14
+ max_tokens: int = 8192
15
+ groq_api_key: str = os.getenv("GROQ_API_KEY", "")
16
+ groq_model: str = "meta-llama/llama-4-maverick-17b-128e-instruct"
17
+
18
+ # Application Configuration
19
+ app_name: str = "Fraud Detection API"
20
+ app_version: str = "1.0.0"
21
+ debug: bool = False
22
+
23
+ # Data Paths
24
+ data_dir: Path = Path("data")
25
+ train_data_path: Path = data_dir / "fraudTrain.csv"
26
+ pdf_dir: Path = data_dir
27
+
28
+ # RAG Configuration
29
+ chunk_size: int = 1000
30
+ chunk_overlap: int = 200
31
+ vector_store_path: Optional[str] = None # Will use in-memory by default
32
+
33
+ # API Configuration
34
+ api_host: str = "localhost"
35
+ api_port: int = 8000
36
+
37
+ class Config:
38
+ """Pydantic config."""
39
+
40
+ env_file = ".env"
41
+ case_sensitive = False
42
+
43
+
44
+ settings = Settings()
45
+
46
+
src/data/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """Data processing module."""
2
+
3
+ from src.data.processor import FraudDataProcessor
4
+
5
+ __all__ = ["FraudDataProcessor"]
6
+
7
+
8
+
src/data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (307 Bytes). View file
 
src/data/__pycache__/processor.cpython-311.pyc ADDED
Binary file (6.16 kB). View file
 
src/data/processor.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data processor for fraud detection datasets."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Dict, List, Optional
6
+
7
+ import pandas as pd
8
+
9
+ from src.config.config import settings
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class FraudDataProcessor:
15
+ """Processor for fraud detection data."""
16
+
17
+ def __init__(self) -> None:
18
+ """Initialize data processor."""
19
+ self.train_df: Optional[pd.DataFrame] = None
20
+
21
+ def load_train_data(self, path: Optional[Path] = None) -> pd.DataFrame:
22
+ """Load training data.
23
+
24
+ Args:
25
+ path: Path to training data CSV. If None, uses default path.
26
+
27
+ Returns:
28
+ Training dataframe.
29
+ """
30
+ data_path = path or settings.train_data_path
31
+
32
+ if not data_path.exists():
33
+ raise FileNotFoundError(f"Training data not found: {data_path}")
34
+
35
+ try:
36
+ logger.info(f"Loading training data from {data_path}")
37
+ # Load full dataset for accurate statistics
38
+ self.train_df = pd.read_csv(data_path)
39
+
40
+ # Clean merchant names (remove 'fraud_' prefix common in synthetic datasets)
41
+ if 'merchant' in self.train_df.columns:
42
+ self.train_df['merchant'] = self.train_df['merchant'].str.replace('fraud_', '', regex=False)
43
+
44
+ logger.info(f"Loaded {len(self.train_df)} rows from training data (merchant names cleaned)")
45
+ return self.train_df
46
+ except Exception as e:
47
+ logger.error(f"Error loading training data: {str(e)}")
48
+ raise
49
+
50
+ def get_transaction_summary(self, transaction_id: Optional[int] = None) -> Dict:
51
+ """Get summary of a transaction or all transactions.
52
+
53
+ Args:
54
+ transaction_id: Optional transaction ID. If None, returns overall summary.
55
+
56
+ Returns:
57
+ Transaction summary dictionary.
58
+ """
59
+ if self.train_df is None:
60
+ self.load_train_data()
61
+
62
+ df = self.train_df
63
+
64
+ if transaction_id is not None:
65
+ transaction = df[df.index == transaction_id]
66
+ if transaction.empty:
67
+ raise ValueError(f"Transaction {transaction_id} not found")
68
+
69
+ return transaction.iloc[0].to_dict()
70
+
71
+ # Overall summary
72
+ summary = {
73
+ "total_transactions": len(df),
74
+ "fraud_count": int(df["is_fraud"].sum()),
75
+ "fraud_percentage": float(df["is_fraud"].mean() * 100),
76
+ "total_amount": float(df["amt"].sum()),
77
+ "average_amount": float(df["amt"].mean()),
78
+ "categories": df["category"].value_counts().to_dict(),
79
+ }
80
+
81
+ return summary
82
+
83
+ def format_transaction_for_llm(self, transaction: Dict) -> str:
84
+ """Format a transaction dictionary for LLM analysis.
85
+
86
+ Args:
87
+ transaction: Transaction dictionary.
88
+
89
+ Returns:
90
+ Formatted string representation.
91
+ """
92
+ formatted = f"""
93
+ Transaction Details:
94
+ - Date/Time: {transaction.get('trans_date_trans_time', 'N/A')}
95
+ - Merchant: {str(transaction.get('merchant', 'N/A')).replace('fraud_', '')}
96
+ - Category: {transaction.get('category', 'N/A')}
97
+ - Amount: ${transaction.get('amt', 'N/A')}
98
+ - Customer: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}
99
+ - Gender: {transaction.get('gender', 'N/A')}
100
+ - Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}
101
+ - Job: {transaction.get('job', 'N/A')}
102
+ - City Population: {transaction.get('city_pop', 'N/A')}
103
+ - Distance from Merchant: Calculated from coordinates
104
+ """
105
+ return formatted.strip()
106
+
107
+
108
+
src/llm/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """LLM integration module."""
2
+
3
+ from src.llm.groq_client import GroqClient
4
+
5
+ __all__ = ["GroqClient"]
6
+
7
+
8
+
src/llm/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (298 Bytes). View file
 
src/llm/__pycache__/groq_client.cpython-311.pyc ADDED
Binary file (3.69 kB). View file
 
src/llm/groq_client.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Groq LLM client using LangChain."""
2
+
3
+ import logging
4
+ from typing import Any, List, Optional
5
+
6
+ from langchain_groq import ChatGroq
7
+ from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
8
+ from langchain_core.output_parsers import StrOutputParser
9
+
10
+ from src.config.config import settings
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class GroqClient:
16
+ """Client for interacting with Groq LLM using LangChain."""
17
+
18
+ def __init__(
19
+ self,
20
+ api_key: Optional[str] = None,
21
+ model_name: Optional[str] = None,
22
+ temperature: float = 0,
23
+
24
+ ) -> None:
25
+ """Initialize Groq client.
26
+
27
+ Args:
28
+ api_key: Groq API key. If None, uses settings.groq_api_key.
29
+ model_name: Model name. If None, uses settings.groq_model.
30
+ temperature: Temperature for model generation.
31
+ """
32
+ self.api_key = api_key or settings.groq_api_key
33
+ self.model_name = model_name or settings.groq_model
34
+ self.temperature = temperature
35
+ self.max_tokens = settings.max_tokens
36
+
37
+ if not self.api_key:
38
+ raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable.")
39
+
40
+ self.llm = ChatGroq(
41
+ groq_api_key=self.api_key,
42
+ model_name=self.model_name,
43
+ temperature=self.temperature,
44
+ max_tokens=self.max_tokens,
45
+ )
46
+ self.output_parser = StrOutputParser()
47
+
48
+ logger.info(f"Initialized Groq client with model: {self.model_name}")
49
+
50
+ def invoke(
51
+ self,
52
+ prompt: str,
53
+ system_message: Optional[str] = None,
54
+ **kwargs: Any,
55
+ ) -> str:
56
+ """Invoke the LLM with a prompt.
57
+
58
+ Args:
59
+ prompt: User prompt.
60
+ system_message: Optional system message.
61
+ **kwargs: Additional arguments to pass to the LLM.
62
+
63
+ Returns:
64
+ Generated response as string.
65
+ """
66
+ messages: List[BaseMessage] = []
67
+
68
+ if system_message:
69
+ messages.append(SystemMessage(content=system_message))
70
+
71
+ messages.append(HumanMessage(content=prompt))
72
+
73
+ try:
74
+ response = self.llm.invoke(messages, **kwargs)
75
+ return self.output_parser.parse(response.content)
76
+ except Exception as e:
77
+ logger.error(f"Error invoking LLM: {str(e)}")
78
+ raise
79
+
80
+
81
+
src/rag/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """RAG (Retrieval Augmented Generation) module."""
2
+
3
+ from src.rag.document_loader import DocumentLoader
4
+ from src.rag.vector_store import VectorStore
5
+
6
+ __all__ = ["DocumentLoader", "VectorStore"]
7
+
8
+
9
+
src/rag/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (409 Bytes). View file
 
src/rag/__pycache__/csv_document_generator.cpython-311.pyc ADDED
Binary file (14.2 kB). View file
 
src/rag/__pycache__/document_loader.cpython-311.pyc ADDED
Binary file (5.91 kB). View file
 
src/rag/__pycache__/vector_store.cpython-311.pyc ADDED
Binary file (4.88 kB). View file
 
src/rag/csv_document_generator.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CSV document generator for RAG system."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any
6
+ import pandas as pd
7
+ from langchain_core.documents import Document
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class CSVDocumentGenerator:
13
+ """Generate documents from CSV data for RAG system."""
14
+
15
+ def __init__(self, csv_path: Path, sample_size: int = 1050000) -> None:
16
+ """Initialize CSV document generator.
17
+
18
+ Args:
19
+ csv_path: Path to the CSV file.
20
+ sample_size: Number of rows to sample from CSV (to handle large files).
21
+ """
22
+ self.csv_path = Path(csv_path)
23
+ self.sample_size = sample_size
24
+ self.df: pd.DataFrame = None
25
+
26
+ def load_data(self) -> None:
27
+ """Load CSV data with sampling for efficiency."""
28
+ if not self.csv_path.exists():
29
+ raise FileNotFoundError(f"CSV file not found: {self.csv_path}")
30
+
31
+ try:
32
+ logger.info(f"Loading CSV data from {self.csv_path}")
33
+ # Load with sampling to handle large file
34
+ self.df = pd.read_csv(self.csv_path, nrows=self.sample_size)
35
+
36
+ # Clean merchant names (remove 'fraud_' prefix common in synthetic datasets)
37
+ if 'merchant' in self.df.columns:
38
+ self.df['merchant'] = self.df['merchant'].str.replace('fraud_', '', regex=False)
39
+
40
+ logger.info(f"Loaded {len(self.df)} rows from CSV (merchant names cleaned)")
41
+ except Exception as e:
42
+ logger.error(f"Error loading CSV: {str(e)}")
43
+ raise
44
+
45
+ def generate_fraud_pattern_documents(self) -> List[Document]:
46
+ """Generate documents about fraud patterns by category.
47
+
48
+ Returns:
49
+ List of documents containing fraud pattern insights.
50
+ """
51
+ if self.df is None:
52
+ self.load_data()
53
+
54
+ documents = []
55
+
56
+ # Fraud patterns by category
57
+ category_fraud = self.df.groupby('category').agg({
58
+ 'is_fraud': ['sum', 'mean', 'count']
59
+ }).round(4)
60
+
61
+ for category in category_fraud.index:
62
+ fraud_count = int(category_fraud.loc[category, ('is_fraud', 'sum')])
63
+ fraud_rate = float(category_fraud.loc[category, ('is_fraud', 'mean')] * 100)
64
+ total_txns = int(category_fraud.loc[category, ('is_fraud', 'count')])
65
+
66
+ content = f"""Fraud Pattern Analysis - Category: {category}
67
+
68
+ Based on historical transaction data analysis:
69
+
70
+ - Total Transactions: {total_txns:,}
71
+ - Fraud Cases: {fraud_count:,}
72
+ - Fraud Rate: {fraud_rate:.2f}%
73
+ - Risk Level: {'HIGH' if fraud_rate > 5 else 'MEDIUM' if fraud_rate > 1 else 'LOW'}
74
+
75
+ This category shows {'significant' if fraud_rate > 5 else 'moderate' if fraud_rate > 1 else 'low'} fraud activity in the historical dataset.
76
+ """
77
+ documents.append(Document(
78
+ page_content=content,
79
+ metadata={
80
+ "source": "fraudTrain.csv",
81
+ "type": "fraud_pattern",
82
+ "category": category,
83
+ "fraud_rate": fraud_rate
84
+ }
85
+ ))
86
+
87
+ logger.info(f"Generated {len(documents)} category fraud pattern documents")
88
+ return documents
89
+
90
+ def generate_statistical_summaries(self) -> List[Document]:
91
+ """Generate statistical summary documents.
92
+
93
+ Returns:
94
+ List of documents containing statistical insights.
95
+ """
96
+ if self.df is None:
97
+ self.load_data()
98
+
99
+ documents = []
100
+
101
+ # Overall statistics
102
+ total_txns = len(self.df)
103
+ fraud_txns = int(self.df['is_fraud'].sum())
104
+ fraud_rate = float(self.df['is_fraud'].mean() * 100)
105
+ avg_amount = float(self.df['amt'].mean())
106
+ fraud_avg_amount = float(self.df[self.df['is_fraud'] == 1]['amt'].mean())
107
+ legit_avg_amount = float(self.df[self.df['is_fraud'] == 0]['amt'].mean())
108
+
109
+ overall_summary = f"""Overall Fraud Detection Statistics
110
+
111
+ Dataset Summary:
112
+ - Total Transactions Analyzed: {total_txns:,}
113
+ - Fraudulent Transactions: {fraud_txns:,}
114
+ - Overall Fraud Rate: {fraud_rate:.2f}%
115
+ - Average Transaction Amount: ${avg_amount:.2f}
116
+ - Average Fraud Amount: ${fraud_avg_amount:.2f}
117
+ - Average Legitimate Amount: ${legit_avg_amount:.2f}
118
+
119
+ Key Insight: Fraudulent transactions have an average amount of ${fraud_avg_amount:.2f} compared to ${legit_avg_amount:.2f} for legitimate transactions.
120
+ """
121
+ documents.append(Document(
122
+ page_content=overall_summary,
123
+ metadata={
124
+ "source": "fraudTrain.csv",
125
+ "type": "statistical_summary",
126
+ "scope": "overall"
127
+ }
128
+ ))
129
+
130
+ # Amount range analysis
131
+ amount_bins = [0, 10, 50, 100, 500, 1000, float('inf')]
132
+ amount_labels = ['$0-10', '$10-50', '$50-100', '$100-500', '$500-1000', '$1000+']
133
+ self.df['amount_range'] = pd.cut(self.df['amt'], bins=amount_bins, labels=amount_labels)
134
+
135
+ amount_fraud = self.df.groupby('amount_range', observed=True).agg({
136
+ 'is_fraud': ['sum', 'mean', 'count']
137
+ }).round(4)
138
+
139
+ amount_content = "Fraud Patterns by Transaction Amount\n\n"
140
+ for amt_range in amount_labels:
141
+ if amt_range in amount_fraud.index:
142
+ fraud_count = int(amount_fraud.loc[amt_range, ('is_fraud', 'sum')])
143
+ fraud_rate = float(amount_fraud.loc[amt_range, ('is_fraud', 'mean')] * 100)
144
+ total = int(amount_fraud.loc[amt_range, ('is_fraud', 'count')])
145
+
146
+ amount_content += f"""
147
+ Amount Range: {amt_range}
148
+ - Total Transactions: {total:,}
149
+ - Fraud Cases: {fraud_count:,}
150
+ - Fraud Rate: {fraud_rate:.2f}%
151
+ """
152
+
153
+ documents.append(Document(
154
+ page_content=amount_content,
155
+ metadata={
156
+ "source": "fraudTrain.csv",
157
+ "type": "statistical_summary",
158
+ "scope": "amount_analysis"
159
+ }
160
+ ))
161
+
162
+ logger.info(f"Generated {len(documents)} statistical summary documents")
163
+ return documents
164
+
165
+ def generate_merchant_profiles(self) -> List[Document]:
166
+ """Generate merchant risk profile documents.
167
+
168
+ Returns:
169
+ List of documents containing merchant insights.
170
+ """
171
+ if self.df is None:
172
+ self.load_data()
173
+
174
+ documents = []
175
+
176
+ # Top merchants by transaction volume
177
+ merchant_stats = self.df.groupby('merchant').agg({
178
+ 'is_fraud': ['sum', 'mean', 'count'],
179
+ 'amt': 'mean'
180
+ }).round(4)
181
+
182
+ # Get top 20 merchants by volume
183
+ top_merchants = merchant_stats.nlargest(20, ('is_fraud', 'count'))
184
+
185
+ for merchant in top_merchants.index:
186
+ fraud_count = int(top_merchants.loc[merchant, ('is_fraud', 'sum')])
187
+ fraud_rate = float(top_merchants.loc[merchant, ('is_fraud', 'mean')] * 100)
188
+ total_txns = int(top_merchants.loc[merchant, ('is_fraud', 'count')])
189
+ avg_amt = float(top_merchants.loc[merchant, ('amt', 'mean')])
190
+
191
+ content = f"""Merchant Risk Profile: {merchant}
192
+
193
+ Transaction Analysis:
194
+ - Total Transactions: {total_txns:,}
195
+ - Fraudulent Transactions: {fraud_count:,}
196
+ - Fraud Rate: {fraud_rate:.2f}%
197
+ - Average Transaction Amount: ${avg_amt:.2f}
198
+ - Risk Assessment: {'HIGH RISK' if fraud_rate > 10 else 'MEDIUM RISK' if fraud_rate > 5 else 'LOW RISK'}
199
+
200
+ This merchant profile is based on historical transaction patterns and can help identify similar fraud patterns.
201
+ """
202
+ documents.append(Document(
203
+ page_content=content,
204
+ metadata={
205
+ "source": "fraudTrain.csv",
206
+ "type": "merchant_profile",
207
+ "merchant": merchant,
208
+ "fraud_rate": fraud_rate
209
+ }
210
+ ))
211
+
212
+ logger.info(f"Generated {len(documents)} merchant profile documents")
213
+ return documents
214
+
215
+ def generate_location_insights(self) -> List[Document]:
216
+ """Generate location-based fraud insights.
217
+
218
+ Returns:
219
+ List of documents containing location insights.
220
+ """
221
+ if self.df is None:
222
+ self.load_data()
223
+
224
+ documents = []
225
+
226
+ # State-level analysis
227
+ state_fraud = self.df.groupby('state').agg({
228
+ 'is_fraud': ['sum', 'mean', 'count']
229
+ }).round(4)
230
+
231
+ # Get top 15 states by transaction volume
232
+ top_states = state_fraud.nlargest(15, ('is_fraud', 'count'))
233
+
234
+ for state in top_states.index:
235
+ fraud_count = int(top_states.loc[state, ('is_fraud', 'sum')])
236
+ fraud_rate = float(top_states.loc[state, ('is_fraud', 'mean')] * 100)
237
+ total_txns = int(top_states.loc[state, ('is_fraud', 'count')])
238
+
239
+ content = f"""Geographic Fraud Analysis - State: {state}
240
+
241
+ Location-based Fraud Patterns:
242
+ - Total Transactions: {total_txns:,}
243
+ - Fraud Cases: {fraud_count:,}
244
+ - Fraud Rate: {fraud_rate:.2f}%
245
+ - Geographic Risk Level: {'HIGH' if fraud_rate > 5 else 'MEDIUM' if fraud_rate > 2 else 'LOW'}
246
+
247
+ This geographic area shows {'elevated' if fraud_rate > 5 else 'moderate' if fraud_rate > 2 else 'normal'} fraud activity levels.
248
+ """
249
+ documents.append(Document(
250
+ page_content=content,
251
+ metadata={
252
+ "source": "fraudTrain.csv",
253
+ "type": "location_insight",
254
+ "state": state,
255
+ "fraud_rate": fraud_rate
256
+ }
257
+ ))
258
+
259
+ logger.info(f"Generated {len(documents)} location insight documents")
260
+ return documents
261
+
262
+ def generate_all_documents(self) -> List[Document]:
263
+ """Generate all types of documents from CSV data.
264
+
265
+ Returns:
266
+ List of all generated documents.
267
+ """
268
+ all_documents = []
269
+
270
+ logger.info("Generating all document types from CSV data...")
271
+
272
+ all_documents.extend(self.generate_fraud_pattern_documents())
273
+ all_documents.extend(self.generate_statistical_summaries())
274
+ all_documents.extend(self.generate_merchant_profiles())
275
+ all_documents.extend(self.generate_location_insights())
276
+
277
+ logger.info(f"Generated total of {len(all_documents)} documents from CSV data")
278
+ return all_documents
src/rag/document_loader.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Document loader for PDF files."""
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import List
6
+
7
+ from langchain_community.document_loaders import PyPDFLoader
8
+ from langchain_core.documents import Document
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+
11
+ from src.config.config import settings
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class DocumentLoader:
17
+ """Loader for PDF documents."""
18
+
19
+ def __init__(
20
+ self,
21
+ chunk_size: int = 1000,
22
+ chunk_overlap: int = 200,
23
+ ) -> None:
24
+ """Initialize document loader.
25
+
26
+ Args:
27
+ chunk_size: Size of text chunks.
28
+ chunk_overlap: Overlap between chunks.
29
+ """
30
+ self.chunk_size = chunk_size
31
+ self.chunk_overlap = chunk_overlap
32
+ self.text_splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=chunk_size,
34
+ chunk_overlap=chunk_overlap,
35
+ length_function=len,
36
+ )
37
+
38
+ def load_pdf(self, pdf_path: Path) -> List[Document]:
39
+ """Load a PDF file and split it into chunks.
40
+
41
+ Args:
42
+ pdf_path: Path to the PDF file.
43
+
44
+ Returns:
45
+ List of document chunks.
46
+ """
47
+ if not pdf_path.exists():
48
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
49
+
50
+ try:
51
+ logger.info(f"Loading PDF: {pdf_path}")
52
+ loader = PyPDFLoader(str(pdf_path))
53
+ documents = loader.load()
54
+
55
+ # Split documents into chunks
56
+ chunks = self.text_splitter.split_documents(documents)
57
+
58
+ logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}")
59
+ return chunks
60
+ except Exception as e:
61
+ logger.error(f"Error loading PDF {pdf_path}: {str(e)}")
62
+ raise
63
+
64
+ def load_pdfs_from_directory(self, directory: Path) -> List[Document]:
65
+ """Load all PDF files from a directory.
66
+
67
+ Args:
68
+ directory: Directory containing PDF files.
69
+
70
+ Returns:
71
+ List of document chunks from all PDFs.
72
+ """
73
+ if not directory.exists():
74
+ raise FileNotFoundError(f"Directory not found: {directory}")
75
+
76
+ pdf_files = list(directory.glob("*.pdf"))
77
+ if not pdf_files:
78
+ logger.warning(f"No PDF files found in {directory}")
79
+ return []
80
+
81
+ all_chunks: List[Document] = []
82
+ for pdf_path in pdf_files:
83
+ try:
84
+ chunks = self.load_pdf(pdf_path)
85
+ all_chunks.extend(chunks)
86
+ except Exception as e:
87
+ logger.error(f"Failed to load {pdf_path}: {str(e)}")
88
+ continue
89
+
90
+ logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs")
91
+ return all_chunks
92
+
93
+ def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]:
94
+ """Load insights from CSV file and convert to documents.
95
+
96
+ Args:
97
+ csv_path: Path to CSV file.
98
+ sample_size: Number of rows to sample from CSV.
99
+
100
+ Returns:
101
+ List of documents generated from CSV insights.
102
+ """
103
+ try:
104
+ from src.rag.csv_document_generator import CSVDocumentGenerator
105
+
106
+ logger.info(f"Loading CSV insights from {csv_path}")
107
+ generator = CSVDocumentGenerator(csv_path, sample_size=sample_size)
108
+ documents = generator.generate_all_documents()
109
+
110
+ logger.info(f"Generated {len(documents)} documents from CSV insights")
111
+ return documents
112
+ except Exception as e:
113
+ logger.error(f"Error loading CSV insights: {str(e)}")
114
+ raise
115
+
116
+
117
+
src/rag/vector_store.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Vector store for document embeddings."""
2
+
3
+ import logging
4
+ from typing import List, Optional
5
+
6
+ from langchain_core.documents import Document
7
+ from langchain_chroma import Chroma
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain_core.retrievers import BaseRetriever
10
+
11
+ from src.config.config import settings
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class VectorStore:
17
+ """Vector store for document embeddings and retrieval."""
18
+
19
+ def __init__(
20
+ self,
21
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
22
+ persist_directory: Optional[str] = None,
23
+ ) -> None:
24
+ """Initialize vector store.
25
+
26
+ Args:
27
+ embedding_model: Name of the embedding model.
28
+ persist_directory: Directory to persist the vector store.
29
+ """
30
+ self.embedding_model = embedding_model
31
+ self.persist_directory = persist_directory or settings.vector_store_path
32
+
33
+ # Initialize embeddings
34
+ self.embeddings = HuggingFaceEmbeddings(
35
+ model_name=embedding_model,
36
+ model_kwargs={"device": "cpu"},
37
+ )
38
+
39
+ self.vector_store: Optional[Chroma] = None
40
+ self.retriever: Optional[BaseRetriever] = None
41
+
42
+ def add_documents(self, documents: List[Document]) -> None:
43
+ """Add documents to the vector store.
44
+
45
+ Args:
46
+ documents: List of documents to add.
47
+ """
48
+ if not documents:
49
+ logger.warning("No documents to add")
50
+ return
51
+
52
+ try:
53
+ if self.vector_store is None:
54
+ # Create new vector store
55
+ self.vector_store = Chroma.from_documents(
56
+ documents=documents,
57
+ embedding=self.embeddings,
58
+ persist_directory=self.persist_directory,
59
+ )
60
+ else:
61
+ # Add to existing vector store
62
+ self.vector_store.add_documents(documents)
63
+
64
+ # Create retriever
65
+ self.retriever = self.vector_store.as_retriever(
66
+ search_kwargs={"k": 5}
67
+ )
68
+
69
+ logger.info(f"Added {len(documents)} documents to vector store")
70
+ except Exception as e:
71
+ logger.error(f"Error adding documents to vector store: {str(e)}")
72
+ raise
73
+
74
+ def similarity_search(
75
+ self,
76
+ query: str,
77
+ k: int = 5,
78
+ ) -> List[Document]:
79
+ """Search for similar documents.
80
+
81
+ Args:
82
+ query: Search query.
83
+ k: Number of results to return.
84
+
85
+ Returns:
86
+ List of similar documents.
87
+ """
88
+ if self.vector_store is None:
89
+ raise ValueError("Vector store not initialized. Add documents first.")
90
+
91
+ try:
92
+ results = self.vector_store.similarity_search(query, k=k)
93
+ logger.info(f"Found {len(results)} similar documents for query: {query[:50]}...")
94
+ return results
95
+ except Exception as e:
96
+ logger.error(f"Error in similarity search: {str(e)}")
97
+ raise
98
+
99
+ def get_retriever(self) -> BaseRetriever:
100
+ """Get the retriever for RAG.
101
+
102
+ Returns:
103
+ Base retriever instance.
104
+ """
105
+ if self.retriever is None:
106
+ raise ValueError("Retriever not initialized. Add documents first.")
107
+
108
+ return self.retriever
109
+
110
+
111
+
src/schemas/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic schemas for API."""
2
+
3
+ from src.schemas.fraud import (
4
+ FraudAnalysisRequest,
5
+ FraudAnalysisResponse,
6
+ TransactionData,
7
+ TransactionSummary,
8
+ )
9
+
10
+ __all__ = [
11
+ "FraudAnalysisRequest",
12
+ "FraudAnalysisResponse",
13
+ "TransactionData",
14
+ "TransactionSummary",
15
+ ]
16
+
17
+
18
+
src/schemas/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (448 Bytes). View file
 
src/schemas/__pycache__/fraud.cpython-311.pyc ADDED
Binary file (3.45 kB). View file
 
src/schemas/fraud.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic schemas for fraud detection."""
2
+
3
+ from typing import Dict, Optional
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class TransactionData(BaseModel):
9
+ """Transaction data schema."""
10
+
11
+ trans_date_trans_time: Optional[str] = None
12
+ merchant: Optional[str] = None
13
+ category: Optional[str] = None
14
+ amt: Optional[float] = None
15
+ first: Optional[str] = None
16
+ last: Optional[str] = None
17
+ gender: Optional[str] = None
18
+ city: Optional[str] = None
19
+ state: Optional[str] = None
20
+ job: Optional[str] = None
21
+ city_pop: Optional[int] = None
22
+
23
+
24
+ class TransactionSummary(BaseModel):
25
+ """Transaction summary schema."""
26
+
27
+ total_transactions: int
28
+ fraud_count: int
29
+ fraud_percentage: float
30
+ total_amount: float
31
+ average_amount: float
32
+ categories: Dict[str, int]
33
+
34
+
35
+ class FraudAnalysisRequest(BaseModel):
36
+ """Request schema for fraud analysis."""
37
+
38
+ transaction_id: Optional[int] = Field(None, description="Transaction ID from dataset")
39
+ transaction_data: Optional[TransactionData] = Field(None, description="Direct transaction data")
40
+ use_rag: bool = Field(True, description="Whether to use RAG for context")
41
+
42
+ class Config:
43
+ """Pydantic config."""
44
+
45
+ json_schema_extra = {
46
+ "example": {
47
+ "transaction_id": 0,
48
+ "use_rag": True,
49
+ }
50
+ }
51
+
52
+
53
+ class FraudAnalysisResponse(BaseModel):
54
+ """Response schema for fraud analysis."""
55
+
56
+ transaction: Dict
57
+ analysis: str
58
+ formatted_transaction: str
59
+ success: bool = True
60
+ error: Optional[str] = None
61
+
62
+
src/services/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ """Services module."""
2
+
3
+ from src.services.fraud_analyzer import FraudAnalyzer
4
+
5
+ __all__ = ["FraudAnalyzer"]
6
+
7
+
src/services/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (308 Bytes). View file
 
src/services/__pycache__/fraud_analyzer.cpython-311.pyc ADDED
Binary file (11.4 kB). View file