Spaces:

ahmzakif
/

Fraud-Chatbot

Sleeping

App Files Files Community

ahmzakif commited on Jan 20

Commit

fd99b61

verified ·

1 Parent(s): c8aeeaa

feat: add new project

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +37 -0
.gitattributes +4 -0
.gitignore +58 -0
.gradio/certificate.pem +31 -0
Dockerfile +34 -0
LICENSE +201 -0
QUICKSTART.md +135 -0
README.md +354 -13
TECHNICAL_ASSESSMENT.md +645 -0
app.py +757 -0
data/Bhatla.pdf +3 -0
data/EBA_ECB 2024 Report on Payment Fraud.pdf +3 -0
data/fraudTest.csv +3 -0
data/fraudTrain.csv +3 -0
docker-compose.yml +27 -0
main.py +112 -0
requirements.txt +17 -0
src/__init__.py +5 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/api/__init__.py +4 -0
src/api/__pycache__/__init__.cpython-311.pyc +0 -0
src/api/__pycache__/routes.cpython-311.pyc +0 -0
src/api/routes.py +126 -0
src/config/__init__.py +8 -0
src/config/__pycache__/__init__.cpython-311.pyc +0 -0
src/config/__pycache__/config.cpython-311.pyc +0 -0
src/config/config.py +46 -0
src/data/__init__.py +8 -0
src/data/__pycache__/__init__.cpython-311.pyc +0 -0
src/data/__pycache__/processor.cpython-311.pyc +0 -0
src/data/processor.py +108 -0
src/llm/__init__.py +8 -0
src/llm/__pycache__/__init__.cpython-311.pyc +0 -0
src/llm/__pycache__/groq_client.cpython-311.pyc +0 -0
src/llm/groq_client.py +81 -0
src/rag/__init__.py +9 -0
src/rag/__pycache__/__init__.cpython-311.pyc +0 -0
src/rag/__pycache__/csv_document_generator.cpython-311.pyc +0 -0
src/rag/__pycache__/document_loader.cpython-311.pyc +0 -0
src/rag/__pycache__/vector_store.cpython-311.pyc +0 -0
src/rag/csv_document_generator.py +278 -0
src/rag/document_loader.py +117 -0
src/rag/vector_store.py +111 -0
src/schemas/__init__.py +18 -0
src/schemas/__pycache__/__init__.cpython-311.pyc +0 -0
src/schemas/__pycache__/fraud.cpython-311.pyc +0 -0
src/schemas/fraud.py +62 -0
src/services/__init__.py +7 -0
src/services/__pycache__/__init__.cpython-311.pyc +0 -0
src/services/__pycache__/fraud_analyzer.cpython-311.pyc +0 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,37 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+venv/
+.env
+# Project Specific
+logs/
+chroma_db/
+vector_store/
+.vscode/
+.idea/
+.git/
+.gitignore
+# Large data files (handled via volumes in docker-compose)
+data/fraudTrain.csv
+data/fraudTest.csv

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/Bhatla.pdf filter=lfs diff=lfs merge=lfs -text
+data/EBA_ECB[[:space:]]2024[[:space:]]Report[[:space:]]on[[:space:]]Payment[[:space:]]Fraud.pdf filter=lfs diff=lfs merge=lfs -text
+data/fraudTest.csv filter=lfs diff=lfs merge=lfs -text
+data/fraudTrain.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,58 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+.gradio/
+# Virtual Environment
+venv/
+env/
+ENV/
+.venv
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Environment variables
+.env
+.env.local
+# Data (ignore large CSV and PDF files)
+data/*.csv
+data/*.pdf
+# Vector store
+chroma_db/
+*.db
+# Logs
+*.log
+logs/
+# OS
+.DS_Store
+Thumbs.db

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONPATH=/app
+# Set the working directory in the container
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the requirements file into the container at /app
+COPY requirements.txt .
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code into the container at /app
+COPY . .
+# Create directory for persistent vector store
+RUN mkdir -p /app/chroma_db
+# Expose ports for Gradio (7860) and FastAPI (8000)
+EXPOSE 7860 8000
+# Default command (can be overridden in docker-compose)
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

QUICKSTART.md ADDED Viewed

	@@ -0,0 +1,135 @@

+# Quick Start Guide
+Panduan cepat untuk menjalankan aplikasi Fraud Detection menggunakan LangChain dan Groq.
+## Prerequisites
+1. Python 3.10 atau lebih tinggi
+2. Groq API Key (dapatkan di https://console.groq.com/)
+## Setup Cepat
+### 1. Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+### 2. Setup Environment Variable
+Buat file `.env` di root directory:
+```env
+GROQ_API_KEY=your_groq_api_key_here
+```
+Atau export sebagai environment variable:
+```bash
+# Windows PowerShell
+$env:GROQ_API_KEY="your_groq_api_key_here"
+# Linux/Mac
+export GROQ_API_KEY="your_groq_api_key_here"
+```
+### 3. Jalankan Server
+```bash
+python main.py
+```
+Server akan berjalan di `http://localhost:8000`
+## Menggunakan API
+### 1. Melalui Browser
+Buka `http://localhost:8000/docs` untuk melihat dokumentasi interaktif Swagger UI.
+### 2. Melalui cURL
+#### Health Check
+```bash
+curl http://localhost:8000/api/v1/health
+```
+#### Analisis Transaksi
+```bash
+curl -X POST "http://localhost:8000/api/v1/analyze" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "transaction_id": 0,
+    "use_rag": true
+  }'
+```
+#### Analisis dengan Data Langsung
+```bash
+curl -X POST "http://localhost:8000/api/v1/analyze" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "transaction_data": {
+      "merchant": "Suspicious Merchant",
+      "category": "grocery_pos",
+      "amt": 5000.00,
+      "city": "Jakarta",
+      "state": "DKI"
+    },
+    "use_rag": true
+  }'
+```
+### 3. Menggunakan Python Script
+Jalankan contoh penggunaan:
+```bash
+python example_usage.py
+```
+## Struktur Kode
+```
+├── config.py              # Konfigurasi aplikasi
+├── main.py                # FastAPI application
+├── example_usage.py       # Contoh penggunaan
+├── requirements.txt       # Dependencies
+└── src/
+    ├── api/              # API routes
+    ├── data/             # Data processing
+    ├── llm/              # LangChain Groq integration
+    ├── rag/              # RAG system
+    ├── schemas/          # Pydantic models
+    └── services/         # Business logic
+```
+## Fitur Utama
+1. **LLM Integration**: Menggunakan Groq dengan LangChain
+2. **RAG System**: Menggunakan dokumen PDF sebagai konteks
+3. **RESTful API**: FastAPI dengan dokumentasi otomatis
+4. **Modular Design**: Kode yang mudah di-maintain dan di-extend
+## Troubleshooting
+### Error: "Groq API key is required"
+- Pastikan `GROQ_API_KEY` sudah di-set di environment variable atau file `.env`
+### Error: "PDF file not found"
+- Pastikan file PDF ada di folder `data/`
+- Atau sesuaikan path di `config.py`
+### Dataset terlalu besar
+- Aplikasi secara default hanya memuat sample data (10,000 rows untuk training, 1,000 untuk test)
+- Untuk memuat full dataset, edit `src/data/processor.py` dan hapus parameter `nrows`
+## Next Steps
+1. Baca dokumentasi lengkap di `README.md`
+2. Explore API documentation di `http://localhost:8000/docs`
+3. Customize konfigurasi di `config.py`
+4. Extend functionality sesuai kebutuhan

README.md CHANGED Viewed

@@ -1,13 +1,354 @@
----
-title: Fraud Chatbot
-emoji: 🚀
-colorFrom: green
-colorTo: gray
-sdk: gradio
-sdk_version: 6.3.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Fraud Detection Chatbot
+AI-powered fraud detection system menggunakan LangChain, Groq, dan RAG (Retrieval Augmented Generation) dengan Gradio interface dan FastAPI backend.
+## 🎯 Fitur Utama
+### 1. **Gradio Web Interface** (`app.py`)
+- **Chat with Fraud Expert**: Tanya jawab interaktif dengan inline citations & **Response Quality Scoring**
+- **Analyze by Transaction ID**: Analisis data historis lengkap (semua kolom CSV) berdasarkan ID
+- **Analyze Manual Transaction**: Input manual transaction details, termasuk **Advanced Optional Fields** (Age, Gender, Location)
+- **Dataset Summary**: Statistik lengkap dari 1.2M+ total transaksi
+### 2. **RAG System dengan Dual Data Sources**
+- **PDF Documents**: Research papers tentang fraud detection
+  - Bhatla.pdf
+  - EBA_ECB 2024 Report on Payment Fraud.pdf
+- **CSV Insights**: Extracted patterns dari fraudTrain.csv
+  - Fraud patterns by category (14 documents)
+  - Merchant risk profiles (20 documents)
+  - Location-based insights (15 documents)
+  - Statistical summaries (2 documents)
+### 3. **FastAPI REST API**
+- RESTful endpoints dengan dokumentasi otomatis
+- Batch analysis support
+- CORS enabled untuk frontend integration
+### 4. **Inline Source Citations**
+- LLM responses include `[Source X]` citations
+- Source reference list at the end
+- Transparency dan verifikasi informasi
+## 📁 Struktur Proyek
+```
+.
+├── app.py                    # Gradio web interface (MAIN)
+├── main.py                   # FastAPI application
+├── requirements.txt          # Dependencies
+├── README.md                 # Dokumentasi
+├── QUICKSTART.md            # Quick start guide
+├── data/                     # Data dan dokumen
+│   ├── fraudTrain.csv       # Training dataset (351 MB)
+│   ├── fraudTest.csv        # Test dataset
+│   ├── Bhatla.pdf           # Research paper
+│   └── EBA_ECB 2024 Report on Payment Fraud.pdf
+├── src/
+│   ├── api/                 # API routes
+│   │   └── routes.py
+│   ├── config/              # Configuration
+│   │   ├── __init__.py
+│   │   └── config.py
+│   ├── data/                # Data processing
+│   │   └── processor.py
+│   ├── llm/                 # LLM integration
+│   │   └── groq_client.py
+│   ├── rag/                 # RAG system
+│   │   ├── document_loader.py
+│   │   ├── vector_store.py
+│   │   └── csv_document_generator.py  # NEW: CSV insights
+│   ├── schemas/             # Pydantic schemas
+│   │   └── fraud.py
+│   └── services/            # Business logic
+│       └── fraud_analyzer.py
+└── test/                    # Test files
+    ├── example_usage.py
+    └── test_vector_store.py
+```
+## 🚀 Instalasi
+### 1. Clone & Setup Environment
+```bash
+# Create virtual environment
+python -m venv venv
+# Activate
+# Windows:
+venv\Scripts\activate
+# Linux/Mac:
+source venv/bin/activate
+```
+### 2. Install Dependencies
+```bash
+pip install -r requirements.txt
+```
+### 3. Setup Environment Variables
+Buat file `.env` di root directory:
+```env
+GROQ_API_KEY=your_groq_api_key_here
+```
+## 💻 Penggunaan
+### Gradio Web Interface (Recommended)
+```bash
+python app.py
+```
+Interface akan terbuka di:
+- Local: `http://localhost:7860`
+- Public: Shareable link (expires in 72 hours)
+### FastAPI Backend
+```bash
+python main.py
+```
+API akan berjalan di `http://localhost:8000`
+**API Documentation:**
+- Swagger UI: `http://localhost:8000/docs`
+- ReDoc: `http://localhost:8000/redoc`
+### Docker (Recommended for Deployment)
+Jika Anda memiliki Docker dan Docker Compose terinstal:
+```bash
+# Build dan jalankan semua service (UI & API)
+docker-compose up --build -d
+```
+Service akan tersedia di:
+- **Gradio UI**: `http://localhost:7860`
+- **FastAPI Docs**: `http://localhost:8000/docs`
+Untuk mematikan service:
+```bash
+docker-compose down
+```
+## 📖 Contoh Penggunaan
+### Gradio Interface
+1. **Chat with Fraud Expert**
+   - Enable "Use RAG" untuk enhanced responses
+   - Tanya: "What are fraud patterns in grocery transactions?"
+   - Response akan include inline citations `[Source 1]`
+2. **Analyze Transaction**
+   - Input Transaction ID atau manual data
+   - Enable RAG untuk analysis dengan context
+   - Lihat detailed fraud analysis dengan sources
+3. **Dataset Summary**
+   - View transaction statistics
+   - See RAG knowledge base info (243 documents total)
+### API Endpoints
+#### 1. Health Check
+```bash
+curl http://localhost:8000/api/v1/health
+```
+#### 2. Analyze Transaction (by ID)
+```bash
+curl -X POST "http://localhost:8000/api/v1/analyze" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "transaction_id": 0,
+    "use_rag": true
+  }'
+```
+#### 3. Analyze Transaction (Manual Data)
+```bash
+curl -X POST "http://localhost:8000/api/v1/analyze" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "transaction_data": {
+      "merchant": "Amazon",
+      "category": "shopping_net",
+      "amt": 150.00,
+      "city": "Jakarta",
+      "state": "DKI"
+    },
+    "use_rag": true
+  }'
+```
+#### 4. Get Dataset Summary
+```bash
+curl http://localhost:8000/api/v1/summary
+```
+#### 5. Batch Analysis
+```bash
+curl -X POST "http://localhost:8000/api/v1/batch-analyze?transaction_ids=[0,1,2]&use_rag=true"
+```
+## 🏗️ Arsitektur
+### RAG System Flow
+```
+User Query
+    ↓
+Vector Store (Chroma)
+    ↓
+Retrieve Top K Documents (PDF + CSV insights)
+    ↓
+Format with Source Numbers [Source 1], [Source 2]
+    ↓
+LLM (Groq) with Context
+    ↓
+Response with Inline Citations
+    ↓
+Source Reference List
+```
+### Komponen Utama
+1. **GroqClient** (`src/llm/groq_client.py`):
+   - Groq LLM integration via LangChain
+   - Model: `meta-llama/llama-4-maverick-17b-128e-instruct`
+   - Max tokens: 8192
+- **ResponseQualityScorer** (`src/services/quality_scorer.py`):
+  - Automated evaluation of LLM responses
+  - Metrics: Relevance, Completeness, Citation Quality, Clarity
+2. **DocumentLoader** (`src/rag/document_loader.py`):
+   - Load PDF documents dengan PyPDFLoader
+   - Load CSV insights via CSVDocumentGenerator
+   - Text splitting dengan RecursiveCharacterTextSplitter
+3. **CSVDocumentGenerator** (`src/rag/csv_document_generator.py`):
+   - Extract fraud patterns by category
+   - Generate merchant risk profiles
+   - Create location-based insights
+   - Statistical summaries
+4. **VectorStore** (`src/rag/vector_store.py`):
+   - Chroma vector database
+   - HuggingFace embeddings (sentence-transformers/all-MiniLM-L6-v2)
+   - Similarity search untuk RAG
+5. **FraudAnalyzer** (`src/services/fraud_analyzer.py`):
+   - Main service untuk fraud analysis
+   - RAG chain dengan inline citation instructions
+   - Batch analysis support
+## ⚙️ Konfigurasi
+File `src/config/config.py`:
+```python
+# Groq API
+max_tokens: int = 8192
+groq_model: str = "meta-llama/llama-4-maverick-17b-128e-instruct"
+# RAG
+chunk_size: int = 1000
+chunk_overlap: int = 200
+# Data Paths
+data_dir: Path = Path("data")
+train_data_path: Path = data_dir / "fraudTrain.csv"
+pdf_dir: Path = data_dir
+```
+## 🎨 UI Features
+- **Modern Design**: Inter font, clean layout
+- **Vertical Layout**: Analysis results appear below inputs
+- **Response Quality Scoring**: Otomatis menilai kualitas jawaban (0-100)
+- **Advanced Manual Analysis**: Optional fields collapsible section untuk high-precision simulation
+- **Clean Terminal**: Warnings suppressed untuk better UX
+## 📊 Dataset
+- **fraudTrain.csv**: 351 MB, 1.29M+ transactions
+- **CSV Insights**: 1,050,000 rows di-load untuk RAG generation
+- **Dataset Stats**: Menampilkan statistik dari full 1.29M rows
+## 🔍 RAG Knowledge Base
+**Total: 243 documents**
+- **PDF Documents**: 187 chunks
+  - Bhatla.pdf: 67 chunks
+  - EBA_ECB 2024 Report: 120 chunks
+- **CSV Insights**: 51 documents
+  - Fraud Pattern Analysis: 14
+  - Merchant Profiles: 20
+  - Location Insights: 15
+  - Statistical Summaries: 2
+## 🧪 Testing
+```bash
+# Run example usage
+python test/example_usage.py
+# Run vector store test
+python test/test_vector_store.py
+```
+## 📝 Development
+### Code Style
+- PEP 8 compliant
+- Type hints untuk semua functions
+- Google-style docstrings
+- Modular architecture
+### Best Practices
+- Clean code dengan separation of concerns
+- No unused functions (cleaned up)
+- Proper error handling
+- Comprehensive logging
+## 🚨 Catatan Penting & Troubleshooting
+1. **API Key**: Pastikan `GROQ_API_KEY` sudah benar di file `.env`.
+2. **Besar Dataset**: Dataset asli sangat besar (1.29M+ rows). Sistem menggunakan sampling 1M+ rows untuk insight RAG agar performa tetap terjaga.
+3. **Dependency Conflict**: Jika menginstal manual dan terjadi konflik versi `huggingface-hub`, gunakan versi `>=0.27.0` untuk kompatibilitas dengan Gradio 6.
+4. **Volume Mounting**: Saat menggunakan Docker, folder `data/` dan `chroma_db/` akan di-mount ke container secara otomatis.
+5. **ChromaDB**: Error telemetry ChromaDB dapat diabaikan, fitur pencarian tetap berfungsi normal.
+## 📄 License
+MIT License

TECHNICAL_ASSESSMENT.md ADDED Viewed

	@@ -0,0 +1,645 @@

+# Technical Requirements Assessment
+* [ ]
+---
+## Requirements Checklist
+### ✅ 1. Accuracy: Akurasi dan Relevansi Response
+#### Implementation Details:
+**A. RAG System dengan Dual Data Sources**
+- **Location:** `src/rag/vector_store.py`, `src/rag/document_loader.py`
+- **Implementation:**
+  ```python
+  # Vector Store dengan Chroma DB
+  # File: src/rag/vector_store.py (line 34-37)
+  self.embeddings = HuggingFaceEmbeddings(
+      model_name="sentence-transformers/all-MiniLM-L6-v2",
+      model_kwargs={"device": "cpu"},
+  )
+  ```
+**B. Data Sources (243 Documents Total)**
+1. **PDF Documents (187 chunks)**
+   - Bhatla.pdf: 67 chunks
+   - EBA_ECB 2024 Report: 120 chunks
+2. **CSV Insights (51 documents)**
+   - Fraud Pattern Analysis: 14 documents
+   - Merchant Profiles: 20 documents
+   - Location Insights: 15 documents
+   - Statistical Summaries: 2 documents
+**C. Inline Source Citations**
+- **Location:** `app.py` (line 328-337)
+- **Format:** `[Source X]` inline dalam response
+- **Verification:** Source reference list di akhir response
+**D. Transaction Query Detection**
+- **Location:** `app.py` (line 284-307)
+- **Implementation:**
+  ```python
+  # Auto-detect transaction ID dalam query
+  transaction_query = re.search(r'transaction\s+(?:id\s+)?(\d+)', message.lower())
+  # Fetch actual transaction data
+  transaction = data_processor.get_transaction_summary(transaction_id)
+  ```
+**E. Merchant Name Cleaning (Artifact Removal)**
+- **Location:** `src/data/processor.py` (line 39-42), `src/rag/csv_document_generator.py` (line 35-38)
+- **Problem:** All merchants in the synthetic dataset have a "fraud_" prefix, leading to false positive analysis by the LLM.
+- **Fix:** Automated removal of the "fraud_" prefix during data ingestion and LLM prompting instructions to ignore the artifact.
+**F. Deterministic Responses**
+- **Location:** `src/llm/groq_client.py` (line 23)
+- **Setting:** `temperature: float = 0`
+**Evidence:**
+- ✅ RAG retrieves top-k relevant documents
+- ✅ Inline citations untuk transparency
+- ✅ Actual transaction data untuk specific queries
+- ✅ Merchant name cleaning untuk menghilangkan "false positive" indikator
+- ✅ Temperature 0 untuk consistent responses
+---
+### ✅ 2. Coverage: Adaptabilitas untuk Berbagai Pertanyaan
+#### Implementation Details:
+**A. Multiple Interfaces**
+- **Location:** `app.py`
+- **Interfaces:**
+  1. Chat with Fraud Expert (line 277-403)
+  2. Analyze by Transaction ID (line 106-138)
+  3. Analyze Manual Transaction (line 141-178)
+  4. Dataset Summary (line 182-274)
+**B. Flexible Query Handling**
+- **Natural Language Transaction Queries:**
+  ```python
+  # Supports queries like:
+  # - "is transaction id 996746 fraud?"
+  # - "analyze transaction 12345"
+  # - "what about transaction id 999?"
+  ```
+**C. RAG Coverage Across Domains**
+- Fraud patterns by category (14 categories)
+- Merchant risk profiles (20 merchants)
+- Geographic insights (15 states)
+- Statistical patterns (overall + by amount range)
+**D. API Endpoints**
+- **Location:** `src/api/routes.py`
+- **Endpoints:**
+  - `POST /api/v1/analyze` - Single transaction
+  - `POST /api/v1/batch-analyze` - Multiple transactions
+  - `GET /api/v1/summary` - Dataset overview
+  - `GET /api/v1/health` - Health check
+**Evidence:**
+- ✅ 4 different interaction modes
+- ✅ Handles general + specific queries
+- ✅ Supports 1.2M+ transactions
+- ✅ REST API untuk programmatic access
+---
+### ✅ 3. Readability: Struktur Kode dan Naming
+#### Implementation Details:
+**A. Modular Architecture**
+```
+src/
+├── api/              # REST API layer
+│   └── routes.py
+├── config/           # Configuration management
+│   ├── __init__.py
+│   └── config.py
+├── data/             # Data processing
+│   └── processor.py
+├── llm/              # LLM integration
+│   └── groq_client.py
+├── rag/              # RAG system
+│   ├── document_loader.py
+│   ├── vector_store.py
+│   └── csv_document_generator.py
+├── schemas/          # Pydantic models
+│   └── fraud.py
+└── services/         # Business logic
+    ├── fraud_analyzer.py
+    └── quality_scorer.py
+```
+**B. Naming Conventions**
+- **Classes:** `PascalCase`
+  - `FraudAnalyzer`, `VectorStore`, `ResponseQualityScorer`
+- **Functions:** `snake_case`
+  - `analyze_transaction()`, `load_csv_insights()`, `score_response()`
+- **Constants:** `UPPER_CASE` in config
+  - `GROQ_API_KEY`, `MAX_TOKENS`
+**C. Type Hints (100% Coverage)**
+```python
+# Example: src/services/fraud_analyzer.py
+def analyze_transaction(
+    self,
+    transaction_id: Optional[int] = None,
+    transaction_data: Optional[Dict] = None,
+    use_rag: bool = True,
+) -> Dict:
+```
+**D. Documentation**
+- **Docstrings:** Google-style untuk semua functions
+- **Comments:** Inline comments untuk complex logic
+- **README.md:** Comprehensive project documentation
+**Evidence:**
+- ✅ Clear separation of concerns
+- ✅ Consistent naming across codebase
+- ✅ Type hints untuk IDE support
+- ✅ Well-documented code
+---
+### ✅ 4. Exception Handling: Error Handling & Edge Cases
+#### Implementation Details:
+**A. Transaction Not Found**
+- **Location:** `src/data/processor.py` (line 60-62)
+```python
+if transaction.empty:
+    raise ValueError(f"Transaction {transaction_id} not found")
+```
+**B. File Not Found**
+- **Location:** `src/data/processor.py` (line 32-33)
+```python
+if not data_path.exists():
+    raise FileNotFoundError(f"Training data not found: {data_path}")
+```
+**C. RAG Fallback Mechanism**
+- **Location:** `src/services/fraud_analyzer.py` (line 151-154)
+```python
+except Exception as e:
+    logger.warning(f"RAG chain failed, falling back to direct LLM: {str(e)}")
+    analysis_text = self._direct_analysis(formatted_transaction)
+    sources = []
+```
+**D. Chat Error Handling**
+- **Location:** `app.py` (line 395-398)
+```python
+except Exception as e:
+    logger.error(f"Chat failed: {e}")
+    history.append([message, f"❌ Error: {str(e)}"])
+    return history
+```
+**E. Graceful Degradation**
+- **Location:** `app.py` (line 74-82)
+```python
+# CSV loading dengan try-except
+try:
+    csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
+    all_documents.extend(csv_documents)
+except Exception as e:
+    logger.warning(f"⚠ Failed to load CSV insights: {e}")
+    # System continues without CSV insights
+```
+**F. API Validation**
+- **Location:** `src/schemas/fraud.py`
+- **Pydantic models** untuk request validation
+**Evidence:**
+- ✅ Comprehensive error handling
+- ✅ Graceful degradation
+- ✅ Logging untuk debugging
+---
+### ✅ 5. Performance: Optimasi Sistem
+#### Implementation Details:
+**A. Efficient Embeddings**
+- **Location:** `src/rag/vector_store.py` (line 34-37)
+- **Model:** `sentence-transformers/all-MiniLM-L6-v2`
+  - Lightweight (80MB)
+  - Fast inference
+  - Good accuracy/speed tradeoff
+**B. Sampling Strategy**
+- **Location:** `src/rag/csv_document_generator.py` (line 15)
+```python
+sample_size: int = 1050000  # ~81% of full dataset
+# Balance between coverage and performance
+```
+**C. Chunking Optimization**
+- **Location:** `src/config/config.py` (line 29-30)
+```python
+chunk_size: int = 1000      # Optimal for context
+chunk_overlap: int = 200    # Preserve context continuity
+```
+**D. In-Memory Vector Store**
+- **Location:** `src/config/config.py` (line 31)
+```python
+vector_store_path: Optional[str] = None  # Fast in-memory storage
+```
+- **Trade-off:** Speed vs persistence
+- **Benefit:** No disk I/O latency
+**E. Lazy Loading**
+- **Location:** `src/data/processor.py` (line 54-55)
+```python
+if self.train_df is None:
+    self.load_train_data()  # Load only when needed
+```
+**F. Batch Processing**
+- **Location:** `src/services/fraud_analyzer.py` (line 218-245)
+```python
+def batch_analyze(
+    self,
+    transaction_ids: List[int],
+    use_rag: bool = True,
+) -> List[Dict]:
+    # Process multiple transactions efficiently
+```
+**G. Max Tokens Optimization**
+- **Location:** `src/config/config.py` (line 14)
+```python
+max_tokens: int = 8192  # Model maximum
+```
+**Performance Metrics:**
+- Document loading: ~5-10 seconds
+- Vector store creation: ~3-5 seconds
+- Query response: ~1-3 seconds
+- Full dataset load: ~15-20 seconds
+**Evidence:**
+- ✅ Lightweight embeddings
+- ✅ Strategic sampling
+- ✅ Optimized chunking
+- ✅ Fast in-memory storage
+---
+### ✅ 6. Data Processing: Embeddings, RAG, Pre/Post Processing
+#### Implementation Details:
+**A. Embeddings**
+- **Location:** `src/rag/vector_store.py` (line 34-37)
+- **Model:** sentence-transformers/all-MiniLM-L6-v2
+- **Dimension:** 384
+- **Normalization:** L2 normalized
+**B. RAG Pipeline**
+**1. Document Loading**
+- **PDF Processing** (`src/rag/document_loader.py` line 53-75)
+  ```python
+  # PyPDFLoader → RecursiveCharacterTextSplitter
+  loader = PyPDFLoader(str(pdf_path))
+  documents = loader.load()
+  chunks = self.text_splitter.split_documents(documents)
+  ```
+- **CSV Processing** (`src/rag/csv_document_generator.py`)
+  ```python
+  # Extract structured insights
+  - generate_fraud_pattern_documents()
+  - generate_statistical_summaries()
+  - generate_merchant_profiles()
+  - generate_location_insights()
+  ```
+**2. Vector Store Creation**
+- **Location:** `src/rag/vector_store.py` (line 52-65)
+  ```python
+  # Chroma DB with HuggingFace embeddings
+  self.vector_store = Chroma.from_documents(
+      documents=documents,
+      embedding=self.embeddings,
+      persist_directory=self.persist_directory,
+  )
+  ```
+**3. Retrieval**
+- **Similarity Search** (line 82-96)
+  ```python
+  # Top-k retrieval dengan metadata
+  results = self.vector_store.similarity_search(
+      query=query,
+      k=k,
+  )
+  ```
+**C. Preprocessing**
+**1. PDF Text Splitting**
+```python
+# Recursive character splitting
+chunk_size=1000
+chunk_overlap=200
+# Preserves context across chunks
+```
+**2. CSV Data Extraction**
+```python
+# Structured insight generation
+- Fraud patterns by category
+- Statistical aggregations
+- Merchant risk profiles
+- Geographic analysis
+```
+**3. Transaction Formatting**
+- **Location:** `src/data/processor.py` (line 78-104)
+```python
+def format_transaction_for_llm(self, transaction: Dict) -> str:
+    # Format dengan clear labels
+    # Include all relevant fields
+    # Human-readable format
+```
+**D. Postprocessing**
+**1. Source Reference Collection**
+- **Location:** `app.py` (line 295-318)
+```python
+# Extract metadata dari retrieved docs
+# Format source references
+# Include file names, page numbers, data types
+```
+**2. Response Formatting**
+```python
+# Structured sections:
+# - Transaction Details
+# - Fraud Analysis
+# - Quality Score
+# - Source References
+```
+**3. Quality Scoring**
+- **Location:** `src/services/quality_scorer.py`
+```python
+# Automated quality assessment
+# 4 metrics: relevance, completeness, citations, clarity
+# Grade: A-F
+```
+**Evidence:**
+- ✅ Comprehensive embedding strategy
+- ✅ Dual-source RAG (PDF + CSV)
+- ✅ Structured preprocessing
+- ✅ Rich postprocessing dengan quality scoring
+---
+### ✅ 7. Prompt Design: Multiple Layers
+#### Implementation Details:
+**Layer 1: System Role Definition**
+- **Location:** `app.py` (line 356-365)
+```python
+system_message = """You are an expert fraud detection analyst.
+Help users understand fraud patterns, detection methods, and transaction analysis."""
+```
+**Layer 2: Citation Instructions**
+- **Location:** `app.py` (line 358-363)
+```python
+IMPORTANT CITATION RULES:
+- When using information from the provided context sources, you MUST add an inline citation
+- Format citations as: [Source X]
+- Place citations at the end of sentences
+```
+**Layer 3: Transaction Analysis Guidelines**
+- **Location:** `app.py` (line 365-369)
+```python
+TRANSACTION ANALYSIS:
+- If transaction details are provided, analyze them thoroughly
+- Compare transaction characteristics against known fraud patterns
+- Provide a clear fraud risk assessment (Low/Medium/High)
+```
+**Layer 4: RAG Context**
+- **Location:** `app.py` (line 320-348)
+```python
+# Retrieved documents dengan source numbers
+context = "\n\nRelevant context from fraud detection documents:\n"
+for i, doc in enumerate(docs, 1):
+    context += f"\n[Source {i}] {doc.page_content[:500]}...\n"
+```
+**Layer 5: Transaction Data**
+- **Location:** `app.py` (line 293-306)
+```python
+# Auto-fetched transaction details
+transaction_context = f"\n\n**Transaction ID {transaction_id} Details:**\n"
+transaction_context += f"- Merchant: {transaction.get('merchant', 'N/A')}\n"
+transaction_context += f"- Actual Fraud Status: {'FRAUD' if ... else 'LEGITIMATE'}\n"
+```
+**Layer 6: RAG Chain Template**
+- **Location:** `src/services/fraud_analyzer.py` (line 46-66)
+```python
+template = """You are an expert fraud detection analyst.
+Use the following context from fraud detection research papers...
+Context:
+{context}
+Question: {question}
+IMPORTANT CITATION RULES:
+...
+"""
+```
+**Evidence:**
+- ✅ 6-layer prompt architecture
+- ✅ Clear role definition
+- ✅ Explicit instructions
+- ✅ Dynamic context injection
+---
+### ✅ 8. Quality Scoring: Response Assessment
+#### Implementation Details:
+**A. Quality Scorer Module**
+- **Location:** `src/services/quality_scorer.py`
+- **Class:** `ResponseQualityScorer`
+**B. Scoring Metrics (4 Dimensions)**
+- **Relevance (35%):** Analyzes query term matching and contextual alignment.
+- **Completeness (25%):** Evaluates depth of information and structural integrity.
+- **Citation Quality (25%):** Validates presence and distribution of inline citations.
+- **Clarity (15%):** Assesses sentence structure and formatting.
+**C. Integration:** Automatically triggered for every chatbot response, providing a detailed breakdown and an overall grade (A-F).
+---
+### ✅ 9. Advanced Manual Analysis
+#### Implementation Details:
+- **Location:** `app.py`
+- **Feature:** Collapsible "Advanced Fields" section in the Manual Transaction Analysis tab.
+- **Inputs:** Gender, Age, Job, ZIP Code, City Population, and Merchant Coordinates.
+- **Improved Accuracy:** Provides the LLM with significantly more context, matching the granularity of the actual dataset for more realistic simulations.
+---
+## Summary Matrix
+| # | Requirement        | Status | Evidence                              |
+| - | ------------------ | ------ | ------------------------------------- |
+| 1 | Accuracy           | ✅     | RAG, Citations, Transaction Detection |
+| 2 | Coverage           | ✅     | 4 Interfaces, Flexible Queries, API   |
+| 3 | Readability        | ✅     | Modular, Type Hints, Docstrings       |
+| 4 | Exception Handling | ✅     | Comprehensive Error Handling          |
+| 5 | Performance        | ✅     | Optimized Embeddings, Sampling        |
+| 6 | Data Processing    | ✅     | RAG Pipeline, Pre/Post Processing     |
+| 7 | Prompt Design      | ✅     | 6-Layer Architecture                  |
+| 8 | Quality Scoring    | ✅     | 4-Metric Automated Scoring            |
+| 9 | Advanced Manual    | ✅     | Modular UI with 7 optional fields     |
+**Overall Assessment:** ✅ ALL REQUIREMENTS MET
+---
+## Key Achievements
+1. ✅ **Dual-Source RAG** - PDF research papers + CSV fraud patterns
+2. ✅ **Inline Citations** - Transparent source referencing
+3. ✅ **Transaction Query Detection** - Natural language transaction analysis
+4. ✅ **Multi-Layer Prompting** - 6-layer prompt architecture
+5. ✅ **Quality Scoring** - Automated 4-metric response assessment
+6. ✅ **Comprehensive Error Handling** - Graceful degradation
+7. ✅ **Performance Optimization** - Strategic sampling, efficient embeddings
+8. ✅ **Clean Architecture** - Modular, well-documented codebase
+---
+## Files Reference
+### Core Implementation
+- `app.py` - Gradio interface dengan quality scoring
+- `main.py` - FastAPI application
+- `src/services/fraud_analyzer.py` - Main analysis service
+- `src/services/quality_scorer.py` - Quality assessment
+- `src/rag/vector_store.py` - Vector store management
+- `src/rag/document_loader.py` - Document loading
+- `src/rag/csv_document_generator.py` - CSV insights extraction
+- `src/data/processor.py` - Data processing
+- `src/llm/groq_client.py` - LLM integration
+- `src/config/config.py` - Configuration
+### Documentation
+- `README.md` - Project documentation
+- `QUICKSTART.md` - Quick start guide
+- `requirements.txt` - Dependencies
+---
+**Conclusion:** Project successfully implements all required features with high quality standards and additional bonus features (multi-layer prompting, quality scoring).

app.py ADDED Viewed

	@@ -0,0 +1,757 @@

+"""Gradio interface for Fraud Detection Chatbot."""
+import logging
+import warnings
+import os
+# Suppress warnings for cleaner output
+warnings.filterwarnings('ignore', category=FutureWarning)
+warnings.filterwarnings('ignore', category=DeprecationWarning)
+warnings.filterwarnings('ignore', category=UserWarning)
+warnings.filterwarnings('ignore', message='.*LangChain.*')
+# Disable ChromaDB telemetry to avoid errors
+os.environ['ANONYMIZED_TELEMETRY'] = 'False'
+import gradio as gr
+from pathlib import Path
+import pandas as pd
+from src.data.processor import FraudDataProcessor
+from src.llm.groq_client import GroqClient
+from src.rag.document_loader import DocumentLoader
+from src.rag.vector_store import VectorStore
+from src.services.fraud_analyzer import FraudAnalyzer
+from src.services.quality_scorer import ResponseQualityScorer
+from src.config.config import settings
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Suppress chromadb logging
+logging.getLogger('chromadb').setLevel(logging.ERROR)
+logging.getLogger('chromadb.telemetry').setLevel(logging.CRITICAL)
+# Initialize components globally
+groq_client = None
+vector_store = None
+fraud_analyzer = None
+data_processor = None
+quality_scorer = ResponseQualityScorer()
+def initialize_system():
+    """Initialize the fraud detection system."""
+    global groq_client, vector_store, fraud_analyzer, data_processor
+    logger.info("Initializing Fraud Detection System...")
+    # Initialize Groq client
+    groq_client = GroqClient()
+    logger.info("✓ Groq client initialized")
+    # Initialize data processor
+    data_processor = FraudDataProcessor()
+    logger.info("✓ Data processor initialized")
+    # Setup RAG system
+    try:
+        document_loader = DocumentLoader(
+            chunk_size=settings.chunk_size,
+            chunk_overlap=settings.chunk_overlap,
+        )
+        all_documents = []
+        # Load PDF documents
+        pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
+        if pdf_documents:
+            all_documents.extend(pdf_documents)
+            logger.info(f"✓ Loaded {len(pdf_documents)} PDF documents")
+        else:
+            logger.warning("⚠ No PDF documents found")
+        # Load CSV insights
+        csv_path = settings.data_dir / "fraudTrain.csv"
+        if csv_path.exists():
+            try:
+                csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
+                all_documents.extend(csv_documents)
+                logger.info(f"✓ Loaded {len(csv_documents)} CSV insight documents")
+            except Exception as e:
+                logger.warning(f"⚠ Failed to load CSV insights: {e}")
+        else:
+            logger.warning(f"⚠ CSV file not found: {csv_path}")
+        # Add all documents to vector store
+        if all_documents:
+            vector_store = VectorStore()
+            vector_store.add_documents(all_documents)
+            logger.info(f"✓ RAG system initialized with {len(all_documents)} total documents")
+        else:
+            logger.warning("⚠ No documents loaded for RAG system")
+    except Exception as e:
+        logger.warning(f"⚠ RAG setup failed: {e}")
+    # Create fraud analyzer
+    fraud_analyzer = FraudAnalyzer(
+        groq_client=groq_client,
+        vector_store=vector_store,
+    )
+    logger.info("✓ Fraud analyzer initialized")
+    return "✅ System initialized successfully!"
+def analyze_by_transaction_id(transaction_id: int, use_rag: bool):
+    """Analyze fraud by transaction ID."""
+    if fraud_analyzer is None:
+        return "❌ System not initialized. Please wait for initialization to complete."
+    try:
+        transaction_id = int(transaction_id)
+        result = fraud_analyzer.analyze_transaction(
+            transaction_id=transaction_id,
+            use_rag=use_rag,
+        )
+        # Format the response
+        transaction = result['transaction']
+        analysis = result['analysis']
+        response = f"""### 📊 Transaction Details
+**Merchant:** {transaction.get('merchant', 'N/A')}
+**Category:** {transaction.get('category', 'N/A')}
+**Amount:** ${transaction.get('amt', 0):.2f}
+**City:** {transaction.get('city', 'N/A')}
+**State:** {transaction.get('state', 'N/A')}
+---
+### 🔍 Fraud Analysis
+{analysis}
+"""
+        return response
+    except Exception as e:
+        logger.error(f"Analysis failed: {e}")
+        return f"❌ Error: {str(e)}"
+def analyze_by_manual_data(
+    merchant: str, category: str, amount: float, city: str, state: str, use_rag: bool,
+    gender: str = None, age: int = None, job: str = None, zip_code: str = None,
+    city_pop: int = None, merch_lat: float = None, merch_long: float = None
+):
+    """Analyze fraud by manual transaction data."""
+    if fraud_analyzer is None:
+        return "❌ System not initialized. Please wait for initialization to complete."
+    try:
+        # Clean merchant name from prefix if present
+        clean_merchant = merchant.replace('fraud_', '') if merchant else merchant
+        transaction_data = {
+            "merchant": clean_merchant,
+            "category": category,
+            "amt": float(amount),
+            "city": city,
+            "state": state,
+        }
+        # Add advanced fields if provided
+        if gender:
+            transaction_data["gender"] = gender
+        if age:
+            transaction_data["age"] = age
+        if job:
+            transaction_data["job"] = job
+        if zip_code:
+            transaction_data["zip"] = zip_code
+        if city_pop:
+            transaction_data["city_pop"] = city_pop
+        if merch_lat is not None:
+            transaction_data["merch_lat"] = merch_lat
+        if merch_long is not None:
+            transaction_data["merch_long"] = merch_long
+        result = fraud_analyzer.analyze_transaction(
+            transaction_data=transaction_data,
+            use_rag=use_rag,
+        )
+        analysis = result['analysis']
+        response = f"""### 📊 Transaction Details
+**Merchant:** {merchant}
+**Category:** {category}
+**Amount:** ${amount:.2f}
+**City:** {city}
+**State:** {state}
+"""
+        # Add advanced fields to display if provided
+        if gender or age or job:
+            response += "\n**Cardholder Info:**\n"
+            if gender:
+                response += f"- Gender: {gender}\n"
+            if age:
+                response += f"- Age: {age}\n"
+            if job:
+                response += f"- Job: {job}\n"
+        if zip_code or city_pop:
+            response += "\n**Location Details:**\n"
+            if zip_code:
+                response += f"- ZIP: {zip_code}\n"
+            if city_pop:
+                response += f"- City Population: {city_pop:,}\n"
+        if merch_lat is not None or merch_long is not None:
+            response += "\n**Merchant Location:**\n"
+            response += f"- Coordinates: ({merch_lat}, {merch_long})\n"
+        response += f"""
+---
+### 🔍 Fraud Analysis
+{analysis}
+"""
+        return response
+    except Exception as e:
+        logger.error(f"Analysis failed: {e}")
+        return f"❌ Error: {str(e)}"
+def get_dataset_summary():
+    """Get dataset summary statistics including RAG documents."""
+    if data_processor is None:
+        return "❌ System not initialized."
+    try:
+        # Get transaction data summary
+        summary = data_processor.get_transaction_summary()
+        response = f"""### 📊 Transaction Dataset Summary
+**Total Transactions:** {summary['total_transactions']:,}
+**Fraud Cases:** {summary['fraud_count']:,}
+**Fraud Rate:** {summary['fraud_percentage']:.2f}%
+**Average Amount:** ${summary['average_amount']:.2f}
+---
+**Top Transaction Categories:**
+"""
+        for category, count in list(summary['categories'].items())[:10]:
+            response += f"\n- {category}: {count:,}"
+        # Add RAG document summary if available
+        if vector_store is not None:
+            response += "\n\n---\n\n### 📚 RAG Knowledge Base\n\n"
+            # Count documents by type
+            try:
+                # Get all documents from vector store
+                all_docs = vector_store.vector_store._collection.get()
+                if all_docs and 'metadatas' in all_docs:
+                    metadatas = all_docs['metadatas']
+                    # Count by source type
+                    pdf_count = 0
+                    csv_pattern_count = 0
+                    csv_merchant_count = 0
+                    csv_location_count = 0
+                    csv_stats_count = 0
+                    pdf_sources = set()
+                    for meta in metadatas:
+                        doc_type = meta.get('type', 'document')
+                        source = meta.get('source', '')
+                        if doc_type == 'fraud_pattern':
+                            csv_pattern_count += 1
+                        elif doc_type == 'merchant_profile':
+                            csv_merchant_count += 1
+                        elif doc_type == 'location_insight':
+                            csv_location_count += 1
+                        elif doc_type == 'statistical_summary':
+                            csv_stats_count += 1
+                        else:
+                            # PDF document
+                            pdf_count += 1
+                            if source.endswith('.pdf'):
+                                pdf_sources.add(source)
+                    response += f"**Total Documents in RAG:** {len(metadatas):,}\n\n"
+                    if pdf_count > 0:
+                        response += f"**📄 PDF Research Documents:** {pdf_count:,}\n"
+                        for pdf in sorted(pdf_sources):
+                            response += f"  - {pdf}\n"
+                        response += "\n"
+                    csv_total = csv_pattern_count + csv_merchant_count + csv_location_count + csv_stats_count
+                    if csv_total > 0:
+                        response += f"**📊 CSV-Derived Insights:** {csv_total:,}\n"
+                        if csv_pattern_count > 0:
+                            response += f"  - Fraud Pattern Analysis: {csv_pattern_count}\n"
+                        if csv_merchant_count > 0:
+                            response += f"  - Merchant Profiles: {csv_merchant_count}\n"
+                        if csv_location_count > 0:
+                            response += f"  - Location Insights: {csv_location_count}\n"
+                        if csv_stats_count > 0:
+                            response += f"  - Statistical Summaries: {csv_stats_count}\n"
+                else:
+                    response += "**Status:** RAG system initialized but no document metadata available."
+            except Exception as e:
+                logger.warning(f"Could not retrieve RAG document stats: {e}")
+                response += "**Status:** RAG system active (document count unavailable)"
+        return response
+    except Exception as e:
+        logger.error(f"Summary failed: {e}")
+        return f"❌ Error: {str(e)}"
+def chat_with_fraud_expert(message: str, history: list, use_rag: bool):
+    """Chat with fraud detection expert."""
+    if groq_client is None:
+        return history + [[message, "❌ System not initialized. Please wait for initialization to complete."]]
+    try:
+        # Check if message is asking about a specific transaction ID
+        import re
+        transaction_query = re.search(r'transaction\s+(?:id\s+)?(\d+)', message.lower())
+        transaction_context = ""
+        if transaction_query and data_processor is not None:
+            transaction_id = int(transaction_query.group(1))
+            try:
+                # Get transaction data
+                transaction = data_processor.get_transaction_summary(transaction_id)
+                # Format transaction details with all relevant columns
+                transaction_context = f"\n\n**Transaction ID {transaction_id} Details:**\n"
+                transaction_context += f"- **Transaction Number:** {transaction.get('trans_num', 'N/A')}\n"
+                transaction_context += f"- **Date/Time:** {transaction.get('trans_date_trans_time', 'N/A')}\n"
+                transaction_context += f"- **Merchant:** {transaction.get('merchant', 'N/A')}\n"
+                transaction_context += f"- **Category:** {transaction.get('category', 'N/A')}\n"
+                transaction_context += f"- **Amount:** ${transaction.get('amt', 0):.2f}\n"
+                transaction_context += f"- **Location:** {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}\n"
+                transaction_context += f"- **Merchant Coordinates:** ({transaction.get('merch_lat', 'N/A')}, {transaction.get('merch_long', 'N/A')})\n"
+                transaction_context += f"\n**Cardholder Information:**\n"
+                transaction_context += f"- **Name:** {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}\n"
+                transaction_context += f"- **Gender:** {transaction.get('gender', 'N/A')}\n"
+                transaction_context += f"- **Date of Birth:** {transaction.get('dob', 'N/A')}\n"
+                transaction_context += f"- **Job:** {transaction.get('job', 'N/A')}\n"
+                transaction_context += f"- **Street:** {transaction.get('street', 'N/A')}\n"
+                transaction_context += f"- **City/State/ZIP:** {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')} {transaction.get('zip', 'N/A')}\n"
+                transaction_context += f"- **Cardholder Coordinates:** ({transaction.get('lat', 'N/A')}, {transaction.get('long', 'N/A')})\n"
+                transaction_context += f"- **City Population:** {transaction.get('city_pop', 'N/A')}\n"
+                transaction_context += f"\n**Card Information:**\n"
+                transaction_context += f"- **Card Number:** {transaction.get('cc_num', 'N/A')}\n"
+                transaction_context += f"\n**Fraud Status:**\n"
+                transaction_context += f"- **Actual Status:** {'🚨 FRAUD' if transaction.get('is_fraud', 0) == 1 else '✅ LEGITIMATE'}\n"
+                logger.info(f"Found transaction {transaction_id} for chat query")
+            except ValueError as e:
+                transaction_context = f"\n\n**Note:** {str(e)}\n"
+            except Exception as e:
+                logger.warning(f"Could not fetch transaction {transaction_id}: {e}")
+        # If RAG is enabled and vector store is available, get relevant context
+        context = ""
+        source_references = []
+        if use_rag and vector_store is not None:
+            docs = vector_store.similarity_search(message, k=3)
+            if docs:
+                context = "\n\nRelevant context from fraud detection documents:\n"
+                for i, doc in enumerate(docs, 1):
+                    # Add context with source number
+                    context += f"\n[Source {i}] {doc.page_content[:500]}...\n"
+                    # Collect source information for reference list
+                    source_file = doc.metadata.get('source', 'Unknown')
+                    page_num = doc.metadata.get('page', 'N/A')
+                    doc_type = doc.metadata.get('type', 'document')
+                    # Format source info
+                    if doc_type == 'fraud_pattern':
+                        category = doc.metadata.get('category', 'N/A')
+                        source_references.append(f"Source {i}: CSV Data - Fraud Pattern Analysis ({category})")
+                    elif doc_type == 'statistical_summary':
+                        scope = doc.metadata.get('scope', 'N/A')
+                        source_references.append(f"Source {i}: CSV Data - Statistical Summary ({scope})")
+                    elif doc_type == 'merchant_profile':
+                        merchant = doc.metadata.get('merchant', 'N/A')
+                        source_references.append(f"Source {i}: CSV Data - Merchant Profile ({merchant})")
+                    elif doc_type == 'location_insight':
+                        state = doc.metadata.get('state', 'N/A')
+                        source_references.append(f"Source {i}: CSV Data - Location Analysis ({state})")
+                    else:
+                        # PDF document
+                        if page_num != 'N/A':
+                            source_references.append(f"Source {i}: {source_file}, Page {page_num}")
+                        else:
+                            source_references.append(f"Source {i}: {source_file}")
+        # Create prompt with transaction data and context
+        full_prompt = message
+        if transaction_context:
+            full_prompt = f"{message}\n{transaction_context}"
+        if context:
+            full_prompt = f"{full_prompt}\n{context}"
+        # Enhanced system message with inline citation instructions
+        system_message = """You are an expert fraud detection analyst. Help users understand fraud patterns, detection methods, and transaction analysis.
+IMPORTANT CITATION RULES:
+- When using information from the provided context sources, you MUST add an inline citation immediately after the relevant sentence or paragraph.
+- Format citations as: [Source X] where X is the source number from the context.
+- Place citations at the end of sentences that use information from that source.
+- You can cite multiple sources in one paragraph if needed: [Source 1, Source 2]
+- Be specific and reference the data when using information from sources.
+TRANSACTION ANALYSIS:
+- If transaction details are provided, analyze them thoroughly.
+- Note: Ignore "fraud_" prefix in merchant names; it is an artifact of the synthetic dataset and NOT an indicator of fraud.
+- Compare transaction characteristics against known fraud patterns.
+- Provide a clear fraud risk assessment (Low/Medium/High).
+- Explain your reasoning with specific indicators.
+Example:
+"Online gaming merchants often experience higher fraud rates due to card-not-present transactions. [Source 1] The average fraud rate in this category is 5.2%. [Source 2]"
+Provide clear, actionable insights with proper inline citations."""
+        # Get response from LLM
+        response = groq_client.invoke(
+            prompt=full_prompt,
+            system_message=system_message,
+        )
+        # Score response quality
+        score_result = quality_scorer.score_response(
+            response=response,
+            query=message,
+            has_rag=use_rag and vector_store is not None,
+            sources=source_references,
+        )
+        # Add quality score display
+        quality_display = quality_scorer.format_score_display(score_result)
+        response += quality_display
+        # Add source reference list at the end
+        if source_references:
+            response += "\n**📚 Source References:**\n"
+            for ref in source_references:
+                response += f"\n- {ref}"
+        # Log quality score
+        logger.info(f"Response quality score: {score_result['overall_score']}/100 (Grade: {score_result['grade']})")
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": response})
+        return history
+    except Exception as e:
+        logger.error(f"Chat failed: {e}")
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": f"❌ Error: {str(e)}"})
+        return history
+# Create Gradio interface
+def create_interface():
+    """Create the Gradio interface."""
+    with gr.Blocks(
+        theme=gr.themes.Soft(
+            primary_hue="blue",
+            secondary_hue="slate",
+            font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui", "sans-serif"],
+        ),
+        title="Fraud Detection Chatbot",
+        css="""
+        @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
+        * {
+            font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif !important;
+        }
+        .gradio-container {
+            max-width: 1200px !important;
+        }
+        h1, h2, h3, h4, h5, h6 {
+            font-weight: 600 !important;
+        }
+        .markdown-text {
+            font-size: 15px !important;
+            line-height: 1.6 !important;
+        }
+        button {
+            font-weight: 500 !important;
+        }
+        """
+    ) as demo:
+        gr.Markdown("""
+        # 🛡️ Fraud Detection Chatbot
+        AI-powered fraud detection system using LangChain, Groq, and RAG (Retrieval Augmented Generation).
+        """)
+        # System status
+        with gr.Row():
+            init_status = gr.Textbox(
+                label="System Status",
+                value="Initializing...",
+                interactive=False,
+            )
+        # Tabs for different functionalities
+        with gr.Tabs():
+            # Tab 1: Chat with Expert
+            with gr.Tab("💬 Chat with Fraud Expert"):
+                gr.Markdown("""
+                Ask questions about fraud detection, transaction patterns, or get expert advice.
+                """)
+                with gr.Row():
+                    chat_use_rag = gr.Checkbox(
+                        label="Use RAG (Enhanced with fraud detection documents + CSV data)",
+                        value=True,
+                    )
+                chatbot = gr.Chatbot(
+                    label="Fraud Detection Expert",
+                    height=500,
+                )
+                with gr.Row():
+                    chat_input = gr.Textbox(
+                        label="Your Question",
+                        placeholder="Ask about fraud detection, transaction analysis, etc...",
+                        scale=4,
+                    )
+                    chat_submit = gr.Button("Send", variant="primary", scale=1)
+                chat_clear = gr.Button("Clear Chat")
+                # Chat examples
+                gr.Examples(
+                    examples=[
+                        "What are common indicators of credit card fraud?",
+                        "How can I detect unusual transaction patterns?",
+                        "What are fraud patterns in grocery transactions?",
+                        "Which merchants have high fraud rates?",
+                        "What states have elevated fraud activity?",
+                    ],
+                    inputs=chat_input,
+                )
+            # Tab 2: Analyze by Transaction ID
+            with gr.Tab("🔍 Analyze by Transaction ID"):
+                gr.Markdown("""
+                Analyze a specific transaction from the dataset by its ID.
+                """)
+                txn_id_input = gr.Number(
+                    label="Transaction ID",
+                    value=0,
+                    precision=0,
+                )
+                txn_id_use_rag = gr.Checkbox(
+                    label="Use RAG (Enhanced analysis)",
+                    value=True,
+                )
+                txn_id_submit = gr.Button("Analyze Transaction", variant="primary")
+                txn_id_output = gr.Markdown(label="Analysis Result")
+            # Tab 3: Analyze Manual Transaction
+            with gr.Tab("✍️ Analyze Manual Transaction"):
+                gr.Markdown("""
+                Enter transaction details manually for fraud analysis.
+                """)
+                # Basic Fields
+                gr.Markdown("### Basic Transaction Information")
+                manual_merchant = gr.Textbox(
+                    label="Merchant Name",
+                    placeholder="e.g., Amazon, Walmart",
+                )
+                manual_category = gr.Dropdown(
+                    label="Category",
+                    choices=[
+                        "grocery_pos", "gas_transport", "misc_net",
+                        "shopping_net", "shopping_pos", "entertainment",
+                        "food_dining", "personal_care", "health_fitness",
+                        "travel", "kids_pets", "home"
+                    ],
+                    value="grocery_pos",
+                )
+                manual_amount = gr.Number(
+                    label="Amount ($)",
+                    value=100.0,
+                )
+                manual_city = gr.Textbox(
+                    label="City",
+                    placeholder="e.g., Jakarta",
+                )
+                manual_state = gr.Textbox(
+                    label="State",
+                    placeholder="e.g., DKI",
+                )
+                # Advanced Fields (Accordion)
+                with gr.Accordion("🔧 Advanced Fields (Optional)", open=False):
+                    gr.Markdown("*Provide additional details for more accurate fraud analysis*")
+                    with gr.Row():
+                        manual_gender = gr.Radio(
+                            label="Cardholder Gender",
+                            choices=["M", "F"],
+                            value="M",
+                        )
+                        manual_age = gr.Number(
+                            label="Cardholder Age",
+                            value=35,
+                            precision=0,
+                        )
+                    manual_job = gr.Textbox(
+                        label="Cardholder Job",
+                        placeholder="e.g., Engineer, Teacher",
+                    )
+                    with gr.Row():
+                        manual_zip = gr.Textbox(
+                            label="ZIP Code",
+                            placeholder="e.g., 12345",
+                        )
+                        manual_city_pop = gr.Number(
+                            label="City Population",
+                            value=100000,
+                            precision=0,
+                        )
+                    with gr.Row():
+                        manual_merch_lat = gr.Number(
+                            label="Merchant Latitude",
+                            value=0.0,
+                        )
+                        manual_merch_long = gr.Number(
+                            label="Merchant Longitude",
+                            value=0.0,
+                        )
+                manual_use_rag = gr.Checkbox(
+                    label="Use RAG (Enhanced analysis)",
+                    value=True,
+                )
+                manual_submit = gr.Button("Analyze Transaction", variant="primary")
+                manual_output = gr.Markdown(label="Analysis Result")
+            # Tab 4: Dataset Summary
+            with gr.Tab("📊 Dataset Summary"):
+                gr.Markdown("""
+                View statistics and insights from the fraud detection dataset.
+                """)
+                summary_button = gr.Button("Get Dataset Summary", variant="primary")
+                summary_output = gr.Markdown(label="Summary")
+        # Event handlers
+        def chat_fn(message, history, use_rag):
+            return chat_with_fraud_expert(message, history, use_rag)
+        chat_submit.click(
+            fn=chat_fn,
+            inputs=[chat_input, chatbot, chat_use_rag],
+            outputs=chatbot,
+        ).then(
+            lambda: "",
+            outputs=chat_input,
+        )
+        chat_input.submit(
+            fn=chat_fn,
+            inputs=[chat_input, chatbot, chat_use_rag],
+            outputs=chatbot,
+        ).then(
+            lambda: "",
+            outputs=chat_input,
+        )
+        chat_clear.click(
+            lambda: [],
+            outputs=chatbot,
+        )
+        txn_id_submit.click(
+            fn=analyze_by_transaction_id,
+            inputs=[txn_id_input, txn_id_use_rag],
+            outputs=txn_id_output,
+        )
+        manual_submit.click(
+            fn=analyze_by_manual_data,
+            inputs=[
+                manual_merchant,
+                manual_category,
+                manual_amount,
+                manual_city,
+                manual_state,
+                manual_use_rag,
+                manual_gender,
+                manual_age,
+                manual_job,
+                manual_zip,
+                manual_city_pop,
+                manual_merch_lat,
+                manual_merch_long,
+            ],
+            outputs=manual_output,
+        )
+        summary_button.click(
+            fn=get_dataset_summary,
+            outputs=summary_output,
+        )
+        # Initialize system on load
+        demo.load(
+            fn=initialize_system,
+            outputs=init_status,
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+    )

data/Bhatla.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6961a50224423ea16eb2b97486fdbe88b4d1a48fd9289687e911e3ae10c4596d
+size 1215275

data/EBA_ECB 2024 Report on Payment Fraud.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dca63ad08f8ea7d5d0db77bd5953bbc0ebca987a3b7e4df501c43e825dfe5ebf
+size 734484

data/fraudTest.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12d553ab19440c752d2531ee1af44bb64f12cc3d3839f1649f19e81c230545f0
+size 150354339

data/fraudTrain.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd7139200dbfcbed0b6742bbe05a4f1abce532c4fef20918228a651647a3e75d
+size 351238196

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+services:
+  app:
+    build: .
+    container_name: fraud-detection-ui
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./data:/app/data
+      - ./chroma_db:/app/chroma_db
+    env_file:
+      - .env
+    environment:
+      - HOST=0.0.0.0
+    restart: always
+  api:
+    build: .
+    container_name: fraud-detection-api
+    command: uvicorn main:app --host 0.0.0.0 --port 8000
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./data:/app/data
+      - ./chroma_db:/app/chroma_db
+    env_file:
+      - .env
+    restart: always

main.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Main FastAPI application."""
+import logging
+import warnings
+import os
+from contextlib import asynccontextmanager
+# Suppress warnings for cleaner output
+warnings.filterwarnings('ignore', category=FutureWarning)
+warnings.filterwarnings('ignore', category=DeprecationWarning)
+warnings.filterwarnings('ignore', message='.*LangChain.*')
+# Disable ChromaDB telemetry to avoid errors
+os.environ['ANONYMIZED_TELEMETRY'] = 'False'
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from src.config.config import settings
+from src.api.routes import router
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO if not settings.debug else logging.DEBUG,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+# Suppress chromadb logging
+logging.getLogger('chromadb').setLevel(logging.ERROR)
+logging.getLogger('chromadb.telemetry').setLevel(logging.CRITICAL)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Lifespan context manager for startup and shutdown events."""
+    # Startup
+    logger.info("Starting Fraud Detection API...")
+    logger.info(f"Using Groq model: {settings.groq_model}")
+    # Initialize RAG system if needed
+    try:
+        from src.rag.document_loader import DocumentLoader
+        from src.rag.vector_store import VectorStore
+        logger.info("Initializing RAG system...")
+        document_loader = DocumentLoader(
+            chunk_size=settings.chunk_size,
+            chunk_overlap=settings.chunk_overlap,
+        )
+        # Load PDF documents
+        pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
+        if pdf_documents:
+            vector_store = VectorStore()
+            vector_store.add_documents(pdf_documents)
+            logger.info("RAG system initialized successfully")
+        else:
+            logger.warning("No PDF documents found for RAG system")
+    except Exception as e:
+        logger.warning(f"Failed to initialize RAG system: {str(e)}")
+    yield
+    # Shutdown
+    logger.info("Shutting down Fraud Detection API...")
+# Create FastAPI app
+app = FastAPI(
+    title=settings.app_name,
+    version=settings.app_version,
+    description="Fraud Detection API using LangChain and Groq",
+    lifespan=lifespan,
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Include routers
+app.include_router(router)
+@app.get("/", tags=["root"])
+async def root() -> dict:
+    """Root endpoint."""
+    return {
+        "message": "Fraud Detection API",
+        "version": settings.app_version,
+        "docs": "/docs",
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "main:app",
+        host=settings.api_host,
+        port=settings.api_port,
+        reload=settings.debug,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+fastapi==0.128.0
+gradio==6.3.0
+langchain_chroma==1.1.0
+langchain_community==0.4.1
+langchain_core==1.2.7
+langchain_groq==1.1.1
+langchain_text_splitters==1.1.0
+pandas==2.3.3
+pydantic==2.12.5
+pydantic_settings==2.12.0
+uvicorn==0.40.0
+python-dotenv==1.0.1
+pypdf==5.1.0
+sentence-transformers==3.3.1
+huggingface-hub>=0.27.0
+httpx==0.28.1
+loguru==0.7.3

src/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@


1	+ """Fraud detection application package."""
2	+
3	+ __version__ = "1.0.0"
4	+
5	+

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (234 Bytes). View file

src/api/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@


1	+ """API routes module."""
2	+
3	+
4	+

src/api/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (193 Bytes). View file

src/api/__pycache__/routes.cpython-311.pyc ADDED Viewed

Binary file (5.69 kB). View file

src/api/routes.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""API routes for fraud detection."""
+import logging
+from typing import Dict, List
+from fastapi import APIRouter, HTTPException, status
+from src.schemas.fraud import (
+    FraudAnalysisRequest,
+    FraudAnalysisResponse,
+    TransactionSummary,
+)
+from src.services.fraud_analyzer import FraudAnalyzer
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api/v1", tags=["fraud"])
+# Initialize services (in production, use dependency injection)
+fraud_analyzer = FraudAnalyzer()
+@router.get("/health", summary="Health check")
+async def health_check() -> Dict[str, str]:
+    """Health check endpoint."""
+    return {"status": "healthy", "service": "fraud-detection-api"}
+@router.post(
+    "/analyze",
+    response_model=FraudAnalysisResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Analyze transaction for fraud",
+)
+async def analyze_transaction(request: FraudAnalysisRequest) -> FraudAnalysisResponse:
+    """Analyze a transaction for fraud indicators.
+    Args:
+        request: Fraud analysis request.
+    Returns:
+        Fraud analysis response with detailed assessment.
+    """
+    try:
+        if not request.transaction_id and not request.transaction_data:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Either transaction_id or transaction_data must be provided",
+            )
+        result = fraud_analyzer.analyze_transaction(
+            transaction_id=request.transaction_id,
+            transaction_data=request.transaction_data.dict() if request.transaction_data else None,
+            use_rag=request.use_rag,
+        )
+        return FraudAnalysisResponse(**result)
+    except ValueError as e:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=str(e),
+        )
+    except Exception as e:
+        logger.error(f"Error analyzing transaction: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}",
+        )
+@router.get(
+    "/summary",
+    response_model=TransactionSummary,
+    summary="Get transaction summary",
+)
+async def get_summary() -> TransactionSummary:
+    """Get summary statistics of the fraud dataset.
+    Returns:
+        Transaction summary with statistics.
+    """
+    try:
+        summary = fraud_analyzer.data_processor.get_transaction_summary()
+        return TransactionSummary(**summary)
+    except Exception as e:
+        logger.error(f"Error getting summary: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}",
+        )
+@router.post(
+    "/batch-analyze",
+    response_model=List[FraudAnalysisResponse],
+    summary="Batch analyze multiple transactions",
+)
+async def batch_analyze(
+    transaction_ids: List[int],
+    use_rag: bool = True,
+) -> List[FraudAnalysisResponse]:
+    """Analyze multiple transactions in batch.
+    Args:
+        transaction_ids: List of transaction IDs to analyze.
+        use_rag: Whether to use RAG for context.
+    Returns:
+        List of fraud analysis responses.
+    """
+    try:
+        if not transaction_ids:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="At least one transaction_id must be provided",
+            )
+        results = fraud_analyzer.batch_analyze(transaction_ids, use_rag=use_rag)
+        return [FraudAnalysisResponse(**result) for result in results]
+    except Exception as e:
+        logger.error(f"Error in batch analysis: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}",
+        )

src/config/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Configuration module."""
+from src.config.config import settings
+__all__ = ["settings"]

src/config/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (295 Bytes). View file

src/config/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (2.01 kB). View file

src/config/config.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Configuration module for the fraud detection application."""
+import os
+from pathlib import Path
+from typing import Optional
+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    """Application settings."""
+    # Groq API Configuration
+    max_tokens: int = 8192
+    groq_api_key: str = os.getenv("GROQ_API_KEY", "")
+    groq_model: str = "meta-llama/llama-4-maverick-17b-128e-instruct"
+    # Application Configuration
+    app_name: str = "Fraud Detection API"
+    app_version: str = "1.0.0"
+    debug: bool = False
+    # Data Paths
+    data_dir: Path = Path("data")
+    train_data_path: Path = data_dir / "fraudTrain.csv"
+    pdf_dir: Path = data_dir
+    # RAG Configuration
+    chunk_size: int = 1000
+    chunk_overlap: int = 200
+    vector_store_path: Optional[str] = None  # Will use in-memory by default
+    # API Configuration
+    api_host: str = "localhost"
+    api_port: int = 8000
+    class Config:
+        """Pydantic config."""
+        env_file = ".env"
+        case_sensitive = False
+settings = Settings()

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Data processing module."""
+from src.data.processor import FraudDataProcessor
+__all__ = ["FraudDataProcessor"]

src/data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (307 Bytes). View file

src/data/__pycache__/processor.cpython-311.pyc ADDED Viewed

Binary file (6.16 kB). View file

src/data/processor.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Data processor for fraud detection datasets."""
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+import pandas as pd
+from src.config.config import settings
+logger = logging.getLogger(__name__)
+class FraudDataProcessor:
+    """Processor for fraud detection data."""
+    def __init__(self) -> None:
+        """Initialize data processor."""
+        self.train_df: Optional[pd.DataFrame] = None
+    def load_train_data(self, path: Optional[Path] = None) -> pd.DataFrame:
+        """Load training data.
+        Args:
+            path: Path to training data CSV. If None, uses default path.
+        Returns:
+            Training dataframe.
+        """
+        data_path = path or settings.train_data_path
+        if not data_path.exists():
+            raise FileNotFoundError(f"Training data not found: {data_path}")
+        try:
+            logger.info(f"Loading training data from {data_path}")
+            # Load full dataset for accurate statistics
+            self.train_df = pd.read_csv(data_path)
+            # Clean merchant names (remove 'fraud_' prefix common in synthetic datasets)
+            if 'merchant' in self.train_df.columns:
+                self.train_df['merchant'] = self.train_df['merchant'].str.replace('fraud_', '', regex=False)
+            logger.info(f"Loaded {len(self.train_df)} rows from training data (merchant names cleaned)")
+            return self.train_df
+        except Exception as e:
+            logger.error(f"Error loading training data: {str(e)}")
+            raise
+    def get_transaction_summary(self, transaction_id: Optional[int] = None) -> Dict:
+        """Get summary of a transaction or all transactions.
+        Args:
+            transaction_id: Optional transaction ID. If None, returns overall summary.
+        Returns:
+            Transaction summary dictionary.
+        """
+        if self.train_df is None:
+            self.load_train_data()
+        df = self.train_df
+        if transaction_id is not None:
+            transaction = df[df.index == transaction_id]
+            if transaction.empty:
+                raise ValueError(f"Transaction {transaction_id} not found")
+            return transaction.iloc[0].to_dict()
+        # Overall summary
+        summary = {
+            "total_transactions": len(df),
+            "fraud_count": int(df["is_fraud"].sum()),
+            "fraud_percentage": float(df["is_fraud"].mean() * 100),
+            "total_amount": float(df["amt"].sum()),
+            "average_amount": float(df["amt"].mean()),
+            "categories": df["category"].value_counts().to_dict(),
+        }
+        return summary
+    def format_transaction_for_llm(self, transaction: Dict) -> str:
+        """Format a transaction dictionary for LLM analysis.
+        Args:
+            transaction: Transaction dictionary.
+        Returns:
+            Formatted string representation.
+        """
+        formatted = f"""
+Transaction Details:
+- Date/Time: {transaction.get('trans_date_trans_time', 'N/A')}
+- Merchant: {str(transaction.get('merchant', 'N/A')).replace('fraud_', '')}
+- Category: {transaction.get('category', 'N/A')}
+- Amount: ${transaction.get('amt', 'N/A')}
+- Customer: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}
+- Gender: {transaction.get('gender', 'N/A')}
+- Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}
+- Job: {transaction.get('job', 'N/A')}
+- City Population: {transaction.get('city_pop', 'N/A')}
+- Distance from Merchant: Calculated from coordinates
+"""
+        return formatted.strip()

src/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""LLM integration module."""
+from src.llm.groq_client import GroqClient
+__all__ = ["GroqClient"]

src/llm/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (298 Bytes). View file

src/llm/__pycache__/groq_client.cpython-311.pyc ADDED Viewed

Binary file (3.69 kB). View file

src/llm/groq_client.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Groq LLM client using LangChain."""
+import logging
+from typing import Any, List, Optional
+from langchain_groq import ChatGroq
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
+from langchain_core.output_parsers import StrOutputParser
+from src.config.config import settings
+logger = logging.getLogger(__name__)
+class GroqClient:
+    """Client for interacting with Groq LLM using LangChain."""
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        model_name: Optional[str] = None,
+        temperature: float = 0,
+    ) -> None:
+        """Initialize Groq client.
+        Args:
+            api_key: Groq API key. If None, uses settings.groq_api_key.
+            model_name: Model name. If None, uses settings.groq_model.
+            temperature: Temperature for model generation.
+        """
+        self.api_key = api_key or settings.groq_api_key
+        self.model_name = model_name or settings.groq_model
+        self.temperature = temperature
+        self.max_tokens = settings.max_tokens
+        if not self.api_key:
+            raise ValueError("Groq API key is required. Set GROQ_API_KEY environment variable.")
+        self.llm = ChatGroq(
+            groq_api_key=self.api_key,
+            model_name=self.model_name,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+        )
+        self.output_parser = StrOutputParser()
+        logger.info(f"Initialized Groq client with model: {self.model_name}")
+    def invoke(
+        self,
+        prompt: str,
+        system_message: Optional[str] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Invoke the LLM with a prompt.
+        Args:
+            prompt: User prompt.
+            system_message: Optional system message.
+            **kwargs: Additional arguments to pass to the LLM.
+        Returns:
+            Generated response as string.
+        """
+        messages: List[BaseMessage] = []
+        if system_message:
+            messages.append(SystemMessage(content=system_message))
+        messages.append(HumanMessage(content=prompt))
+        try:
+            response = self.llm.invoke(messages, **kwargs)
+            return self.output_parser.parse(response.content)
+        except Exception as e:
+            logger.error(f"Error invoking LLM: {str(e)}")
+            raise

src/rag/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""RAG (Retrieval Augmented Generation) module."""
+from src.rag.document_loader import DocumentLoader
+from src.rag.vector_store import VectorStore
+__all__ = ["DocumentLoader", "VectorStore"]

src/rag/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (409 Bytes). View file

src/rag/__pycache__/csv_document_generator.cpython-311.pyc ADDED Viewed

Binary file (14.2 kB). View file

src/rag/__pycache__/document_loader.cpython-311.pyc ADDED Viewed

Binary file (5.91 kB). View file

src/rag/__pycache__/vector_store.cpython-311.pyc ADDED Viewed

Binary file (4.88 kB). View file

src/rag/csv_document_generator.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""CSV document generator for RAG system."""
+import logging
+from pathlib import Path
+from typing import List, Dict, Any
+import pandas as pd
+from langchain_core.documents import Document
+logger = logging.getLogger(__name__)
+class CSVDocumentGenerator:
+    """Generate documents from CSV data for RAG system."""
+    def __init__(self, csv_path: Path, sample_size: int = 1050000) -> None:
+        """Initialize CSV document generator.
+        Args:
+            csv_path: Path to the CSV file.
+            sample_size: Number of rows to sample from CSV (to handle large files).
+        """
+        self.csv_path = Path(csv_path)
+        self.sample_size = sample_size
+        self.df: pd.DataFrame = None
+    def load_data(self) -> None:
+        """Load CSV data with sampling for efficiency."""
+        if not self.csv_path.exists():
+            raise FileNotFoundError(f"CSV file not found: {self.csv_path}")
+        try:
+            logger.info(f"Loading CSV data from {self.csv_path}")
+            # Load with sampling to handle large file
+            self.df = pd.read_csv(self.csv_path, nrows=self.sample_size)
+            # Clean merchant names (remove 'fraud_' prefix common in synthetic datasets)
+            if 'merchant' in self.df.columns:
+                self.df['merchant'] = self.df['merchant'].str.replace('fraud_', '', regex=False)
+            logger.info(f"Loaded {len(self.df)} rows from CSV (merchant names cleaned)")
+        except Exception as e:
+            logger.error(f"Error loading CSV: {str(e)}")
+            raise
+    def generate_fraud_pattern_documents(self) -> List[Document]:
+        """Generate documents about fraud patterns by category.
+        Returns:
+            List of documents containing fraud pattern insights.
+        """
+        if self.df is None:
+            self.load_data()
+        documents = []
+        # Fraud patterns by category
+        category_fraud = self.df.groupby('category').agg({
+            'is_fraud': ['sum', 'mean', 'count']
+        }).round(4)
+        for category in category_fraud.index:
+            fraud_count = int(category_fraud.loc[category, ('is_fraud', 'sum')])
+            fraud_rate = float(category_fraud.loc[category, ('is_fraud', 'mean')] * 100)
+            total_txns = int(category_fraud.loc[category, ('is_fraud', 'count')])
+            content = f"""Fraud Pattern Analysis - Category: {category}
+Based on historical transaction data analysis:
+- Total Transactions: {total_txns:,}
+- Fraud Cases: {fraud_count:,}
+- Fraud Rate: {fraud_rate:.2f}%
+- Risk Level: {'HIGH' if fraud_rate > 5 else 'MEDIUM' if fraud_rate > 1 else 'LOW'}
+This category shows {'significant' if fraud_rate > 5 else 'moderate' if fraud_rate > 1 else 'low'} fraud activity in the historical dataset.
+"""
+            documents.append(Document(
+                page_content=content,
+                metadata={
+                    "source": "fraudTrain.csv",
+                    "type": "fraud_pattern",
+                    "category": category,
+                    "fraud_rate": fraud_rate
+                }
+            ))
+        logger.info(f"Generated {len(documents)} category fraud pattern documents")
+        return documents
+    def generate_statistical_summaries(self) -> List[Document]:
+        """Generate statistical summary documents.
+        Returns:
+            List of documents containing statistical insights.
+        """
+        if self.df is None:
+            self.load_data()
+        documents = []
+        # Overall statistics
+        total_txns = len(self.df)
+        fraud_txns = int(self.df['is_fraud'].sum())
+        fraud_rate = float(self.df['is_fraud'].mean() * 100)
+        avg_amount = float(self.df['amt'].mean())
+        fraud_avg_amount = float(self.df[self.df['is_fraud'] == 1]['amt'].mean())
+        legit_avg_amount = float(self.df[self.df['is_fraud'] == 0]['amt'].mean())
+        overall_summary = f"""Overall Fraud Detection Statistics
+Dataset Summary:
+- Total Transactions Analyzed: {total_txns:,}
+- Fraudulent Transactions: {fraud_txns:,}
+- Overall Fraud Rate: {fraud_rate:.2f}%
+- Average Transaction Amount: ${avg_amount:.2f}
+- Average Fraud Amount: ${fraud_avg_amount:.2f}
+- Average Legitimate Amount: ${legit_avg_amount:.2f}
+Key Insight: Fraudulent transactions have an average amount of ${fraud_avg_amount:.2f} compared to ${legit_avg_amount:.2f} for legitimate transactions.
+"""
+        documents.append(Document(
+            page_content=overall_summary,
+            metadata={
+                "source": "fraudTrain.csv",
+                "type": "statistical_summary",
+                "scope": "overall"
+            }
+        ))
+        # Amount range analysis
+        amount_bins = [0, 10, 50, 100, 500, 1000, float('inf')]
+        amount_labels = ['$0-10', '$10-50', '$50-100', '$100-500', '$500-1000', '$1000+']
+        self.df['amount_range'] = pd.cut(self.df['amt'], bins=amount_bins, labels=amount_labels)
+        amount_fraud = self.df.groupby('amount_range', observed=True).agg({
+            'is_fraud': ['sum', 'mean', 'count']
+        }).round(4)
+        amount_content = "Fraud Patterns by Transaction Amount\n\n"
+        for amt_range in amount_labels:
+            if amt_range in amount_fraud.index:
+                fraud_count = int(amount_fraud.loc[amt_range, ('is_fraud', 'sum')])
+                fraud_rate = float(amount_fraud.loc[amt_range, ('is_fraud', 'mean')] * 100)
+                total = int(amount_fraud.loc[amt_range, ('is_fraud', 'count')])
+                amount_content += f"""
+Amount Range: {amt_range}
+- Total Transactions: {total:,}
+- Fraud Cases: {fraud_count:,}
+- Fraud Rate: {fraud_rate:.2f}%
+"""
+        documents.append(Document(
+            page_content=amount_content,
+            metadata={
+                "source": "fraudTrain.csv",
+                "type": "statistical_summary",
+                "scope": "amount_analysis"
+            }
+        ))
+        logger.info(f"Generated {len(documents)} statistical summary documents")
+        return documents
+    def generate_merchant_profiles(self) -> List[Document]:
+        """Generate merchant risk profile documents.
+        Returns:
+            List of documents containing merchant insights.
+        """
+        if self.df is None:
+            self.load_data()
+        documents = []
+        # Top merchants by transaction volume
+        merchant_stats = self.df.groupby('merchant').agg({
+            'is_fraud': ['sum', 'mean', 'count'],
+            'amt': 'mean'
+        }).round(4)
+        # Get top 20 merchants by volume
+        top_merchants = merchant_stats.nlargest(20, ('is_fraud', 'count'))
+        for merchant in top_merchants.index:
+            fraud_count = int(top_merchants.loc[merchant, ('is_fraud', 'sum')])
+            fraud_rate = float(top_merchants.loc[merchant, ('is_fraud', 'mean')] * 100)
+            total_txns = int(top_merchants.loc[merchant, ('is_fraud', 'count')])
+            avg_amt = float(top_merchants.loc[merchant, ('amt', 'mean')])
+            content = f"""Merchant Risk Profile: {merchant}
+Transaction Analysis:
+- Total Transactions: {total_txns:,}
+- Fraudulent Transactions: {fraud_count:,}
+- Fraud Rate: {fraud_rate:.2f}%
+- Average Transaction Amount: ${avg_amt:.2f}
+- Risk Assessment: {'HIGH RISK' if fraud_rate > 10 else 'MEDIUM RISK' if fraud_rate > 5 else 'LOW RISK'}
+This merchant profile is based on historical transaction patterns and can help identify similar fraud patterns.
+"""
+            documents.append(Document(
+                page_content=content,
+                metadata={
+                    "source": "fraudTrain.csv",
+                    "type": "merchant_profile",
+                    "merchant": merchant,
+                    "fraud_rate": fraud_rate
+                }
+            ))
+        logger.info(f"Generated {len(documents)} merchant profile documents")
+        return documents
+    def generate_location_insights(self) -> List[Document]:
+        """Generate location-based fraud insights.
+        Returns:
+            List of documents containing location insights.
+        """
+        if self.df is None:
+            self.load_data()
+        documents = []
+        # State-level analysis
+        state_fraud = self.df.groupby('state').agg({
+            'is_fraud': ['sum', 'mean', 'count']
+        }).round(4)
+        # Get top 15 states by transaction volume
+        top_states = state_fraud.nlargest(15, ('is_fraud', 'count'))
+        for state in top_states.index:
+            fraud_count = int(top_states.loc[state, ('is_fraud', 'sum')])
+            fraud_rate = float(top_states.loc[state, ('is_fraud', 'mean')] * 100)
+            total_txns = int(top_states.loc[state, ('is_fraud', 'count')])
+            content = f"""Geographic Fraud Analysis - State: {state}
+Location-based Fraud Patterns:
+- Total Transactions: {total_txns:,}
+- Fraud Cases: {fraud_count:,}
+- Fraud Rate: {fraud_rate:.2f}%
+- Geographic Risk Level: {'HIGH' if fraud_rate > 5 else 'MEDIUM' if fraud_rate > 2 else 'LOW'}
+This geographic area shows {'elevated' if fraud_rate > 5 else 'moderate' if fraud_rate > 2 else 'normal'} fraud activity levels.
+"""
+            documents.append(Document(
+                page_content=content,
+                metadata={
+                    "source": "fraudTrain.csv",
+                    "type": "location_insight",
+                    "state": state,
+                    "fraud_rate": fraud_rate
+                }
+            ))
+        logger.info(f"Generated {len(documents)} location insight documents")
+        return documents
+    def generate_all_documents(self) -> List[Document]:
+        """Generate all types of documents from CSV data.
+        Returns:
+            List of all generated documents.
+        """
+        all_documents = []
+        logger.info("Generating all document types from CSV data...")
+        all_documents.extend(self.generate_fraud_pattern_documents())
+        all_documents.extend(self.generate_statistical_summaries())
+        all_documents.extend(self.generate_merchant_profiles())
+        all_documents.extend(self.generate_location_insights())
+        logger.info(f"Generated total of {len(all_documents)} documents from CSV data")
+        return all_documents

src/rag/document_loader.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""Document loader for PDF files."""
+import logging
+from pathlib import Path
+from typing import List
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from src.config.config import settings
+logger = logging.getLogger(__name__)
+class DocumentLoader:
+    """Loader for PDF documents."""
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+    ) -> None:
+        """Initialize document loader.
+        Args:
+            chunk_size: Size of text chunks.
+            chunk_overlap: Overlap between chunks.
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+        )
+    def load_pdf(self, pdf_path: Path) -> List[Document]:
+        """Load a PDF file and split it into chunks.
+        Args:
+            pdf_path: Path to the PDF file.
+        Returns:
+            List of document chunks.
+        """
+        if not pdf_path.exists():
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        try:
+            logger.info(f"Loading PDF: {pdf_path}")
+            loader = PyPDFLoader(str(pdf_path))
+            documents = loader.load()
+            # Split documents into chunks
+            chunks = self.text_splitter.split_documents(documents)
+            logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}")
+            return chunks
+        except Exception as e:
+            logger.error(f"Error loading PDF {pdf_path}: {str(e)}")
+            raise
+    def load_pdfs_from_directory(self, directory: Path) -> List[Document]:
+        """Load all PDF files from a directory.
+        Args:
+            directory: Directory containing PDF files.
+        Returns:
+            List of document chunks from all PDFs.
+        """
+        if not directory.exists():
+            raise FileNotFoundError(f"Directory not found: {directory}")
+        pdf_files = list(directory.glob("*.pdf"))
+        if not pdf_files:
+            logger.warning(f"No PDF files found in {directory}")
+            return []
+        all_chunks: List[Document] = []
+        for pdf_path in pdf_files:
+            try:
+                chunks = self.load_pdf(pdf_path)
+                all_chunks.extend(chunks)
+            except Exception as e:
+                logger.error(f"Failed to load {pdf_path}: {str(e)}")
+                continue
+        logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs")
+        return all_chunks
+    def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]:
+        """Load insights from CSV file and convert to documents.
+        Args:
+            csv_path: Path to CSV file.
+            sample_size: Number of rows to sample from CSV.
+        Returns:
+            List of documents generated from CSV insights.
+        """
+        try:
+            from src.rag.csv_document_generator import CSVDocumentGenerator
+            logger.info(f"Loading CSV insights from {csv_path}")
+            generator = CSVDocumentGenerator(csv_path, sample_size=sample_size)
+            documents = generator.generate_all_documents()
+            logger.info(f"Generated {len(documents)} documents from CSV insights")
+            return documents
+        except Exception as e:
+            logger.error(f"Error loading CSV insights: {str(e)}")
+            raise

src/rag/vector_store.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""Vector store for document embeddings."""
+import logging
+from typing import List, Optional
+from langchain_core.documents import Document
+from langchain_chroma import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_core.retrievers import BaseRetriever
+from src.config.config import settings
+logger = logging.getLogger(__name__)
+class VectorStore:
+    """Vector store for document embeddings and retrieval."""
+    def __init__(
+        self,
+        embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
+        persist_directory: Optional[str] = None,
+    ) -> None:
+        """Initialize vector store.
+        Args:
+            embedding_model: Name of the embedding model.
+            persist_directory: Directory to persist the vector store.
+        """
+        self.embedding_model = embedding_model
+        self.persist_directory = persist_directory or settings.vector_store_path
+        # Initialize embeddings
+        self.embeddings = HuggingFaceEmbeddings(
+            model_name=embedding_model,
+            model_kwargs={"device": "cpu"},
+        )
+        self.vector_store: Optional[Chroma] = None
+        self.retriever: Optional[BaseRetriever] = None
+    def add_documents(self, documents: List[Document]) -> None:
+        """Add documents to the vector store.
+        Args:
+            documents: List of documents to add.
+        """
+        if not documents:
+            logger.warning("No documents to add")
+            return
+        try:
+            if self.vector_store is None:
+                # Create new vector store
+                self.vector_store = Chroma.from_documents(
+                    documents=documents,
+                    embedding=self.embeddings,
+                    persist_directory=self.persist_directory,
+                )
+            else:
+                # Add to existing vector store
+                self.vector_store.add_documents(documents)
+            # Create retriever
+            self.retriever = self.vector_store.as_retriever(
+                search_kwargs={"k": 5}
+            )
+            logger.info(f"Added {len(documents)} documents to vector store")
+        except Exception as e:
+            logger.error(f"Error adding documents to vector store: {str(e)}")
+            raise
+    def similarity_search(
+        self,
+        query: str,
+        k: int = 5,
+    ) -> List[Document]:
+        """Search for similar documents.
+        Args:
+            query: Search query.
+            k: Number of results to return.
+        Returns:
+            List of similar documents.
+        """
+        if self.vector_store is None:
+            raise ValueError("Vector store not initialized. Add documents first.")
+        try:
+            results = self.vector_store.similarity_search(query, k=k)
+            logger.info(f"Found {len(results)} similar documents for query: {query[:50]}...")
+            return results
+        except Exception as e:
+            logger.error(f"Error in similarity search: {str(e)}")
+            raise
+    def get_retriever(self) -> BaseRetriever:
+        """Get the retriever for RAG.
+        Returns:
+            Base retriever instance.
+        """
+        if self.retriever is None:
+            raise ValueError("Retriever not initialized. Add documents first.")
+        return self.retriever

src/schemas/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""Pydantic schemas for API."""
+from src.schemas.fraud import (
+    FraudAnalysisRequest,
+    FraudAnalysisResponse,
+    TransactionData,
+    TransactionSummary,
+)
+__all__ = [
+    "FraudAnalysisRequest",
+    "FraudAnalysisResponse",
+    "TransactionData",
+    "TransactionSummary",
+]

src/schemas/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (448 Bytes). View file

src/schemas/__pycache__/fraud.cpython-311.pyc ADDED Viewed

Binary file (3.45 kB). View file

src/schemas/fraud.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Pydantic schemas for fraud detection."""
+from typing import Dict, Optional
+from pydantic import BaseModel, Field
+class TransactionData(BaseModel):
+    """Transaction data schema."""
+    trans_date_trans_time: Optional[str] = None
+    merchant: Optional[str] = None
+    category: Optional[str] = None
+    amt: Optional[float] = None
+    first: Optional[str] = None
+    last: Optional[str] = None
+    gender: Optional[str] = None
+    city: Optional[str] = None
+    state: Optional[str] = None
+    job: Optional[str] = None
+    city_pop: Optional[int] = None
+class TransactionSummary(BaseModel):
+    """Transaction summary schema."""
+    total_transactions: int
+    fraud_count: int
+    fraud_percentage: float
+    total_amount: float
+    average_amount: float
+    categories: Dict[str, int]
+class FraudAnalysisRequest(BaseModel):
+    """Request schema for fraud analysis."""
+    transaction_id: Optional[int] = Field(None, description="Transaction ID from dataset")
+    transaction_data: Optional[TransactionData] = Field(None, description="Direct transaction data")
+    use_rag: bool = Field(True, description="Whether to use RAG for context")
+    class Config:
+        """Pydantic config."""
+        json_schema_extra = {
+            "example": {
+                "transaction_id": 0,
+                "use_rag": True,
+            }
+        }
+class FraudAnalysisResponse(BaseModel):
+    """Response schema for fraud analysis."""
+    transaction: Dict
+    analysis: str
+    formatted_transaction: str
+    success: bool = True
+    error: Optional[str] = None

src/services/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Services module."""
+from src.services.fraud_analyzer import FraudAnalyzer
+__all__ = ["FraudAnalyzer"]

src/services/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (308 Bytes). View file

src/services/__pycache__/fraud_analyzer.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file