mrfirdauss commited on
Commit
1cd4158
·
1 Parent(s): 7b71428

init: chatgpt version

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -1
  2. .gitignore +6 -0
  3. Dockerfile +1 -0
  4. README.md +15 -18
  5. requirements.txt +5 -2
  6. src/models.py +8 -0
  7. src/prompt.py +102 -0
  8. src/streamlit_app.py +102 -36
.gitattributes CHANGED
@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .env
2
+ *.yaml
3
+ *.ipynb
4
+ /logs/*
5
+ *.pdf
6
+ */__pycache__/*
Dockerfile CHANGED
@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
10
 
11
  COPY requirements.txt ./
12
  COPY src/ ./src/
 
13
 
14
  RUN pip3 install -r requirements.txt
15
 
 
10
 
11
  COPY requirements.txt ./
12
  COPY src/ ./src/
13
+ COPY fraudTrainData.pkl.xz ./
14
 
15
  RUN pip3 install -r requirements.txt
16
 
README.md CHANGED
@@ -1,19 +1,16 @@
1
- ---
2
- title: Financial RAG
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- ---
13
 
14
- # Welcome to Streamlit!
15
-
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
-
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
1
+ # Financial RAG
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ ## Schema
4
+ ```
5
+ User Input
6
+
7
+ Prompt Refinery
8
+
9
+ Routing
10
+ ├── Function Call → Query & Aggregate DataFrame
11
+ └── Report Retrieval → Chunk Selection & Context Injection
12
+
13
+ Context Assembly
14
+
15
+ Streaming Response to User
16
+ ```
requirements.txt CHANGED
@@ -1,3 +1,6 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
1
+ openai
2
  pandas
3
+ streamlit
4
+ matplotlib
5
+ pandas
6
+ tabulate
src/models.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+
3
+ class ResponseState(BaseModel):
4
+ isNeedContext: bool = Field(default=False, description="Whether the model needs additional context from data from or pdf")
5
+ response: str = Field(default="", description="The response from the model")
6
+ contextType: str = Field(default="", description="If isNeedContext true then The type of context needed, e.g., 'data' or 'pdf' or 'both'")
7
+ retriverKey: str = Field(default="", description="If isNeedContext true and contextType is pdf or both then The key to use to retrieve the context from the pdf for vector db. The key is in plain text.")
8
+ code: str = Field(default="", description="The data already loaded in a dataframe named df, you only need to write code to gain insights from the data. Insert all necessary info inside string variable named result")
src/prompt.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REFINERY_PROMPT = """
2
+ ### PROMPT:
3
+ YOU ARE A FINANCIAL EXPERT. I WANT YOU TO ACT AS A DATA ANALYST AND ASSIST ME IN UNDERSTANDING AND INTERPRETING THE DATA PROVIDED.
4
+ BASED ON INPUT, I WANT YOU TO REFINE THE USER'S QUESTION AND DETERMINE IF YOU NEED ADDITIONAL CONTEXT FROM DATA OR PDF DOCUMENTS TO ANSWER THE QUESTION. IF YOU NEED CONTEXT, PLEASE SPECIFY THE TYPE OF CONTEXT NEEDED ('data', 'pdf', OR 'both') AND PROVIDE THE CODE TO EXECUTE TO GAIN INSIGHTS FROM THE DATA. IF YOU DO NOT NEED ADDITIONAL CONTEXT, PLEASE PROVIDE A DIRECT ANSWER TO THE USER'S QUESTION.
5
+ PLEASE RESPOND IN JSON FORMAT WITH THE FOLLOWING.
6
+
7
+ ### Context:
8
+ #### Dataframe
9
+ This is a credit card transaction dataset containing legitimate and fraud transactions from the duration 1st Jan 2019 - 31st Dec 2020. It covers credit cards of 1000 customers doing transactions with a pool of 800 merchants.
10
+ The data location is located in USA and the currency used is USD. If there is other country or area asked, please answer that the data only covers USA.
11
+ Latest data might not available, so please answer based on the data available.
12
+ table head, columns, and sample data:
13
+ dataframe head:
14
+ {df_head}
15
+
16
+ dataframe columns type:
17
+ index - Unique Identifier for each row
18
+ cc_num - Credit Card Number of Customer
19
+ trans_date_trans_time - Transaction DateTime
20
+ merchant - Merchant Name
21
+ category - Category of Merchant
22
+ amt - Amount of Transaction
23
+ first - First Name of Credit Card Holder
24
+ last - Last Name of Credit Card Holder
25
+ gender - Gender of Credit Card Holder
26
+ street - Street Address of Credit Card Holder
27
+ city - City of Credit Card Holder
28
+ state - State of Credit Card Holder
29
+ zip - Zip of Credit Card Holder
30
+ lat - Latitude Location of Credit Card Holder
31
+ long - Longitude Location of Credit Card Holder
32
+ city_pop - Credit Card Holder's City Population
33
+ job - Job of Credit Card Holder
34
+ dob - Date of Birth of Credit Card Holder
35
+ trans_num - Transaction Number
36
+ unix_time - UNIX Time of transaction
37
+ merch_lat - Latitude Location of Merchant
38
+ merch_long - Longitude Location of Merchant
39
+ is_fraud - Fraud Flag
40
+
41
+ dataframe sample data:
42
+ {df_sample}
43
+
44
+ #### PDF Document
45
+ File PDF summary:
46
+ Understanding Credit Card Frauds Card Busienss Review by Tata Consulting Service:
47
+ This paper is released at 2003. The latest data are not available. If required, please answer that you dont have the data.
48
+ This paper contain world wide snapshot data before 2003 about credit card fraud.
49
+ Card fraud is a major global threat, particularly in online “card-not-present” transactions where fraud rates far exceed in-person purchases.
50
+ Common techniques include lost/stolen cards, identity theft, counterfeit cards, skimming, and internet schemes such as site cloning and false merchant sites. While cardholders are typically protected by law, merchants bear the highest costs through chargebacks, penalties, and reputational damage, with banks also incurring significant prevention expenses.
51
+ Effective management requires a layered approach: verification systems (AVS, CVV, payer authentication), blacklists/whitelists, and advanced methods like risk scoring, neural networks, biometrics, and smart cards. The key challenge is balancing fraud losses with the cost of prevention to minimize the total cost of fraud while maintaining trust in the payment ecosystem.
52
+
53
+ Table of Contents:
54
+ Overview
55
+
56
+ Introduction
57
+ 2.1. Purpose of this Paper
58
+
59
+ Current State of the Industry
60
+
61
+ How Fraud is Committed Worldwide
62
+
63
+ Fraud Techniques
64
+ 5.1. Card-Related Frauds
65
+ - Application Fraud
66
+ - Lost / Stolen Cards
67
+ - Account Takeover
68
+ - Fake and Counterfeit Cards
69
+ 5.2. Merchant-Related Frauds
70
+ - Merchant Collusion
71
+ - Triangulation
72
+ 5.3. Internet-Related Frauds
73
+
74
+ Impact of Credit Card Frauds
75
+ 6.1. Impact on Cardholders
76
+ 6.2. Impact on Merchants
77
+ 6.3. Impact on Banks (Issuer / Acquirer)
78
+
79
+ Fraud Prevention and Management
80
+ 7.1. Fraud Prevention Technologies
81
+ - Manual Review
82
+ - Address Verification System (AVS)
83
+ - Card Verification Methods
84
+ - Negative and Positive Lists
85
+ - Payer Authentication
86
+ - Lockout Mechanisms
87
+ - Fraudulent Merchants
88
+ 7.2. Recent Developments in Fraud Management
89
+ - Simple Rule Systems
90
+ - Risk Scoring Technologies
91
+ - Neural Network Technologies
92
+ - Biometrics
93
+ - Smart Cards
94
+
95
+ Managing the Total Cost of Fraud
96
+ """
97
+
98
+
99
+ FINAL_PROMPT = """"
100
+ You are a financial expert. Use the provided context to answer the user's question.
101
+ IF THE CONTEXT INSUFFICIENT ANSWER WITH 'Insufficient context to answer the question.' AND TELL WHY NOT TO MAKE UP ANSWER. WHAT CONTEX IS MISSING.
102
+ """
src/streamlit_app.py CHANGED
@@ -1,40 +1,106 @@
1
- import altair as alt
 
2
  import numpy as np
 
3
  import pandas as pd
4
  import streamlit as st
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from models import ResponseState
2
+ from prompt import REFINERY_PROMPT, FINAL_PROMPT
3
  import numpy as np
4
+ import matplotlib.pyplot as plt
5
  import pandas as pd
6
  import streamlit as st
7
+ from openai import OpenAI
8
+ import pickle
9
+ import logging
10
+ logging.basicConfig(filename="/app/logs/app.log", level=logging.INFO)
11
 
12
+ client = OpenAI()
13
+
14
+ df = pickle.load(open("fraudTrainData.pkl", "rb"))
15
+
16
+ st.title("Financial Agent")
17
+
18
+ st.session_state["openai_model"] = "gpt-5-mini-2025-08-07"
19
+
20
+ if "messages" not in st.session_state:
21
+ st.session_state.messages = []
22
+
23
+ for message in st.session_state.messages:
24
+ with st.chat_message(message["role"]):
25
+ st.markdown(message["content"])
26
+
27
+ def streamAnswer(instructions, input):
28
+ chunk = client.responses.create(
29
+ model=st.session_state["openai_model"],
30
+ instructions=instructions,
31
+ input=input,
32
+ stream=True,
33
+ # if contextType == "both" || contextType == "pdf"add vector store tools file search
34
+ tools=[{
35
+ "type": "file_search",
36
+ "vector_store_ids": ['vs_68bf713eea2c81919ac08298a05d6704']
37
+ }]
38
+ )
39
+
40
+ for chunk in chunk:
41
+ if chunk.type == 'response.output_text.delta':
42
+ yield chunk.delta
43
+
44
+ if prompt := st.chat_input("What is up?"):
45
+ st.session_state.messages.append({"role": "user", "content": prompt})
46
+ with st.chat_message("user"):
47
+ st.markdown(prompt)
48
+
49
+ response = client.responses.parse(
50
+ model=st.session_state["openai_model"],
51
+ instructions=REFINERY_PROMPT.format(
52
+ df_head=df.head().to_markdown(),
53
+ df_columns=df.columns.tolist(),
54
+ df_sample=df.sample(5).to_markdown()
55
+ ),
56
+ input=[
57
+ {"role": m["role"], "content": m["content"]}
58
+ for m in st.session_state.messages
59
+ ],
60
+ stream=False,
61
+ text_format=ResponseState
62
+ )
63
+
64
+ context_prompt = ""
65
+ logging.info(response)
66
+ responseState : ResponseState = response.output_parsed
67
+ if responseState.isNeedContext:
68
+ contextType = responseState.contextType
69
+ st.session_state.messages.append({"role": "assistant", "content": responseState.response})
70
+ # with st.chat_message("assistant"):
71
+ # st.markdown(responseState.response)
72
+ if contextType == "data" or contextType == "both":
73
+ local_scope = {
74
+ "df": df,
75
+ "np": np,
76
+ "pd": pd,
77
+ "plt": plt
78
+ }
79
+ exec(responseState.code, {}, local_scope)
80
+ fig = plt.gcf() # get current active figure
81
+ if fig.get_axes(): # check if plot was created
82
+ with st.chat_message("assistant"):
83
+ st.pyplot(fig)
84
+ plt.close(fig) # prevent duplicate rendering later
85
+ context_prompt = "## CONTEXT DATAFRAME.\n"
86
+ context_prompt += local_scope.get("result", "")
87
+ logging.info("context from data: " + context_prompt)
88
+ # elif format == "pdf" or format == "both":
89
+ # context_prompt = "Provide the relevant information from the PDF documents to answer the user's question."
90
+ # st.session_state.messages.append({"role": "user", "content": context_prompt})
91
+ # with st.chat_message("user"):
92
+ # st.markdown(context_prompt)
93
+ answer = ""
94
+ with st.chat_message("assistant"):
95
+ answer = st.write_stream(streamAnswer(
96
+ instructions=FINAL_PROMPT,
97
+ input=[
98
+ {"role": m["role"], "content": m["content"]}
99
+ for m in st.session_state.messages
100
+ ] + [{"role": "user", "content": context_prompt}]
101
+ ))
102
+ st.session_state.messages.append({"role": "assistant", "content": answer})
103
+ else: #only write the response
104
+ st.session_state.messages.append({"role": "assistant", "content": responseState.response})
105
+ with st.chat_message("assistant"):
106
+ st.markdown(responseState.response)