mrfirdauss commited on
Commit
18a508e
·
1 Parent(s): cb77609

init: init repo

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ *.faiss filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -8,11 +8,17 @@ RUN apt-get update && apt-get install -y \
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
 
 
 
 
11
  COPY requirements.txt ./
12
  COPY src/ ./src/
13
 
14
  RUN pip3 install -r requirements.txt
15
 
 
 
16
  EXPOSE 8501
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
 
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+ RUN curl -fsSL https://ollama.com/install.sh | sh
12
+
13
+ RUN ollama serve --detach &
14
+
15
  COPY requirements.txt ./
16
  COPY src/ ./src/
17
 
18
  RUN pip3 install -r requirements.txt
19
 
20
+ RUN ollama pull qwen3:4b
21
+
22
  EXPOSE 8501
23
 
24
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
fraudTrainData.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd3363405c1efbd9862c0b6c4ebebdac1fc9aff175063847c4cda60dc8c50f32
3
+ size 254853611
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ ollama
2
  pandas
3
+ streamlit
4
+ matplotlib
5
+ langchain-community
6
+ faiss-cpu
7
+ pypdf
8
+ sentence-transformers
9
+ huggingface_hub
src/FinancialAgent.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from prompt import FINAL_PROMPT, ResponseState
3
+ from models import ResponseState
4
+ import streamlit as st
5
+ import pickle
6
+ from matplotlib.pyplot import plt
7
+ import io
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+ class FinancialAgentFactory(ABC):
12
+ """Abstract Factory for creating Financial Agents."""
13
+ def __init__(self, st: st, model_name="gpt-4o"):
14
+ self.st = st
15
+ self.df = pickle.load(open("fraudTrainData.pkl", "rb"))
16
+ self.model_name = model_name
17
+
18
+
19
+ if "messages" not in self.st.session_state:
20
+ self.st.session_state.messages = []
21
+ self.st.session_state["openai_model"] = self.model_name
22
+
23
+ @abstractmethod
24
+ def render_header(self, header="Financial Agent"):
25
+ self.st.title(header)
26
+
27
+ @abstractmethod
28
+ def render_messages(self):
29
+ """Render previous chat messages."""
30
+ for message in self.st.session_state.messages:
31
+ with self.st.chat_message(message["role"]):
32
+ self.st.markdown(message["content"])
33
+
34
+ @abstractmethod
35
+ def __stream_answer__(self, instructions, input_messages):
36
+ """Stream answer from the model."""
37
+ pass
38
+
39
+ @abstractmethod
40
+ def process_prompt(self, prompt):
41
+ """Main pipeline for processing a new user input."""
42
+ pass
43
+
44
+ def __safe_savefig__(*args, **kwargs):
45
+ buf = io.BytesIO()
46
+ plt.savefig(buf, format="png")
47
+ buf.seek(0)
48
+ return buf
49
+
50
+ def __handle_context__(self, response_state: ResponseState) -> str:
51
+ """Handle additional context (data, PDF, etc.)."""
52
+ context_prompt = ""
53
+ if response_state.contextType in ("data", "both"):
54
+ local_scope = {"df": self.df, "np": np, "pd": pd, "plt": plt, "savefig": self.__safe_savefig__}
55
+ exec(response_state.code, {}, local_scope)
56
+
57
+ fig = plt.gcf()
58
+ if fig.get_axes(): # if a chart was generated
59
+ with st.chat_message("assistant"):
60
+ st.pyplot(fig)
61
+ plt.close(fig)
62
+
63
+ context_prompt = "## CONTEXT DATAFRAME.\n"
64
+ context_prompt += str(local_scope.get("result", ""))
65
+
66
+ # Placeholder for PDF or other context handling
67
+ # elif response_state.contextType in ("pdf", "both"):
68
+ # context_prompt = "Provide the relevant information from the PDF documents."
69
+
70
+ return context_prompt
71
+
72
+ def generate_final_answer(self, context_prompt: str):
73
+ """Generate and stream the final answer with context."""
74
+ with st.chat_message("assistant"):
75
+ answer = st.write_stream(
76
+ self.__stream_answer__(
77
+ instructions=FINAL_PROMPT,
78
+ input_messages=[
79
+ {"role": m["role"], "content": m["content"]}
80
+ for m in st.session_state.messages
81
+ ] + [{"role": "user", "content": context_prompt}]
82
+ )
83
+ )
84
+ st.session_state.messages.append({"role": "assistant", "content": answer})
85
+
86
+ def display_final_answer(self, answer: str):
87
+ """Display a non-streamed assistant answer."""
88
+ st.session_state.messages.append({"role": "assistant", "content": answer})
89
+ with st.chat_message("assistant"):
90
+ st.markdown(answer)
91
+
92
+ def run(self):
93
+ """Run the app."""
94
+ self.render_header()
95
+ self.render_messages()
96
+
97
+ if prompt := st.chat_input("What is up?"):
98
+ self.process_prompt(prompt)
src/FinancialAgentOllama.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ollama import chat
2
+ from FinancialAgent import FinancialAgentFactory
3
+ from prompt import REFINERY_PROMPT
4
+ from models import ResponseState
5
+ import numpy as np
6
+ import pandas as pd
7
+ import streamlit as st
8
+ import matplotlib.pyplot as plt
9
+ from langchain_community.vectorstores import FAISS
10
+
11
+
12
+ class FinancialAgentOllama(FinancialAgentFactory):
13
+ """Concrete Financial Agent using Ollama."""
14
+ def __init__(self, st, model_name="deepseek-r1:8b", embedding=None):
15
+ super().__init__(st, model_name)
16
+ self.client = chat
17
+ self.vector_db = FAISS.load_local("vs_68bf713eea2c81919ac08298a05d6704", embedding, allow_dangerous_deserialization=True)
18
+
19
+ def __stream_answer__(self, instructions, input_messages):
20
+ response_stream = self.client(
21
+ message=input_messages + [{"role": "user", "content": instructions}],
22
+ model=self.model_name,
23
+ stream=True
24
+ )
25
+
26
+ for chunk in response_stream:
27
+ yield chunk.message.content
28
+
29
+ def generate_final_answer(self, context_prompt):
30
+ """Generate final answer using context."""
31
+ with self.st.chat_message("assistant"):
32
+ answer = self.st.write_stream(
33
+ self.__stream_answer__(context_prompt,
34
+ [{"role": m["role"], "content": m["content"]} for m in self.st.session_state.messages])
35
+ )
36
+
37
+ self.st.session_state.messages.append({"role": "assistant", "content": answer})
38
+ self.st.experimental_rerun()
39
+
40
+ def __handle_context__ (self, response_state: ResponseState) -> str:
41
+ """Handle context retrieval based on response state."""
42
+ context_prompt = ""
43
+ if response_state.contextType in ("data", "both"):
44
+ local_scope = {"df": self.df, "np": np, "pd": pd, "plt": plt, "savefig": self.__safe_savefig__}
45
+ exec(response_state.code, {}, local_scope)
46
+
47
+ fig = plt.gcf()
48
+ if fig.get_axes(): # if a chart was generated
49
+ with st.chat_message("assistant"):
50
+ st.pyplot(fig)
51
+ plt.close(fig)
52
+
53
+ context_prompt = "## CONTEXT DATAFRAME.\n"
54
+ context_prompt += str(local_scope.get("result", ""))
55
+
56
+ if response_state.contextType in ("pdf", "both"):
57
+ context_prompt += "## CONTEXT PDF.\n"
58
+ results = self.vector_db.similarity_search(response_state.retriverKey, k=5)
59
+ for i, doc in enumerate(results, 1):
60
+ context_prompt += f"### Document {i}\n{doc.page_content}\n"
61
+ return context_prompt
62
+
63
+ def process_prompt(self, prompt):
64
+ """Main pipeline for processing a new user input."""
65
+ self.st.session_state.messages.append({"role": "user", "content": prompt})
66
+ with self.st.chat_message("user"):
67
+ self.st.markdown(prompt)
68
+
69
+ # Step 1: Run refinery prompt
70
+ response = self.client(
71
+ message=[{"role": m["role"], "content": m["content"]} for m in self.st.session_state.messages] +
72
+ [{"role": "user", "content": REFINERY_PROMPT.format(
73
+ response_format=ResponseState.model_json_schema(),
74
+ df_head=self.df.head().to_markdown(),
75
+ df_columns=self.df.columns.tolist(),
76
+ df_sample=self.df.sample(5).to_markdown()
77
+ )}],
78
+ model=self.model_name,
79
+ stream=False,
80
+ format=ResponseState
81
+ )
82
+
83
+ response_state: ResponseState = ResponseState.model_validate_json(response.message.content)
84
+
85
+ # Step 2: Check if context is needed
86
+ if response_state.isNeedContext:
87
+ context_prompt = self.__handle_context__(response_state)
88
+ self.generate_final_answer(context_prompt)
89
+ else:
90
+ self.display_final_answer(response_state.response)
91
+
src/models.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+
3
+ class ResponseState(BaseModel):
4
+ isNeedContext: bool = Field(default=False, description="Whether the model needs additional context from data from or pdf")
5
+ response: str = Field(default="", description="The response from the model")
6
+ contextType: str = Field(default="", description="If isNeedContext true then The type of context needed, e.g., 'data' or 'pdf' or 'both'")
7
+ retriverKey: str = Field(default="", description="If isNeedContext true and contextType is pdf or both then The key to use to retrieve the context from the pdf for vector db. The key is in plain text.")
8
+ code: str = Field(default="", description="The data already loaded in a dataframe named df, you only need to write code to gain insights from the data. Insert all necessary info inside string variable named result")
src/prompt.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REFINERY_PROMPT = """
2
+ ### PROMPT:
3
+ YOU ARE A FINANCIAL EXPERT. I WANT YOU TO ACT AS A DATA ANALYST AND ASSIST ME IN UNDERSTANDING AND INTERPRETING THE DATA PROVIDED.
4
+ BASED ON INPUT, I WANT YOU TO REFINE THE USER'S QUESTION AND DETERMINE IF YOU NEED ADDITIONAL CONTEXT FROM DATA OR PDF DOCUMENTS TO ANSWER THE QUESTION.
5
+ IF YOU NEED CONTEXT, PLEASE SPECIFY THE TYPE OF CONTEXT NEEDED ('data', 'pdf', OR 'both') AND PROVIDE THE CODE TO EXECUTE TO GAIN INSIGHTS FROM THE DATA.
6
+ IF WANT TO SAFE PLOT, USE THE PROVIDED 'savefig()' FUNCTION TO SAVE THE PLOT AND RETURN THE BUFFER.
7
+ IF YOU DO NOT NEED ADDITIONAL CONTEXT, PLEASE PROVIDE A DIRECT ANSWER TO THE USER'S QUESTION.
8
+ PLEASE RESPOND IN JSON FORMAT WITH THE FOLLOWING.
9
+
10
+ ### Response Format:
11
+ {response_format}
12
+
13
+ ### Context:
14
+ #### Dataframe
15
+ This is a credit card transaction dataset containing legitimate and fraud transactions from the duration 1st Jan 2019 - 31st Dec 2020. It covers credit cards of 1000 customers doing transactions with a pool of 800 merchants.
16
+ The data location is located in USA and the currency used is USD. If there is other country or area asked, please answer that the data only covers USA.
17
+ Latest data might not available, so please answer based on the data available.
18
+ table head, columns, and sample data:
19
+ dataframe head:
20
+ {df_head}
21
+
22
+ dataframe columns type:
23
+ index - Unique Identifier for each row
24
+ cc_num - Credit Card Number of Customer
25
+ trans_date_trans_time - Transaction DateTime
26
+ merchant - Merchant Name
27
+ category - Category of Merchant
28
+ amt - Amount of Transaction
29
+ first - First Name of Credit Card Holder
30
+ last - Last Name of Credit Card Holder
31
+ gender - Gender of Credit Card Holder
32
+ street - Street Address of Credit Card Holder
33
+ city - City of Credit Card Holder
34
+ state - State of Credit Card Holder
35
+ zip - Zip of Credit Card Holder
36
+ lat - Latitude Location of Credit Card Holder
37
+ long - Longitude Location of Credit Card Holder
38
+ city_pop - Credit Card Holder's City Population
39
+ job - Job of Credit Card Holder
40
+ dob - Date of Birth of Credit Card Holder
41
+ trans_num - Transaction Number
42
+ unix_time - UNIX Time of transaction
43
+ merch_lat - Latitude Location of Merchant
44
+ merch_long - Longitude Location of Merchant
45
+ is_fraud - Fraud Flag
46
+
47
+ dataframe sample data:
48
+ {df_sample}
49
+
50
+ #### PDF Document
51
+ File PDF summary:
52
+ Understanding Credit Card Frauds Card Busienss Review by Tata Consulting Service:
53
+ This paper is released at 2003. The latest data are not available. If required, please answer that you dont have the data.
54
+ This paper contain world wide snapshot data before 2003 about credit card fraud.
55
+ Card fraud is a major global threat, particularly in online “card-not-present” transactions where fraud rates far exceed in-person purchases.
56
+ Common techniques include lost/stolen cards, identity theft, counterfeit cards, skimming, and internet schemes such as site cloning and false merchant sites. While cardholders are typically protected by law, merchants bear the highest costs through chargebacks, penalties, and reputational damage, with banks also incurring significant prevention expenses.
57
+ Effective management requires a layered approach: verification systems (AVS, CVV, payer authentication), blacklists/whitelists, and advanced methods like risk scoring, neural networks, biometrics, and smart cards. The key challenge is balancing fraud losses with the cost of prevention to minimize the total cost of fraud while maintaining trust in the payment ecosystem.
58
+
59
+ Table of Contents:
60
+ Overview
61
+
62
+ Introduction
63
+ 2.1. Purpose of this Paper
64
+
65
+ Current State of the Industry
66
+
67
+ How Fraud is Committed Worldwide
68
+
69
+ Fraud Techniques
70
+ 5.1. Card-Related Frauds
71
+ - Application Fraud
72
+ - Lost / Stolen Cards
73
+ - Account Takeover
74
+ - Fake and Counterfeit Cards
75
+ 5.2. Merchant-Related Frauds
76
+ - Merchant Collusion
77
+ - Triangulation
78
+ 5.3. Internet-Related Frauds
79
+
80
+ Impact of Credit Card Frauds
81
+ 6.1. Impact on Cardholders
82
+ 6.2. Impact on Merchants
83
+ 6.3. Impact on Banks (Issuer / Acquirer)
84
+
85
+ Fraud Prevention and Management
86
+ 7.1. Fraud Prevention Technologies
87
+ - Manual Review
88
+ - Address Verification System (AVS)
89
+ - Card Verification Methods
90
+ - Negative and Positive Lists
91
+ - Payer Authentication
92
+ - Lockout Mechanisms
93
+ - Fraudulent Merchants
94
+ 7.2. Recent Developments in Fraud Management
95
+ - Simple Rule Systems
96
+ - Risk Scoring Technologies
97
+ - Neural Network Technologies
98
+ - Biometrics
99
+ - Smart Cards
100
+
101
+ Managing the Total Cost of Fraud
102
+ """
103
+
104
+
105
+ FINAL_PROMPT = """"
106
+ You are a financial expert. Use the provided context to answer the user's question.
107
+ IF THE CONTEXT INSUFFICIENT ANSWER WITH 'Insufficient context to answer the question.' AND TELL WHY NOT TO MAKE UP ANSWER. WHAT CONTEX IS MISSING.
108
+ """
src/streamlit_app.py CHANGED
@@ -1,40 +1,14 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
 
 
 
8
 
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
 
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ from FinancialAgentOllama import FinancialAgentOllama
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from huggingface_hub import hf_hub_download
5
 
6
+ # Download the .faiss file from a model or dataset repo
7
+ index_path = hf_hub_download(
8
+ repo_id="mrfirdauss/FaissBhatlaBook/",
9
+ filename="vs_68bf713eea2c81919ac08298a05d6704/index.faiss"
10
+ )
11
 
12
+ app = FinancialAgentOllama(st, model_name="qwen3:4b", embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
 
 
13
 
14
+ app.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vs_68bf713eea2c81919ac08298a05d6704/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed1be49bcb8d019522a1838992eaad2c3fd5f8ed62c4da9b6c8cee3f17bfc78
3
+ size 69695