Spaces:

mrfirdauss
/

Financial-RAG

Sleeping

App Files Files Community

mrfirdauss commited on Sep 9, 2025

Commit

1cd4158

1 Parent(s): 7b71428

init: chatgpt version

Browse files

Files changed (8) hide show

.gitattributes +1 -1
.gitignore +6 -0
Dockerfile +1 -0
README.md +15 -18
requirements.txt +5 -2
src/models.py +8 -0
src/prompt.py +102 -0
src/streamlit_app.py +102 -36

.gitattributes CHANGED Viewed

@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.env
+*.yaml
+*.ipynb
+/logs/*
+*.pdf
+*/__pycache__/*

Dockerfile CHANGED Viewed

@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
 COPY requirements.txt ./
 COPY src/ ./src/
 RUN pip3 install -r requirements.txt

 COPY requirements.txt ./
 COPY src/ ./src/
+COPY fraudTrainData.pkl.xz ./
 RUN pip3 install -r requirements.txt

README.md CHANGED Viewed

@@ -1,19 +1,16 @@
----
-title: Financial RAG
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Streamlit template space
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+# Financial RAG
+## Schema
+```
+User Input
+   ↓
+Prompt Refinery
+   ↓
+Routing
+   ├── Function Call → Query & Aggregate DataFrame
+   └── Report Retrieval → Chunk Selection & Context Injection
+           ↓
+    Context Assembly
+           ↓
+Streaming Response to User
+```

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
 pandas
-streamlit

+openai
 pandas
+streamlit
+matplotlib
+pandas
+tabulate

src/models.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from pydantic import BaseModel, Field
+class ResponseState(BaseModel):
+    isNeedContext: bool = Field(default=False, description="Whether the model needs additional context from data from or pdf")
+    response: str = Field(default="", description="The response from the model")
+    contextType: str = Field(default="", description="If isNeedContext true then The type of context needed, e.g., 'data' or 'pdf' or 'both'")
+    retriverKey: str = Field(default="", description="If isNeedContext true and contextType is pdf or both then The key to use to retrieve the context from the pdf for vector db. The key is in plain text.")
+    code: str = Field(default="", description="The data already loaded in a dataframe named df, you only need to write code to gain insights from the data. Insert all necessary info inside string variable named result")

src/prompt.py ADDED Viewed

	@@ -0,0 +1,102 @@

+REFINERY_PROMPT = """
+### PROMPT:
+YOU ARE A FINANCIAL EXPERT. I WANT YOU TO ACT AS A DATA ANALYST AND ASSIST ME IN UNDERSTANDING AND INTERPRETING THE DATA PROVIDED.
+BASED ON INPUT, I WANT YOU TO REFINE THE USER'S QUESTION AND DETERMINE IF YOU NEED ADDITIONAL CONTEXT FROM DATA OR PDF DOCUMENTS TO ANSWER THE QUESTION. IF YOU NEED CONTEXT, PLEASE SPECIFY THE TYPE OF CONTEXT NEEDED ('data', 'pdf', OR 'both') AND PROVIDE THE CODE TO EXECUTE TO GAIN INSIGHTS FROM THE DATA. IF YOU DO NOT NEED ADDITIONAL CONTEXT, PLEASE PROVIDE A DIRECT ANSWER TO THE USER'S QUESTION.
+PLEASE RESPOND IN JSON FORMAT WITH THE FOLLOWING.
+### Context:
+#### Dataframe
+This is a credit card transaction dataset containing legitimate and fraud transactions from the duration 1st Jan 2019 - 31st Dec 2020. It covers credit cards of 1000 customers doing transactions with a pool of 800 merchants.
+The data location is located in USA and the currency used is USD. If there is other country or area asked, please answer that the data only covers USA.
+Latest data might not available, so please answer based on the data available.
+table head, columns, and sample data:
+dataframe head:
+    {df_head}
+dataframe columns type:
+    index - Unique Identifier for each row
+    cc_num - Credit Card Number of Customer
+    trans_date_trans_time - Transaction DateTime
+    merchant - Merchant Name
+    category - Category of Merchant
+    amt - Amount of Transaction
+    first - First Name of Credit Card Holder
+    last - Last Name of Credit Card Holder
+    gender - Gender of Credit Card Holder
+    street - Street Address of Credit Card Holder
+    city - City of Credit Card Holder
+    state - State of Credit Card Holder
+    zip - Zip of Credit Card Holder
+    lat - Latitude Location of Credit Card Holder
+    long - Longitude Location of Credit Card Holder
+    city_pop - Credit Card Holder's City Population
+    job - Job of Credit Card Holder
+    dob - Date of Birth of Credit Card Holder
+    trans_num - Transaction Number
+    unix_time - UNIX Time of transaction
+    merch_lat - Latitude Location of Merchant
+    merch_long - Longitude Location of Merchant
+    is_fraud - Fraud Flag
+dataframe sample data:
+    {df_sample}
+#### PDF Document
+File PDF summary:
+Understanding Credit Card Frauds Card Busienss Review by Tata Consulting Service:
+This paper is released at 2003. The latest data are not available. If required, please answer that you dont have the data.
+This paper contain world wide snapshot data before 2003 about credit card fraud.
+Card fraud is a major global threat, particularly in online “card-not-present” transactions where fraud rates far exceed in-person purchases.
+Common techniques include lost/stolen cards, identity theft, counterfeit cards, skimming, and internet schemes such as site cloning and false merchant sites. While cardholders are typically protected by law, merchants bear the highest costs through chargebacks, penalties, and reputational damage, with banks also incurring significant prevention expenses.
+Effective management requires a layered approach: verification systems (AVS, CVV, payer authentication), blacklists/whitelists, and advanced methods like risk scoring, neural networks, biometrics, and smart cards. The key challenge is balancing fraud losses with the cost of prevention to minimize the total cost of fraud while maintaining trust in the payment ecosystem.
+Table of Contents:
+    Overview
+    Introduction
+    2.1. Purpose of this Paper
+    Current State of the Industry
+    How Fraud is Committed Worldwide
+    Fraud Techniques
+    5.1. Card-Related Frauds
+    - Application Fraud
+    - Lost / Stolen Cards
+    - Account Takeover
+    - Fake and Counterfeit Cards
+    5.2. Merchant-Related Frauds
+    - Merchant Collusion
+    - Triangulation
+    5.3. Internet-Related Frauds
+    Impact of Credit Card Frauds
+    6.1. Impact on Cardholders
+    6.2. Impact on Merchants
+    6.3. Impact on Banks (Issuer / Acquirer)
+    Fraud Prevention and Management
+    7.1. Fraud Prevention Technologies
+    - Manual Review
+    - Address Verification System (AVS)
+    - Card Verification Methods
+    - Negative and Positive Lists
+    - Payer Authentication
+    - Lockout Mechanisms
+    - Fraudulent Merchants
+    7.2. Recent Developments in Fraud Management
+    - Simple Rule Systems
+    - Risk Scoring Technologies
+    - Neural Network Technologies
+    - Biometrics
+    - Smart Cards
+    Managing the Total Cost of Fraud
+"""
+FINAL_PROMPT = """"
+You are a financial expert. Use the provided context to answer the user's question.
+IF THE CONTEXT INSUFFICIENT ANSWER WITH 'Insufficient context to answer the question.' AND TELL WHY NOT TO MAKE UP ANSWER. WHAT CONTEX IS MISSING.
+"""

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,106 @@
-import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+from models import ResponseState
+from prompt import REFINERY_PROMPT, FINAL_PROMPT
 import numpy as np
+import matplotlib.pyplot as plt
 import pandas as pd
 import streamlit as st
+from openai import OpenAI
+import pickle
+import logging
+logging.basicConfig(filename="/app/logs/app.log", level=logging.INFO)
+client = OpenAI()
+df = pickle.load(open("fraudTrainData.pkl", "rb"))
+st.title("Financial Agent")
+st.session_state["openai_model"] = "gpt-5-mini-2025-08-07"
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+def streamAnswer(instructions, input):
+    chunk = client.responses.create(
+        model=st.session_state["openai_model"],
+        instructions=instructions,
+        input=input,
+        stream=True,
+                # if contextType == "both" || contextType == "pdf"add vector store  tools file search
+        tools=[{
+            "type": "file_search",
+            "vector_store_ids": ['vs_68bf713eea2c81919ac08298a05d6704']
+        }]
+    )
+    for chunk in chunk:
+        if chunk.type == 'response.output_text.delta':
+            yield chunk.delta
+if prompt := st.chat_input("What is up?"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    response = client.responses.parse(
+        model=st.session_state["openai_model"],
+        instructions=REFINERY_PROMPT.format(
+            df_head=df.head().to_markdown(),
+            df_columns=df.columns.tolist(),
+            df_sample=df.sample(5).to_markdown()
+        ),
+        input=[
+            {"role": m["role"], "content": m["content"]}
+            for m in st.session_state.messages
+        ],
+        stream=False,
+        text_format=ResponseState
+    )
+    context_prompt = ""
+    logging.info(response)
+    responseState : ResponseState = response.output_parsed
+    if responseState.isNeedContext:
+        contextType = responseState.contextType
+        st.session_state.messages.append({"role": "assistant", "content": responseState.response})
+        # with st.chat_message("assistant"):
+        #     st.markdown(responseState.response)
+        if contextType == "data" or contextType == "both":
+            local_scope = {
+                "df": df,
+                "np": np,
+                "pd": pd,
+                "plt": plt
+            }
+            exec(responseState.code, {}, local_scope)
+            fig = plt.gcf()  # get current active figure
+            if fig.get_axes():  # check if plot was created
+                with st.chat_message("assistant"):
+                    st.pyplot(fig)
+                plt.close(fig)  # prevent duplicate rendering later
+            context_prompt = "## CONTEXT DATAFRAME.\n"
+            context_prompt += local_scope.get("result", "")
+            logging.info("context from data: " + context_prompt)
+        # elif format == "pdf" or format == "both":
+        #     context_prompt = "Provide the relevant information from the PDF documents to answer the user's question."
+        # st.session_state.messages.append({"role": "user", "content": context_prompt})
+        # with st.chat_message("user"):
+        #     st.markdown(context_prompt)
+        answer = ""
+        with st.chat_message("assistant"):
+            answer = st.write_stream(streamAnswer(
+                instructions=FINAL_PROMPT,
+                input=[
+                    {"role": m["role"], "content": m["content"]}
+                    for m in st.session_state.messages
+                ] + [{"role": "user", "content": context_prompt}]
+            ))
+        st.session_state.messages.append({"role": "assistant", "content": answer})
+    else: #only write the response
+        st.session_state.messages.append({"role": "assistant", "content": responseState.response})
+        with st.chat_message("assistant"):
+            st.markdown(responseState.response)