File size: 7,138 Bytes
d3a543b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c0711c
d3a543b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c0711c
d3a543b
2c0711c
d3a543b
 
2c0711c
d3a543b
2c0711c
 
 
 
d3a543b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c0711c
d3a543b
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import sqlite3
import pandas as pd
import os
from dotenv import load_dotenv
import google.generativeai as genai

app = FastAPI()

load_dotenv()
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

class Query(BaseModel):
    question: str
    data_source: str

def get_gemini_response(question, prompt):
    model = genai.GenerativeModel('gemini-1.5-pro')
    response = model.generate_content([prompt, question])
    return response.text

sql_cols_human = 'REQUESTID', 'DATETIMEINIT', 'SOURCE', 'DESCRIPTION', 'REQCATEGORY', 'STATUS', 'REFERREDTO', 'DATETIMECLOSED', 'City', 'State', 'Ward', 'Postcode'
csv_columns_human = ['REQUESTID', 'DATETIMEINIT', 'SOURCE', 'DESCRIPTION', 'REQCATEGORY', 'STATUS', 'REFERREDTO', 'DATETIMECLOSED', 'City', 'State', 'Ward', 'Postcode']
sql_cols = 'REQUESTID', 'DATETIMEINIT', 'SOURCE', 'DESCRIPTION', 'REQCATEGORY', 'STATUS', 'REFERREDTO', 'DATETIMECLOSED', 'City', 'State', 'Ward', 'Postcode'

def get_csv_columns():
    df = pd.read_csv('wandsworth_callcenter_sampled.csv')
    return df.columns.tolist()

csv_columns = get_csv_columns()

sql_prompt = f"""

You are an expert in converting English questions to SQLite code!

The SQLite database has the name CALLCENTER_REQUESTS and has the following Columns: {', '.join(sql_cols)}



Here are some key details about the dataset:

- `SOURCE`: Phone, Online Form, FixMyStreet, Email, Telephone/Email, Telephone Voicemail, Other, Local Council Office.

- `REQCATEGORY`: Blocked Drains, Council Building Maintenance, Fly-Tipping, Street and Pavement Maintenance, Recycling, Traffic Signage Issues, Parks Maintenance, Graffiti Removal, Tree Maintenance.

- `STATUS`: Resolved, In Progress, Cancelled by Customer, Referred to External Agency, Work Order Created, Under Review.

- `REFERREDTO`: Council Enforcement, Transport for London (TfL), Thames Water, Royal Mail, UK Power Networks.



For example:

- Would you please list all unresolved calls? command: SELECT * FROM CALLCENTER_REQUESTS WHERE STATUS='In Progress';

- Would you please count the total number of calls? command: SELECT COUNT(*) FROM CALLCENTER_REQUESTS;

- List all unique wards please? command: SELECT DISTINCT Ward FROM CALLCENTER_REQUESTS;



Also, the SQL code should not have ''' in the beginning or at the end, and SQL word in output.

Ensure that you only generate valid SQLite database queries, not pandas or Python code.

"""

csv_prompt = f"""

You are an expert in analyzing CSV data and converting English questions to pandas query syntax.

The CSV file is named 'wandsworth_callcenter_sampled.csv' and contains residents' call information in Wandsworth Council.

The available columns in the CSV file are: {', '.join(csv_columns)}



Here are some key details about the dataset:

- `SOURCE`: Phone, Online Form, FixMyStreet, Email, Telephone/Email, Telephone Voicemail, Other, Local Council Office.

- `REQCATEGORY`: Blocked Drains, Council Building Maintenance, Fly-Tipping, Street and Pavement Maintenance, Recycling, Traffic Signage Issues, Parks Maintenance, Graffiti Removal, Tree Maintenance.

- `STATUS`: Resolved, In Progress, Cancelled by Customer, Referred to External Agency, Work Order Created, Under Review.

- `REFERREDTO`: Council Enforcement, Transport for London (TfL), Thames Water, Royal Mail, UK Power Networks.



For example:

- How many calls in total? len(df.REQUESTID)

- What are all the calls referred to external agencies? df[df['REFERREDTO'].notna()]

- Would you please show the top 5 most frequent call categories? df['REQCATEGORY'].value_counts().head(5)



Please ensure:

1. Always reference columns using `df['COLUMN_NAME']`.

2. Do not use Python lists like `['COLUMN_NAME']` to refer to columns.

3. Provide only the pandas query syntax without any additional explanation or markdown formatting.

Make sure to use only the columns that are available in the CSV file.

Ensure that you only generate valid pandas queries. NO SQL or other types of code/syntax.

"""

def execute_sql_query(query):
    conn = sqlite3.connect('wandsworth_callcenter_sampled.db')
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        result = cursor.fetchall()
        return result
    except sqlite3.Error as e:
        sql_error_message = str(e)
        error_prompt = f"""

        You are an expert SQL debugger and an assistant of the director. An error occurred while executing the following query:

        {query}



        The error was: {sql_error_message}

        Please explain the error in simple laymen terms. Do Not explain. 

        Do Not include any programming code, e.g. sql or python syntax, etc.

        And finally politely remind the user there are only information about the following columns{', '.join(sql_cols_human)}.

        Explain this in layman's terms and remind the user that the dataset contains the following columns: {', '.join(sql_cols_human)}.

        """
        explanation = get_gemini_response("", error_prompt)
        raise HTTPException(status_code=400, detail={"error": sql_error_message, "explanation": explanation})
    finally:
        conn.close()

def execute_pandas_query(query):
    df = pd.read_csv('wandsworth_callcenter_sampled.csv')
    df.columns = df.columns.str.upper()
    query = query.replace("```python", "").replace("```", "").strip()
    query_lines = query.split("\n")
    try:
        result = None
        exec_context = {'df': df, 'pd': pd}
        for line in query_lines:
            line = line.strip()
            if line:
                exec(line, exec_context)
        result = eval(query_lines[-1].strip(), exec_context)
        if isinstance(result, pd.DataFrame):
            result = result.replace([float('inf'), -float('inf')], None).fillna(value="N/A")
            return result.to_dict(orient='records')
        elif isinstance(result, pd.Series):
            result = result.replace([float('inf'), -float('inf')], None).fillna(value="N/A")
            return result.to_dict()
        else:
            return result
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Pandas Error: {str(e)}")

@app.post("/query")
async def process_query(query: Query):
    if query.data_source == "SQL Database":
        ai_response = get_gemini_response(query.question, sql_prompt)
        try:
            result = execute_sql_query(ai_response)
            return {"query": ai_response, "result": result}
        except HTTPException as e:
            error_detail = e.detail
            return {"query": ai_response, "error": error_detail["error"], "explanation": error_detail["explanation"]}
    else:
        ai_response = get_gemini_response(query.question, csv_prompt)
        try:
            result = execute_pandas_query(ai_response)
            return {"query": ai_response, "result": result, "columns": csv_columns}
        except HTTPException as e:
            raise HTTPException(status_code=400, detail=f"Error in pandas query: {e.detail}")