DebayanDaw commited on
Commit
5692274
·
1 Parent(s): f9007d0

Upload 6 files

Browse files
app.py CHANGED
@@ -1,29 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
 
 
2
 
3
- import gradio as gr
4
- import random
5
- import time
6
-
7
- with gr.Blocks() as demo:
8
- chatbot = gr.Chatbot()
9
- msg = gr.Textbox()
10
- clear = gr.Button("Clear")
11
-
12
- def user(user_message, history):
13
- return gr.update(value="", interactive=False), history + [[user_message, None]]
14
-
15
- def bot(history):
16
- bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
17
- history[-1][1] = ""
18
- for character in bot_message:
19
- history[-1][1] += character
20
- time.sleep(0.05)
21
- yield history
22
-
23
- response = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
24
- bot, chatbot, chatbot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
26
- response.then(lambda: gr.update(interactive=True), None, [msg], queue=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- demo.queue()
29
- demo.launch()
 
1
+ import pickle
2
+ import openai
3
+ import re
4
+ import numpy as np
5
+ import tensorflow_hub as hub
6
+ import openai
7
+ import os
8
+ import gradio as gr
9
+ import pandas as pd
10
+ import io
11
+ import fitz
12
+ from sklearn.neighbors import NearestNeighbors
13
+ import warnings
14
+ warnings.filterwarnings(action='ignore')
15
 
16
+ indi_pkl_file_path = "individual_list.pkl" # Specify the file path from which to read the list
17
+ busi_pkl_file_path = "business_list.pkl" # Specify the file path from which to read the list
18
 
19
+ indi_loaded_array_file_path= "individual_numpy.npy"
20
+ busi_loaded_array_file_path= "business_numpy.npy"
21
+
22
+ with open(indi_pkl_file_path, "rb") as f:
23
+ indi_texts = pickle.load(f)
24
+
25
+ with open(busi_pkl_file_path, "rb") as f:
26
+ busi_texts = pickle.load(f)
27
+
28
+ #df = pd.read_excel('ABS_Statewise_Tax_Revenue_data.xlsx')
29
+ #ndf= df.copy()
30
+
31
+ global indi_loaded_array
32
+ global busi_loaded_array
33
+
34
+ indi_loaded_array = np.load(indi_loaded_array_file_path)
35
+ busi_loaded_array = np.load(busi_loaded_array_file_path)
36
+ openAI_key = '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'
37
+
38
+ # key1: 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
39
+ start_page=1
40
+
41
+
42
+ def text_to_chunks(texts, word_length=150, start_page=1):
43
+ text_toks = [t.split(' ') for t in texts] # length of text_toks is equal to number of pages in PDF
44
+ #print("text_toks: ",text_toks)
45
+ page_nums = []
46
+ chunks = []
47
+
48
+ for idx, words in enumerate(text_toks):
49
+ #print(f'idx: {idx}\nwords: {words}')
50
+ for i in range(0, len(words), word_length):
51
+ #print("i: ",i)
52
+ chunk = words[i:i+word_length]
53
+ #print("chunk: ",chunk)
54
+ if (i+word_length) > len(words) and (len(chunk) < word_length) and (
55
+ len(text_toks) != (idx+1)):
56
+ text_toks[idx+1] = chunk + text_toks[idx+1]
57
+ continue
58
+ chunk = ' '.join(chunk).strip()
59
+ chunk = f'[{idx+start_page}]' + ' ' + '"' + chunk + '"'
60
+ #print(chunk)
61
+ chunks.append(chunk)
62
+ return chunks
63
+
64
+ class SemanticSearch:
65
+
66
+ def __init__(self,data,loaded_array, batch=1000, n_neighbors=5) :
67
+ self.use = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
68
+ self.fitted = False
69
+ self.data = data
70
+ self.embeddings = loaded_array
71
+ n_neighbors = min(n_neighbors, len(self.embeddings))
72
+ self.nn = NearestNeighbors(n_neighbors=n_neighbors)
73
+ self.nn.fit(self.embeddings)
74
+ self.fitted = True
75
+
76
+ def __call__(self, text, return_data=True):
77
+ inp_emb = self.use([text])
78
+ neighbors = self.nn.kneighbors(inp_emb, return_distance=False)[0]
79
+
80
+ if return_data:
81
+ return [self.data[i] for i in neighbors]
82
+ else:
83
+ return neighbors
84
+
85
+ def generate_text(openAI_key,prompt, engine="text-davinci-003"):
86
+ openai.api_key = openAI_key
87
+ completions = openai.Completion.create(
88
+ engine=engine,
89
+ prompt=prompt,
90
+ max_tokens=512,
91
+ n=1,
92
+ stop=None,
93
+ temperature=0,
94
  )
95
+ message = completions.choices[0].text
96
+ return message
97
+
98
+ def generate_answer(question,openAI_key):
99
+ topn_chunks = recommender(question)
100
+ prompt = ""
101
+ prompt += 'search results:\n\n'
102
+ for c in topn_chunks:
103
+ prompt += c + '\n\n'
104
+
105
+ prompt += "Instructions: Compose a comprehensive reply to the query using the search results given. "\
106
+ "Consider the provided information from Australian Taxation Office (ATO) and answer only when context is clear to you. For example : Tax, Australia etc."\
107
+ "Keep The tone conversational "\
108
+ "Provide example from the text to make the answer understandable."\
109
+ "If there are multiple points, use bullets to write those points."\
110
+ "If there is any related hyperlink, include that in the answer."\
111
+ "If the search results mention multiple subjects with the same name, create separate answers for each. "\
112
+ "Only include information found in the results and don't add any additional information."\
113
+ "Make sure the answer is correct and don't output false content. "\
114
+ "If the text does not relate to the query, simply state 'Are you sure this is a taxation related query? If not kindly ask taxation related queries'." \
115
+ "Ignore outlier search results which has nothing to do with the question. Only answer what is asked."\
116
+ "The answer should be short and concise. Answer step-by-step. \n\nQuery: {question}\nAnswer:"\
117
+
118
+ prompt += f"Query: {question}\nAnswer:"
119
+ answer = generate_text(openAI_key, prompt,"text-davinci-003")
120
+ return answer
121
+
122
+ def classify_chatgpt(question, openAI_key, engine='gpt-3.5-turbo-0301'):
123
+ openai.api_key = openAI_key
124
+ message = [{'role': 'system', 'content': '''We have a user query related to taxes from Australia and your task is to classify the query into one of the category from the following given two categories only:
125
+
126
+ 1. Business Taxation Query
127
+ 2. Individual Taxation Query
128
+
129
+ Context: Individual Taxation Query are those queries which are related to the individual taxation of the residents of a country (here from Australia).\n
130
+ Business Taxation Query are those queries which are related from the Business perspective.
131
+
132
+
133
+ Instructions:
134
+ 1. Understand the query very well.\n
135
+ 2. Return the result only as a string. \n
136
+ 3. Do not put any punctuation mark after classifying it. \n
137
+ 4. Do not write any extra information and just do the Classification from the mentioned two categories.'''},
138
+ {'role': 'user', 'content': question}]
139
+
140
+ completions = openai.ChatCompletion.create(
141
+ model=engine,
142
+ messages=message,
143
+ max_tokens=512,
144
+ n=1,
145
+ stop=None,
146
+ temperature=0.7,
147
+ )
148
+ message = completions.choices[0].message['content']
149
+ #print(message)
150
+ return message
151
+
152
+ def question_answer(question,openAI_key):
153
+
154
+ return generate_answer(question,openAI_key)
155
+
156
+
157
+ indi_chunks = text_to_chunks(indi_texts, start_page=start_page)
158
+ busi_chunks = text_to_chunks(busi_texts, start_page=start_page)
159
+
160
+
161
+
162
+ #def struct_answer(openAI_key, engine, message, dataframe, user_question):
163
+ #
164
+ # def generate_text2(openAI_key, engine, message):
165
+ # openai.api_key = openAI_key
166
+ #
167
+ # completions = openai.ChatCompletion.create(
168
+ # model=engine,
169
+ # messages=message,
170
+ # max_tokens=512,
171
+ # n=1,
172
+ # stop=None,
173
+ # temperature=0.7,
174
+ # )
175
+ # message = completions.choices[0].message['content']
176
+ # return message
177
+ #
178
+ # def filter_dataframe(openAI_key,engine, message,df=dataframe):
179
+ # out_state_year = generate_text2(openAI_key, engine, message)
180
+ # out_state_year = eval(out_state_year)
181
+ # state= [out_state_year[1]]
182
+ # year = [out_state_year[0]]
183
+ # tax_type = [out_state_year[2]]
184
+ # print(out_state_year)
185
+ # filter_df = df[(df['Year'].isin(year)) & (df['State'].isin(state)) & (df['SubSubCategory'].isin(tax_type))].reset_index().drop('index',axis=1)
186
+ # feeded_df = filter_df.loc[:,['Country','State','Govt_Type','Year','SubSubCategory',"Tax Amount",'Unit']]
187
+ # feeded_df = feeded_df.rename({'Govt_Type':'Government Type','SubSubCategory':'Type of Tax','Tax Amount':'Tax Amount Collected'},axis=1)
188
+ #
189
+ # return feeded_df
190
+ #
191
+ # feeded_df = filter_dataframe(openAI_key, engine, message)
192
+ # print(feeded_df.shape)
193
+ #
194
+ # def dataframe_to_text(new_dataframe= feeded_df):
195
+ # new_dataframe.to_csv(r'nmdata.txt', index=None, sep='|', mode='a')
196
+ # text_file = open('nmdata.txt', 'r')
197
+ # lines = text_file.readlines()
198
+ # os.remove("nmdata.txt")
199
+ # return lines
200
+ #
201
+ # def prompt_generation(x=dataframe_to_text(), y= user_question):
202
+ # prompt = ""
203
+ # for line in x:
204
+ # prompt+=line +'\n'
205
+ # prompt += '''Take above datapoint(s) as reference and answer the following question: {0}.\n Instructions while answering the question: 1. Do not generate any additional information on your own \n
206
+ # 2. Do not do generate any number on your own. Just the number mentioned in the datapoint(s) \n
207
+ # 3. Take your time to understand the question and than only answer the question \n
208
+ # 4. Write answer in the form of sentence. '''.format(y)
209
+ # return prompt
210
+ #
211
+ # def generate_text(openAI_key,prompt, engine="text-davinci-003"):
212
+ # openai.api_key = openAI_key
213
+ # completions = openai.Completion.create(
214
+ # engine=engine,
215
+ # prompt=prompt,
216
+ # max_tokens=512,
217
+ # n=1,
218
+ # stop=None,
219
+ # temperature=0
220
+ # )
221
+ # message = completions.choices[0].text
222
+ # return message
223
+ #
224
+ # prompt_ = prompt_generation()
225
+ # response_ = generate_text(openAI_key, prompt_)
226
+ # return response_
227
+
228
+ def output_generation(typed_ques):
229
+
230
+ type_of_query= classify_chatgpt(typed_ques, openAI_key)
231
+ global recommender
232
+ #print(type_of_query)
233
+
234
+ if type_of_query in ['Individual Taxation Query','Individual Taxation Query.']:
235
+
236
+ recommender = SemanticSearch(data=indi_chunks,loaded_array = indi_loaded_array)
237
+ out_pu = question_answer(question= typed_ques, openAI_key= openAI_key)
238
+ #print(out_pu)
239
+ return out_pu, type_of_query
240
+ elif type_of_query in ['Business Taxation Query','Business Taxation Query.']:
241
+ recommender = SemanticSearch(data=busi_chunks, loaded_array=busi_loaded_array)
242
+ out_pu = question_answer(question= typed_ques, openAI_key= openAI_key)
243
+ #print(out_pu)
244
+ return out_pu, type_of_query
245
+ #elif type_of_query in ['Data Related Query', 'Data Related Query.']:
246
+ #
247
+ # engine="gpt-3.5-turbo-0301"
248
+ # message = [{'role': 'system', 'content': '''From the provided sentence extract out the year mentioned in the sentence, State and type of Tax for which user is asking the question about.
249
+ #
250
+ # Instructions:
251
+ # 1. Provide your output just a list in which first element is Year and second element is the State for example: [2018, "Queensland", "Land Tax"]
252
+ # 2. If multiple states and year are mentioned than provide both as the result. for example: [[2018,2020], ["New South Wales","Queensland"], ["Land Tax","Gift Tax"]]
253
+ # 3. Extract the exact name of the tax from the mentioned list of type of taxes below. For example, if in query motor tax is present but in the given below categories there is no category as 'motor tax' than classify into its most nearest category which has information about total tax present.
254
+ # 4. Do not explain your answer. Just write the answer as mentioned in the first instruction.
255
+ #
256
+ # There are total 7 states in country Australia and they are as follows:
257
+ # 1. Northern Territory
258
+ # 2. Tasmania
259
+ # 3. South Australia
260
+ # 4. Western Australia
261
+ # 5. Queensland
262
+ # 6. Victoria
263
+ # 7. New South Wales
264
+ #
265
+ # Classify the State from above 7 names but in case, if in the sentence none of the states are mentioned than classify from the below special cases:
266
+ # 1. Give "Australia" as the result if nothing is mentioned at all.
267
+ # 2. Give "All States" as the result if the sentence is talking about all the states of Australia or all the state governments of Australia.
268
+ # 3. Give "Australian Capital Territory" as the result if the sentence is talking about the Australian Capitals territory.
269
+ #
270
+ #
271
+ # Also from the sentence classify the type of tax for which user is talking about from the below list of Tax Type:
272
+ # 1. Taxes on employers payroll and labour force, 2. Land taxes, 3. Municipal rates, 4. Other taxes on property, 5. Total taxes on immovable property, 6. Estate, inheritance and gift taxes,
273
+ # 7. Total taxes on property, 8. Excises and levies, 9. Taxes on government lotteries, 10. Taxes on private lotteries, 11. Taxes on gambling machines, 12. Casino taxes,
274
+ # 13. Race and other sports betting taxes, 14. Other taxes on gambling, 15. Total taxes on gambling, 16. Insurance companies contributions to fire brigades,
275
+ # 17. Third party insurance taxes, 18. Other taxes on insurance, 19. Total taxes on insurance, 20. Government borrowing guarantee levies, 21. Stamp duties on conveyances,
276
+ # 22. Other taxes on financial and capital transactions, 23. Total taxes on financial and capital transactions, 24. Total taxes on the provision of goods and services,
277
+ # 25. Stamp duty on vehicle registration, 26. Other motor vehicle taxes, 27. Total motor vehicle taxes, 28. Franchise taxes, 29. Other taxes on use of goods and performance of activities,
278
+ # 30. Total taxes on use of goods and performance of activities, 31. Total Taxation Northern Territory State and Local Government, 32. Taxes received from public corporations,
279
+ # 33. Taxes received from other levels of government, 34. Total Taxation Tasmania State and Local Government, 35. Total Taxation South Australia State and Local Government,
280
+ # 36. Total Taxation Western Australia State and Local Government, 37. Total Taxation Queensland State and Local Government, 38. Total Taxation Victoria State and Local Government,
281
+ # 39. Total Taxation New South Wales State and Local Government, 40. Total Taxation Australian Capital Territory State Government, 41. Personal income tax, 42. Government health insurance levy,
282
+ # 43. Fringe benefits tax, 44. Other income tax levied on individuals, 45. Total income taxes levied on individuals, 46. Company income tax, 47. Income tax paid by superannuation funds,
283
+ # 48. Other income tax levied on enterprises, 49. Total income taxes levied on enterprises, 50. Dividend withholding tax, 51. Interest withholding tax, 52. Other income taxes levied on non-residents,
284
+ # 53. Total income taxes levied on non-residents, 54. Total taxes on income, 55. General taxes (sales taxes), 56. Goods and services tax (GST),
285
+ # 57. Crude oil and LPG, 58. Other excises, 59. Agricultural production taxes, 60. Levies on statutory corporations, 61. Total excises and levies, 62. Taxes on international trade,
286
+ # 63. Taxes on financial and capital transactions, 64. Taxes on the use of goods and performance of activities, 65. Total Taxation on Commonwealth Government'''},
287
+ # {'role': 'user', 'content': typed_ques}]
288
+ # output_ = struct_answer(openAI_key, engine, message, ndf, typed_ques).lstrip()
289
+ # #print(output_)
290
+ # return output_, type_of_query
291
+
292
+
293
+
294
+ title = 'CDI Citizen Intelligence 360 Tool: Tax Advisory'
295
+ #description = """ Citizen Intelligence 360 sources data from various government agencies and makes it accessible to citizens in a user-friendly format. This can help citizens better understand how their local government works and stay informed about important initiatives and changes in their communities. Citizen Intelligence 360 is a citizen intelligence tool that uses mapping technology to provide citizens with access to government data, performance report and other civic information. It helps to increase transparency and accountability in local government, and empowers citizens to make informed decisions and participate in the democratic process."""
296
+ description = """Welcome to Citizen Intelligence 360 Tool, your intelligent tax companion. Harnessing the power of advanced GEN AI, the tax advisory tool is here to revolutionize your tax experience. With its unrivaled expertise and personalized guidance, the tool simplifies complex tax matters, providing accurate answers to your questions instantly. Say goodbye to confusing endless research and experience the future of tax advisory with Citizen Intelligence 360 Tool - the intelligent solution for your tax needs."""
297
+ img_ = '''<html><head><style>
298
+ .image {
299
+ width: 500px;
300
+ height: 150px;
301
+ position: static;
302
+ }</style></head><body><img src= https://blog.ipleaders.in/wp-content/uploads/2020/08/HDFC_Life_Filed_Your_Returns_Here%D0%A2s_How_You_Can_Check_to_Be_Sure_Sept19-1.jpg class=image></body></html>'''
303
+
304
+
305
+ def gradio_chatbox(input,history):
306
+ history = history or []
307
+
308
+ typed_ques = input
309
+ answe, query_type = output_generation(typed_ques)
310
+
311
+ ccc = "{0}".format(answe)
312
+ output = ccc
313
+ history.append((input,output))
314
+ #print(history)
315
+ return history, history
316
+
317
+ block = gr.Blocks()
318
+
319
+
320
+ with block:
321
+ gr.Markdown(f'<center><h1>{title}</h1></center>')
322
+ gr.Markdown(f'<center><h1>{img_}</h1></center>')
323
+ gr.Markdown(description)
324
+ chatbot = gr.Chatbot(label="Tax GenAI")
325
+ message = gr.Textbox(label='Please ask your question',placeholder = "Welcome! This is Tax GenAI.\nHow can I assist you today?")
326
+ state = gr.State()
327
+
328
+ submit = gr.Button("SEND")
329
+ #submit1 = gr.Button("SEND")
330
+ #submit2 = gr.Button("SEND")
331
+ submit.style(full_width=None, size='lg' )
332
+ #submit1.style(full_width=None, size='lg')
333
+ #submit2.style(full_width=None, size='lg')
334
+ submit.click(gradio_chatbox, inputs=[message, state], outputs=[chatbot, state])
335
+
336
 
337
+ block.launch()
 
business_list.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bf7ebb97a4a7e1b3cfec4110a1f379ad03209f66012e7a43b30437209817fe5
3
+ size 3905795
business_numpy.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd12b4bbb6d106c4906ff2ac11fbc7a969ab2684f32b8d338d5b45dcb9ad1ca
3
+ size 8523904
individual_list.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bacfc786a5aa9560c7b0138a9f52706982ac798860eb4a44fffa44a3492eb864
3
+ size 2236261
individual_numpy.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:610bd857cccf1beeaa962415ba0e809622232901cbfd27f32fb3561aa347cbff
3
+ size 5027968
requirements (1).txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ PyMuPDF
2
+ openai
3
+ frontend
4
+ tensorflow==2.9.2
5
+ tensorflow-hub==0.12.0
6
+ scikit-learn==1.0.2