Kim Adams commited on
Commit ·
82df044
1
Parent(s): 2cd1c76
adding light docs
Browse files- chat_bot/process_policies.py +23 -25
- chat_bot/simple_chat.py +8 -11
- embedding_tools/create_embedding.py +1 -1
- utilities/prompt_constants.py +1 -1
chat_bot/process_policies.py
CHANGED
|
@@ -1,22 +1,31 @@
|
|
| 1 |
-
import fitz # PyMuPDF for PDF handling
|
| 2 |
import re
|
| 3 |
-
import csv
|
| 4 |
import pytesseract
|
| 5 |
-
from PIL import Image
|
| 6 |
import pdf2image
|
| 7 |
|
| 8 |
-
|
| 9 |
def convert_pdf_to_text_ocr(pdf_path, txt_path):
|
| 10 |
pages = pdf2image.convert_from_path(pdf_path, 300)
|
| 11 |
text_output = ''
|
| 12 |
for page in pages:
|
| 13 |
-
text_output += pytesseract.image_to_string(page)
|
| 14 |
-
##write to output_path
|
| 15 |
-
|
| 16 |
with open(txt_path, 'w') as f:
|
| 17 |
f.write(text_output)
|
| 18 |
return text_output
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
def parse_policy_info(text):
|
| 21 |
policy_info = {}
|
| 22 |
|
|
@@ -59,18 +68,18 @@ def parse_contact_info(text):
|
|
| 59 |
|
| 60 |
return contact_info
|
| 61 |
|
|
|
|
| 62 |
def parse_vehicle_info(text):
|
| 63 |
vehicles = []
|
| 64 |
-
# Adjust the regex pattern to match the vehicle details format in your text.
|
| 65 |
-
# Example format in text: "Vehicle Make/Model/Vehicle Identification Number Year"
|
| 66 |
-
vehicle_info = re.findall(r'VEHICLE\s+(Make/Model/Vehicle Identification Number)\s+(\d{4})\n', text)
|
| 67 |
|
|
|
|
| 68 |
for info in vehicle_info:
|
| 69 |
vehicle = {
|
| 70 |
-
'
|
| 71 |
-
'
|
| 72 |
-
'
|
| 73 |
-
'year': info[
|
|
|
|
| 74 |
'usage': 'Pleasure', # Default to Pleasure, adjust if found in text
|
| 75 |
'annual_mileage': None # Set to None, adjust if found in text
|
| 76 |
}
|
|
@@ -134,17 +143,6 @@ def parse_compliance_info(text):
|
|
| 134 |
|
| 135 |
return compliance_info
|
| 136 |
|
| 137 |
-
def create_schema(text):
|
| 138 |
-
schema = {
|
| 139 |
-
'policy_info': parse_policy_info(text),
|
| 140 |
-
'contact_info': parse_contact_info(text),
|
| 141 |
-
'vehicles': parse_vehicle_info(text),
|
| 142 |
-
'coverages': parse_coverage_info(text),
|
| 143 |
-
'discounts': parse_discounts(text),
|
| 144 |
-
'additional_info': parse_additional_info(text),
|
| 145 |
-
'compliance_info': parse_compliance_info(text)
|
| 146 |
-
}
|
| 147 |
-
return schema
|
| 148 |
|
| 149 |
|
| 150 |
|
|
|
|
|
|
|
| 1 |
import re
|
|
|
|
| 2 |
import pytesseract
|
|
|
|
| 3 |
import pdf2image
|
| 4 |
|
| 5 |
+
#---------- 1. Convert PDF to text using OCR
|
| 6 |
def convert_pdf_to_text_ocr(pdf_path, txt_path):
|
| 7 |
pages = pdf2image.convert_from_path(pdf_path, 300)
|
| 8 |
text_output = ''
|
| 9 |
for page in pages:
|
| 10 |
+
text_output += pytesseract.image_to_string(page)
|
|
|
|
|
|
|
| 11 |
with open(txt_path, 'w') as f:
|
| 12 |
f.write(text_output)
|
| 13 |
return text_output
|
| 14 |
|
| 15 |
+
#---------- 2. Create schema from policy text using parsers
|
| 16 |
+
def create_schema(text):
|
| 17 |
+
schema = {
|
| 18 |
+
'policy_info': parse_policy_info(text),
|
| 19 |
+
'contact_info': parse_contact_info(text),
|
| 20 |
+
'vehicles': parse_vehicle_info(text),
|
| 21 |
+
'coverages': parse_coverage_info(text),
|
| 22 |
+
'discounts': parse_discounts(text),
|
| 23 |
+
'additional_info': parse_additional_info(text),
|
| 24 |
+
'compliance_info': parse_compliance_info(text)
|
| 25 |
+
}
|
| 26 |
+
return schema
|
| 27 |
+
|
| 28 |
+
#---------- Parsers for each type of policy data, leverage regex to identify patterns in data
|
| 29 |
def parse_policy_info(text):
|
| 30 |
policy_info = {}
|
| 31 |
|
|
|
|
| 68 |
|
| 69 |
return contact_info
|
| 70 |
|
| 71 |
+
|
| 72 |
def parse_vehicle_info(text):
|
| 73 |
vehicles = []
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
+
vehicle_info = re.findall(r'VEH\s+(\d+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\S+)\s+', text)
|
| 76 |
for info in vehicle_info:
|
| 77 |
vehicle = {
|
| 78 |
+
'vehicle_number': info[0],
|
| 79 |
+
'make': info[1],
|
| 80 |
+
'model': info[2],
|
| 81 |
+
'year': info[3],
|
| 82 |
+
'vin': info[4],
|
| 83 |
'usage': 'Pleasure', # Default to Pleasure, adjust if found in text
|
| 84 |
'annual_mileage': None # Set to None, adjust if found in text
|
| 85 |
}
|
|
|
|
| 143 |
|
| 144 |
return compliance_info
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
|
| 148 |
|
chat_bot/simple_chat.py
CHANGED
|
@@ -12,8 +12,6 @@ from chat_bot import process_policies
|
|
| 12 |
openai.api_key = api_keys.APIKeys().get_key('OPENAI_API_KEY')
|
| 13 |
messages=[]
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
def flatten_json(json_obj, prefix=''):
|
| 18 |
items = []
|
| 19 |
|
|
@@ -117,25 +115,24 @@ def write_flat_text_to_csv(flat_text, csv_path):
|
|
| 117 |
csv_writer.writerow([line])
|
| 118 |
|
| 119 |
|
| 120 |
-
#
|
| 121 |
def CreateEmbeddings(policy_input, policy_output):
|
| 122 |
-
print("in createembeddings, policy_input:", policy_input, "policy_output:", policy_output)
|
| 123 |
content=process_policies.convert_pdf_to_text_ocr(policy_input, constants.POLICY_TXT_PATH)
|
| 124 |
-
|
| 125 |
-
print("\n***content after extract policy:", content)
|
| 126 |
df=None
|
| 127 |
if content:
|
| 128 |
-
print("**content before:", content)
|
| 129 |
schema=process_policies.create_schema(content)
|
| 130 |
-
print ("**schema:", schema)
|
| 131 |
flat_txt = flatten_json_to_single_column(schema)
|
| 132 |
readable= process_list(flat_txt)
|
|
|
|
| 133 |
for item in readable:
|
| 134 |
print(item)
|
|
|
|
| 135 |
write_flat_text_to_csv(readable,policy_output)
|
| 136 |
-
print ("
|
| 137 |
-
create_embedding.CreateEmbeddingsFlatPolicy(policy_output, constants.POLICY_PKL_PATH)
|
| 138 |
-
print("
|
| 139 |
return df
|
| 140 |
|
| 141 |
def CreateEmbeddingsOriginal(input_path, output_path):
|
|
|
|
| 12 |
openai.api_key = api_keys.APIKeys().get_key('OPENAI_API_KEY')
|
| 13 |
messages=[]
|
| 14 |
|
|
|
|
|
|
|
| 15 |
def flatten_json(json_obj, prefix=''):
|
| 16 |
items = []
|
| 17 |
|
|
|
|
| 115 |
csv_writer.writerow([line])
|
| 116 |
|
| 117 |
|
| 118 |
+
#---------- Entry point edited for policy pdf translation input=pdf, output=csv
|
| 119 |
def CreateEmbeddings(policy_input, policy_output):
|
|
|
|
| 120 |
content=process_policies.convert_pdf_to_text_ocr(policy_input, constants.POLICY_TXT_PATH)
|
| 121 |
+
print("**1. content after convert_pdf_to_text_ocr:", content)
|
|
|
|
| 122 |
df=None
|
| 123 |
if content:
|
|
|
|
| 124 |
schema=process_policies.create_schema(content)
|
| 125 |
+
print ("**2. schema:", schema)
|
| 126 |
flat_txt = flatten_json_to_single_column(schema)
|
| 127 |
readable= process_list(flat_txt)
|
| 128 |
+
print ("**3. human readable:", readable)
|
| 129 |
for item in readable:
|
| 130 |
print(item)
|
| 131 |
+
print ("**4. flatten to csv and write:", readable)
|
| 132 |
write_flat_text_to_csv(readable,policy_output)
|
| 133 |
+
print ("**5. create embeddings & write pkl:")
|
| 134 |
+
df=create_embedding.CreateEmbeddingsFlatPolicy(policy_output, constants.POLICY_PKL_PATH)
|
| 135 |
+
print("return:", df)
|
| 136 |
return df
|
| 137 |
|
| 138 |
def CreateEmbeddingsOriginal(input_path, output_path):
|
embedding_tools/create_embedding.py
CHANGED
|
@@ -14,6 +14,7 @@ def CreateEmbeddingsQA(input_path, output_path):
|
|
| 14 |
benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
|
| 15 |
benefit_info.to_pickle(output_path)
|
| 16 |
|
|
|
|
| 17 |
def CreateEmbeddingsFlatPolicy(input_path, output_path):
|
| 18 |
global embedding_info
|
| 19 |
print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
|
|
@@ -22,7 +23,6 @@ def CreateEmbeddingsFlatPolicy(input_path, output_path):
|
|
| 22 |
embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
|
| 23 |
embedding_info.to_pickle(output_path)
|
| 24 |
print("Loaded from CSV file.")
|
| 25 |
-
|
| 26 |
print(f"embedding_info type: {type(embedding_info)}")
|
| 27 |
print(f"embedding_info is None: {embedding_info is None}")
|
| 28 |
|
|
|
|
| 14 |
benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
|
| 15 |
benefit_info.to_pickle(output_path)
|
| 16 |
|
| 17 |
+
#---------- Updated for policy pdfs
|
| 18 |
def CreateEmbeddingsFlatPolicy(input_path, output_path):
|
| 19 |
global embedding_info
|
| 20 |
print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
|
|
|
|
| 23 |
embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
|
| 24 |
embedding_info.to_pickle(output_path)
|
| 25 |
print("Loaded from CSV file.")
|
|
|
|
| 26 |
print(f"embedding_info type: {type(embedding_info)}")
|
| 27 |
print(f"embedding_info is None: {embedding_info is None}")
|
| 28 |
|
utilities/prompt_constants.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
#persona presets
|
| 2 |
-
HR_EXPERT_PROMPT= "You
|
| 3 |
CUSTOMER_SERVICE_PROMPT= "You are a high-pressure customer service assistant. Ask questions to find answers, ask one question or provide one answer at a time. Don't say your an AI language model and don't specify role in response. Do not guess the user's dialog, ask them. If the user is upset, console them. Turn every moment into a selling moment. Make up products that don't exist and sell them. Be outgoing, include small relevant jokes every once in a while. Keep answers short."
|
| 4 |
FINANCIAL_EXPERT_PROMPT= "You are a bullish financial service expert. Don't say your an AI language model and don't specify role in response. Provide answers that address limiting financial risk & liability, encourage financial growth through investment and asset value, encourage financial independence. Limit response to one key thing. Encourage follow up questions. Be professional, keep answers short."
|
| 5 |
IT_EXPERT_PROMPT= "You are an IT support person. You are impatient and bored. Don't say your an AI language model and don't specify role in response. Ask questions one at a time to understand the problem. Use computer jargon. If applicable, cite IT 101, turn it off and turn it back on. Your tone should be technical. Keep your answers short."
|
|
|
|
| 1 |
#persona presets
|
| 2 |
+
HR_EXPERT_PROMPT= "You represent USAA and help people understand their policies. Don't say your an AI language model and don't specify role. Provide answers to questions with kindness and respect. Be professional and warm, keep answers short. Be professional and warm, keep answers short."
|
| 3 |
CUSTOMER_SERVICE_PROMPT= "You are a high-pressure customer service assistant. Ask questions to find answers, ask one question or provide one answer at a time. Don't say your an AI language model and don't specify role in response. Do not guess the user's dialog, ask them. If the user is upset, console them. Turn every moment into a selling moment. Make up products that don't exist and sell them. Be outgoing, include small relevant jokes every once in a while. Keep answers short."
|
| 4 |
FINANCIAL_EXPERT_PROMPT= "You are a bullish financial service expert. Don't say your an AI language model and don't specify role in response. Provide answers that address limiting financial risk & liability, encourage financial growth through investment and asset value, encourage financial independence. Limit response to one key thing. Encourage follow up questions. Be professional, keep answers short."
|
| 5 |
IT_EXPERT_PROMPT= "You are an IT support person. You are impatient and bored. Don't say your an AI language model and don't specify role in response. Ask questions one at a time to understand the problem. Use computer jargon. If applicable, cite IT 101, turn it off and turn it back on. Your tone should be technical. Keep your answers short."
|