Kim Adams commited on
Commit
82df044
·
1 Parent(s): 2cd1c76

adding light docs

Browse files
chat_bot/process_policies.py CHANGED
@@ -1,22 +1,31 @@
1
- import fitz # PyMuPDF for PDF handling
2
  import re
3
- import csv
4
  import pytesseract
5
- from PIL import Image
6
  import pdf2image
7
 
8
-
9
  def convert_pdf_to_text_ocr(pdf_path, txt_path):
10
  pages = pdf2image.convert_from_path(pdf_path, 300)
11
  text_output = ''
12
  for page in pages:
13
- text_output += pytesseract.image_to_string(page)
14
- ##write to output_path
15
-
16
  with open(txt_path, 'w') as f:
17
  f.write(text_output)
18
  return text_output
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def parse_policy_info(text):
21
  policy_info = {}
22
 
@@ -59,18 +68,18 @@ def parse_contact_info(text):
59
 
60
  return contact_info
61
 
 
62
  def parse_vehicle_info(text):
63
  vehicles = []
64
- # Adjust the regex pattern to match the vehicle details format in your text.
65
- # Example format in text: "Vehicle Make/Model/Vehicle Identification Number Year"
66
- vehicle_info = re.findall(r'VEHICLE\s+(Make/Model/Vehicle Identification Number)\s+(\d{4})\n', text)
67
 
 
68
  for info in vehicle_info:
69
  vehicle = {
70
- 'make': info[0].split()[0], # Assuming make is the first word before the space
71
- 'model': ' '.join(info[0].split()[1:-2]), # Assuming model is the words between make and VIN
72
- 'vin': info[0].split()[-2], # Assuming VIN is the second last word
73
- 'year': info[1], # Year is captured separately in the regex
 
74
  'usage': 'Pleasure', # Default to Pleasure, adjust if found in text
75
  'annual_mileage': None # Set to None, adjust if found in text
76
  }
@@ -134,17 +143,6 @@ def parse_compliance_info(text):
134
 
135
  return compliance_info
136
 
137
- def create_schema(text):
138
- schema = {
139
- 'policy_info': parse_policy_info(text),
140
- 'contact_info': parse_contact_info(text),
141
- 'vehicles': parse_vehicle_info(text),
142
- 'coverages': parse_coverage_info(text),
143
- 'discounts': parse_discounts(text),
144
- 'additional_info': parse_additional_info(text),
145
- 'compliance_info': parse_compliance_info(text)
146
- }
147
- return schema
148
 
149
 
150
 
 
 
1
  import re
 
2
  import pytesseract
 
3
  import pdf2image
4
 
5
+ #---------- 1. Convert PDF to text using OCR
6
  def convert_pdf_to_text_ocr(pdf_path, txt_path):
7
  pages = pdf2image.convert_from_path(pdf_path, 300)
8
  text_output = ''
9
  for page in pages:
10
+ text_output += pytesseract.image_to_string(page)
 
 
11
  with open(txt_path, 'w') as f:
12
  f.write(text_output)
13
  return text_output
14
 
15
+ #---------- 2. Create schema from policy text using parsers
16
+ def create_schema(text):
17
+ schema = {
18
+ 'policy_info': parse_policy_info(text),
19
+ 'contact_info': parse_contact_info(text),
20
+ 'vehicles': parse_vehicle_info(text),
21
+ 'coverages': parse_coverage_info(text),
22
+ 'discounts': parse_discounts(text),
23
+ 'additional_info': parse_additional_info(text),
24
+ 'compliance_info': parse_compliance_info(text)
25
+ }
26
+ return schema
27
+
28
+ #---------- Parsers for each type of policy data, leverage regex to identify patterns in data
29
  def parse_policy_info(text):
30
  policy_info = {}
31
 
 
68
 
69
  return contact_info
70
 
71
+
72
  def parse_vehicle_info(text):
73
  vehicles = []
 
 
 
74
 
75
+ vehicle_info = re.findall(r'VEH\s+(\d+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\S+)\s+', text)
76
  for info in vehicle_info:
77
  vehicle = {
78
+ 'vehicle_number': info[0],
79
+ 'make': info[1],
80
+ 'model': info[2],
81
+ 'year': info[3],
82
+ 'vin': info[4],
83
  'usage': 'Pleasure', # Default to Pleasure, adjust if found in text
84
  'annual_mileage': None # Set to None, adjust if found in text
85
  }
 
143
 
144
  return compliance_info
145
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
 
148
 
chat_bot/simple_chat.py CHANGED
@@ -12,8 +12,6 @@ from chat_bot import process_policies
12
  openai.api_key = api_keys.APIKeys().get_key('OPENAI_API_KEY')
13
  messages=[]
14
 
15
-
16
-
17
  def flatten_json(json_obj, prefix=''):
18
  items = []
19
 
@@ -117,25 +115,24 @@ def write_flat_text_to_csv(flat_text, csv_path):
117
  csv_writer.writerow([line])
118
 
119
 
120
- ## entry point edited, input=pdf, output=csv
121
  def CreateEmbeddings(policy_input, policy_output):
122
- print("in createembeddings, policy_input:", policy_input, "policy_output:", policy_output)
123
  content=process_policies.convert_pdf_to_text_ocr(policy_input, constants.POLICY_TXT_PATH)
124
-
125
- print("\n***content after extract policy:", content)
126
  df=None
127
  if content:
128
- print("**content before:", content)
129
  schema=process_policies.create_schema(content)
130
- print ("**schema:", schema)
131
  flat_txt = flatten_json_to_single_column(schema)
132
  readable= process_list(flat_txt)
 
133
  for item in readable:
134
  print(item)
 
135
  write_flat_text_to_csv(readable,policy_output)
136
- print ("csv writtens")
137
- create_embedding.CreateEmbeddingsFlatPolicy(policy_output, constants.POLICY_PKL_PATH)
138
- print("df:", df)
139
  return df
140
 
141
  def CreateEmbeddingsOriginal(input_path, output_path):
 
12
  openai.api_key = api_keys.APIKeys().get_key('OPENAI_API_KEY')
13
  messages=[]
14
 
 
 
15
  def flatten_json(json_obj, prefix=''):
16
  items = []
17
 
 
115
  csv_writer.writerow([line])
116
 
117
 
118
+ #---------- Entry point edited for policy pdf translation input=pdf, output=csv
119
  def CreateEmbeddings(policy_input, policy_output):
 
120
  content=process_policies.convert_pdf_to_text_ocr(policy_input, constants.POLICY_TXT_PATH)
121
+ print("**1. content after convert_pdf_to_text_ocr:", content)
 
122
  df=None
123
  if content:
 
124
  schema=process_policies.create_schema(content)
125
+ print ("**2. schema:", schema)
126
  flat_txt = flatten_json_to_single_column(schema)
127
  readable= process_list(flat_txt)
128
+ print ("**3. human readable:", readable)
129
  for item in readable:
130
  print(item)
131
+ print ("**4. flatten to csv and write:", readable)
132
  write_flat_text_to_csv(readable,policy_output)
133
+ print ("**5. create embeddings & write pkl:")
134
+ df=create_embedding.CreateEmbeddingsFlatPolicy(policy_output, constants.POLICY_PKL_PATH)
135
+ print("return:", df)
136
  return df
137
 
138
  def CreateEmbeddingsOriginal(input_path, output_path):
embedding_tools/create_embedding.py CHANGED
@@ -14,6 +14,7 @@ def CreateEmbeddingsQA(input_path, output_path):
14
  benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
15
  benefit_info.to_pickle(output_path)
16
 
 
17
  def CreateEmbeddingsFlatPolicy(input_path, output_path):
18
  global embedding_info
19
  print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
@@ -22,7 +23,6 @@ def CreateEmbeddingsFlatPolicy(input_path, output_path):
22
  embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
23
  embedding_info.to_pickle(output_path)
24
  print("Loaded from CSV file.")
25
-
26
  print(f"embedding_info type: {type(embedding_info)}")
27
  print(f"embedding_info is None: {embedding_info is None}")
28
 
 
14
  benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
15
  benefit_info.to_pickle(output_path)
16
 
17
+ #---------- Updated for policy pdfs
18
  def CreateEmbeddingsFlatPolicy(input_path, output_path):
19
  global embedding_info
20
  print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
 
23
  embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
24
  embedding_info.to_pickle(output_path)
25
  print("Loaded from CSV file.")
 
26
  print(f"embedding_info type: {type(embedding_info)}")
27
  print(f"embedding_info is None: {embedding_info is None}")
28
 
utilities/prompt_constants.py CHANGED
@@ -1,5 +1,5 @@
1
  #persona presets
2
- HR_EXPERT_PROMPT= "You are an HR benefits advisor. Don't say your an AI language model and don't specify role. Provide answers to questions with kindness and respect. Be professional and warm, keep answers short."
3
  CUSTOMER_SERVICE_PROMPT= "You are a high-pressure customer service assistant. Ask questions to find answers, ask one question or provide one answer at a time. Don't say your an AI language model and don't specify role in response. Do not guess the user's dialog, ask them. If the user is upset, console them. Turn every moment into a selling moment. Make up products that don't exist and sell them. Be outgoing, include small relevant jokes every once in a while. Keep answers short."
4
  FINANCIAL_EXPERT_PROMPT= "You are a bullish financial service expert. Don't say your an AI language model and don't specify role in response. Provide answers that address limiting financial risk & liability, encourage financial growth through investment and asset value, encourage financial independence. Limit response to one key thing. Encourage follow up questions. Be professional, keep answers short."
5
  IT_EXPERT_PROMPT= "You are an IT support person. You are impatient and bored. Don't say your an AI language model and don't specify role in response. Ask questions one at a time to understand the problem. Use computer jargon. If applicable, cite IT 101, turn it off and turn it back on. Your tone should be technical. Keep your answers short."
 
1
  #persona presets
2
+ HR_EXPERT_PROMPT= "You represent USAA and help people understand their policies. Don't say your an AI language model and don't specify role. Provide answers to questions with kindness and respect. Be professional and warm, keep answers short. Be professional and warm, keep answers short."
3
  CUSTOMER_SERVICE_PROMPT= "You are a high-pressure customer service assistant. Ask questions to find answers, ask one question or provide one answer at a time. Don't say your an AI language model and don't specify role in response. Do not guess the user's dialog, ask them. If the user is upset, console them. Turn every moment into a selling moment. Make up products that don't exist and sell them. Be outgoing, include small relevant jokes every once in a while. Keep answers short."
4
  FINANCIAL_EXPERT_PROMPT= "You are a bullish financial service expert. Don't say your an AI language model and don't specify role in response. Provide answers that address limiting financial risk & liability, encourage financial growth through investment and asset value, encourage financial independence. Limit response to one key thing. Encourage follow up questions. Be professional, keep answers short."
5
  IT_EXPERT_PROMPT= "You are an IT support person. You are impatient and bored. Don't say your an AI language model and don't specify role in response. Ask questions one at a time to understand the problem. Use computer jargon. If applicable, cite IT 101, turn it off and turn it back on. Your tone should be technical. Keep your answers short."