Spaces:

kimadams
/

ai-kit

Sleeping

App Files Files Community

Kim Adams commited on Jun 29, 2024

Commit

82df044

1 Parent(s): 2cd1c76

adding light docs

Browse files

Files changed (4) hide show

chat_bot/process_policies.py +23 -25
chat_bot/simple_chat.py +8 -11
embedding_tools/create_embedding.py +1 -1
utilities/prompt_constants.py +1 -1

chat_bot/process_policies.py CHANGED Viewed

@@ -1,22 +1,31 @@
-import fitz  # PyMuPDF for PDF handling
 import re
-import csv
 import pytesseract
-from PIL import Image
 import pdf2image
 def convert_pdf_to_text_ocr(pdf_path, txt_path):
     pages = pdf2image.convert_from_path(pdf_path, 300)
     text_output = ''
     for page in pages:
-        text_output += pytesseract.image_to_string(page)
-    ##write to output_path
     with open(txt_path, 'w') as f:
         f.write(text_output)
     return text_output
 def parse_policy_info(text):
     policy_info = {}
@@ -59,18 +68,18 @@ def parse_contact_info(text):
     return contact_info
 def parse_vehicle_info(text):
     vehicles = []
-    # Adjust the regex pattern to match the vehicle details format in your text.
-    # Example format in text: "Vehicle Make/Model/Vehicle Identification Number Year"
-    vehicle_info = re.findall(r'VEHICLE\s+(Make/Model/Vehicle Identification Number)\s+(\d{4})\n', text)
     for info in vehicle_info:
         vehicle = {
-            'make': info[0].split()[0],  # Assuming make is the first word before the space
-            'model': ' '.join(info[0].split()[1:-2]),  # Assuming model is the words between make and VIN
-            'vin': info[0].split()[-2],  # Assuming VIN is the second last word
-            'year': info[1],  # Year is captured separately in the regex
             'usage': 'Pleasure',  # Default to Pleasure, adjust if found in text
             'annual_mileage': None  # Set to None, adjust if found in text
         }
@@ -134,17 +143,6 @@ def parse_compliance_info(text):
     return compliance_info
-def create_schema(text):
-    schema = {
-        'policy_info': parse_policy_info(text),
-        'contact_info': parse_contact_info(text),
-        'vehicles': parse_vehicle_info(text),
-        'coverages': parse_coverage_info(text),
-        'discounts': parse_discounts(text),
-        'additional_info': parse_additional_info(text),
-        'compliance_info': parse_compliance_info(text)
-    }
-    return schema

 import re
 import pytesseract
 import pdf2image
+#---------- 1. Convert PDF to text using OCR
 def convert_pdf_to_text_ocr(pdf_path, txt_path):
     pages = pdf2image.convert_from_path(pdf_path, 300)
     text_output = ''
     for page in pages:
+        text_output += pytesseract.image_to_string(page)
     with open(txt_path, 'w') as f:
         f.write(text_output)
     return text_output
+#---------- 2. Create schema from policy text using parsers
+def create_schema(text):
+    schema = {
+        'policy_info': parse_policy_info(text),
+        'contact_info': parse_contact_info(text),
+        'vehicles': parse_vehicle_info(text),
+        'coverages': parse_coverage_info(text),
+        'discounts': parse_discounts(text),
+        'additional_info': parse_additional_info(text),
+        'compliance_info': parse_compliance_info(text)
+    }
+    return schema
+#---------- Parsers for each type of policy data, leverage regex to identify patterns in data
 def parse_policy_info(text):
     policy_info = {}
     return contact_info
 def parse_vehicle_info(text):
     vehicles = []
+    vehicle_info = re.findall(r'VEH\s+(\d+)\s+(\w+)\s+(\w+)\s+(\d+)\s+(\S+)\s+', text)
     for info in vehicle_info:
         vehicle = {
+            'vehicle_number': info[0],
+            'make': info[1],
+            'model': info[2],
+            'year': info[3],
+            'vin': info[4],
             'usage': 'Pleasure',  # Default to Pleasure, adjust if found in text
             'annual_mileage': None  # Set to None, adjust if found in text
         }
     return compliance_info

chat_bot/simple_chat.py CHANGED Viewed

@@ -12,8 +12,6 @@ from chat_bot import process_policies
 openai.api_key = api_keys.APIKeys().get_key('OPENAI_API_KEY')
 messages=[]
 def flatten_json(json_obj, prefix=''):
     items = []
@@ -117,25 +115,24 @@ def write_flat_text_to_csv(flat_text, csv_path):
             csv_writer.writerow([line])
-## entry point edited, input=pdf, output=csv
 def CreateEmbeddings(policy_input, policy_output):
-    print("in createembeddings, policy_input:", policy_input, "policy_output:", policy_output)
     content=process_policies.convert_pdf_to_text_ocr(policy_input, constants.POLICY_TXT_PATH)
-    print("\n***content after extract policy:", content)
     df=None
     if content:
-        print("**content before:", content)
         schema=process_policies.create_schema(content)
-        print ("**schema:", schema)
         flat_txt = flatten_json_to_single_column(schema)
         readable= process_list(flat_txt)
         for item in readable:
             print(item)
         write_flat_text_to_csv(readable,policy_output)
-        print ("csv writtens")
-        create_embedding.CreateEmbeddingsFlatPolicy(policy_output, constants.POLICY_PKL_PATH)
-    print("df:", df)
     return df
 def CreateEmbeddingsOriginal(input_path, output_path):

 openai.api_key = api_keys.APIKeys().get_key('OPENAI_API_KEY')
 messages=[]
 def flatten_json(json_obj, prefix=''):
     items = []
             csv_writer.writerow([line])
+#---------- Entry point edited for policy pdf translation input=pdf, output=csv
 def CreateEmbeddings(policy_input, policy_output):
     content=process_policies.convert_pdf_to_text_ocr(policy_input, constants.POLICY_TXT_PATH)
+    print("**1. content after convert_pdf_to_text_ocr:", content)
     df=None
     if content:
         schema=process_policies.create_schema(content)
+        print ("**2. schema:", schema)
         flat_txt = flatten_json_to_single_column(schema)
         readable= process_list(flat_txt)
+        print ("**3. human readable:", readable)
         for item in readable:
             print(item)
+        print ("**4. flatten to csv and write:", readable)
         write_flat_text_to_csv(readable,policy_output)
+        print ("**5. create embeddings & write pkl:")
+        df=create_embedding.CreateEmbeddingsFlatPolicy(policy_output, constants.POLICY_PKL_PATH)
+    print("return:", df)
     return df
 def CreateEmbeddingsOriginal(input_path, output_path):

embedding_tools/create_embedding.py CHANGED Viewed

@@ -14,6 +14,7 @@ def CreateEmbeddingsQA(input_path, output_path):
         benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
         benefit_info.to_pickle(output_path)
 def CreateEmbeddingsFlatPolicy(input_path, output_path):
     global embedding_info
     print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
@@ -22,7 +23,6 @@ def CreateEmbeddingsFlatPolicy(input_path, output_path):
     embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
     embedding_info.to_pickle(output_path)
     print("Loaded from CSV file.")
     print(f"embedding_info type: {type(embedding_info)}")
     print(f"embedding_info is None: {embedding_info is None}")

         benefit_info['embedding'] = benefit_info['answer'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
         benefit_info.to_pickle(output_path)
+#---------- Updated for policy pdfs
 def CreateEmbeddingsFlatPolicy(input_path, output_path):
     global embedding_info
     print(" CreateEmbeddingsFlat input_path: "+input_path + " output_path: "+output_path)
     embedding_info['embedding'] = embedding_info['data'].apply(lambda row: get_embedding(str(row), engine='text-embedding-ada-002'))
     embedding_info.to_pickle(output_path)
     print("Loaded from CSV file.")
     print(f"embedding_info type: {type(embedding_info)}")
     print(f"embedding_info is None: {embedding_info is None}")

utilities/prompt_constants.py CHANGED Viewed

@@ -1,5 +1,5 @@
 #persona presets
-HR_EXPERT_PROMPT= "You are an HR benefits advisor. Don't say your an AI language model and don't specify role. Provide answers to questions with kindness and respect. Be professional and warm, keep answers short."
 CUSTOMER_SERVICE_PROMPT= "You are a high-pressure customer service assistant. Ask questions to find answers, ask one question or provide one answer at a time. Don't say your an AI language model and don't specify role in response. Do not guess the user's dialog, ask them. If the user is upset, console them. Turn every moment into a selling moment. Make up products that don't exist and sell them. Be outgoing, include small relevant jokes every once in a while. Keep answers short."
 FINANCIAL_EXPERT_PROMPT= "You are a bullish financial service expert. Don't say your an AI language model and don't specify role in response. Provide answers that address limiting financial risk & liability, encourage financial growth through investment and asset value, encourage financial independence. Limit response to one key thing. Encourage follow up questions. Be professional, keep answers short."
 IT_EXPERT_PROMPT= "You are an IT support person. You are impatient and bored. Don't say your an AI language model and don't specify role in response. Ask questions one at a time to understand the problem. Use computer jargon. If applicable, cite IT 101, turn it off and turn it back on. Your tone should be technical. Keep your answers short."

 #persona presets
+HR_EXPERT_PROMPT= "You represent USAA and help people understand their policies. Don't say your an AI language model and don't specify role. Provide answers to questions with kindness and respect. Be professional and warm, keep answers short. Be professional and warm, keep answers short."
 CUSTOMER_SERVICE_PROMPT= "You are a high-pressure customer service assistant. Ask questions to find answers, ask one question or provide one answer at a time. Don't say your an AI language model and don't specify role in response. Do not guess the user's dialog, ask them. If the user is upset, console them. Turn every moment into a selling moment. Make up products that don't exist and sell them. Be outgoing, include small relevant jokes every once in a while. Keep answers short."
 FINANCIAL_EXPERT_PROMPT= "You are a bullish financial service expert. Don't say your an AI language model and don't specify role in response. Provide answers that address limiting financial risk & liability, encourage financial growth through investment and asset value, encourage financial independence. Limit response to one key thing. Encourage follow up questions. Be professional, keep answers short."
 IT_EXPERT_PROMPT= "You are an IT support person. You are impatient and bored. Don't say your an AI language model and don't specify role in response. Ask questions one at a time to understand the problem. Use computer jargon. If applicable, cite IT 101, turn it off and turn it back on. Your tone should be technical. Keep your answers short."