|
|
import PyPDF2 |
|
|
from docx import Document |
|
|
import io |
|
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
|
from langchain.text_splitter import CharacterTextSplitter |
|
|
from typing_extensions import Concatenate |
|
|
from typing import List |
|
|
|
|
|
from langchain_community.callbacks import get_openai_callback |
|
|
from langchain.output_parsers import PydanticOutputParser |
|
|
from langchain.prompts import PromptTemplate |
|
|
from langchain_core.pydantic_v1 import BaseModel, Field, validator |
|
|
import os |
|
|
import logging |
|
|
import base64 |
|
|
from langchain_openai import OpenAI |
|
|
import re |
|
|
import json |
|
|
|
|
|
|
|
|
api_key=os.getenv('OPENAI_API_KEY') |
|
|
|
|
|
class Candidate(BaseModel): |
|
|
brand: str = Field(description="Please identify and provide the primary brand name listed on the receipt. If multiple brand names are present, determine and specify the most prominent or relevant brand associated with the primary transaction on the receipt. If the brand name is not explicitly mentioned, include any contextual details or indirect indicators that might help in accurately identifying the brand. Defalut value will be 'null'.") |
|
|
total_cost: str = Field(description="Identify and provide the 'Total Order Value' listed on the receipt. Please specify the exact section where this value is noted, typically labeled as 'Total', 'Total Amount','total' , 'total amount' ,'total cost','Total Cost','Grand total','grand total'. Include any other labeling variations that might represent the total order value. If the total order value is not present or cannot be determined, explicitly state 'null' as the response.Rember total cost is always the highest value and it mostly cannot be a single digit value like 2.9 , 5.8 , 5 ,etc.") |
|
|
location: str = Field(description="Please provide the city and state where the purchase was made, as indicated on the receipt. For travel-related receipts, extract the location from which the booking was initiated, focusing on the booking origin or departure city/state, rather than the destination. Look for specific details such as the departure airport code, departure city, or the booking location mentioned in the itinerary or booking confirmation section.If no such information is available, or if it remains unclear, clearly mark the response as 'null'") |
|
|
no_of_items: str = Field(description="Specify the total number of items listed in the order as reflected in the receipt or document. If the total count of items is not explicitly mentioned or if it cannot be determined from the provided document, please assign and return the value 'null'.") |
|
|
purchase_category: str = Field(description="Identify and specify the purchase category. Choose from the following predefined categories: fashion, home, travel, food, groceries, hotels, spa, insurance, or others. If the purchase category is not explicitly stated on the receipt or document, or if it cannot be accurately determined based on the available information, assign and return the value 'null'.") |
|
|
brand_category: str = Field(description="""Based on the receipt information, use one of the following brand categories strictly: |
|
|
1. "Fashion, Dress, Personal" |
|
|
2. "Coffee - Personal" |
|
|
3. "Food - Personal" |
|
|
4. "Travel, Roam, Explore" |
|
|
5. "Shopping, Hunt, Obtain" |
|
|
|
|
|
If you don't find any brand category then return 'null'. |
|
|
""") |
|
|
Date: str = Field(description="Specify the date of purchase in the format dd-MM-yyyy. If the date of purchase is not explicitly provided on the receipt or document, or if it cannot be accurately determined, assign the value 'null'. Ensure the date is formatted correctly as day, month, and year in two digits each.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def strcuture_document_data(raw_text:str)->dict: |
|
|
null_values = 0 |
|
|
try: |
|
|
model_name = "gpt-3.5-turbo-instruct" |
|
|
|
|
|
temperature = 0.0 |
|
|
model = OpenAI(model_name=model_name, temperature=temperature, max_tokens=800) |
|
|
doc_query = ( |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Please structure the extracted data into a JSON format with the following keys: |
|
|
|
|
|
- Brand - Description = Please identify and provide the primary brand name listed on the receipt. If multiple brand names are present, determine and specify the most prominent or relevant brand associated with the primary transaction on the receipt. If the brand name is not explicitly mentioned, include any contextual details or indirect indicators that might help in accurately identifying the brand.\n |
|
|
- Total Cost - Description = Identify and provide the 'Total Order Value' listed on the receipt. Please specify the exact section where this value is noted, typically labeled as 'Total', 'Total Amount','total' , 'total amount' ,'total cost','Total Cost','Grand total','grand total'.It is the highest value in the text ,which can be found near the words total , total cost ,etc.\n |
|
|
- Location - Description = Please provide the city and state where the purchase was made, as indicated on the receipt. For travel-related receipts, extract the location from which the booking was initiated, focusing on the booking origin or departure city/state, rather than the destination. Look for specific details such as the departure airport code, departure city, or the booking location mentioned in the itinerary or booking confirmation section. If no such information is available, or if it remains unclear, clearly mark the response as 'null'\n |
|
|
- Number of Items - Description = Specify the total number of items listed in the order as reflected in the receipt or document. If the total count of items is not explicitly mentioned or if it cannot be determined from the provided document, please assign and return the value 'null'.\n |
|
|
- Purchase Category - This will include categories like fashion, home, travel, food, groceries, hotels, spa, insurance, or others. If the purchase category is not explicitly stated on the receipt or document, or if it cannot be accurately determined based on the available information, assign and return the value 'null'.\n |
|
|
- Brand Category - Description = Based on the receipt information, use one of the following brand categories strictly: |
|
|
1. "Fashion, Dress, Personal" |
|
|
2. "Coffee - Personal" |
|
|
3. "Food - Personal" |
|
|
4. "Travel, Roam, Explore" |
|
|
5. "Shopping, Hunt, Obtain" |
|
|
If you don't find any brand category then return 'null'. |
|
|
|
|
|
- Date - Description = Specify the date of purchase in the format dd-MM-yyyy. If the date of purchase is not explicitly provided on the receipt or document, or if it cannot be accurately determined, assign the value 'null'. Ensure the date is formatted correctly as day, month, and year in two digits each.\n |
|
|
Receipt Text: |
|
|
""" + "\n" + raw_text+"\n" |
|
|
|
|
|
|
|
|
""" |
|
|
Json Response Example : \n { |
|
|
"brand": "Burger King", |
|
|
"total_cost": "145.96", |
|
|
"location": "Nashik, Maharashtra", |
|
|
"no_of_items": "2", |
|
|
"purchase_category": "food", |
|
|
"brand_category": "Food - Personal", |
|
|
"Date": "31-12-2023" |
|
|
} \n |
|
|
|
|
|
I want to you give me Strictly a only one Json response having above of these keys brand , total_cost , location , no_of_items , purchase_category , brand_category , Date strictly . You should only give a json output Strictly having the keys in the Candidate class Strictly.Don't return null for every value try to analyze the Invoice data throughly and please return values for each key.Only return 1 json response Strictly.Ensure that the response includes only one JSON object representing the extracted data from the receipt text provided. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
) |
|
|
|
|
|
parser = PydanticOutputParser(pydantic_object=Candidate) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(parser.get_format_instructions()) |
|
|
prompt = PromptTemplate( |
|
|
template="Answer the user query.\n{query}\n", |
|
|
input_variables=["query"], |
|
|
|
|
|
) |
|
|
input = prompt.format_prompt(query=doc_query) |
|
|
with get_openai_callback() as cb: |
|
|
result = model(input.to_string()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class_object= parser.parse(result) |
|
|
dict_object=class_object.__dict__ |
|
|
print("printing structured json") |
|
|
if dict_object : |
|
|
print(dict_object) |
|
|
print("Null value") |
|
|
print(null_values) |
|
|
print(dict_object) |
|
|
return dict_object |
|
|
except Exception as e: |
|
|
print(f"Error occurred: {e}") |
|
|
return {} |
|
|
|
|
|
|
|
|
def extract_json_from_string(input_string): |
|
|
|
|
|
pattern = r'\{.*?\}' |
|
|
|
|
|
|
|
|
matches = re.findall(pattern, input_string) |
|
|
|
|
|
|
|
|
if matches: |
|
|
json_data_list = [] |
|
|
for match in matches: |
|
|
json_data = json.loads(match) |
|
|
json_data_list.append(json_data) |
|
|
return json_data_list |
|
|
else: |
|
|
return None |
|
|
|
|
|
def extract_text_from_pdf(pdf_data): |
|
|
with io.BytesIO(pdf_data) as pdf_file: |
|
|
pdf_reader = PyPDF2.PdfReader(pdf_file) |
|
|
text = "" |
|
|
for page_num in range(len(pdf_reader.pages)): |
|
|
page = pdf_reader.pages[page_num] |
|
|
text += page.extract_text() |
|
|
return text |
|
|
|
|
|
def extract_text_from_docx(docx_data): |
|
|
doc = Document(io.BytesIO(docx_data)) |
|
|
text = "" |
|
|
for para in doc.paragraphs: |
|
|
text += para.text + "\n" |
|
|
return text |
|
|
|
|
|
def extract_text_from_attachment(filename, data): |
|
|
if filename.endswith('.pdf'): |
|
|
return extract_text_from_pdf(base64.urlsafe_b64decode(data)) |
|
|
elif filename.endswith('.docx'): |
|
|
return extract_text_from_docx(base64.urlsafe_b64decode(data)) |
|
|
else: |
|
|
|
|
|
return "Unsupported document type" |
|
|
|
|
|
|
|
|
|