Spaces:
Sleeping
Sleeping
Commit ·
317211f
1
Parent(s): 6db4a81
Initial commit
Browse files- .gitattributes +2 -0
- .gitignore +3 -0
- app.py +10 -0
- categories/__init__.py +197 -0
- categories/accomodation/__init__.py +41 -0
- categories/accomodation/model.py +29 -0
- categories/random_/__init__.py +128 -0
- categories/random_/model.py +82 -0
- categories/travel_cab/__init__.py +37 -0
- categories/travel_cab/model.py +19 -0
- categories/travel_flight/__init__.py +23 -0
- categories/travel_flight/model.py +30 -0
- categories/vendor/__init__.py +38 -0
- categories/vendor/model.py +46 -0
- examples/example1.pdf +3 -0
- examples/rotated.jpeg +3 -0
- examples/rotated.pdf +3 -0
- examples/upright.jpeg +3 -0
- examples/upright.pdf +3 -0
- extract.py +67 -0
- main.py +61 -0
- packages.txt +1 -0
- processing.py +171 -0
- requirements.txt +346 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.conda
|
| 2 |
+
temp*
|
| 3 |
+
__pycache__/
|
app.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
st.title("Automatic Reimbursement Tool Demo")
|
| 4 |
+
|
| 5 |
+
with st.container():
|
| 6 |
+
col1, col2 = st.columns(2)
|
| 7 |
+
|
| 8 |
+
with col1:
|
| 9 |
+
st.header("Input")
|
| 10 |
+
st.file_uploader("Upload a PDF file or an image", type=["pdf", "png", "jpg", "jpeg"])
|
categories/__init__.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
|
| 3 |
+
from . import random_
|
| 4 |
+
from . import accomodation
|
| 5 |
+
from . import travel_cab
|
| 6 |
+
from . import travel_flight
|
| 7 |
+
|
| 8 |
+
# from . import vendor
|
| 9 |
+
from langchain.chains import LLMChain
|
| 10 |
+
from langchain.chat_models import ChatOpenAI
|
| 11 |
+
from langchain.output_parsers import PydanticOutputParser
|
| 12 |
+
from langchain.output_parsers.enum import EnumOutputParser
|
| 13 |
+
from langchain.prompts import (
|
| 14 |
+
ChatPromptTemplate,
|
| 15 |
+
HumanMessagePromptTemplate,
|
| 16 |
+
SystemMessagePromptTemplate,
|
| 17 |
+
)
|
| 18 |
+
from pydantic import BaseModel
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Category(Enum):
|
| 22 |
+
ACCOMODATION = "accomodation"
|
| 23 |
+
TRAVEL_FLIGHT = "travel_flight"
|
| 24 |
+
TRAVEL_CAB = "travel_cab"
|
| 25 |
+
# VENDOR = "vendor"
|
| 26 |
+
RANDOM = "random"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
category_modules = {
|
| 30 |
+
Category.ACCOMODATION: accomodation,
|
| 31 |
+
Category.TRAVEL_FLIGHT: travel_flight,
|
| 32 |
+
Category.TRAVEL_CAB: travel_cab,
|
| 33 |
+
# Category.VENDOR: vendor,
|
| 34 |
+
Category.RANDOM: random_,
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
model = ChatOpenAI(
|
| 38 |
+
temperature=0,
|
| 39 |
+
n=1,
|
| 40 |
+
# max_tokens=300,
|
| 41 |
+
model_kwargs={
|
| 42 |
+
"stop": None,
|
| 43 |
+
"top_p": 1,
|
| 44 |
+
"frequency_penalty": 0,
|
| 45 |
+
"presence_penalty": 0,
|
| 46 |
+
},
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Build categorizing chain
|
| 50 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
| 51 |
+
"You are a classifier that, given a bill's text, states what type of bill "
|
| 52 |
+
"category it belongs to: accomodation (bills regarding stays), travel (bills "
|
| 53 |
+
"concerning cab or other land rides), travel (bills concerning flights), random "
|
| 54 |
+
"(bills concerning deliveries from e-commerce websites like amazon etc) bills.\n"
|
| 55 |
+
"You may want to see if there are Room Details, Check-in/Check-out Date for "
|
| 56 |
+
"Accomodation stay; Flight Details, Train Details, Bus Details Cab details for "
|
| 57 |
+
"Travel; Conference Details for Conference organizers; anything else comes under "
|
| 58 |
+
"random category. Your answers must be only the appropriate choice e.g. 'option' and "
|
| 59 |
+
"not 'The given bill belongs to the option category.'\n"
|
| 60 |
+
"{format_instructions}"
|
| 61 |
+
)
|
| 62 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
| 63 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
| 64 |
+
[system_message_prompt, human_message_prompt]
|
| 65 |
+
)
|
| 66 |
+
category_parser = EnumOutputParser(enum=Category)
|
| 67 |
+
categorize_chain = LLMChain(
|
| 68 |
+
llm=model, prompt=chat_prompt, output_parser=category_parser
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def categorize_text(text: str) -> Category:
|
| 73 |
+
"""Categories the text into one of the categories defined in Category by querying
|
| 74 |
+
ChatGPT.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
text(str): The text to categorize.
|
| 78 |
+
|
| 79 |
+
Returns: The category of the text.
|
| 80 |
+
"""
|
| 81 |
+
return categorize_chain.run(
|
| 82 |
+
text=text, format_instructions=category_parser.get_format_instructions()
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def run_category_chain(category: Category, text: str) -> BaseModel | None:
|
| 87 |
+
"""Runs the chain for the given category on the given text.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
category(Category): The category for which the chain is to be run.
|
| 91 |
+
text(str): The text on which the chain is to be run.
|
| 92 |
+
|
| 93 |
+
Returns: The output of the chain.
|
| 94 |
+
"""
|
| 95 |
+
output_parser = category_modules[category].output_parser
|
| 96 |
+
try:
|
| 97 |
+
return category_modules[category].chain.run(
|
| 98 |
+
text=text, format_instructions=output_parser.get_format_instructions()
|
| 99 |
+
)
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print("Error in running chain for category", category, ":", e)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
text = """amazonin
|
| 106 |
+
we)
|
| 107 |
+
|
| 108 |
+
Sold By :
|
| 109 |
+
|
| 110 |
+
Spigen India Pvt. Ltd.
|
| 111 |
+
|
| 112 |
+
* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
|
| 113 |
+
37//15/1, 15/2,, Adjacent to Starex School, Village
|
| 114 |
+
- Binola, National Highway -8, Tehsil - Manesar
|
| 115 |
+
Gurgaon, Haryana, 122413
|
| 116 |
+
|
| 117 |
+
IN
|
| 118 |
+
|
| 119 |
+
PAN No: ABACS5056L
|
| 120 |
+
GST Registration No: O6ABACS5056L12Z5
|
| 121 |
+
|
| 122 |
+
Order Number: 407-5335982-7837125
|
| 123 |
+
Order Date: 30.05.2023
|
| 124 |
+
|
| 125 |
+
Tax Invoice/Bill of Supply/Cash Memo
|
| 126 |
+
(Original for Recipient)
|
| 127 |
+
|
| 128 |
+
Billing Address :
|
| 129 |
+
|
| 130 |
+
Praveen Bohra
|
| 131 |
+
|
| 132 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
| 133 |
+
GURGAON, HARYANA, 122018
|
| 134 |
+
|
| 135 |
+
IN
|
| 136 |
+
|
| 137 |
+
State/UT Code: 06
|
| 138 |
+
|
| 139 |
+
Shipping Address :
|
| 140 |
+
|
| 141 |
+
Praveen Bohra
|
| 142 |
+
|
| 143 |
+
Praveen Bohra
|
| 144 |
+
|
| 145 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
| 146 |
+
GURGAON, HARYANA, 122018
|
| 147 |
+
|
| 148 |
+
IN
|
| 149 |
+
|
| 150 |
+
State/UT Code: 06
|
| 151 |
+
|
| 152 |
+
Place of supply: HARYANA
|
| 153 |
+
|
| 154 |
+
Place of delivery: HARYANA
|
| 155 |
+
|
| 156 |
+
Invoice Number : DEL5-21033
|
| 157 |
+
Invoice Details : HR-DEL5-918080915-2324
|
| 158 |
+
Invoice Date : 30.05.2023
|
| 159 |
+
|
| 160 |
+
Description at Tax |Tax /|Tax Total
|
| 161 |
+
p y Rate |Type |Amount|Amount
|
| 162 |
+
|
| 163 |
+
Black) | BO8BHLZHBH ( ACS01744INP )
|
| 164 |
+
HSN:39269099
|
| 165 |
+
|
| 166 |
+
1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
|
| 167 |
+
1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
|
| 168 |
+
9% |SGST| %76.19
|
| 169 |
+
|
| 170 |
+
TOTAL:
|
| 171 |
+
|
| 172 |
+
Amount in Words:
|
| 173 |
+
Nine Hundred Ninety-nine only
|
| 174 |
+
|
| 175 |
+
Whether tax is payable under reverse charge - No
|
| 176 |
+
|
| 177 |
+
For Spigen India Pvt. Ltd.:
|
| 178 |
+
sSoigenrn
|
| 179 |
+
|
| 180 |
+
Authorized Signatory
|
| 181 |
+
|
| 182 |
+
Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
|
| 183 |
+
2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
|
| 184 |
+
|
| 185 |
+
*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
|
| 186 |
+
|
| 187 |
+
Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
|
| 188 |
+
|
| 189 |
+
Please note that this invoice is not a demand for payment
|
| 190 |
+
|
| 191 |
+
Page 1 of 1"""
|
| 192 |
+
category = categorize_text(text)
|
| 193 |
+
print("Category:", category)
|
| 194 |
+
|
| 195 |
+
print("\n\n")
|
| 196 |
+
result = run_category_chain(category, text)
|
| 197 |
+
print(result)
|
categories/accomodation/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
| 2 |
+
|
| 3 |
+
from langchain.chains import LLMChain
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
| 6 |
+
from langchain.prompts import (
|
| 7 |
+
ChatPromptTemplate,
|
| 8 |
+
HumanMessagePromptTemplate,
|
| 9 |
+
SystemMessagePromptTemplate,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
model = ChatOpenAI(
|
| 13 |
+
temperature=0.6,
|
| 14 |
+
max_tokens=300,
|
| 15 |
+
n=1,
|
| 16 |
+
request_timeout=None,
|
| 17 |
+
model_kwargs={
|
| 18 |
+
'stop': None,
|
| 19 |
+
'top_p': 1,
|
| 20 |
+
}
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Build category chain
|
| 24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
| 25 |
+
"You are tasked with developing an OCR data extraction system for hotel bills in PDF "
|
| 26 |
+
"format given as text. The system should extract important information necessary for "
|
| 27 |
+
"the reimbursement process from a college. Your prompt should fetch the following "
|
| 28 |
+
"essential details from the hotel bill: hotel name, address, bill number/invoice "
|
| 29 |
+
"number, booking ID / confirmation ID / booking number, check-in date and time, "
|
| 30 |
+
"check-out date and time, total amount, booking platform, bill date.\n"
|
| 31 |
+
"Ensure that the system accurately extracts the above information from the OCR text "
|
| 32 |
+
"of the hotel bill.\n"
|
| 33 |
+
"{format_instructions}"
|
| 34 |
+
)
|
| 35 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
| 36 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
| 37 |
+
[system_message_prompt, human_message_prompt]
|
| 38 |
+
)
|
| 39 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
| 40 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
| 41 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/accomodation/model.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
| 9 |
+
"""
|
| 10 |
+
1. Hotel Name: [Hotel Name]
|
| 11 |
+
2. Address: [Hotel Address]
|
| 12 |
+
3. Bill number/Invoice number: [Bill Number]
|
| 13 |
+
4. booking ID / Confirmation ID / Booking #: [Booking ID]
|
| 14 |
+
5. Check-in Date and Time: [Check-in Date Time]
|
| 15 |
+
6. Check-out Date and Time: [Check-out Date Time]
|
| 16 |
+
7. Total Amount: [Total Amount Charged]
|
| 17 |
+
8. Booking platform: [Booking Platform]
|
| 18 |
+
9. Bill date: [Bill Date]
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
hostel_name: str = Field(..., title="The name of the hotel")
|
| 22 |
+
address: str = Field(..., title="The address of the hotel")
|
| 23 |
+
bill_number: str = Field(..., title="The bill number/invoice number")
|
| 24 |
+
booking_id: str = Field(..., title="The booking ID/confirmation ID/booking number")
|
| 25 |
+
check_in_date_time: datetime = Field(..., title="The check-in date and time")
|
| 26 |
+
check_out_date_time: datetime = Field(..., title="The check-out date and time")
|
| 27 |
+
total_amount_charged: float = Field(..., title="The total amount charged")
|
| 28 |
+
booking_platform: str = Field(..., title="The booking platform")
|
| 29 |
+
bill_date: datetime = Field(..., title="The bill date")
|
categories/random_/__init__.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
| 2 |
+
|
| 3 |
+
from langchain.chains import LLMChain
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
| 6 |
+
from langchain.prompts import (
|
| 7 |
+
ChatPromptTemplate,
|
| 8 |
+
HumanMessagePromptTemplate,
|
| 9 |
+
SystemMessagePromptTemplate,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
model = ChatOpenAI(
|
| 13 |
+
temperature=0,
|
| 14 |
+
n=1,
|
| 15 |
+
model_kwargs={
|
| 16 |
+
'stop': None,
|
| 17 |
+
'top_p': 1,
|
| 18 |
+
'frequency_penalty': 0,
|
| 19 |
+
'presence_penalty': 0,
|
| 20 |
+
}
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Build category chain
|
| 24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
| 25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
| 26 |
+
"documents like uids, total, tax, name, currency, date, seller details, summary. You "
|
| 27 |
+
"may use context to make an educated guess about the currency. Use null if you are "
|
| 28 |
+
"unable to find certain details\n"
|
| 29 |
+
"{format_instructions}"
|
| 30 |
+
)
|
| 31 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
| 32 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
| 33 |
+
[system_message_prompt, human_message_prompt]
|
| 34 |
+
)
|
| 35 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
| 36 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
| 37 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
text = """amazonin
|
| 41 |
+
we)
|
| 42 |
+
|
| 43 |
+
Sold By :
|
| 44 |
+
|
| 45 |
+
Spigen India Pvt. Ltd.
|
| 46 |
+
|
| 47 |
+
* Rect/Killa Nos. 38//8/2 min, 192//22/1,196//2/1/1,
|
| 48 |
+
37//15/1, 15/2,, Adjacent to Starex School, Village
|
| 49 |
+
- Binola, National Highway -8, Tehsil - Manesar
|
| 50 |
+
Gurgaon, Haryana, 122413
|
| 51 |
+
|
| 52 |
+
IN
|
| 53 |
+
|
| 54 |
+
PAN No: ABACS5056L
|
| 55 |
+
GST Registration No: O6ABACS5056L12Z5
|
| 56 |
+
|
| 57 |
+
Order Number: 407-5335982-7837125
|
| 58 |
+
Order Date: 30.05.2023
|
| 59 |
+
|
| 60 |
+
Tax Invoice/Bill of Supply/Cash Memo
|
| 61 |
+
(Original for Recipient)
|
| 62 |
+
|
| 63 |
+
Billing Address :
|
| 64 |
+
|
| 65 |
+
Praveen Bohra
|
| 66 |
+
|
| 67 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
| 68 |
+
GURGAON, HARYANA, 122018
|
| 69 |
+
|
| 70 |
+
IN
|
| 71 |
+
|
| 72 |
+
State/UT Code: 06
|
| 73 |
+
|
| 74 |
+
Shipping Address :
|
| 75 |
+
|
| 76 |
+
Praveen Bohra
|
| 77 |
+
|
| 78 |
+
Praveen Bohra
|
| 79 |
+
|
| 80 |
+
E-303, ParkView City 2, Sector 49, Sohna Road
|
| 81 |
+
GURGAON, HARYANA, 122018
|
| 82 |
+
|
| 83 |
+
IN
|
| 84 |
+
|
| 85 |
+
State/UT Code: 06
|
| 86 |
+
|
| 87 |
+
Place of supply: HARYANA
|
| 88 |
+
|
| 89 |
+
Place of delivery: HARYANA
|
| 90 |
+
|
| 91 |
+
Invoice Number : DEL5-21033
|
| 92 |
+
Invoice Details : HR-DEL5-918080915-2324
|
| 93 |
+
Invoice Date : 30.05.2023
|
| 94 |
+
|
| 95 |
+
Description at Tax |Tax /|Tax Total
|
| 96 |
+
p y Rate |Type |Amount|Amount
|
| 97 |
+
|
| 98 |
+
Black) | BO8BHLZHBH ( ACS01744INP )
|
| 99 |
+
HSN:39269099
|
| 100 |
+
|
| 101 |
+
1 |Spigen Liquid Air Back Cover Case for iPhone 12 Mini (TPU | Matte
|
| 102 |
+
1846.62] 1 |%846.62| 9% |CGST! %76.19 |%999.00
|
| 103 |
+
9% |SGST| %76.19
|
| 104 |
+
|
| 105 |
+
TOTAL:
|
| 106 |
+
|
| 107 |
+
Amount in Words:
|
| 108 |
+
Nine Hundred Ninety-nine only
|
| 109 |
+
|
| 110 |
+
Whether tax is payable under reverse charge - No
|
| 111 |
+
|
| 112 |
+
For Spigen India Pvt. Ltd.:
|
| 113 |
+
sSoigenrn
|
| 114 |
+
|
| 115 |
+
Authorized Signatory
|
| 116 |
+
|
| 117 |
+
Payment Transaction ID: Date & Time: 30/05/2023, 10:48:43 Invoice Value: Mode of Payment: Credit
|
| 118 |
+
2rs9ZEF8BwU9VmWiCc2Us hrs 999.00 Card
|
| 119 |
+
|
| 120 |
+
*ASSPL-Amazon Seller Services Pvt. Ltd., ARIPL-Amazon Retail India Pvt. Ltd. (only where Amazon Retail India Pvt. Ltd. fulfillment center is co-located)
|
| 121 |
+
|
| 122 |
+
Customers desirous of availing input GST credit are requested to create a Business account and purchase on Amazon.in/business from Business eligible offers
|
| 123 |
+
|
| 124 |
+
Please note that this invoice is not a demand for payment
|
| 125 |
+
|
| 126 |
+
Page 1 of 1"""
|
| 127 |
+
result = chain.run(text=text, format_instructions=fixing_parser.get_format_instructions())
|
| 128 |
+
print(result.json(indent=4))
|
categories/random_/model.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# generated by datamodel-codegen:
|
| 2 |
+
# filename: schema.json
|
| 3 |
+
# timestamp: 2023-07-28T11:36:16+00:00
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
from datetime import date
|
| 8 |
+
from typing import Dict, Optional, Union
|
| 9 |
+
|
| 10 |
+
import iso4217
|
| 11 |
+
from pydantic import BaseModel, Field, constr, validator, ValidationError
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TaxItem(BaseModel):
|
| 15 |
+
gst: float = Field(
|
| 16 |
+
...,
|
| 17 |
+
title="The total GST tax amount (IGST + CGST + SGST + etc) as a single number",
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class TaxItem1(BaseModel):
|
| 22 |
+
vat: float = Field(..., title="The total VAT present in the invoice")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class TaxNumberItem(BaseModel):
|
| 26 |
+
gst_number: constr(min_length=15) = Field(
|
| 27 |
+
..., title="The alphanumeric GSTIN/GST number code"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class TaxNumberItem1(BaseModel):
|
| 32 |
+
vat_number: str = Field(..., title="The VAT/TIN number present in older invoices")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class TaxNumberItem2(BaseModel):
|
| 36 |
+
ui_number: str = Field(..., title="The tax UIN issued to foreign entities")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class SellerDetails(BaseModel):
|
| 40 |
+
name: Optional[str] = None
|
| 41 |
+
address: Optional[str] = None
|
| 42 |
+
contact: Optional[str] = None
|
| 43 |
+
tax_number: Union[TaxNumberItem, TaxNumberItem1, TaxNumberItem2] = Field(
|
| 44 |
+
..., title="Tax information"
|
| 45 |
+
)
|
| 46 |
+
pan_number: constr(min_length=10, max_length=10) = Field(
|
| 47 |
+
..., title="The 10-character alphanumeric PAN code"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class UIDs(BaseModel):
|
| 52 |
+
invoice_number: str = Field(..., title="The invoice number")
|
| 53 |
+
other_uids: Dict[str, str] = Field(
|
| 54 |
+
...,
|
| 55 |
+
title="Key-value pairs of uniquely identifying numbers (UIDs) like order number, bill number, payment ID, etc but not the invoice number",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
| 60 |
+
uids: UIDs = Field(..., title="Invoice number and other UIDs")
|
| 61 |
+
total: float = Field(..., title="Total amount or price")
|
| 62 |
+
tax: Union[TaxItem, TaxItem1] = Field(..., title="The total tax amount")
|
| 63 |
+
name: str = Field(
|
| 64 |
+
...,
|
| 65 |
+
title="Name of the person/entity that the invoice item was charged or delivered to",
|
| 66 |
+
)
|
| 67 |
+
currency: str = Field(
|
| 68 |
+
default="INR",
|
| 69 |
+
title="The ISO 4217 code for the currency in which the prices in the invoice are (inferred from symbols, names, addresses, etc)",
|
| 70 |
+
)
|
| 71 |
+
date: date = Field(
|
| 72 |
+
..., title="The date the invoice was issued"
|
| 73 |
+
)
|
| 74 |
+
seller_details: SellerDetails = Field(..., title="Information about the seller")
|
| 75 |
+
summary: str = Field(..., title="5-6 words short summary of purchased good(s)")
|
| 76 |
+
|
| 77 |
+
@validator("currency")
|
| 78 |
+
@classmethod
|
| 79 |
+
def check_currency(cls, v: str) -> str:
|
| 80 |
+
if not iso4217.Currency.__members__.get(v.lower()):
|
| 81 |
+
raise ValidationError(f"{v} is not a valid ISO 4217 currency code")
|
| 82 |
+
return v.upper()
|
categories/travel_cab/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
| 2 |
+
|
| 3 |
+
from langchain.chains import LLMChain
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
| 6 |
+
from langchain.prompts import (
|
| 7 |
+
ChatPromptTemplate,
|
| 8 |
+
HumanMessagePromptTemplate,
|
| 9 |
+
SystemMessagePromptTemplate,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
model = ChatOpenAI(
|
| 13 |
+
temperature=0,
|
| 14 |
+
n=1,
|
| 15 |
+
model_kwargs= {
|
| 16 |
+
'stop': None,
|
| 17 |
+
'top_p': 1,
|
| 18 |
+
'frequency_penalty': 0,
|
| 19 |
+
'presence_penalty': 0,
|
| 20 |
+
}
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Build categorizing chain
|
| 24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
| 25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
| 26 |
+
"documents such as date/time/place of departure and arrival.\n"
|
| 27 |
+
"{format_instructions}"
|
| 28 |
+
)
|
| 29 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
| 30 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
| 31 |
+
[system_message_prompt, human_message_prompt]
|
| 32 |
+
)
|
| 33 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
| 34 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
| 35 |
+
chain = LLMChain(
|
| 36 |
+
llm=model, prompt=chat_prompt, output_parser=fixing_parser
|
| 37 |
+
)
|
categories/travel_cab/model.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import date, time
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
| 9 |
+
''''''
|
| 10 |
+
|
| 11 |
+
place_from: str = Field(..., title="place where journey starts")
|
| 12 |
+
date_from: date = Field(
|
| 13 |
+
..., title="date on which journey starts (DD/MM/YYYY)"
|
| 14 |
+
)
|
| 15 |
+
time_from: time = Field(..., title="time at which journey starts")
|
| 16 |
+
place_to: str = Field(..., title="place where journey end")
|
| 17 |
+
date_to: date = Field(..., title="date on which journey end (DD/MM/YYYY)")
|
| 18 |
+
time_to: time = Field(..., title="time at which journey end")
|
| 19 |
+
amount: float = Field(..., title="cost of journey ticket")
|
categories/travel_flight/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
| 2 |
+
|
| 3 |
+
from langchain.chains import LLMChain
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
| 6 |
+
from langchain.prompts import (
|
| 7 |
+
ChatPromptTemplate,
|
| 8 |
+
HumanMessagePromptTemplate,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
model = ChatOpenAI(temperature=0)
|
| 12 |
+
|
| 13 |
+
# Build categorizing chain
|
| 14 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template(
|
| 15 |
+
"Parse through and find the following details from the text extracted from a travel "
|
| 16 |
+
"bill\n"
|
| 17 |
+
"{format_instructions}\n"
|
| 18 |
+
"{text}"
|
| 19 |
+
)
|
| 20 |
+
chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])
|
| 21 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
| 22 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
| 23 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/travel_flight/model.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from datetime import date, time
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
| 9 |
+
"""
|
| 10 |
+
response_schemas = [
|
| 11 |
+
ResponseSchema(name="place (from)", description="place where flight starts/takes-off"),
|
| 12 |
+
ResponseSchema(name="date (from)", description="date on which flight starts/takes-off (DD/MM/YYYY)"),
|
| 13 |
+
ResponseSchema(name="time (from)", description="time at which flight starts/takes-off"),
|
| 14 |
+
ResponseSchema(name="place (to)", description="place where flight end/lands"),
|
| 15 |
+
ResponseSchema(name="date (to)", description="date on which flight end/lands (DD/MM/YYYY)"),
|
| 16 |
+
ResponseSchema(name="time (to)", description="time at which flight end/lands"),
|
| 17 |
+
ResponseSchema(name="PNR Number", description ="PNR Number of flight"),
|
| 18 |
+
ResponseSchema(name="amount", description="cost of flight ticket")
|
| 19 |
+
]"""
|
| 20 |
+
|
| 21 |
+
place_from: str = Field(..., title="place where flight starts/takes-off")
|
| 22 |
+
date_from: date = Field(
|
| 23 |
+
..., title="date on which flight starts/takes-off (DD/MM/YYYY)"
|
| 24 |
+
)
|
| 25 |
+
time_from: time = Field(..., title="time at which flight starts/takes-off")
|
| 26 |
+
place_to: str = Field(..., title="place where flight end/lands")
|
| 27 |
+
date_to: date = Field(..., title="date on which flight end/lands (DD/MM/YYYY)")
|
| 28 |
+
time_to: time = Field(..., title="time at which flight end/lands")
|
| 29 |
+
pnr_number: str = Field(..., title="PNR Number of flight")
|
| 30 |
+
amount: float = Field(..., title="cost of flight ticket")
|
categories/vendor/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .model import InformationExtractedFromABillReceipt as PydanticModel
|
| 2 |
+
|
| 3 |
+
from langchain.chains import LLMChain
|
| 4 |
+
from langchain.chat_models import ChatOpenAI
|
| 5 |
+
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
|
| 6 |
+
from langchain.prompts import (
|
| 7 |
+
ChatPromptTemplate,
|
| 8 |
+
HumanMessagePromptTemplate,
|
| 9 |
+
SystemMessagePromptTemplate,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
model = ChatOpenAI(
|
| 13 |
+
temperature=0,
|
| 14 |
+
n=1,
|
| 15 |
+
model_kwargs={
|
| 16 |
+
"stop": None,
|
| 17 |
+
"top_p": 1,
|
| 18 |
+
"frequency_penalty": 0,
|
| 19 |
+
"presence_penalty": 0,
|
| 20 |
+
},
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# Build category chain
|
| 24 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(
|
| 25 |
+
"You are an information extraction engine that outputs details from OCR processed "
|
| 26 |
+
"documents like uids, total, tax, addresses, bank details, invoice details, "
|
| 27 |
+
"participant registration details."
|
| 28 |
+
"{format_instructions}"
|
| 29 |
+
)
|
| 30 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template("{text}")
|
| 31 |
+
chat_prompt = ChatPromptTemplate.from_messages(
|
| 32 |
+
[system_message_prompt, human_message_prompt]
|
| 33 |
+
)
|
| 34 |
+
output_parser = PydanticOutputParser(pydantic_object=PydanticModel)
|
| 35 |
+
print(output_parser.get_format_instructions())
|
| 36 |
+
# exit()
|
| 37 |
+
fixing_parser = OutputFixingParser.from_llm(llm=model, parser=output_parser)
|
| 38 |
+
chain = LLMChain(llm=model, prompt=chat_prompt, output_parser=fixing_parser)
|
categories/vendor/model.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# generated by datamodel-codegen:
|
| 2 |
+
# filename: schema.json
|
| 3 |
+
# timestamp: 2023-07-28T11:36:16+00:00
|
| 4 |
+
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
from pydantic import BaseModel, Field, constr, validator, ValidationError
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BankDetails(BaseModel):
|
| 13 |
+
"""account holder name, bank name, account number, branch, ifs code, swift code"""
|
| 14 |
+
|
| 15 |
+
account_holder_name: str = Field(..., title="The name of the account holder")
|
| 16 |
+
bank_name: str = Field(..., title="The name of the bank")
|
| 17 |
+
account_number: str = Field(..., title="The account number")
|
| 18 |
+
branch: str = Field(..., title="The branch of the bank")
|
| 19 |
+
ifs_code: str = Field(..., title="The IFS code of the bank")
|
| 20 |
+
swift_code: str = Field(..., title="The SWIFT code of the bank")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class InformationExtractedFromABillReceipt(BaseModel):
|
| 24 |
+
"""
|
| 25 |
+
GSTIN, billing address, invoice number, invoice date, due date, total, balance due,
|
| 26 |
+
bank details: (account holder name, bank name, account number, branch, ifs code, swift
|
| 27 |
+
code), recipient, registration id, registration fee, registration date/time
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
gstin: constr(min_length=15) = Field(
|
| 31 |
+
..., title="The alphanumeric GSTIN/GST number code"
|
| 32 |
+
)
|
| 33 |
+
billing_address: str = Field(..., title="The billing address")
|
| 34 |
+
invoice_number: str = Field(..., title="The invoice number")
|
| 35 |
+
invoice_date: datetime = Field(..., title="The date-time the invoice was issued")
|
| 36 |
+
due_date: datetime = Field(..., title="The date-time the invoice is due")
|
| 37 |
+
total: float = Field(..., title="Total amount or price")
|
| 38 |
+
balance_due: float = Field(..., title="The amount due")
|
| 39 |
+
bank_details: BankDetails = Field(..., title="Bank details")
|
| 40 |
+
recipient: str = Field(
|
| 41 |
+
...,
|
| 42 |
+
title="Name of the person/entity that the invoice item was charged or delivered to",
|
| 43 |
+
)
|
| 44 |
+
registration_id: str = Field(..., title="The registration ID")
|
| 45 |
+
registration_fee: float = Field(..., title="The registration fee")
|
| 46 |
+
registration_date_time: datetime = Field(..., title="The registration date-time")
|
examples/example1.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a0afab196c55afe47c6716d242a0ef1c3352c596eb717759e5c6b40f5240e8b
|
| 3 |
+
size 45782
|
examples/rotated.jpeg
ADDED
|
Git LFS Details
|
examples/rotated.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13219084901ec494f11495c5a930a35d151a22accac542af4dfaa7690b4f584f
|
| 3 |
+
size 333463
|
examples/upright.jpeg
ADDED
|
Git LFS Details
|
examples/upright.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d476c2a0bfc9f6fe99e369097dd3c9c75513588231d219ba193dc2e1d792419
|
| 3 |
+
size 325064
|
extract.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Responsible for extracting text from images and PDFs using OCR engines or other modules.
|
| 2 |
+
"""
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import pyocr.tesseract
|
| 7 |
+
import pypdf
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def extract_text_from_pdf_pypdf(bytes_stream: BytesIO) -> str:
|
| 12 |
+
"""Extracts text from the given PDF file using pypdf.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
bytes_stream (BytesIO): The PDF file to extract text from.
|
| 16 |
+
|
| 17 |
+
Returns: The extracted text
|
| 18 |
+
"""
|
| 19 |
+
pdf_reader = pypdf.PdfReader(bytes_stream)
|
| 20 |
+
text = ""
|
| 21 |
+
for page in pdf_reader.pages:
|
| 22 |
+
text += page.extract_text()
|
| 23 |
+
text += "\n\n"
|
| 24 |
+
return text
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def extract_text_from_image_pyocr_tesseract(image: Image.Image) -> str:
|
| 28 |
+
"""Extracts text from the given image using tesseract via pyocr.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
image(PIL.Image.Image): The image to extract text from.
|
| 32 |
+
|
| 33 |
+
Returns: The extracted text.
|
| 34 |
+
"""
|
| 35 |
+
if not pyocr.tesseract.is_available():
|
| 36 |
+
raise Exception("Tesseract is not available.")
|
| 37 |
+
text = pyocr.tesseract.image_to_string(image, lang="eng")
|
| 38 |
+
return text
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def extract_text_from_images_pyocr_tesseract(images: List[Image.Image]) -> str:
|
| 42 |
+
"""Extracts text from the given images using tesseract via pyocr.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
images(List[PIL.Image.Image]): The images to extract text from.
|
| 46 |
+
|
| 47 |
+
Returns: The extracted text.
|
| 48 |
+
"""
|
| 49 |
+
text = ""
|
| 50 |
+
for image in images:
|
| 51 |
+
text += extract_text_from_image_pyocr_tesseract(image)
|
| 52 |
+
text += "\n\n"
|
| 53 |
+
image.close()
|
| 54 |
+
return text
|
| 55 |
+
|
| 56 |
+
if __name__ == '__main__':
|
| 57 |
+
filename = 'examples/upright.pdf'
|
| 58 |
+
with open(filename, 'rb') as file:
|
| 59 |
+
bytes_stream = BytesIO(file.read())
|
| 60 |
+
text = extract_text_from_pdf_pypdf(bytes_stream)
|
| 61 |
+
print(text)
|
| 62 |
+
print("-"*25)
|
| 63 |
+
filename = 'examples/upright.jpeg'
|
| 64 |
+
image = Image.open(filename)
|
| 65 |
+
text = extract_text_from_image_pyocr_tesseract(image)
|
| 66 |
+
print(text)
|
| 67 |
+
image.close()
|
main.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
import categories
|
| 4 |
+
import processing
|
| 5 |
+
import extract
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
|
| 10 |
+
def categorize_and_parse_text(text: str) -> BaseModel:
|
| 11 |
+
"""Categorizes the text and parses the information from it.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
text(str): The text to categorize and parse information from.
|
| 15 |
+
|
| 16 |
+
Returns: The category of the text.
|
| 17 |
+
"""
|
| 18 |
+
category = categories.categorize_text(text)
|
| 19 |
+
print("Categorized as category", category)
|
| 20 |
+
result = categories.run_category_chain(category, text)
|
| 21 |
+
return result
|
| 22 |
+
|
| 23 |
+
def process_pdf(filename: Path) -> BaseModel:
|
| 24 |
+
"""Processes the given PDF file and extracts information from it.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
filename(Path): The PDF file to process.
|
| 28 |
+
|
| 29 |
+
Returns: The extracted information.
|
| 30 |
+
"""
|
| 31 |
+
with open(filename, "rb") as f:
|
| 32 |
+
pdf_bytes = bytes(f.read())
|
| 33 |
+
|
| 34 |
+
text = extract.extract_text_from_pdf_pypdf(BytesIO(pdf_bytes))
|
| 35 |
+
# If the encoded text is too short, a pdf scanner probably added a watermark
|
| 36 |
+
if len(text) < 20:
|
| 37 |
+
# Try to extract text from images
|
| 38 |
+
images = processing.convert_pdf_to_image_pdf2image(pdf_bytes)
|
| 39 |
+
text = extract.extract_text_from_images_pyocr_tesseract(images)
|
| 40 |
+
|
| 41 |
+
result = categorize_and_parse_text(text)
|
| 42 |
+
return result
|
| 43 |
+
|
| 44 |
+
def process_image(filename: Path) -> BaseModel:
|
| 45 |
+
"""Processes the given image file and extracts information from it.
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
filename(Path): The image file to process.
|
| 49 |
+
|
| 50 |
+
Returns: The extracted information.
|
| 51 |
+
"""
|
| 52 |
+
image = Image.open(filename)
|
| 53 |
+
text = extract.extract_text_from_image_pyocr_tesseract(image)
|
| 54 |
+
image.close()
|
| 55 |
+
result = categorize_and_parse_text(text)
|
| 56 |
+
return result
|
| 57 |
+
|
| 58 |
+
if __name__ == "__main__":
|
| 59 |
+
filename = Path("examples/example1.pdf")
|
| 60 |
+
result = process_pdf(filename)
|
| 61 |
+
print(result.json(indent=4))
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
poppler-utils
|
processing.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Responsible for (pre)processing images and PDFs before they are passed to the OCR
|
| 2 |
+
engine and other miscellaneous actions concerning processing.
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import List
|
| 7 |
+
|
| 8 |
+
# import cv2
|
| 9 |
+
# import numpy as np
|
| 10 |
+
import pyocr
|
| 11 |
+
from pdf2image import pdf2image
|
| 12 |
+
from PIL import Image #, ImageOps
|
| 13 |
+
|
| 14 |
+
PDF_CONVERSION_DPI = 300
|
| 15 |
+
ROTATION_CONFIDENCE_THRESHOLD = 2.0
|
| 16 |
+
|
| 17 |
+
# def rotate_image(image: Image, angle: float):
|
| 18 |
+
# """Rotates the given image by the given angle.
|
| 19 |
+
|
| 20 |
+
# Args:
|
| 21 |
+
# image(PIL.Image.Image): The image to be rotated.
|
| 22 |
+
# angle(float): The angle to rotate the image by.
|
| 23 |
+
|
| 24 |
+
# Returns: The rotated image.
|
| 25 |
+
# """
|
| 26 |
+
# image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 27 |
+
# height, width, _ = image.shape # Get the image height, width, and channels
|
| 28 |
+
# # Compute the rotation matrix
|
| 29 |
+
# rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
|
| 30 |
+
# # Apply the rotation to the image
|
| 31 |
+
# rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
|
| 32 |
+
# rotated_image = Image.fromarray(cv2.cvtColor(rotated_image, cv2.COLOR_BGR2RGB))
|
| 33 |
+
# return rotated_image
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# class PDF_CONVERTER(enum.Enum):
|
| 37 |
+
# PDF2IMAGE = 1
|
| 38 |
+
# IMAGEMAGICK = 2
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def correct_orientation(image: Image.Image) -> Image.Image:
|
| 42 |
+
"""Corrects the orientation of an image if it is not upright.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
image(PIL.Image.Image): The pillow image to be corrected.
|
| 46 |
+
|
| 47 |
+
Returns: The corrected pillow image as a copy. The original image is not closed.
|
| 48 |
+
"""
|
| 49 |
+
if not pyocr.tesseract.is_available():
|
| 50 |
+
raise Exception("Tesseract is not available.")
|
| 51 |
+
|
| 52 |
+
# image = ImageOps.exif_transpose(image) # EXIF rotation is apparent, not actual
|
| 53 |
+
orientation_info = {}
|
| 54 |
+
try:
|
| 55 |
+
orientation_info = pyocr.tesseract.detect_orientation(image)
|
| 56 |
+
except pyocr.PyocrException as e:
|
| 57 |
+
print("Orientation detection failed: {}".format(e))
|
| 58 |
+
# output = pytesseract.image_to_osd(
|
| 59 |
+
# image, config=" --psm 0", output_type=pytesseract.Output.DICT
|
| 60 |
+
# )
|
| 61 |
+
angle = orientation_info.get("angle", 0)
|
| 62 |
+
confidence = orientation_info.get("confidence", 100)
|
| 63 |
+
# rotate = output["rotate"]
|
| 64 |
+
# confidence = output["orientation_conf"]
|
| 65 |
+
|
| 66 |
+
if confidence > ROTATION_CONFIDENCE_THRESHOLD:
|
| 67 |
+
new_image = image.rotate(angle, expand=True)
|
| 68 |
+
else:
|
| 69 |
+
new_image = image.copy()
|
| 70 |
+
return new_image
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def convert_pdf_to_image_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
|
| 74 |
+
"""Converts a PDF to an image using pdf2image.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
pdf_bytes(bytes): The bytes of the PDF to be converted.
|
| 78 |
+
|
| 79 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
| 80 |
+
"""
|
| 81 |
+
images = pdf2image.convert_from_bytes(pdf_bytes, dpi=PDF_CONVERSION_DPI)
|
| 82 |
+
return images
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def convert_pdf_to_image_ImageMagick(filename: Path, dest_folder: Path) -> Path:
|
| 86 |
+
"""Converts a PDF to an image using ImageMagick.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
filename(pathlib.Path): The path to the PDF to be converted.
|
| 90 |
+
dest_folder(pathlib.Path): The destination folder for the converted pages. Pages
|
| 91 |
+
are saved in the folder as page.jpg or as page-01.jpg,
|
| 92 |
+
page-02.jpg, etc.
|
| 93 |
+
|
| 94 |
+
Returns: dest_folder
|
| 95 |
+
"""
|
| 96 |
+
os.system(f"magick convert"
|
| 97 |
+
f"-density {PDF_CONVERSION_DPI}"
|
| 98 |
+
f"{filename}"
|
| 99 |
+
f"-quality 100"
|
| 100 |
+
f"{dest_folder/'page.jpg'}")
|
| 101 |
+
return dest_folder
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def preprocess_image(image: Image.Image) -> Image.Image:
|
| 105 |
+
"""Preprocesses an image for future use with OCR.
|
| 106 |
+
The following operations are performed:
|
| 107 |
+
1. Orientation correction
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
image(PIL.Image.Image): The image to be preprocessed.
|
| 111 |
+
|
| 112 |
+
Returns: The preprocessed pillow image.
|
| 113 |
+
"""
|
| 114 |
+
rotated_image = correct_orientation(image)
|
| 115 |
+
result = rotated_image
|
| 116 |
+
image.close()
|
| 117 |
+
return result
|
| 118 |
+
|
| 119 |
+
def preprocess_pdf_pdf2image(pdf_bytes: bytes) -> List[Image.Image]:
|
| 120 |
+
"""Preprocesses a PDF for future use with OCR.
|
| 121 |
+
The following operations are performed:
|
| 122 |
+
1. PDF to image conversion
|
| 123 |
+
2. Orientation correction
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
pdf_bytes(bytes): The bytes of the PDF to be preprocessed.
|
| 127 |
+
|
| 128 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
| 129 |
+
"""
|
| 130 |
+
images = convert_pdf_to_image_pdf2image(pdf_bytes)
|
| 131 |
+
result = []
|
| 132 |
+
for image in images:
|
| 133 |
+
new_image = preprocess_image(image)
|
| 134 |
+
image.close()
|
| 135 |
+
result.append(new_image)
|
| 136 |
+
return result
|
| 137 |
+
|
| 138 |
+
def preprocess_pdf_ImageMagick(filename: Path) -> List[Image.Image]:
|
| 139 |
+
"""Preprocesses a PDF for future use with OCR.
|
| 140 |
+
The following operations are performed:
|
| 141 |
+
1. PDF to image conversion
|
| 142 |
+
2. Orientation correction
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
filename(pathlib.Path): The path to the PDF to be preprocessed.
|
| 146 |
+
|
| 147 |
+
Returns: A list of pillow images corresponding to each page from the PDF.
|
| 148 |
+
"""
|
| 149 |
+
dest_folder = convert_pdf_to_image_ImageMagick(filename, dest_folder)
|
| 150 |
+
result = []
|
| 151 |
+
for image in dest_folder.glob("*.jpg"):
|
| 152 |
+
new_image = preprocess_image(image)
|
| 153 |
+
image.close()
|
| 154 |
+
result.append(new_image)
|
| 155 |
+
return result
|
| 156 |
+
|
| 157 |
+
if __name__ == '__main__':
|
| 158 |
+
filename = 'examples/upright.jpeg'
|
| 159 |
+
image = Image.open(filename)
|
| 160 |
+
new_image = preprocess_image(image)
|
| 161 |
+
image.close()
|
| 162 |
+
new_image.show()
|
| 163 |
+
new_image.close()
|
| 164 |
+
|
| 165 |
+
filename = 'examples/rotated.pdf'
|
| 166 |
+
with open(filename, 'rb') as file:
|
| 167 |
+
bytes_ = bytes(file.read())
|
| 168 |
+
images = preprocess_pdf_pdf2image(bytes_)
|
| 169 |
+
for image in images:
|
| 170 |
+
image.show()
|
| 171 |
+
image.close()
|
requirements.txt
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiobotocore==2.5.0
|
| 2 |
+
aiofiles==22.1.0
|
| 3 |
+
aiohttp==3.8.3
|
| 4 |
+
aioitertools==0.7.1
|
| 5 |
+
aiosignal==1.2.0
|
| 6 |
+
aiosqlite==0.18.0
|
| 7 |
+
alabaster==0.7.12
|
| 8 |
+
anyio==3.5.0
|
| 9 |
+
appdirs==1.4.4
|
| 10 |
+
argon2-cffi==21.3.0
|
| 11 |
+
argon2-cffi-bindings==21.2.0
|
| 12 |
+
arrow==1.2.3
|
| 13 |
+
astroid==2.14.2
|
| 14 |
+
astropy==5.1
|
| 15 |
+
asttokens==2.2.1
|
| 16 |
+
async-timeout==4.0.2
|
| 17 |
+
atomicwrites==1.4.0
|
| 18 |
+
attrs==22.1.0
|
| 19 |
+
Automat==20.2.0
|
| 20 |
+
autopep8==1.6.0
|
| 21 |
+
Babel==2.11.0
|
| 22 |
+
backcall==0.2.0
|
| 23 |
+
bcrypt==3.2.0
|
| 24 |
+
beautifulsoup4==4.12.2
|
| 25 |
+
binaryornot==0.4.4
|
| 26 |
+
black==0.0
|
| 27 |
+
bleach==4.1.0
|
| 28 |
+
bokeh==3.1.1
|
| 29 |
+
botocore==1.29.76
|
| 30 |
+
Bottleneck==1.3.5
|
| 31 |
+
brotlipy==0.7.0
|
| 32 |
+
certifi==2023.7.22
|
| 33 |
+
cffi==1.15.1
|
| 34 |
+
chardet==4.0.0
|
| 35 |
+
charset-normalizer==2.0.4
|
| 36 |
+
click==8.0.4
|
| 37 |
+
cloudpickle==2.2.1
|
| 38 |
+
colorama==0.4.6
|
| 39 |
+
colorcet==3.0.1
|
| 40 |
+
comm==0.1.3
|
| 41 |
+
constantly==15.1.0
|
| 42 |
+
contourpy==1.0.5
|
| 43 |
+
cookiecutter==1.7.3
|
| 44 |
+
cryptography==39.0.1
|
| 45 |
+
cssselect==1.1.0
|
| 46 |
+
cycler==0.11.0
|
| 47 |
+
cytoolz==0.12.0
|
| 48 |
+
daal4py==2023.1.1
|
| 49 |
+
dask==2023.6.0
|
| 50 |
+
dataclasses-json==0.5.13
|
| 51 |
+
datasets==2.12.0
|
| 52 |
+
datashader==0.15.0
|
| 53 |
+
datashape==0.5.4
|
| 54 |
+
debugpy==1.6.7
|
| 55 |
+
decorator==5.1.1
|
| 56 |
+
defusedxml==0.7.1
|
| 57 |
+
diff-match-patch==20200713
|
| 58 |
+
dill==0.3.6
|
| 59 |
+
distributed==2023.6.0
|
| 60 |
+
docstring-to-markdown==0.11
|
| 61 |
+
docutils==0.18.1
|
| 62 |
+
entrypoints==0.4
|
| 63 |
+
et-xmlfile==1.1.0
|
| 64 |
+
exceptiongroup==1.0.4
|
| 65 |
+
executing==1.2.0
|
| 66 |
+
fastjsonschema==2.16.2
|
| 67 |
+
filelock==3.9.0
|
| 68 |
+
flake8==6.0.0
|
| 69 |
+
Flask==2.2.2
|
| 70 |
+
fonttools==4.25.0
|
| 71 |
+
frozenlist==1.3.3
|
| 72 |
+
fsspec==2023.4.0
|
| 73 |
+
gensim==4.3.0
|
| 74 |
+
greenlet==2.0.1
|
| 75 |
+
h5py==3.7.0
|
| 76 |
+
HeapDict==1.0.1
|
| 77 |
+
holoviews==1.16.2
|
| 78 |
+
huggingface-hub==0.15.1
|
| 79 |
+
hvplot==0.8.4
|
| 80 |
+
hyperlink==21.0.0
|
| 81 |
+
idna==3.4
|
| 82 |
+
imagecodecs==2021.8.26
|
| 83 |
+
imageio==2.26.0
|
| 84 |
+
imagesize==1.4.1
|
| 85 |
+
imbalanced-learn==0.10.1
|
| 86 |
+
importlib-metadata==6.0.0
|
| 87 |
+
incremental==21.3.0
|
| 88 |
+
inflection==0.5.1
|
| 89 |
+
iniconfig==1.1.1
|
| 90 |
+
intake==0.6.8
|
| 91 |
+
intervaltree==3.1.0
|
| 92 |
+
ipykernel==6.22.0
|
| 93 |
+
ipython==8.12.0
|
| 94 |
+
ipython-genutils==0.2.0
|
| 95 |
+
ipywidgets==8.0.4
|
| 96 |
+
iso4217==1.9.20220401
|
| 97 |
+
isort==5.9.3
|
| 98 |
+
itemadapter==0.3.0
|
| 99 |
+
itemloaders==1.0.4
|
| 100 |
+
itsdangerous==2.0.1
|
| 101 |
+
jaraco.classes==3.2.1
|
| 102 |
+
jedi==0.18.2
|
| 103 |
+
jellyfish==0.9.0
|
| 104 |
+
Jinja2==3.1.2
|
| 105 |
+
jinja2-time==0.2.0
|
| 106 |
+
jmespath==0.10.0
|
| 107 |
+
joblib==1.2.0
|
| 108 |
+
json5==0.9.6
|
| 109 |
+
jsonschema==4.17.3
|
| 110 |
+
jupyter==1.0.0
|
| 111 |
+
jupyter_client==8.2.0
|
| 112 |
+
jupyter-console==6.6.3
|
| 113 |
+
jupyter_core==5.3.0
|
| 114 |
+
jupyter-events==0.6.3
|
| 115 |
+
jupyter-server==1.23.6
|
| 116 |
+
jupyter_server_fileid==0.9.0
|
| 117 |
+
jupyter_server_terminals==0.4.4
|
| 118 |
+
jupyter_server_ydoc==0.8.0
|
| 119 |
+
jupyter-ydoc==0.2.4
|
| 120 |
+
jupyterlab==3.6.3
|
| 121 |
+
jupyterlab-pygments==0.1.2
|
| 122 |
+
jupyterlab_server==2.22.0
|
| 123 |
+
jupyterlab-widgets==3.0.5
|
| 124 |
+
keyring==23.13.1
|
| 125 |
+
kiwisolver==1.4.4
|
| 126 |
+
langchain==0.0.245
|
| 127 |
+
langsmith==0.0.15
|
| 128 |
+
lazy_loader==0.2
|
| 129 |
+
lazy-object-proxy==1.6.0
|
| 130 |
+
linkify-it-py==2.0.0
|
| 131 |
+
llvmlite==0.40.0
|
| 132 |
+
lmdb==1.4.1
|
| 133 |
+
locket==1.0.0
|
| 134 |
+
lxml==4.9.2
|
| 135 |
+
lz4==4.3.2
|
| 136 |
+
Markdown==3.4.1
|
| 137 |
+
markdown-it-py==2.2.0
|
| 138 |
+
MarkupSafe==2.1.1
|
| 139 |
+
marshmallow==3.20.1
|
| 140 |
+
matplotlib==3.7.1
|
| 141 |
+
matplotlib-inline==0.1.6
|
| 142 |
+
mccabe==0.7.0
|
| 143 |
+
mdit-py-plugins==0.3.0
|
| 144 |
+
mdurl==0.1.0
|
| 145 |
+
menuinst==1.4.19
|
| 146 |
+
mistune==3.0.0
|
| 147 |
+
mkl-fft==1.3.6
|
| 148 |
+
mkl-random==1.2.2
|
| 149 |
+
mkl-service==2.4.0
|
| 150 |
+
more-itertools==8.12.0
|
| 151 |
+
mpmath==1.2.1
|
| 152 |
+
msgpack==1.0.3
|
| 153 |
+
multidict==6.0.2
|
| 154 |
+
multipledispatch==0.6.0
|
| 155 |
+
multiprocess==0.70.14
|
| 156 |
+
munkres==1.1.4
|
| 157 |
+
mypy-extensions==0.4.3
|
| 158 |
+
nbclassic==0.5.5
|
| 159 |
+
nbclient==0.5.13
|
| 160 |
+
nbconvert==7.7.3
|
| 161 |
+
nbformat==5.7.0
|
| 162 |
+
nest-asyncio==1.5.6
|
| 163 |
+
networkx==2.8.4
|
| 164 |
+
nltk==3.7
|
| 165 |
+
notebook==6.5.4
|
| 166 |
+
notebook_shim==0.2.2
|
| 167 |
+
numba==0.57.0
|
| 168 |
+
numexpr==2.8.4
|
| 169 |
+
numpy==1.24.3
|
| 170 |
+
numpydoc==1.5.0
|
| 171 |
+
openai==0.27.8
|
| 172 |
+
openapi-schema-pydantic==1.2.4
|
| 173 |
+
opencv-python-headless==4.8.0.74
|
| 174 |
+
openpyxl==3.0.10
|
| 175 |
+
packaging==23.0
|
| 176 |
+
pandas==1.5.3
|
| 177 |
+
pandocfilters==1.5.0
|
| 178 |
+
panel==1.1.0
|
| 179 |
+
param==1.13.0
|
| 180 |
+
paramiko==2.8.1
|
| 181 |
+
parsel==1.6.0
|
| 182 |
+
parso==0.8.3
|
| 183 |
+
partd==1.2.0
|
| 184 |
+
pathspec==0.10.3
|
| 185 |
+
patsy==0.5.3
|
| 186 |
+
pdf2image==1.16.3
|
| 187 |
+
pep8==1.7.1
|
| 188 |
+
pexpect==4.8.0
|
| 189 |
+
pickleshare==0.7.5
|
| 190 |
+
Pillow==9.4.0
|
| 191 |
+
pip==23.1.2
|
| 192 |
+
platformdirs==3.5.0
|
| 193 |
+
plotly==5.9.0
|
| 194 |
+
pluggy==1.0.0
|
| 195 |
+
ply==3.11
|
| 196 |
+
pooch==1.4.0
|
| 197 |
+
poyo==0.5.0
|
| 198 |
+
prometheus-client==0.14.1
|
| 199 |
+
prompt-toolkit==3.0.38
|
| 200 |
+
Protego==0.1.16
|
| 201 |
+
psutil==5.9.5
|
| 202 |
+
ptyprocess==0.7.0
|
| 203 |
+
pure-eval==0.2.2
|
| 204 |
+
py-cpuinfo==8.0.0
|
| 205 |
+
pyarrow==11.0.0
|
| 206 |
+
pyasn1==0.4.8
|
| 207 |
+
pyasn1-modules==0.2.8
|
| 208 |
+
pycodestyle==2.10.0
|
| 209 |
+
pycparser==2.21
|
| 210 |
+
pyct==0.5.0
|
| 211 |
+
pycurl==7.45.2
|
| 212 |
+
pydantic==1.10.12
|
| 213 |
+
PyDispatcher==2.0.5
|
| 214 |
+
pydocstyle==6.3.0
|
| 215 |
+
pyerfa==2.0.0
|
| 216 |
+
pyflakes==3.0.1
|
| 217 |
+
Pygments==2.15.1
|
| 218 |
+
pylint==2.16.2
|
| 219 |
+
pylint-venv==2.3.0
|
| 220 |
+
pyls-spyder==0.4.0
|
| 221 |
+
PyNaCl==1.5.0
|
| 222 |
+
pyocr==0.8.3
|
| 223 |
+
pyodbc==4.0.34
|
| 224 |
+
pyOpenSSL==23.0.0
|
| 225 |
+
pyparsing==3.0.9
|
| 226 |
+
pypdf==3.13.0
|
| 227 |
+
PyQt5==5.15.7
|
| 228 |
+
PyQt5-sip==12.11.0
|
| 229 |
+
PyQtWebEngine==5.15.4
|
| 230 |
+
pyrsistent==0.18.0
|
| 231 |
+
PySocks==1.7.1
|
| 232 |
+
pytest==7.3.1
|
| 233 |
+
python-dateutil==2.8.2
|
| 234 |
+
python-json-logger==2.0.7
|
| 235 |
+
python-lsp-black==1.2.1
|
| 236 |
+
python-lsp-jsonrpc==1.0.0
|
| 237 |
+
python-lsp-server==1.7.2
|
| 238 |
+
python-slugify==5.0.2
|
| 239 |
+
python-snappy==0.6.1
|
| 240 |
+
pytoolconfig==1.2.5
|
| 241 |
+
pytz==2022.7
|
| 242 |
+
pyviz-comms==2.3.0
|
| 243 |
+
PyWavelets==1.4.1
|
| 244 |
+
pywin32==305.1
|
| 245 |
+
pywin32-ctypes==0.2.0
|
| 246 |
+
pywinpty==2.0.10
|
| 247 |
+
PyYAML==6.0
|
| 248 |
+
pyzmq==25.0.2
|
| 249 |
+
QDarkStyle==3.0.2
|
| 250 |
+
qstylizer==0.2.2
|
| 251 |
+
QtAwesome==1.2.2
|
| 252 |
+
qtconsole==5.4.2
|
| 253 |
+
QtPy==2.2.0
|
| 254 |
+
queuelib==1.5.0
|
| 255 |
+
regex==2022.7.9
|
| 256 |
+
requests==2.29.0
|
| 257 |
+
requests-file==1.5.1
|
| 258 |
+
responses==0.13.3
|
| 259 |
+
rfc3339-validator==0.1.4
|
| 260 |
+
rfc3986-validator==0.1.1
|
| 261 |
+
rope==1.7.0
|
| 262 |
+
Rtree==1.0.1
|
| 263 |
+
s3fs==2023.4.0
|
| 264 |
+
sacremoses==0.0.43
|
| 265 |
+
scikit-image==0.20.0
|
| 266 |
+
scikit-learn==1.2.2
|
| 267 |
+
scikit-learn-intelex==20230426.121158
|
| 268 |
+
scipy==1.10.1
|
| 269 |
+
Scrapy==2.8.0
|
| 270 |
+
seaborn==0.12.2
|
| 271 |
+
Send2Trash==1.8.0
|
| 272 |
+
service-identity==18.1.0
|
| 273 |
+
setuptools==67.8.0
|
| 274 |
+
sip==6.6.2
|
| 275 |
+
six==1.16.0
|
| 276 |
+
smart-open==5.2.1
|
| 277 |
+
sniffio==1.2.0
|
| 278 |
+
snowballstemmer==2.2.0
|
| 279 |
+
sortedcontainers==2.4.0
|
| 280 |
+
soupsieve==2.4
|
| 281 |
+
Sphinx==5.0.2
|
| 282 |
+
sphinxcontrib-applehelp==1.0.2
|
| 283 |
+
sphinxcontrib-devhelp==1.0.2
|
| 284 |
+
sphinxcontrib-htmlhelp==2.0.0
|
| 285 |
+
sphinxcontrib-jsmath==1.0.1
|
| 286 |
+
sphinxcontrib-qthelp==1.0.3
|
| 287 |
+
sphinxcontrib-serializinghtml==1.1.5
|
| 288 |
+
spyder==5.4.3
|
| 289 |
+
spyder-kernels==2.4.3
|
| 290 |
+
SQLAlchemy==1.4.39
|
| 291 |
+
stack-data==0.6.2
|
| 292 |
+
statsmodels==0.13.5
|
| 293 |
+
sympy==1.11.1
|
| 294 |
+
tables==3.8.0
|
| 295 |
+
tabulate==0.8.10
|
| 296 |
+
TBB==0.2
|
| 297 |
+
tblib==1.7.0
|
| 298 |
+
tenacity==8.2.2
|
| 299 |
+
terminado==0.17.1
|
| 300 |
+
text-unidecode==1.3
|
| 301 |
+
textdistance==4.2.1
|
| 302 |
+
threadpoolctl==2.2.0
|
| 303 |
+
three-merge==0.1.1
|
| 304 |
+
tifffile==2021.7.2
|
| 305 |
+
tinycss2==1.2.1
|
| 306 |
+
tldextract==3.2.0
|
| 307 |
+
tokenizers==0.13.2
|
| 308 |
+
toml==0.10.2
|
| 309 |
+
tomli==2.0.1
|
| 310 |
+
tomlkit==0.11.1
|
| 311 |
+
toolz==0.12.0
|
| 312 |
+
torch==2.0.1
|
| 313 |
+
tornado==6.3.1
|
| 314 |
+
tqdm==4.65.0
|
| 315 |
+
traitlets==5.9.0
|
| 316 |
+
transformers==4.29.2
|
| 317 |
+
Twisted==22.10.0
|
| 318 |
+
twisted-iocpsupport==1.0.2
|
| 319 |
+
typing_extensions==4.6.3
|
| 320 |
+
typing-inspect==0.9.0
|
| 321 |
+
uc-micro-py==1.0.1
|
| 322 |
+
ujson==5.4.0
|
| 323 |
+
Unidecode==1.2.0
|
| 324 |
+
urllib3==1.26.16
|
| 325 |
+
w3lib==1.21.0
|
| 326 |
+
watchdog==2.1.6
|
| 327 |
+
wcwidth==0.2.6
|
| 328 |
+
webencodings==0.5.1
|
| 329 |
+
websocket-client==0.58.0
|
| 330 |
+
Werkzeug==2.2.3
|
| 331 |
+
whatthepatch==1.0.2
|
| 332 |
+
wheel==0.38.4
|
| 333 |
+
widgetsnbextension==4.0.5
|
| 334 |
+
win-inet-pton==1.1.0
|
| 335 |
+
wrapt==1.14.1
|
| 336 |
+
xarray==2022.11.0
|
| 337 |
+
xlwings==0.29.1
|
| 338 |
+
xxhash==2.0.2
|
| 339 |
+
xyzservices==2022.9.0
|
| 340 |
+
y-py==0.5.9
|
| 341 |
+
yapf==0.31.0
|
| 342 |
+
yarl==1.8.1
|
| 343 |
+
ypy-websocket==0.8.2
|
| 344 |
+
zict==2.2.0
|
| 345 |
+
zipp==3.11.0
|
| 346 |
+
zope.interface==5.4.0
|