Spaces:

Dekode
/

betterzila_assignment

Sleeping

App Files Files Community

Pratik Dwivedi commited on Mar 21, 2024

Commit

25b98b6

1 Parent(s): 837a786

New App

Browse files

Files changed (11) hide show

app.py +24 -0
extractor.ipynb +464 -0
invoice_convertor.py +84 -0
invoices/invoice1.pdf +0 -0
invoices/invoice2.pdf +0 -0
invoices/invoice3.pdf +0 -0
invoices/invoice4.pdf +0 -0
invoices/invoice5.pdf +0 -0
invoices/invoice7.pdf +0 -0
invoices/invoice8.pdf +0 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+import os
+from invoice_convertor import InvoiceConvertor
+def main():
+    st.set_page_config(layout="wide")
+    st.title('Amazon Invoice Convertor')
+    st.write('This app converts your Amazon invoice pdfs to a csv file.')
+    convertor = InvoiceConvertor()
+    files = st.file_uploader('Upload your invoice pdfs', type=['pdf'], accept_multiple_files=True)
+    if files:
+        for file in files:
+            with open('data/' + file.name, 'wb') as f:
+                f.write(file.getbuffer())
+        convertor.read_pdfs('data/')
+        result_df = convertor.convert()
+        st.write(result_df)
+        st.download_button('Download csv', data=result_df.to_csv(), file_name='invoice.csv', mime='text/csv')
+    for file in os.listdir('data/'):
+        os.remove('data/' + file)
+    if st.button('Clear csv file') and os.path.exists('invoice.csv'):
+        os.remove('invoice.csv')
+if __name__ == '__main__':
+    main()

extractor.ipynb ADDED Viewed

	@@ -0,0 +1,464 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import PyPDF2, os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_pdf(path):\n",
+    "    pdf_file = open(path, 'rb')\n",
+    "    pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
+    "    text = ''\n",
+    "    for page_num in range(len(pdf_reader.pages)):\n",
+    "        page = pdf_reader.pages[page_num]\n",
+    "        text += page.extract_text()\n",
+    "    pdf_file.close()\n",
+    "    return text\n",
+    "\n",
+    "invoices = []\n",
+    "path = 'invoices/'\n",
+    "\n",
+    "for file in os.listdir(path):\n",
+    "    if file.startswith('invoice'):\n",
+    "        text = read_pdf(path + file)\n",
+    "        print(text)\n",
+    "        invoices.append(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "def save_as_csv(details, save_as = \"invoice.csv\"):\n",
+    "    # if the csv already exists then concat a new one to it, else create a new one\n",
+    "    if os.path.exists(save_as):\n",
+    "        df = pd.read_csv(save_as)\n",
+    "        df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
+    "    else:  \n",
+    "        df = pd.DataFrame(details, index=[0])\n",
+    "    df.to_csv(save_as, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def extract_invoice_details(text):\n",
+    "    invoice_details = {}\n",
+    "    try:\n",
+    "        invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
+    "        invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
+    "        invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "        invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
+    "        invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "        invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
+    "        invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
+    "        invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
+    "    except:\n",
+    "        print('Order Number not found')\n",
+    "    \n",
+    "    item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
+    "    if item_match:\n",
+    "        item_info = item_match.group(1)\n",
+    "        item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
+    "        invoice_details['Item'] = item_name\n",
+    "        print(item_name)\n",
+    "    else:\n",
+    "        print(\"No item found in the invoice.\")\n",
+    "    total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
+    "    if total_mount_match:\n",
+    "        total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
+    "        invoice_details['Total Amount'] = total_mount\n",
+    "    else:\n",
+    "        print(\"No total amount found in the invoice.\")\n",
+    "    gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
+    "    if gstin_match:\n",
+    "        invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
+    "    else:\n",
+    "        print(\"No GSTIN found in the invoice.\")\n",
+    "    by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
+    "    if by_match:\n",
+    "        invoice_details['Sold By'] = by_match.group(1).strip()\n",
+    "    else:\n",
+    "        print(\"No seller found in the invoice.\")\n",
+    "        \n",
+    "    return invoice_details"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for invoice in invoices:\n",
+    "    # print(invoice)\n",
+    "    details = extract_invoice_details(invoice)\n",
+    "    save_as_csv(details)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('invoice.csv')\n",
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import PyPDF2, os, re\n",
+    "import pandas as pd\n",
+    "\n",
+    "class InvoiceConvertor:\n",
+    "    \"\"\"\n",
+    "    This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.\n",
+    "    \n",
+    "    Usage:\n",
+    "    convertor = InvoiceConvertor()\n",
+    "    convertor.read_pdfs('path_to_pdfs')\n",
+    "    result_df = convertor.convert()\n",
+    "\n",
+    "    \"\"\"\n",
+    "    def __init__(self):\n",
+    "        self.invoices = []\n",
+    "        \n",
+    "    def read_pdfs(self,path):\n",
+    "        for file in os.listdir(path):\n",
+    "            if file.startswith('invoice'):\n",
+    "                pdf_file = open(path + file, 'rb')\n",
+    "                pdf_reader = PyPDF2.PdfReader(pdf_file)\n",
+    "                text = ''\n",
+    "                for page_num in range(len(pdf_reader.pages)):\n",
+    "                    page = pdf_reader.pages[page_num]\n",
+    "                    text += page.extract_text()\n",
+    "                pdf_file.close()\n",
+    "                self.invoices.append(text)\n",
+    "        return self.invoices\n",
+    "    \n",
+    "    def save_as_csv(self, details, save_as = \"invoice.csv\"):\n",
+    "        # if the csv already exists then concat a new one to it, else create a new one\n",
+    "        if os.path.exists(save_as):\n",
+    "            df = pd.read_csv(save_as)\n",
+    "            df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)\n",
+    "        else:  \n",
+    "            df = pd.DataFrame(details, index=[0])\n",
+    "        df.to_csv(save_as, index=False)\n",
+    "        \n",
+    "    def extract_invoice_details(self, text):\n",
+    "        invoice_details = {}\n",
+    "        try:\n",
+    "            invoice_details['Order Number'] = re.search(r'Order Number: (\\S+)', text).group(1)\n",
+    "            invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\\S+)', text).group(1)\n",
+    "            invoice_details['Order Date'] = re.search(r'Order Date: (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "            invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\\S+)', text).group(1)\n",
+    "            invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\\d{2}\\.\\d{2}\\.\\d{4})', text).group(1)\n",
+    "            invoice_details['Billing Address'] = re.search(r'Billing Address :([\\s\\S]+?)Shipping Address :', text).group(1).strip()\n",
+    "            invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\\s\\S]+?)Place of supply:', text).group(1).strip()\n",
+    "            invoice_details['PAN'] = re.search(r'PAN No:(\\S+)', text).group(1)\n",
+    "        except:\n",
+    "            print('Order Number not found')\n",
+    "\n",
+    "        item_match = re.search(r'1([\\s\\S]+?)TOTAL:', text, re.DOTALL)\n",
+    "        if item_match:\n",
+    "            item_info = item_match.group(1)\n",
+    "            item_name = re.search(r'\\nAmount\\n1([\\s\\S]+?)₹', item_info).group(1).strip()\n",
+    "            invoice_details['Item'] = item_name\n",
+    "            # print(item_name)\n",
+    "        else:\n",
+    "            print(\"No item found in the invoice.\")\n",
+    "        total_mount_match = re.search(r'TOTAL:([\\s\\S]+?)only', text, re.DOTALL)\n",
+    "        if total_mount_match:\n",
+    "            total_mount = total_mount_match.group(1).split('₹')[2].split('\\n')[0]\n",
+    "            invoice_details['Total Amount'] = total_mount\n",
+    "        else:\n",
+    "            print(\"No total amount found in the invoice.\")\n",
+    "        gstin_match = re.search(r'GST Registration No: ([\\s\\S]+?) ', text)\n",
+    "        if gstin_match:\n",
+    "            invoice_details['GSTIN'] = gstin_match.group(1).strip()\n",
+    "        else:\n",
+    "            print(\"No GSTIN found in the invoice.\")\n",
+    "        by_match = re.search(r'By :([\\s\\S]+?)PAN No:', text)\n",
+    "        if by_match:\n",
+    "            invoice_details['Sold By'] = by_match.group(1).strip()\n",
+    "        else:\n",
+    "            print(\"No seller found in the invoice.\")\n",
+    "        return invoice_details\n",
+    "    \n",
+    "    def convert(self):\n",
+    "        for invoice in self.invoices:\n",
+    "            details = self.extract_invoice_details(invoice)\n",
+    "            self.save_as_csv(details)\n",
+    "        return pd.read_csv('invoice.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Order Number not found\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Order Number</th>\n",
+       "      <th>Invoice Number</th>\n",
+       "      <th>Order Date</th>\n",
+       "      <th>Invoice Details</th>\n",
+       "      <th>Invoice Date</th>\n",
+       "      <th>Billing Address</th>\n",
+       "      <th>Shipping Address</th>\n",
+       "      <th>PAN</th>\n",
+       "      <th>Item</th>\n",
+       "      <th>Total Amount</th>\n",
+       "      <th>GSTIN</th>\n",
+       "      <th>Sold By</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>402-7035529-3886722</td>\n",
+       "      <td>NAG1-192347</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>MH-NAG1-1034-2324</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
+       "      <td>AALCA0171E</td>\n",
+       "      <td>Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...</td>\n",
+       "      <td>458.0</td>\n",
+       "      <td>27AALCA0171E1ZZ</td>\n",
+       "      <td>Appario Retail Private Ltd \\n*TCI Supply Chain...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>402-7035529-3886722</td>\n",
+       "      <td>BOM5-1379800</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>MH-BOM5-1034-2324</td>\n",
+       "      <td>17.08.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nBennett University, Plot Nos ...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...</td>\n",
+       "      <td>AALCA0171E</td>\n",
+       "      <td>LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...</td>\n",
+       "      <td>13,099.00</td>\n",
+       "      <td>27AALCA0171E1ZZ</td>\n",
+       "      <td>Appario Retail Private Ltd \\n*Renaissance indu...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>405-4419941-9848328</td>\n",
+       "      <td>DEX3-4683</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>DL-DEX3-157533501-2324</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
+       "      <td>ABEPW6057C</td>\n",
+       "      <td>Amozo Easy Fit Tempered Glass Screen Protector...</td>\n",
+       "      <td>474.00</td>\n",
+       "      <td>07ABEPW6057C1ZK</td>\n",
+       "      <td>RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>405-4419941-9848328</td>\n",
+       "      <td>HYD8-29019</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>TG-HYD8-817549015-2324</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
+       "      <td>AACCN8253B</td>\n",
+       "      <td>ESR for iPhone 13/14 Cover, Shockproof Drop Pr...</td>\n",
+       "      <td>399.00</td>\n",
+       "      <td>36AACCN8253B1ZN</td>\n",
+       "      <td>TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>405-0015964-5687515</td>\n",
+       "      <td>IN-5040</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>DL-1922955505-2324</td>\n",
+       "      <td>23.07.2023</td>\n",
+       "      <td>Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...</td>\n",
+       "      <td>Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...</td>\n",
+       "      <td>JISPS4412R</td>\n",
+       "      <td>imluckies Camera Lens Protector Compatible wit...</td>\n",
+       "      <td>149.00</td>\n",
+       "      <td>07JISPS4412R1Z4</td>\n",
+       "      <td>M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>408-4974466-7793143</td>\n",
+       "      <td>JPX2-223775</td>\n",
+       "      <td>02.01.2024</td>\n",
+       "      <td>RJ-JPX2-1317922175-2324</td>\n",
+       "      <td>02.01.2024</td>\n",
+       "      <td>Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...</td>\n",
+       "      <td>Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...</td>\n",
+       "      <td>AADCV4254H</td>\n",
+       "      <td>Amazon Basics Sleek Rechargeable LED Table Lam...</td>\n",
+       "      <td>569.00</td>\n",
+       "      <td>08AADCV4254H1Z8</td>\n",
+       "      <td>ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Saregama Carvaan Telugu - Portable Music Playe...</td>\n",
+       "      <td>6,320.00</td>\n",
+       "      <td>36AARCA3925C1ZQBilling</td>\n",
+       "      <td>AATS Connect Private Limited \\n* GMR Airport C...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          Order Number Invoice Number  Order Date          Invoice Details  \\\n",
+       "0  402-7035529-3886722    NAG1-192347  17.08.2023        MH-NAG1-1034-2324   \n",
+       "1  402-7035529-3886722   BOM5-1379800  17.08.2023        MH-BOM5-1034-2324   \n",
+       "2  405-4419941-9848328      DEX3-4683  23.07.2023   DL-DEX3-157533501-2324   \n",
+       "3  405-4419941-9848328     HYD8-29019  23.07.2023   TG-HYD8-817549015-2324   \n",
+       "4  405-0015964-5687515        IN-5040  23.07.2023       DL-1922955505-2324   \n",
+       "5  408-4974466-7793143    JPX2-223775  02.01.2024  RJ-JPX2-1317922175-2324   \n",
+       "6                  NaN            NaN         NaN                      NaN   \n",
+       "\n",
+       "  Invoice Date                                    Billing Address  \\\n",
+       "0   17.08.2023  Pratik Dwivedi \\nBennett University, Plot Nos ...   \n",
+       "1   17.08.2023  Pratik Dwivedi \\nBennett University, Plot Nos ...   \n",
+       "2   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
+       "3   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
+       "4   23.07.2023  Pratik Dwivedi \\nC- 123 Sector 26, Sector 26 N...   \n",
+       "5   02.01.2024  Devpal \\n514/3, Ganesh vihar \\nROORKEE, UTTARA...   \n",
+       "6          NaN                                                NaN   \n",
+       "\n",
+       "                                    Shipping Address         PAN  \\\n",
+       "0  Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...  AALCA0171E   \n",
+       "1  Pratik Dwivedi \\nPratik Dwivedi \\nBennett Univ...  AALCA0171E   \n",
+       "2  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  ABEPW6057C   \n",
+       "3  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  AACCN8253B   \n",
+       "4  Pratik Dwivedi \\nPratik Dwivedi \\nC- 123 Secto...  JISPS4412R   \n",
+       "5  Devpal \\nDevpal \\n514/3, Ganesh vihar \\nROORKE...  AADCV4254H   \n",
+       "6                                                NaN         NaN   \n",
+       "\n",
+       "                                                Item Total Amount  \\\n",
+       "0  Cosmic Byte CB-EP-05 Wired Gaming in Ear Earph...        458.0   \n",
+       "1  LG Ultragear IPS Gaming Monitor 60 cm (24\\nInc...    13,099.00   \n",
+       "2  Amozo Easy Fit Tempered Glass Screen Protector...       474.00   \n",
+       "3  ESR for iPhone 13/14 Cover, Shockproof Drop Pr...       399.00   \n",
+       "4  imluckies Camera Lens Protector Compatible wit...       149.00   \n",
+       "5  Amazon Basics Sleek Rechargeable LED Table Lam...       569.00   \n",
+       "6  Saregama Carvaan Telugu - Portable Music Playe...     6,320.00   \n",
+       "\n",
+       "                    GSTIN                                            Sold By  \n",
+       "0         27AALCA0171E1ZZ  Appario Retail Private Ltd \\n*TCI Supply Chain...  \n",
+       "1         27AALCA0171E1ZZ  Appario Retail Private Ltd \\n*Renaissance indu...  \n",
+       "2         07ABEPW6057C1ZK  RADHIKA WALIA \\n*Plot no 28, Block A, Mohan Co...  \n",
+       "3         36AACCN8253B1ZN  TIGER PUG COMMERCE PRIVATE LIMITED \\n*GMR Airp...  \n",
+       "4         07JISPS4412R1Z4  M.A.ENTERPRISES \\n*D2/235 GALI NO 6, 3rd PUSTA...  \n",
+       "5         08AADCV4254H1Z8  ETRADE MARKETING PRIVATE LIMITED \\n*Kh No 554 ...  \n",
+       "6  36AARCA3925C1ZQBilling  AATS Connect Private Limited \\n* GMR Airport C...  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "invoice_convertor = InvoiceConvertor()\n",
+    "invoice_convertor.read_pdfs('invoices/')\n",
+    "res = invoice_convertor.convert()\n",
+    "res.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "resparser",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

invoice_convertor.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import PyPDF2, os, re
+import pandas as pd
+class InvoiceConvertor():
+    """
+    This class is hardcoded to read all pdf files that start with 'invoice' in the given user given path and convert them to a csv file.
+    Usage:
+    convertor = InvoiceConvertor()
+    convertor.read_pdfs('path_to_pdfs')
+    result_df = convertor.convert()
+    """
+    def __init__(self):
+        self.invoices = []
+    def read_pdfs(self,path):
+        for file in os.listdir(path):
+            if file.startswith('invoice'):
+                pdf_file = open(path + file, 'rb')
+                pdf_reader = PyPDF2.PdfReader(pdf_file)
+                text = ''
+                for page_num in range(len(pdf_reader.pages)):
+                    page = pdf_reader.pages[page_num]
+                    text += page.extract_text()
+                pdf_file.close()
+                self.invoices.append(text)
+        return self.invoices
+    def save_as_csv(self, details, save_as = "invoice.csv"):
+        # if the csv already exists then concat a new one to it, else create a new one
+        if os.path.exists(save_as):
+            df = pd.read_csv(save_as)
+            df = pd.concat([df, pd.DataFrame(details, index=[0])], ignore_index=True)
+        else:
+            df = pd.DataFrame(details, index=[0])
+        df.to_csv(save_as, index=False)
+    def extract_invoice_details(self, text):
+        invoice_details = {}
+        try:
+            invoice_details['Order Number'] = re.search(r'Order Number: (\S+)', text).group(1)
+            invoice_details['Invoice Number'] = re.search(r'Invoice Number : (\S+)', text).group(1)
+            invoice_details['Order Date'] = re.search(r'Order Date: (\d{2}\.\d{2}\.\d{4})', text).group(1)
+            invoice_details['Invoice Details'] = re.search(r'Invoice Details : (\S+)', text).group(1)
+            invoice_details['Invoice Date'] = re.search(r'Invoice Date : (\d{2}\.\d{2}\.\d{4})', text).group(1)
+            invoice_details['Billing Address'] = re.search(r'Billing Address :([\s\S]+?)Shipping Address :', text).group(1).strip()
+            invoice_details['Shipping Address'] = re.search(r'Shipping Address :([\s\S]+?)Place of supply:', text).group(1).strip()
+            invoice_details['PAN'] = re.search(r'PAN No:(\S+)', text).group(1)
+        except:
+            print('Order Number not found')
+        item_match = re.search(r'1([\s\S]+?)TOTAL:', text, re.DOTALL)
+        if item_match:
+            item_info = item_match.group(1)
+            item_name = re.search(r'\nAmount\n1([\s\S]+?)₹', item_info).group(1).strip()
+            invoice_details['Item'] = item_name
+            # print(item_name)
+        else:
+            print("No item found in the invoice.")
+        total_mount_match = re.search(r'TOTAL:([\s\S]+?)only', text, re.DOTALL)
+        if total_mount_match:
+            total_mount = total_mount_match.group(1).split('₹')[2].split('\n')[0]
+            invoice_details['Total Amount'] = total_mount
+        else:
+            print("No total amount found in the invoice.")
+        gstin_match = re.search(r'GST Registration No: ([\s\S]+?) ', text)
+        if gstin_match:
+            invoice_details['GSTIN'] = gstin_match.group(1).strip()
+        else:
+            print("No GSTIN found in the invoice.")
+        by_match = re.search(r'By :([\s\S]+?)PAN No:', text)
+        if by_match:
+            invoice_details['Sold By'] = by_match.group(1).strip()
+        else:
+            print("No seller found in the invoice.")
+        return invoice_details
+    def convert(self):
+        for invoice in self.invoices:
+            details = self.extract_invoice_details(invoice)
+            self.save_as_csv(details)
+        return pd.read_csv('invoice.csv')

invoices/invoice1.pdf ADDED Viewed

Binary file (48.3 kB). View file

invoices/invoice2.pdf ADDED Viewed

Binary file (48.4 kB). View file

invoices/invoice3.pdf ADDED Viewed

Binary file (54.2 kB). View file

invoices/invoice4.pdf ADDED Viewed

Binary file (103 kB). View file

invoices/invoice5.pdf ADDED Viewed

Binary file (48 kB). View file

invoices/invoice7.pdf ADDED Viewed

Binary file (50.2 kB). View file

invoices/invoice8.pdf ADDED Viewed

Binary file (43.9 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit==1.32.2
+pyPDF2==3.0.1
+pandas==1.3.5
+regex==2023.12.25