Documents-Manager

Sleeping

App Files Files Community

rairo commited on Jan 30, 2025

Commit

4d6b816

verified ·

1 Parent(s): b182938

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -135

app.py CHANGED Viewed

@@ -1,155 +1,98 @@
 import re
 import pandas as pd
 import streamlit as st
-from datetime import datetime
 import pypdf
 def read_pdf(file_path):
     text_content = []
     with open(file_path, 'rb') as file:
         pdf_reader = pypdf.PdfReader(file)
-        for page_num in range(len(pdf_reader.pages)):
-            page = pdf_reader.pages[page_num]
             text = page.extract_text()
             if text:
                 text_content.append(text)
-    return text_content
-def preprocess_text(text_pages):
-    full_text = '\n'.join(text_pages)
-    return full_text
-def parse_amount(amount_str):
-    if not amount_str:
-        return 0.0
-    amount_str = amount_str.replace(' ', '').replace(',', '.')
-    if '-' in amount_str:
-        amount_str = amount_str.replace('-', '')
-        return -float(amount_str)
-    return float(amount_str)
-def extract_transactions(text):
-    lines = text.split('\n')
-    transactions = []
-    current_transaction = None
-    for line in lines:
-        date_match = re.match(r'^(\d{1,2}/\d{2}/\d{4})', line)
-        if date_match:
-            if current_transaction:
-                transactions.append(current_transaction)
-            date_str = date_match.group(1)
-            remaining_line = line[len(date_str):].strip()
-            parts = remaining_line.split()
-            charge_code = None
-            debit = None
-            credit = None
-            balance = None
-            description_parts = []
-            i = 0
-            while i < len(parts):
-                part = parts[i]
-                if part in ('A', 'C', 'M', 'S', 'T', 'V'):
-                    charge_code = part
-                    i += 1
-                    break
-                if re.match(r'^[\d\.,-]+$', part):
-                    break
-                description_parts.append(part)
-                i += 1
-            description = ' '.join(description_parts).strip()
-            amount_parts = parts[i:]
-            if amount_parts:
-                try:
-                    balance = parse_amount(amount_parts[-1])
-                    amount_parts = amount_parts[:-1]
-                except:
-                    balance = None
-                for amt in amount_parts:
-                    if ' ' in amt or ',' in amt or '.' in amt:
-                        if debit is None:
-                            debit = parse_amount(amt)
-                        else:
-                            credit = parse_amount(amt)
-            current_transaction = {
-                'Date': date_str,
-                'Description': description,
-                'Charge Code': charge_code,
-                'Debit': debit if debit != 0 else None,
-                'Credit': credit if credit != 0 else None,
-                'Balance': balance
             }
-        else:
-            if current_transaction:
-                current_transaction['Description'] += ' ' + line.strip()
-    if current_transaction:
-        transactions.append(current_transaction)
-    data = []
-    for t in transactions:
-        date = datetime.strptime(t['Date'], '%d/%m/%Y').strftime('%d/%m/%Y')
-        desc = t['Description']
-        charge_code = t['Charge Code']
-        debit = t['Debit']
-        credit = t['Credit']
-        if charge_code:
-            if debit is not None:
-                data.append({
-                    'Date': date,
-                    'Description': desc,
-                    'Amount': -abs(debit),
-                    'Type': 'bank charge'
-                })
-        else:
-            if debit is not None and debit < 0:
-                data.append({
-                    'Date': date,
-                    'Description': desc,
-                    'Amount': debit,
-                    'Type': 'debit amount'
-                })
-            elif debit is not None and debit > 0:
-                data.append({
-                    'Date': date,
-                    'Description': desc,
-                    'Amount': -debit,
-                    'Type': 'debit amount'
-                })
-            if credit is not None and credit > 0:
-                data.append({
-                    'Date': date,
-                    'Description': desc,
-                    'Amount': credit,
-                    'Type': 'credit amount'
-                })
-    df = pd.DataFrame(data)
-    return df
 def main():
-    st.title("Bank Statement Parser")
     uploaded_file = st.file_uploader("Upload a PDF bank statement", type="pdf")
-    if uploaded_file is not None:
-        with open("temp.pdf", "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        text_content = read_pdf("temp.pdf")
-        processed_text = preprocess_text(text_content)
-        transactions_df = extract_transactions(processed_text)
-        if not transactions_df.empty:
             st.write("### Extracted Transactions")
-            st.dataframe(transactions_df)
-        else:
-            st.write("No transactions found.")
 if __name__ == "__main__":
     main()

 import re
 import pandas as pd
 import streamlit as st
+import google.generativeai as genai
 import pypdf
+import json
+from datetime import datetime
+api_key = os.environ['Gemini']
+# Configure Gemini
+def configure_gemini(api_key):
+    genai.configure(api_key=api_key)
+    return genai.GenerativeModel('gemini-2.0-flash-exp')
+# Read PDF content
 def read_pdf(file_path):
     text_content = []
     with open(file_path, 'rb') as file:
         pdf_reader = pypdf.PdfReader(file)
+        for page in pdf_reader.pages:
             text = page.extract_text()
             if text:
                 text_content.append(text)
+    return "\n".join(text_content)
+# Process text with Gemini
+def process_with_gemini(model, text):
+    prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
+    - Date (format DD/MM/YYYY)
+    - Description
+    - Amount (positive for credits, negative for debits)
+    - Type (either 'debit amount', 'credit amount', or 'bank charge')
+    Return ONLY valid JSON with this structure:
+    {
+        "transactions": [
+            {
+                "Date": "string",
+                "Description": "string",
+                "Amount": number,
+                "Type": "string"
             }
+        ]
+    }"""
+    response = model.generate_content([prompt, text])
+    return response.text
+# Main Streamlit app
 def main():
+    st.title("Bank Statement Parser with Gemini AI")
+    # API key input
+    api_key = st.text_input("Enter your Gemini API key:", type="password")
     uploaded_file = st.file_uploader("Upload a PDF bank statement", type="pdf")
+    if uploaded_file and api_key:
+        try:
+            # Configure Gemini
+            model = configure_gemini(api_key)
+            # Save and read PDF
+            with open("temp.pdf", "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            pdf_text = read_pdf("temp.pdf")
+            # Process with Gemini
+            with st.spinner("Analyzing statement with Gemini AI..."):
+                json_response = process_with_gemini(model, pdf_text)
+                # Clean JSON response
+                json_str = json_response[json_response.find('{'):json_response.rfind('}')+1]
+                json_str = json_str.replace('```json', '').replace('```', '')
+                data = json.loads(json_str)
+                transactions = data.get('transactions', [])
+                # Create DataFrame
+                df = pd.DataFrame(transactions)
+                # Format amounts
+                if not df.empty:
+                    df['Amount'] = df['Amount'].apply(lambda x: f"R {x:,.2f}" if x >= 0 else f"R ({abs(x):,.2f})")
+                    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y').dt.strftime('%d/%m/%Y')
+            st.success("Analysis complete!")
             st.write("### Extracted Transactions")
+            st.dataframe(df)
+        except Exception as e:
+            st.error(f"Error processing document: {str(e)}")
+            st.error("Please ensure you're using a valid bank statement PDF and API key")
 if __name__ == "__main__":
     main()