Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

Bhaskar2611 commited on Jun 8, 2025

Commit

ff610ff

verified ·

1 Parent(s): 52ebfdc

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -33

app.py CHANGED Viewed

@@ -3,42 +3,40 @@ import gradio as gr
 import pdfplumber
 import pytesseract
 from PIL import Image
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import pandas as pd
-import torch
-# Load Hugging Face token from environment
-hf_token = os.getenv("HF_TOKEN")  # Set this in Space Secrets
-# Load Mistral-7B-Instruct with authentication and fast tokenizer
-model_name = "mistralai/Mistral-7B-Instruct-v0.3"
-try:
-    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, use_fast=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        torch_dtype=torch.float16,
-        token=hf_token
-    )
-    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
-except Exception as e:
-    raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e
-# Text extraction from PDF
 def extract_text_from_pdf(pdf_path, is_scanned=False):
-    text = ""
-    if is_scanned:
-        images = convert_from_path(pdf_path)  # Requires pdf2image
-        for image in images:
-            text += pytesseract.image_to_string(image)
-    else:
         with pdfplumber.open(pdf_path) as pdf:
             for page in pdf.pages:
-                text += page.extract_text()
-    return text
 # Prompt engineering for structured extraction
 def parse_bank_statement(text):
     prompt = f"""
     Extract the following details from the bank statement text:
     - Transaction Date
@@ -66,16 +64,56 @@ def parse_bank_statement(text):
     }}
     Bank Statement Text:
-    {text}
     """
-    response = pipe(prompt)[0]["generated_text"]
-    return response  # In production, parse JSON programmatically
 # Main function
 def process_file(file, is_scanned):
     file_path = file.name
-    text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
     parsed_data = parse_bank_statement(text)
     df = pd.DataFrame(parsed_data["transactions"])
     return df
@@ -88,7 +126,9 @@ interface = gr.Interface(
     ],
     outputs=gr.Dataframe(label="Extracted Transactions"),
     title="Bank Statement Parser",
-    description="Convert PDF/Excel bank statements into structured data using Mistral-7B."
 )
-interface.launch()

 import pdfplumber
 import pytesseract
 from PIL import Image
+from pdf2image import convert_from_path
 import pandas as pd
+import numpy as np
+import re
+# For Excel files
+def extract_excel_data(file_path):
+    df = pd.read_excel(file_path, engine='openpyxl')
+    return df.to_string()
+# For PDF files with fallback OCR
 def extract_text_from_pdf(pdf_path, is_scanned=False):
+    try:
+        # First try native PDF extraction
         with pdfplumber.open(pdf_path) as pdf:
+            text = ""
             for page in pdf.pages:
+                text += page.extract_text() + "\n"
+            return text
+    except Exception as e:
+        # Fallback to OCR if PDF is invalid
+        print(f"Native PDF extraction failed: {str(e)}")
+        print("Trying OCR fallback...")
+        images = convert_from_path(pdf_path, dpi=200)
+        text = ""
+        for image in images:
+            text += pytesseract.image_to_string(image) + "\n"
+        return text
 # Prompt engineering for structured extraction
 def parse_bank_statement(text):
+    # Clean up text from PDF/OCR artifacts
+    cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
     prompt = f"""
     Extract the following details from the bank statement text:
     - Transaction Date
     }}
     Bank Statement Text:
+    {cleaned_text}
     """
+    # Simulate LLM response with deterministic parsing for demo
+    # Replace this with actual LLM inference in production
+    return simulate_llm_parsing(cleaned_text)
+def simulate_llm_parsing(text):
+    """Mock LLM response for demo purposes"""
+    # Simple regex-based parsing for demonstration
+    transactions = []
+    lines = text.split('\n')
+    # Skip header lines
+    data_lines = lines[lines.index('Date') + 1:]
+    for i in range(0, len(data_lines), 7):  # Process in chunks of 7
+        if i+6 >= len(data_lines):
+            break
+        try:
+            transactions.append({
+                "date": data_lines[i].strip(),
+                "description": data_lines[i+1].strip(),
+                "amount": data_lines[i+2].strip(),
+                "debit_credit": data_lines[i+3].strip(),
+                "closing_balance": data_lines[i+5].strip(),
+                "expense_type": data_lines[i+6].strip()
+            })
+        except Exception as e:
+            print(f"Error parsing line {i}: {str(e)}")
+            continue
+    return {"transactions": transactions}
 # Main function
 def process_file(file, is_scanned):
     file_path = file.name
+    file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == '.xlsx':
+        text = extract_excel_data(file_path)
+    elif file_ext == '.pdf':
+        text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
+    else:
+        return "Unsupported file format. Please upload PDF or Excel."
     parsed_data = parse_bank_statement(text)
+    # Convert to DataFrame for display
     df = pd.DataFrame(parsed_data["transactions"])
     return df
     ],
     outputs=gr.Dataframe(label="Extracted Transactions"),
     title="Bank Statement Parser",
+    description="Convert PDF/Excel bank statements into structured data using hybrid parsing techniques.",
+    allow_flagging="never"
 )
+if __name__ == "__main__":
+    interface.launch()