Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -81,10 +81,10 @@ def rule_based_parser(text):
|
|
| 81 |
"""Fallback parser for structured tables with pipe delimiters"""
|
| 82 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 83 |
|
| 84 |
-
# Find header line containing 'Date'
|
| 85 |
header_index = None
|
| 86 |
for i, line in enumerate(lines):
|
| 87 |
-
if re.search(r'\
|
| 88 |
header_index = i
|
| 89 |
break
|
| 90 |
|
|
@@ -120,7 +120,10 @@ def rule_based_parser(text):
|
|
| 120 |
def process_file(file, is_scanned):
|
| 121 |
"""Main processing function"""
|
| 122 |
if not file:
|
| 123 |
-
return
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
file_path = file.name
|
| 126 |
file_ext = os.path.splitext(file_path)[1].lower()
|
|
@@ -131,13 +134,32 @@ def process_file(file, is_scanned):
|
|
| 131 |
elif file_ext == '.pdf':
|
| 132 |
text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
|
| 133 |
else:
|
| 134 |
-
return
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
parsed_data = parse_bank_statement(text)
|
| 137 |
df = pd.DataFrame(parsed_data["transactions"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return df
|
|
|
|
| 139 |
except Exception as e:
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
# Gradio Interface
|
| 143 |
interface = gr.Interface(
|
|
|
|
| 81 |
"""Fallback parser for structured tables with pipe delimiters"""
|
| 82 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 83 |
|
| 84 |
+
# Find header line containing '| Date'
|
| 85 |
header_index = None
|
| 86 |
for i, line in enumerate(lines):
|
| 87 |
+
if re.search(r'\|Date', line): # Improved pattern to match "|Date"
|
| 88 |
header_index = i
|
| 89 |
break
|
| 90 |
|
|
|
|
| 120 |
def process_file(file, is_scanned):
|
| 121 |
"""Main processing function"""
|
| 122 |
if not file:
|
| 123 |
+
return pd.DataFrame(columns=[
|
| 124 |
+
"Date", "Description", "Amount", "Debit",
|
| 125 |
+
"Credit", "Closing Balance", "Category"
|
| 126 |
+
])
|
| 127 |
|
| 128 |
file_path = file.name
|
| 129 |
file_ext = os.path.splitext(file_path)[1].lower()
|
|
|
|
| 134 |
elif file_ext == '.pdf':
|
| 135 |
text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
|
| 136 |
else:
|
| 137 |
+
return pd.DataFrame(columns=[
|
| 138 |
+
"Date", "Description", "Amount", "Debit",
|
| 139 |
+
"Credit", "Closing Balance", "Category"
|
| 140 |
+
])
|
| 141 |
|
| 142 |
parsed_data = parse_bank_statement(text)
|
| 143 |
df = pd.DataFrame(parsed_data["transactions"])
|
| 144 |
+
|
| 145 |
+
# Ensure all required columns exist
|
| 146 |
+
required_cols = ["date", "description", "amount", "debit",
|
| 147 |
+
"credit", "closing_balance", "category"]
|
| 148 |
+
for col in required_cols:
|
| 149 |
+
if col not in df.columns:
|
| 150 |
+
df[col] = ""
|
| 151 |
+
|
| 152 |
+
df.columns = ["Date", "Description", "Amount", "Debit",
|
| 153 |
+
"Credit", "Closing Balance", "Category"]
|
| 154 |
return df
|
| 155 |
+
|
| 156 |
except Exception as e:
|
| 157 |
+
print(f"Processing error: {str(e)}")
|
| 158 |
+
# Return empty DataFrame with correct columns on error
|
| 159 |
+
return pd.DataFrame(columns=[
|
| 160 |
+
"Date", "Description", "Amount", "Debit",
|
| 161 |
+
"Credit", "Closing Balance", "Category"
|
| 162 |
+
])
|
| 163 |
|
| 164 |
# Gradio Interface
|
| 165 |
interface = gr.Interface(
|