Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,9 +8,9 @@ import pytesseract
|
|
| 8 |
from pdf2image import convert_from_path
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
|
| 11 |
-
# Initialize Hugging Face Inference Client
|
| 12 |
hf_token = os.getenv("HF_TOKEN")
|
| 13 |
-
client = InferenceClient(model="
|
| 14 |
|
| 15 |
def extract_excel_data(file_path):
|
| 16 |
"""Extract text from Excel file"""
|
|
@@ -86,11 +86,23 @@ Rules:
|
|
| 86 |
1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
|
| 87 |
2. Convert negative balances to standard format (e.g., "-2421.72")
|
| 88 |
3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
|
|
|
|
| 89 |
"""
|
| 90 |
|
| 91 |
try:
|
| 92 |
# Call LLM via Hugging Face Inference API
|
| 93 |
-
response = client.text_generation(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
return json.loads(response)
|
| 95 |
except Exception as e:
|
| 96 |
print(f"LLM Error: {str(e)}")
|
|
@@ -104,7 +116,7 @@ def rule_based_parser(text):
|
|
| 104 |
# Find header line containing '| Date'
|
| 105 |
header_index = None
|
| 106 |
for i, line in enumerate(lines):
|
| 107 |
-
if re.search(r'\|Date', line):
|
| 108 |
header_index = i
|
| 109 |
break
|
| 110 |
|
|
@@ -115,7 +127,7 @@ def rule_based_parser(text):
|
|
| 115 |
transactions = []
|
| 116 |
|
| 117 |
for line in data_lines:
|
| 118 |
-
if not
|
| 119 |
continue
|
| 120 |
|
| 121 |
parts = [p.strip() for p in line.split('|') if p.strip()]
|
|
@@ -123,13 +135,14 @@ def rule_based_parser(text):
|
|
| 123 |
continue
|
| 124 |
|
| 125 |
try:
|
|
|
|
| 126 |
transactions.append({
|
| 127 |
"date": parts[0],
|
| 128 |
"description": parts[1],
|
| 129 |
-
"amount": parts[2],
|
| 130 |
-
"debit": parts[3],
|
| 131 |
-
"credit": parts[4],
|
| 132 |
-
"closing_balance": parts[5],
|
| 133 |
"category": parts[6]
|
| 134 |
})
|
| 135 |
except Exception as e:
|
|
@@ -137,6 +150,13 @@ def rule_based_parser(text):
|
|
| 137 |
|
| 138 |
return {"transactions": transactions}
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def process_file(file, is_scanned):
|
| 141 |
"""Main processing function"""
|
| 142 |
if not file:
|
|
|
|
| 8 |
from pdf2image import convert_from_path
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
|
| 11 |
+
# Initialize Hugging Face Inference Client with a free model
|
| 12 |
hf_token = os.getenv("HF_TOKEN")
|
| 13 |
+
client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=hf_token)
|
| 14 |
|
| 15 |
def extract_excel_data(file_path):
|
| 16 |
"""Extract text from Excel file"""
|
|
|
|
| 86 |
1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
|
| 87 |
2. Convert negative balances to standard format (e.g., "-2421.72")
|
| 88 |
3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
|
| 89 |
+
4. Only return valid JSON with no additional text
|
| 90 |
"""
|
| 91 |
|
| 92 |
try:
|
| 93 |
# Call LLM via Hugging Face Inference API
|
| 94 |
+
response = client.text_generation(
|
| 95 |
+
prompt,
|
| 96 |
+
max_new_tokens=2000,
|
| 97 |
+
temperature=0.1,
|
| 98 |
+
stop_sequences=["</s>"]
|
| 99 |
+
)
|
| 100 |
+
print(f"LLM Response: {response}")
|
| 101 |
+
|
| 102 |
+
# Extract JSON from response (remove non-JSON prefixes/suffixes)
|
| 103 |
+
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
| 104 |
+
if json_match:
|
| 105 |
+
return json.loads(json_match.group())
|
| 106 |
return json.loads(response)
|
| 107 |
except Exception as e:
|
| 108 |
print(f"LLM Error: {str(e)}")
|
|
|
|
| 116 |
# Find header line containing '| Date'
|
| 117 |
header_index = None
|
| 118 |
for i, line in enumerate(lines):
|
| 119 |
+
if re.search(r'\|Date|Date\|', line, re.IGNORECASE):
|
| 120 |
header_index = i
|
| 121 |
break
|
| 122 |
|
|
|
|
| 127 |
transactions = []
|
| 128 |
|
| 129 |
for line in data_lines:
|
| 130 |
+
if not '|' in line:
|
| 131 |
continue
|
| 132 |
|
| 133 |
parts = [p.strip() for p in line.split('|') if p.strip()]
|
|
|
|
| 135 |
continue
|
| 136 |
|
| 137 |
try:
|
| 138 |
+
# Handle numeric values consistently
|
| 139 |
transactions.append({
|
| 140 |
"date": parts[0],
|
| 141 |
"description": parts[1],
|
| 142 |
+
"amount": format_number(parts[2]),
|
| 143 |
+
"debit": format_number(parts[3]),
|
| 144 |
+
"credit": format_number(parts[4]),
|
| 145 |
+
"closing_balance": format_number(parts[5]),
|
| 146 |
"category": parts[6]
|
| 147 |
})
|
| 148 |
except Exception as e:
|
|
|
|
| 150 |
|
| 151 |
return {"transactions": transactions}
|
| 152 |
|
| 153 |
+
def format_number(value):
|
| 154 |
+
"""Format numeric values consistently"""
|
| 155 |
+
value = value.replace(',', '')
|
| 156 |
+
if re.match(r'^-?\d+(\.\d+)?$', value):
|
| 157 |
+
return f"{float(value):.2f}"
|
| 158 |
+
return value
|
| 159 |
+
|
| 160 |
def process_file(file, is_scanned):
|
| 161 |
"""Main processing function"""
|
| 162 |
if not file:
|