alfiinyang commited on
Commit
fb845b9
·
verified ·
1 Parent(s): 32a8ccd

Upload 4 files

Browse files

files to create a python package with.

Files changed (4) hide show
  1. requirements.txt +4 -0
  2. setup.py +18 -0
  3. txt2xl/__init__.py +6 -0
  4. txt2xl/txt2xl.py +84 -0
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers==4.44.2
2
+ torch==2.4.1+cu121
3
+ pandas==2.1.4
4
+ requests==2.32.3
setup.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="txt2xl",
5
+ version="0.1.2",
6
+ description="Text classification Python functions for txt2xl",
7
+ author="alfiinyang",
8
+ author_email="alfiinyang@gmail.com",
9
+ packages=find_packages(include=['classifier_code']),
10
+ package_dir={'': 'classifier_code'},
11
+ install_requires=[
12
+ "transformers==4.44.2",
13
+ "torch==2.4.1+cu121",
14
+ "pandas==2.1.4",
15
+ "requests==2.32.3",
16
+ ],
17
+ python_requires='>=3.6',
18
+ )
txt2xl/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # classifier_code/__init__.py
2
+
3
+ __version__ = "0.1.2"
4
+ __author__ = "alfiinyang"
5
+
6
+ from .txt2xl import classify_, txt2xl
txt2xl/txt2xl.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import libraries
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ import json
5
+ import requests
6
+ import pandas as pd
7
+ import re
8
+
9
+ # load the model and it's tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained('alfiinyang/txt2xl_classifier_model')
11
+ model = AutoModelForSequenceClassification.from_pretrained('alfiinyang/txt2xl_classifier_model')
12
+
13
+ url = "https://huggingface.co/alfiinyang/txt2xl_classifier/resolve/main/label_map.json"
14
+ response = requests.get(url)
15
+ label_map = json.loads(response.text)
16
+
17
+ # Define a function to classify a new description
18
+ def classify_(description):
19
+ """Function for classifying descriptions"""
20
+
21
+ with torch.no_grad():
22
+ encoding = tokenizer.encode_plus(
23
+ description,
24
+ add_special_tokens=True,
25
+ max_length=45,
26
+ return_token_type_ids=False,
27
+ padding='max_length',
28
+ truncation=True,
29
+ return_attention_mask=True,
30
+ return_tensors='pt',
31
+ )
32
+ input_ids = encoding['input_ids']
33
+ attention_mask = encoding['attention_mask']
34
+ outputs = model(input_ids, attention_mask=attention_mask)
35
+ _, prediction = torch.max(outputs.logits, dim=1)
36
+ return label_map[str(prediction.item())]
37
+
38
+ def txt2xl(input_text):
39
+ # Regular expression patterns
40
+ date_pattern = r'\d{2}/\d{2}/\d{4}'
41
+ entry_pattern1 = r'([\w\s,()]+)'
42
+ entry_pattern2 = r'([\w\s,()]+) - ([\d, ]+)'
43
+
44
+ t_entries = input_text.split('\n\n')
45
+ data = []
46
+
47
+
48
+ # Extract entries by date
49
+ for line in t_entries:
50
+ # Extract date
51
+ date = re.search(date_pattern, line).group()
52
+
53
+ # Extract entries
54
+ entries = line[len(date)+1:].strip().split('\n')
55
+
56
+ for i, entry in enumerate(entries):
57
+ if re.findall(entry_pattern2, entry) == []:
58
+ desc = re.findall(entry_pattern1, entry)[0]
59
+ if desc.lower().strip().endswith('cash out'):
60
+ desc = 'POS cash out'
61
+ cost = re.findall(r'\d+', entry)[0] + '000'
62
+ else:
63
+ cost = '0'
64
+ entries[i] = date, desc, cost
65
+ else:
66
+ desc, cost = re.findall(entry_pattern2, entry)[0]
67
+ entries[i] = date, desc, cost
68
+
69
+ # Store entries in a DataFrame
70
+ for date, item, cost in entries:
71
+ total_cost = '=SUM(' + cost + ')'
72
+ if item == 'POS cash out':
73
+ data.append([date, item, total_cost, '', '', ''])
74
+ else:
75
+ data.append([date, item, '', total_cost, '', ''])
76
+
77
+ new_df = pd.DataFrame(data, columns=['DATE', 'COMMENT', 'CREDIT', 'DEBIT', 'SOURCE', 'CATEGORY'])
78
+ new_df['DATE'] = pd.to_datetime(new_df.DATE, dayfirst=True)
79
+ new_df['DATE'] = new_df.DATE.dt.date
80
+
81
+ # Classify Transactions
82
+ new_df['CATEGORY'] = new_df.COMMENT.map(classify_)
83
+
84
+ return new_df