Upload 4 files

Browse files

files to create a python package with.

Files changed (4) hide show

requirements.txt +4 -0
setup.py +18 -0
txt2xl/__init__.py +6 -0
txt2xl/txt2xl.py +84 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers==4.44.2
+torch==2.4.1+cu121
+pandas==2.1.4
+requests==2.32.3

setup.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from setuptools import setup, find_packages
+setup(
+    name="txt2xl",
+    version="0.1.2",
+    description="Text classification Python functions for txt2xl",
+    author="alfiinyang",
+    author_email="alfiinyang@gmail.com",
+    packages=find_packages(include=['classifier_code']),
+    package_dir={'': 'classifier_code'},
+    install_requires=[
+        "transformers==4.44.2",
+        "torch==2.4.1+cu121",
+        "pandas==2.1.4",
+        "requests==2.32.3",
+    ],
+    python_requires='>=3.6',
+)

txt2xl/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# classifier_code/__init__.py
+__version__ = "0.1.2"
+__author__ = "alfiinyang"
+from .txt2xl import classify_, txt2xl

txt2xl/txt2xl.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# import libraries
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import json
+import requests
+import pandas as pd
+import re
+# load the model and it's tokenizer
+tokenizer = AutoTokenizer.from_pretrained('alfiinyang/txt2xl_classifier_model')
+model = AutoModelForSequenceClassification.from_pretrained('alfiinyang/txt2xl_classifier_model')
+url = "https://huggingface.co/alfiinyang/txt2xl_classifier/resolve/main/label_map.json"
+response = requests.get(url)
+label_map = json.loads(response.text)
+# Define a function to classify a new description
+def classify_(description):
+  """Function for classifying descriptions"""
+    with torch.no_grad():
+        encoding = tokenizer.encode_plus(
+            description,
+            add_special_tokens=True,
+            max_length=45,
+            return_token_type_ids=False,
+            padding='max_length',
+            truncation=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+        input_ids = encoding['input_ids']
+        attention_mask = encoding['attention_mask']
+        outputs = model(input_ids, attention_mask=attention_mask)
+        _, prediction = torch.max(outputs.logits, dim=1)
+        return label_map[str(prediction.item())]
+def txt2xl(input_text):
+    # Regular expression patterns
+    date_pattern = r'\d{2}/\d{2}/\d{4}'
+    entry_pattern1 = r'([\w\s,()]+)'
+    entry_pattern2 = r'([\w\s,()]+) - ([\d, ]+)'
+    t_entries = input_text.split('\n\n')
+    data = []
+    # Extract entries by date
+    for line in t_entries:
+      # Extract date
+      date = re.search(date_pattern, line).group()
+      # Extract entries
+      entries = line[len(date)+1:].strip().split('\n')
+      for i, entry in enumerate(entries):
+          if re.findall(entry_pattern2, entry) == []:
+              desc = re.findall(entry_pattern1, entry)[0]
+              if desc.lower().strip().endswith('cash out'):
+                  desc = 'POS cash out'
+                  cost = re.findall(r'\d+', entry)[0] + '000'
+              else:
+                  cost = '0'
+              entries[i] = date, desc, cost
+          else:
+              desc, cost = re.findall(entry_pattern2, entry)[0]
+              entries[i] = date, desc, cost
+      # Store entries in a DataFrame
+      for date, item, cost in entries:
+          total_cost = '=SUM(' + cost + ')'
+          if item == 'POS cash out':
+              data.append([date, item, total_cost, '', '', ''])
+          else:
+              data.append([date, item, '', total_cost, '', ''])
+    new_df = pd.DataFrame(data, columns=['DATE', 'COMMENT', 'CREDIT', 'DEBIT', 'SOURCE', 'CATEGORY'])
+    new_df['DATE'] = pd.to_datetime(new_df.DATE, dayfirst=True)
+    new_df['DATE'] = new_df.DATE.dt.date
+    # Classify Transactions
+    new_df['CATEGORY'] = new_df.COMMENT.map(classify_)
+    return new_df