texh: datapreprocessing

Files changed (7) hide show

.gitignore +1 -1
Dataset/transaction_data.csv +0 -0
datapreprocessing.py +80 -0
main.py +0 -6
requirements.txt +0 -0
requirenments.txt +8 -0
setup.md +12 -5

.gitignore CHANGED Viewed

	@@ -1 +1 @@
1	- ~~venv~~


1	+ transactify_venv

Dataset/transaction_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

datapreprocessing.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Import Required Libaries:
+import numpy as np
+import pandas as pd
+import tensorflow
+import keras
+import torch
+import re
+from transformers import BertTokenizer
+from sklearn.preprocessing import LabelEncoder
+# Read the data.
+def read_data(path):
+    try:
+        df=pd.read_csv(path)
+        return df
+    except FileNotFoundError:
+        print("File not found")
+data=read_data(r"E:\transactify\Dataset\transaction_data.csv")
+if data is not None:
+    print(data.head(15))
+# cleaning the text...
+def clean_text(text):
+    text=text.lower()                   # converting uppercase to lowercase
+    text=re.sub(r"\d+"," ",text)        # Removing digits in the text
+    text=re.sub(r"[^\w\s]"," ",text)    # Removing punctuations
+    text=text.strip()                   # Remove extra spaces
+    return text
+def preprocessing_data(df,max_length=20):
+    tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")
+    input_ids=[]
+    attention_masks=[]
+    for description in df["Transaction Description"]:
+        cleaned_text = clean_text(description)
+        # Debugging print statements
+        print(f"Original Description: {description}")
+        print(f"Cleaned Text: {cleaned_text}")
+        # Only tokenize if the cleaned text is not empty
+        if cleaned_text:
+            encoded_dict = tokenizer.encode_plus(
+                cleaned_text,
+                add_special_tokens=True,  # Correct argument
+                max_length=max_length,
+                pad_to_max_length=True,
+                return_attention_mask=True,  # Correct argument
+                return_tensors="pt",
+                truncation=True
+            )
+            input_ids.append(encoded_dict['input_ids'])  # Append input IDs
+            attention_masks.append(encoded_dict['attention_mask'])  # Append attention masks
+        else:
+            print("Cleaned text is empty, skipping...")
+    # Debugging output to check sizes
+    print(f"Total input_ids collected: {len(input_ids)}")
+    print(f"Total attention_masks collected: {len(attention_masks)}")
+    if not input_ids:
+        raise ValueError("No input_ids were collected. Check the cleaning process.")
+    input_ids = torch.cat(input_ids, dim=0)
+    attention_masks = torch.cat(attention_masks, dim=0)
+    labelencoder = LabelEncoder()
+    labels = labelencoder.fit_transform(df["Category"])
+    labels = torch.tensor(labels)
+    return input_ids, attention_masks, labels, labelencoder
+input_ids, attention_masks, labels, labelencoder = preprocessing_data(data)

main.py DELETED Viewed

@@ -1,6 +0,0 @@
-# hello_world.py
-def hello_world():
-    print("Hello World")
-if __name__ == "__main__":
-    hello_world()

requirements.txt DELETED Viewed

File without changes

requirenments.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy
+pandas
+tensorflow
+transformers
+scikit-learn
+torch
+torchvision
+torchaudio

setup.md CHANGED Viewed

@@ -43,10 +43,17 @@ Choose Write Tab (3rd one) / go here https://huggingface.co/settings/tokens/new?
 ## Create Virtual Environment
 ```
-python3 -m venv venv
-source venv/bin/activate
-```
-## FYI
-> We initially started with `pip install transformers datasets`

 ## Create Virtual Environment
 ```
+create a Virtual Environment for Transactify project...
+python -m venv transactify_venv
+To activate environment..
+go to cmd ..
+type >> cd transactify_venv
+     >> cd scripts
+     >> activate
+```
+## Installing Required Libaries.
+to install required libaries...
+go to cmd..
+type >>pip install -r requirenments.txt