John Best commited on
Commit
37a267f
·
1 Parent(s): 65db162

First Draft

Browse files
Files changed (8) hide show
  1. About.MD +0 -0
  2. app.py +23 -0
  3. data.csv +0 -0
  4. finetune.py +42 -0
  5. requirements.txt +4 -0
  6. run_all.py +10 -0
  7. train.py +15 -0
  8. training_data.txt +0 -0
About.MD ADDED
File without changes
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+
4
+ # Load the fine-tuned model
5
+ model_name = "your_fine_tuned_model_directory"
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
+ model = AutoModelForCausalLM.from_pretrained(model_name)
8
+
9
+ def generate_response(prompt):
10
+ inputs = tokenizer.encode(prompt, return_tensors="pt")
11
+ outputs = model.generate(inputs, max_length=150, num_return_sequences=1)
12
+ response = tokenizer.decode(outputs[0])
13
+ return response
14
+
15
+ st.title("Fine-Tuned Personal Finance Assistant")
16
+
17
+ prompt = st.text_input("Ask a question:")
18
+ response = ""
19
+
20
+ if st.button("Generate"):
21
+ response = generate_response(prompt)
22
+
23
+ st.write(response)
data.csv ADDED
The diff for this file is too large to render. See raw diff
 
finetune.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
3
+
4
+ # Load the pre-trained model and its configuration
5
+ config = AutoConfig.from_pretrained("togethercomputer/GPT-JT-6B-v1", output_hidden_states=True)
6
+ model = AutoModelForCausalLM.from_pretrained("togethercomputer/GPT-JT-6B-v1", config=config)
7
+ tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-JT-6B-v1")
8
+
9
+ # Prepare the dataset
10
+ train_dataset = TextDataset(
11
+ tokenizer=tokenizer,
12
+ file_path="training_data.txt",
13
+ block_size=128
14
+ )
15
+
16
+ data_collator = DataCollatorForLanguageModeling(
17
+ tokenizer=tokenizer, mlm=False,
18
+ )
19
+
20
+ # Configure the training arguments
21
+ training_args = TrainingArguments(
22
+ output_dir="models",
23
+ overwrite_output_dir=True,
24
+ num_train_epochs=3,
25
+ per_device_train_batch_size=4,
26
+ save_steps=10_000,
27
+ save_total_limit=2,
28
+ )
29
+
30
+ # Fine-tune the model
31
+ trainer = Trainer(
32
+ model=model,
33
+ args=training_args,
34
+ data_collator=data_collator,
35
+ train_dataset=train_dataset,
36
+ )
37
+
38
+ trainer.train()
39
+
40
+ # Save the fine-tuned model
41
+ model.save_pretrained("trained_model")
42
+ tokenizer.save_pretrained("trained_tokenizer")
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit==1.20.0
2
+ pandas==1.5.3
3
+ transformers==4.27.1
4
+ torch==2.0.0
run_all.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # Execute train.py
4
+ os.system("python train.py")
5
+
6
+ # Execute finetune.py
7
+ os.system("python finetune.py")
8
+
9
+ # Run the Streamlit app using app.py
10
+ os.system("streamlit run app.py")
train.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def preprocess_data(file_path):
4
+ df = pd.read_csv(file_path)
5
+ text_data = []
6
+
7
+ for index, row in df.iterrows():
8
+ text_data.append(f"Date: {row['Date']}\nDescription: {row['Description']}\nCheck Number: {row['Check Number']}\nAmount: {row['Amount']}\nBalance: {row['Balance']}\n\n")
9
+
10
+ with open("training_data.txt", "w") as f:
11
+ f.writelines(text_data)
12
+
13
+ if __name__ == "__main__":
14
+ preprocess_data("data.csv")
15
+
training_data.txt ADDED
The diff for this file is too large to render. See raw diff