Reyad-Ahmmed commited on
Commit
6577dc7
·
verified ·
1 Parent(s): 931c772

Upload handler.py

Browse files
Files changed (1) hide show
  1. handler.py +200 -0
handler.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #python getvars-generic.py 1 --train_data seq_train_truck_distance.txt --model_output ./json_extraction_truck_distance
2
+ #python getvars-generic.py 1 --train_data seq_train_data_add_point.txt --model_output ./json_extraction_add_point
3
+ #python getvars-generic.py 1 --train_data seq_train_data_add_point_with_alerts.txt --model_output ./json_extraction_add_point_with_alerts
4
+
5
+ #python getvars-generic.py 1 --train_data seq_train_point_activity.txt --model_output ./json_extraction_point_activity
6
+
7
+
8
+ import torch
9
+ import argparse
10
+ import json
11
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
12
+ from datasets import Dataset
13
+ from transformers import TrainingArguments, Trainer
14
+ import random
15
+ import json
16
+ from huggingface_hub import HfApi, login, upload_folder, create_repo
17
+ import os
18
+ import gradio as gr
19
+
20
+ from transformers import BitsAndBytesConfig
21
+
22
+ # read variables from config.json file
23
+
24
+ with open('config.json', 'r') as config_file:
25
+ config = json.load(config_file)
26
+
27
+ num_args = len(config)
28
+
29
+
30
+ mode = config.get('mode', '1')
31
+ train_data_path = config.get('train_data', 'seq_train_point_activity.txt')
32
+ model_output_path = config.get('model_output', 'json_extraction_point_activity')
33
+
34
+ print(f"current mode: {mode}");
35
+ print(f"trin data path: {train_data_path}");
36
+ print(f"model output path: {model_output_path}");
37
+
38
+ def train_model(train_data_path, model_output_path):
39
+ # Read JSON data from the provided file
40
+ with open(train_data_path, "r", encoding="utf-8") as file:
41
+ train_data = json.load(file)
42
+
43
+ # Shuffle data randomly
44
+ random.shuffle(train_data)
45
+
46
+ # Define split ratio (95% train, 5% eval)
47
+ split_ratio = 0.95
48
+ split_index = int(len(train_data) * split_ratio)
49
+
50
+ # Split into training and evaluation sets
51
+ train_set = train_data[:split_index]
52
+ eval_set = train_data[split_index:]
53
+
54
+ # Create Hugging Face Dataset objects
55
+ train_dataset = Dataset.from_dict({
56
+ "input_text": [x["input"] for x in train_set],
57
+ "target_text": [x["output"] for x in train_set]
58
+ })
59
+
60
+ eval_dataset = Dataset.from_dict({
61
+ "input_text": [x["input"] for x in eval_set],
62
+ "target_text": [x["output"] for x in eval_set]
63
+ })
64
+
65
+ tokenizer = T5Tokenizer.from_pretrained("t5-large")
66
+ model = T5ForConditionalGeneration.from_pretrained("t5-large")
67
+
68
+ def tokenize_function(examples):
69
+ model_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=128)
70
+ with tokenizer.as_target_tokenizer():
71
+ labels = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=128)
72
+ model_inputs["labels"] = labels["input_ids"]
73
+ return model_inputs
74
+
75
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
76
+ eval_dataset = eval_dataset.map(tokenize_function, batched=True)
77
+
78
+ training_args = TrainingArguments(
79
+ output_dir=model_output_path,
80
+ logging_dir="./logs",
81
+ logging_steps=10,
82
+ evaluation_strategy="epoch",
83
+ learning_rate=2e-5,
84
+ per_device_train_batch_size=4,
85
+ per_device_eval_batch_size=4,
86
+ num_train_epochs=35,
87
+ weight_decay=0.01
88
+ )
89
+
90
+ trainer = Trainer(
91
+ model=model,
92
+ args=training_args,
93
+ train_dataset=train_dataset,
94
+ eval_dataset=eval_dataset
95
+ )
96
+
97
+ print("Starting training...")
98
+ trainer.train()
99
+
100
+ model.save_pretrained(model_output_path)
101
+ tokenizer.save_pretrained(model_output_path)
102
+
103
+ #for push repository
104
+ repo_name = "Reyad-Ahmmed/hf-data-timeframe"
105
+
106
+ # Your repository name
107
+ api_token = os.getenv("hf_token") # Retrieve the API token from environment variable
108
+
109
+ if not api_token:
110
+ raise ValueError("API token not found. Please set the HF_API_TOKEN environment variable.")
111
+
112
+ # Create repository (if not already created)
113
+ api = HfApi()
114
+ create_repo(repo_id=repo_name, token=api_token, exist_ok=True)
115
+
116
+ # Upload the model and tokenizer to the Hugging Face repository
117
+
118
+ upload_folder(
119
+ folder_path=f"{model_output_path}",
120
+ path_in_repo=f"{model_output_path}",
121
+ repo_id=repo_name,
122
+ token=api_token,
123
+ commit_message="Push getvar generic t5 model",
124
+ #overwrite=True # Force overwrite existing files
125
+ )
126
+
127
+ upload_folder(
128
+ folder_path=f"{model_output_path}",
129
+ path_in_repo=f"{model_output_path}",
130
+ repo_id=repo_name,
131
+ token=api_token,
132
+ commit_message="Push getvar generic t5 tokenizer",
133
+ #overwrite=True # Force overwrite existing files
134
+ )
135
+
136
+
137
+ print(f"Model training complete and saved to {model_output_path}.")
138
+
139
+ def inference_loop(model_output_path):
140
+
141
+
142
+ # Load model and tokenizer
143
+ #model = T5ForConditionalGeneration.from_pretrained(model_output_path)
144
+ #tokenizer = T5Tokenizer.from_pretrained(model_output_path)
145
+
146
+ model_name = "Reyad-Ahmmed/hf-data-timeframe"
147
+
148
+ model = T5ForConditionalGeneration.from_pretrained(model_name, subfolder=model_output_path)
149
+ tokenizer = T5Tokenizer.from_pretrained(model_name, subfolder=model_output_path)
150
+
151
+ # Enable 8-bit or 4-bit quantization
152
+ quantization_config = BitsAndBytesConfig( # Set `load_in_4bit=True` for 4-bit quantization
153
+ llm_int8_threshold=8.0,
154
+ load_in_4bit = True
155
+ )
156
+
157
+ # Load quantized model
158
+ model = T5ForConditionalGeneration.from_pretrained(
159
+ model_name, subfolder=model_output_path,
160
+ quantization_config=quantization_config,
161
+ device_map="auto" # Automatically uses GPU if available
162
+ )
163
+
164
+ # Function to generate JSON output
165
+ def generate_json(input_text):
166
+ input_text = "Extract structured JSON for: " + input_text
167
+ #input_ids = tokenizer(input_text, return_tensors="pt").input_ids
168
+
169
+ input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
170
+
171
+ output_ids = model.generate(input_ids, max_length=100, temperature=0.3)
172
+ json_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
173
+
174
+ try:
175
+ return json.dumps(json.loads(json_output), indent=2) # Pretty-print JSON
176
+ except:
177
+ return json_output # Return as string if not valid JSON
178
+
179
+ # Launch Gradio Interface
180
+ iface = gr.Interface(fn=generate_json, inputs="text", outputs="text", title="JSON Extractor")
181
+ iface.launch(share=True)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ #parser = argparse.ArgumentParser(description="T5 JSON Extraction Script")
186
+ #parser.add_argument("mode", type=int, help="1 for training, 2 for inference")
187
+ #parser.add_argument("--train_data", type=str, help="Path to training data file", required=False)
188
+ #parser.add_argument("--model_output", type=str, help="Path to save/load model", required=True)
189
+ #args = parser.parse_args()
190
+
191
+ if mode == 1:
192
+ if not train_data_path:
193
+ print("Training mode requires --train_data argument.")
194
+ else:
195
+ train_model(train_data_path, model_output_path)
196
+ inference_loop(model_output_path)
197
+ elif mode == 2:
198
+ inference_loop(model_output_path)
199
+ else:
200
+ print("Invalid mode. Use 1 for training and 2 for inference.")