Sidharthkr commited on
Commit
781f7f0
·
verified ·
1 Parent(s): 8530ec1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +45 -189
README.md CHANGED
@@ -16,6 +16,16 @@ pipeline_tag: summarization
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  ## Model description
21
 
@@ -23,210 +33,56 @@ More information needed
23
 
24
  ## Intended uses & limitations
25
 
26
- Run on consumer Grade GPU
27
 
28
- ## GPU
29
- Tesla M60 16GB VRAM
30
 
 
31
 
 
32
 
33
  ### Training hyperparameters
34
 
35
  The following hyperparameters were used during training:
36
- per_device_train_batch_size: int = 2
37
- per_device_eval_batch_size: int = 2
38
- gradient_accumulation_steps: int = 16
39
- learning_rate: float = 2e-4
40
- weight_decay: float = 0.01
41
- warmup_ratio: float = 0.03
42
- logging_steps: int = 1
43
- save_steps: int = 200
44
- eval_steps: int = 200
45
- max_seq_length: int = 1024
46
- num_train_epochs: int = 1
47
- max_grad_norm: float = 0.3
48
- num_epochs: 5
49
 
50
  ### Training results
51
 
52
- Step Training Loss Validation Loss Entropy Num Tokens Mean Token Accuracy
53
- 50 1.399500 1.379406 1.427977 166273.000000 0.684534
54
- 100 1.350000 1.272701 1.238351 331643.000000 0.698206
55
- 150 1.391500 1.252361 1.240065 497339.000000 0.701490
56
- 200 1.175000 1.243332 1.248332 664364.000000 0.701699
57
- 250 1.357100 1.235908 1.209817 830792.000000 0.703880
58
- 300 1.341700 1.226673 1.196961 995955.000000 0.705412
59
- 350 1.211000 1.223105 1.219540 1161755.000000 0.705137
60
- 400 1.414100 1.219148 1.218188 1330892.000000 0.706035
61
- 450 1.088200 1.214209 1.244467 1494009.000000 0.707179
62
- 500 1.302800 1.210984 1.203838 1659876.000000 0.707986
63
- 550 1.192800 1.208378 1.201593 1828355.000000 0.708459
64
- 600 1.302300 1.206382 1.212914 1989352.000000 0.708516
65
- 650 1.177800 1.205050 1.245975 2155580.000000 0.708198
66
- 700 1.156600 1.201754 1.201212 2323534.000000 0.709032
67
- 750 1.271000 1.201216 1.218800 2488415.000000 0.708988
68
- 800 1.264100 1.198175 1.182730 2655756.000000 0.710219
69
- 850 1.324600 1.196617 1.189218 2822068.000000 0.710231
70
- 900 1.159400 1.198235 1.207774 2988438.000000 0.708831
71
- 950 1.294200 1.194295 1.211270 3153113.000000 0.709955
72
- 1000 1.370000 1.192295 1.215226 3321550.000000 0.710322
73
- 1050 1.157300 1.190316 1.214881 3485313.000000 0.710768
74
- 1100 1.124000 1.189019 1.210650 3651712.000000 0.711739
75
- 1150 1.139700 1.188874 1.209716 3815535.000000 0.711151
76
- 1200 1.293600 1.187840 1.198137 3980373.000000 0.710808
77
- 1250 1.199800 1.186739 1.226214 4146077.000000 0.711442
78
- ...
79
- XXXX 7700 steps XXXX
80
 
81
  ### How to use
82
 
83
  Here is how to use this model with the [pipeline API](https://huggingface.co/transformers/main_classes/pipelines.html):
84
 
85
  ```python
86
- import torch
87
- from transformers import AutoTokenizer, AutoModelForCausalLM
88
- from peft import PeftModel
89
- import time
90
- import os
91
-
92
- BASE_MODEL = "Qwen/Qwen3-0.6B" # change this for Qwen / Phi / MPT
93
- ADAPTER_PATH = "Sidharthkr/Qwen3_0.6B_alpaca_lora" # your LoRA output_dir
94
- DEVICE_MAP = "auto" # or "cuda" if single-GPU
95
- DTYPE = torch.float32 # Tesla M60: fp16, NOT bf16
96
-
97
- def create_alpaca_prompt(instruction: str, inp: str = "") -> str:
98
- """Format prompt in Alpaca style."""
99
- if inp.strip():
100
- prompt = (
101
- "Below is an instruction that describes a task, paired with an input that provides further context. "
102
- "Write a response that appropriately completes the request.\n\n"
103
- f"### Instruction:\n{instruction.strip()}\n\n"
104
- f"### Input:\n{inp.strip()}\n\n"
105
- "### Response:\n"
106
- )
107
- else:
108
- prompt = (
109
- "Below is an instruction that describes a task. "
110
- "Write a response that appropriately completes the request.\n\n"
111
- f"### Instruction:\n{instruction.strip()}\n\n"
112
- "### Response:\n"
113
- )
114
- return prompt
115
-
116
-
117
- def load_model_and_tokenizer():
118
- print(f"Loading base model: {BASE_MODEL}")
119
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
120
- if tokenizer.pad_token is None:
121
- tokenizer.pad_token = tokenizer.eos_token
122
- tokenizer.padding_side = "left"
123
- base_model = AutoModelForCausalLM.from_pretrained(
124
- BASE_MODEL,
125
- torch_dtype=DTYPE,
126
- device_map=DEVICE_MAP,
127
- )
128
- print(f"Loading LoRA adapter from {ADAPTER_PATH}")
129
- model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
130
- print("Merging LoRA weights into base model for speed...")
131
- model = model.merge_and_unload()
132
- model.eval()
133
- # For safety with older GPUs
134
- torch.backends.cuda.matmul.allow_tf32 = False
135
- #torch.backends.cudnn.allow_tf32 = False
136
- return model, tokenizer
137
-
138
-
139
- @torch.no_grad()
140
- def generate_single(
141
- model,
142
- tokenizer,
143
- instruction: str,
144
- inp: str = "",
145
- max_new_tokens: int = 256,
146
- temperature: float = 0.7,
147
- top_p: float = 0.9,
148
- ):
149
- prompt = create_alpaca_prompt(instruction, inp)
150
- inputs = tokenizer(
151
- prompt,
152
- return_tensors="pt",
153
- ).to(model.device)
154
- output_ids = model.generate(
155
- **inputs,
156
- max_new_tokens=max_new_tokens,
157
- do_sample=False, # ✅ no sampling → no multinomial
158
- temperature=None, # ignored when do_sample=False
159
- top_p=None,
160
- pad_token_id=tokenizer.eos_token_id,
161
- use_cache=True,
162
- )
163
-
164
- full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
165
- # Strip the prompt part to keep only the response
166
- if "### Response:" in full_text:
167
- response = full_text.split("### Response:")[-1].strip()
168
- else:
169
- response = full_text.strip()
170
- return response
171
-
172
- @torch.no_grad()
173
- def generate_batch(
174
- model,
175
- tokenizer,
176
- instructions,
177
- inputs=None,
178
- max_new_tokens: int = 256,
179
- temperature: float = 0.7,
180
- top_p: float = 0.9,
181
- ):
182
- if inputs is None:
183
- inputs = [""] * len(instructions)
184
- prompts = [
185
- create_alpaca_prompt(inst, inp)
186
- for inst, inp in zip(instructions, inputs)
187
- ]
188
- tokenized = tokenizer(
189
- prompts,
190
- return_tensors="pt",
191
- #padding=True,
192
- #truncation=True,
193
- ).to(model.device)
194
-
195
- output_ids = model.generate(
196
- **tokenized,
197
- max_new_tokens=max_new_tokens,
198
- do_sample=False, # ✅ no sampling → no multinomial
199
- temperature=None, # ignored when do_sample=False
200
- top_p=None,
201
- # do_sample=True,
202
- # temperature=temperature,
203
- # top_p=top_p,
204
- pad_token_id=tokenizer.eos_token_id,
205
- )
206
- outputs = []
207
- for i in range(len(prompts)):
208
- full_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
209
- if "### Response:" in full_text:
210
- response = full_text.split("### Response:")[-1].strip()
211
- else:
212
- response = full_text.strip()
213
- outputs.append(response)
214
- return outputs
215
-
216
- model, tokenizer = load_model_and_tokenizer()
217
-
218
- t1 = time.time() # ⏱ start
219
- # ---------- Example: single prediction ----------
220
- instruction = "Explain what a GPU is to a 15 year old."
221
- inp = ""
222
- response = generate_single(model, tokenizer, instruction, inp, max_new_tokens=512)
223
- t2 = time.time()
224
- print(f"Total time: {t2 - t1:.2f} seconds")
225
- print("=== Single prediction ===")
226
- print(response)
227
-
228
- >>> Total time: 4.42 seconds
229
- === Single prediction ===
230
- A GPU (Graphics Processing Unit) is a type of computer processor used to generate images and videos. It is used in computers and other devices to create visual content, such as games and movies. It is much faster than a CPU (Central Processing Unit) and can process more data in less time.
231
  ```
232
 
 
 
 
 
 
 
 
16
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
  should probably proofread and complete it, then remove this comment. -->
18
 
19
+ # InstructTweetSummarizer
20
+
21
+ This model is a fine-tuned version of [facebook/bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn) on an unknown dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.3548
24
+ - Rouge1: 47.5134
25
+ - Rouge2: 24.7121
26
+ - Rougel: 35.7366
27
+ - Rougelsum: 35.6499
28
+ - Gen Len: 111.96
29
 
30
  ## Model description
31
 
 
33
 
34
  ## Intended uses & limitations
35
 
36
+ More information needed
37
 
38
+ ## Training and evaluation data
 
39
 
40
+ More information needed
41
 
42
+ ## Training procedure
43
 
44
  ### Training hyperparameters
45
 
46
  The following hyperparameters were used during training:
47
+ - learning_rate: 2e-05
48
+ - train_batch_size: 6
49
+ - eval_batch_size: 4
50
+ - seed: 42
51
+ - gradient_accumulation_steps: 2
52
+ - total_train_batch_size: 12
53
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
+ - lr_scheduler_type: linear
55
+ - num_epochs: 3
 
 
 
 
56
 
57
  ### Training results
58
 
59
+ | Training Loss | Epoch | Step | Validation Loss | Rouge1 | Rouge2 | Rougel | Rougelsum | Gen Len |
60
+ |:-------------:|:-----:|:----:|:---------------:|:-------:|:-------:|:-------:|:---------:|:-------:|
61
+ | No log | 1.0 | 417 | 0.3468 | 44.9326 | 22.3736 | 33.008 | 32.9247 | 116.43 |
62
+ | 0.5244 | 2.0 | 834 | 0.3440 | 46.9139 | 24.683 | 35.3699 | 35.333 | 119.65 |
63
+ | 0.2061 | 3.0 | 1251 | 0.3548 | 47.5134 | 24.7121 | 35.7366 | 35.6499 | 111.96 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  ### How to use
66
 
67
  Here is how to use this model with the [pipeline API](https://huggingface.co/transformers/main_classes/pipelines.html):
68
 
69
  ```python
70
+ from transformers import pipeline
71
+ summarizer = pipeline("summarization", model="Sidharthkr/InstructTweetSummarizer")
72
+ def summarymaker(instruction = "", tweets = ""):
73
+ ARTICLE = f"""[INST] {instruction} [/INST] \\n [TWEETS] {tweets} [/TWEETS]"""
74
+ out = summarizer(ARTICLE, max_length=130, min_length=10, do_sample=False)
75
+ out = out[0]['summary_text'].split("[SUMMARY]")[-1].split("[/")[0].split("[via")[0].strip()
76
+ return out
77
+
78
+ summarymaker(instruction = "Summarize the tweets for Stellantis in 100 words",
79
+ tweets = """Stellantis - arch critic of Chinese EVs coming to Europe - is in talks with CATL to build a European plant. \n\nIt has concluded that cutting the price of EVs by using Chinese LFP batteries is more important.\n\n@FT story: \nhttps://t.co/l7nGggRFxH. State-of-the-art North America Battery Technology Centre begins to take shape at Stellantis' Automotive Research and Development Centre (ARDC) in Windsor, Ontario.\n\nhttps://t.co/04RO7CL1O5. RT @UAW: 🧵After the historic Stand Up Strike, UAW members at Ford, General Motors and Stellantis have voted to ratify their new contracts,…. RT @atorsoli: Stellantis and CATL are set to supply lower-cost EV batteries together for Europe, signaling automaker's efforts to tighten t…. RT @atorsoli: Stellantis and CATL are set to supply lower-cost EV batteries together for Europe, signaling automaker's efforts to tighten""")
80
+ >>> 'Stellantis is in talks with CATL to build a European plant, with a focus on cutting the price of EVs by using Chinese LFP batteries. The company is also developing a state-of-the-art North America Battery Technology Centre in Windsor, Ontario, and has ratified its new contracts with the UAW.'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  ```
82
 
83
+ ### Framework versions
84
+
85
+ - Transformers 4.34.1
86
+ - Pytorch 2.1.0
87
+ - Datasets 2.14.7
88
+ - Tokenizers 0.14.1