Spaces:
Running
Running
| from transformers import BertTokenizer, BertForSequenceClassification, AdamW | |
| import torch | |
| # Load a pre-trained BERT model and tokenizer | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| model = BertForSequenceClassification.from_pretrained("bert-base-uncased") | |
| # Prepare your dataset (replace with your dataset loading code) | |
| train_texts = ["Text of issue 1", "Text of issue 2", ...] | |
| labels = [0, 1, ...] # 0 for non-relevant, 1 for relevant | |
| # Tokenize and convert your dataset to tensors | |
| input_ids = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt") | |
| labels = torch.tensor(labels) | |
| # Set up data loaders | |
| dataset = torch.utils.data.TensorDataset(input_ids["input_ids"], input_ids["attention_mask"], labels) | |
| train_loader = torch.utils.data.DataLoader(dataset, batch_size=32) | |
| # Define optimizer and loss | |
| optimizer = AdamW(model.parameters(), lr=1e-5) | |
| loss_fn = torch.nn.CrossEntropyLoss() | |
| # Fine-tune the model | |
| model.train() | |
| for epoch in range(3): # Replace with desired number of epochs | |
| for batch in train_loader: | |
| input_ids, attention_mask, labels = batch | |
| optimizer.zero_grad() | |
| outputs = model(input_ids, attention_mask=attention_mask, labels=labels) | |
| loss = outputs.loss | |
| loss.backward() | |
| optimizer.step() | |
| # Save the fine-tuned model | |
| model.save_pretrained("/path/to/save/model") | |
| # You can now use this model for semantic search. | |