|
|
| import torch
|
| import torch.nn as nn
|
| from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
|
| from transformers import PreTrainedModel, PretrainedConfig
|
|
|
|
|
| class PatentClassifierConfig(PretrainedConfig):
|
| model_type = "patent_classifier"
|
|
|
| def __init__(self,
|
| model_name="Qwen/Qwen3-0.6B",
|
| hidden_dims=[512, 256],
|
| output_dim=9,
|
| dropout_rate=0.1,
|
| max_length=256,** kwargs):
|
| super().__init__(**kwargs)
|
| self.model_name = model_name
|
| self.hidden_dims = hidden_dims
|
| self.output_dim = output_dim
|
| self.dropout_rate = dropout_rate
|
| self.max_length = max_length
|
|
|
|
|
| class PatentClassifier(PreTrainedModel):
|
| config_class = PatentClassifierConfig
|
|
|
| def __init__(self, config):
|
| super().__init__(config)
|
| self.config = config
|
|
|
| if "qwen" in config.model_name.lower():
|
| self.base_llm_model = AutoModelForCausalLM.from_pretrained(
|
| config.model_name,
|
| trust_remote_code=True
|
| )
|
| else:
|
| self.base_llm_model = AutoModel.from_pretrained(config.model_name)
|
|
|
| for param in self.base_llm_model.parameters():
|
| param.requires_grad = False
|
|
|
|
|
| self.hidden_size = self.base_llm_model.config.hidden_size
|
| layers = []
|
| input_dim = self.hidden_size
|
| for dim in config.hidden_dims:
|
| layers.append(nn.Linear(input_dim, dim))
|
| layers.append(nn.ReLU())
|
| layers.append(nn.Dropout(config.dropout_rate))
|
| input_dim = dim
|
| layers.append(nn.Linear(input_dim, config.output_dim))
|
| self.classifier = nn.Sequential(*layers)
|
|
|
|
|
| self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
|
|
|
| def forward(self, input_ids, attention_mask):
|
| with torch.no_grad():
|
| outputs = self.base_llm_model(
|
| input_ids=input_ids,
|
| attention_mask=attention_mask,
|
| output_hidden_states=True
|
| )
|
| last_hidden_state = outputs.hidden_states[-1]
|
| attention_mask = attention_mask.unsqueeze(-1)
|
| weighted_hidden = last_hidden_state * attention_mask
|
| cls_embedding = weighted_hidden.sum(dim=1) / attention_mask.sum(dim=1).clamp(min=1e-9)
|
| return self.classifier(cls_embedding)
|
|
|
| def tokenize(self, texts, max_length=None):
|
| max_length = max_length or self.config.max_length
|
| return self.tokenizer(
|
| texts,
|
| max_length=max_length,
|
| padding="max_length",
|
| truncation=True,
|
| return_tensors="pt"
|
| ) |