Spaces:

saitsharipov
/

arxiv_category_classifier

Sleeping

App Files Files Community

arxiv_category_classifier / model_utils.py

saitsharipov

Update model_utils.py

b1863d7 verified 10 months ago

raw

history blame contribute delete

1.9 kB

	import re
	import json
	import torch
	import unicodedata
	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	from huggingface_hub import hf_hub_download


	def load_model_and_tokenizer(model_path: str):
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForSequenceClassification.from_pretrained(model_path)
	from huggingface_hub import hf_hub_download

	id2label_path = hf_hub_download(repo_id=model_path, filename="id2label.json")
	with open(id2label_path) as f:
	id2label = json.load(f)
	label2id = {v: int(k) for k, v in id2label.items()}
	return model, tokenizer, id2label, label2id


	def preprocess_text(text: str) -> str:
	if not isinstance(text, str):
	return ''
	text = unicodedata.normalize("NFKC", text)
	text = re.sub(r'https?://\S+\|www\.\S+', 'url', text)
	text = re.sub(r'\$.*?\$', 'math', text)
	text = re.sub(r'[^\w\s.,;:!?()\[\]{}\\/\'\"+-=*&^%$#@<>\|~`]', ' ', text)
	text = re.sub(r'\s+', ' ', text).strip()
	text = text.lower()
	return text[:1500]


	def preprocess_for_inference(title: str, abstract: str) -> str:
	title = '' if pd.isna(title) else str(title)
	abstract = '' if pd.isna(abstract) else str(abstract)
	text = f"Title: {title} Abstract: {abstract}"
	return preprocess_text(text)


	def predict_category(title: str, abstract: str, tokenizer, model, id2label):
	preprocessed_text = preprocess_for_inference(title, abstract)
	encoded = tokenizer(
	preprocessed_text,
	padding="max_length",
	truncation=True,
	max_length=128,
	return_tensors="pt"
	)
	encoded = {key: val.to(model.device) for key, val in encoded.items()}
	with torch.no_grad():
	pred = model(**encoded)
	predicted_class_id = torch.argmax(pred.logits, dim=1).item()
	return id2label[str(predicted_class_id)]