Spaces:

Yuichiroh
/

ACL2Vec

Sleeping

ACL2Vec / _arxiv.py

4kasha

initial commit

bcdda34 about 3 years ago

1.1 kB

	from __future__ import annotations

	import logging
	from typing import Optional
	import string
	import nltk
	import arxiv

	logger = logging.getLogger(__name__)

	def extract_title_abst(arxiv_id: str):
	try:
	paper = next(arxiv.Search(id_list=[arxiv_id]).results())
	doc = paper.title + ' ' + paper.summary
	except:
	doc = None
	return doc

	def doc_to_ids(
	doc: Optional[str],
	word_to_id_: dict[str, int],
	stemming: bool,
	lower: bool = True,
	):
	from nltk.stem.porter import PorterStemmer

	if not doc:
	y = []
	else:
	if lower:
	doc = doc.lower()
	doc = "".join([char for char in doc if char not in string.punctuation])
	words = nltk.word_tokenize(doc)
	if stemming:
	porter = PorterStemmer()
	words = [porter.stem(word) for word in words]

	# Consider out-of-vocabulary cases, if y == []: no matched results
	y = [word_to_id_[word] for word in words if word in word_to_id_]
	# pick up keywords only once
	#y = list(set(y))
	return y