holcombzv's picture
Updated scraping behavior.
838d7c7
from functions import *
class Paragraph:
def __init__(self, text: str, article_id: int, index: int):
self.text = text
self.article_id = article_id
self.index = index
def __str__(self):
return self.text
def evaluate_paragraph(self, model, tokenizer):
return evaluate_text(self.text, model, tokenizer)
class Article:
def __init__(self, article_id: int, html: str):
self.article_id = article_id
self.html = html
self.paragraphs = get_article_text(self.html) or ''
logger.info(f'\nParagraphs read: {len(self.paragraphs)}')
self.article_length = 0
for i, paragraph in enumerate(self.paragraphs):
self.article_length += len(paragraph.split(' '))
self.paragraphs[i] = Paragraph(paragraph, self.article_id, i)
logger.info(f'First paragraph: {self.paragraphs[0].text}')
logger.info(f'Last paragraph: {self.paragraphs[-1].text}')
def __str__(self):
return str(self.article_id)
def evaluate_article(self, model, tokenizer):
scores = []
for paragraph in self.paragraphs:
paragraph_score = paragraph.evaluate_paragraph(model, tokenizer)
weighted_score = paragraph_score * len(paragraph.text.split(' '))
scores.append(weighted_score)
weighted_average = sum(scores) / self.article_length
if self.article_length == 0:
return 0.0
else:
return weighted_average