File size: 1,524 Bytes
4e20c0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609d9fc
8306a27
4e20c0f
88ba6a4
cf4c2dc
88ba6a4
cf4c2dc
4e20c0f
8306a27
 
 
4e20c0f
 
 
 
 
 
 
 
 
 
838d7c7
cf4c2dc
838d7c7
cf4c2dc
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from functions import *

class Paragraph:
    def __init__(self, text: str, article_id: int, index: int):
        self.text = text
        self.article_id = article_id
        self.index = index

    def __str__(self):
        return self.text
    
    def evaluate_paragraph(self, model, tokenizer):
        return evaluate_text(self.text, model, tokenizer)
    
class Article:
    def __init__(self, article_id: int, html: str):
        self.article_id = article_id
        self.html = html
        self.paragraphs = get_article_text(self.html) or ''
        logger.info(f'\nParagraphs read: {len(self.paragraphs)}')

        self.article_length = 0
        for i, paragraph in enumerate(self.paragraphs):
            self.article_length += len(paragraph.split(' '))
            self.paragraphs[i] = Paragraph(paragraph, self.article_id, i)

        logger.info(f'First paragraph: {self.paragraphs[0].text}')
        logger.info(f'Last paragraph: {self.paragraphs[-1].text}')

    def __str__(self):
        return str(self.article_id)
    
    def evaluate_article(self, model, tokenizer):
        scores = []
        for paragraph in self.paragraphs:
            paragraph_score = paragraph.evaluate_paragraph(model, tokenizer)
            weighted_score = paragraph_score * len(paragraph.text.split(' '))
            scores.append(weighted_score)

        weighted_average = sum(scores) / self.article_length

        if self.article_length == 0:
            return 0.0
        else:
            return weighted_average