Delete https:

Browse files

Files changed (15) hide show

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/.DS_Store +0 -0
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/__init__.py +0 -0
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/crawler.py +0 -256
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/.DS_Store +0 -0
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/cls.pt +0 -3
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/cls_log.txt +0 -76
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/config.json +0 -45
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/model.safetensors +0 -3
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mean.pt +0 -3
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mean_log.txt +0 -76
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/plot.png +0 -0
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/public_train_v4.json +0 -3
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/myNLI.py +0 -190
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/nli_v3.py +0 -115
https:/huggingface.co/spaces/khaiphan29/fact-checking-api/utils.py +0 -12

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/.DS_Store DELETED Viewed

Binary file (6.15 kB)

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/__init__.py DELETED Viewed

File without changes

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/crawler.py DELETED Viewed

@@ -1,256 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-import re
-import time
-from .utils import timer_func
-def remove_emoji(string):
-    emoji_pattern = re.compile("["
-                           u"\U0001F300-\U0001FAD6"  # emoticons
-                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
-                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
-                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
-                           u"\U00002702-\U000027B0"
-                           u"\U000024C2-\U0001F251"
-                           "]+", flags=re.UNICODE)
-    return emoji_pattern.sub(r'', string)
-def preprocess(texts):
-    texts = [text.replace("_", " ") for text in texts]
-    texts = [i.lower() for i in texts]
-    texts = [remove_emoji(i) for i in texts]
-    texts = [re.sub('[^\w\d\s]', '', i) for i in texts]
-    texts = [re.sub('\s+|\n', ' ', i) for i in texts]
-    texts = [re.sub('^\s|\s$', '', i) for i in texts]
-    # texts = [ViTokenizer.tokenize(i) for i in texts]
-    return texts
-class MyCrawler:
-    headers = {
-        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36",
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'DNT': '1',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1'
-    }
-    # headers = {
-    #             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
-    #             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-    #             # 'Accept-Language': 'en-US,en;q=0.5',
-    #             # 'Accept-Encoding': 'gzip, deflate',
-    #             # 'DNT': '1',
-    #             # 'Connection': 'keep-alive',
-    #             # 'Upgrade-Insecure-Requests': '1'
-    #         }
-    def getSoup(self, url: str):
-        req = requests.get(url,headers=self.headers)
-        return BeautifulSoup(req.text, 'html.parser')
-    def crawl_byContainer(self, url: str, article_container: str, body_class: str):
-        soup = self.getSoup(url)
-        paragraphs = soup.find(article_container,{"class": body_class})
-        if paragraphs:
-            #Crawl all paragraphs
-            contents = []
-            numOfParagraphs = 0
-            for p in paragraphs.find_all("p"):
-                contents.append(p.get_text())
-                numOfParagraphs += 1
-                # if numOfParagraphs > 10:
-                #     break
-            if contents:
-                result = "\n".join(contents)
-                if (url.split("/")[2] == "vnexpress.net"):
-                    result = self.crawl_byElement(soup, "p", "description") + "\n" + result
-                return result
-        return ""
-    def crawl_byElement(self, soup, element: str, ele_class: str):
-        print("by Elements...")
-        paragraph = soup.find(element,{"class": ele_class})
-        if paragraph:
-            print(paragraph.get_text())
-            return paragraph.get_text()
-        return ""
-    def crawl_webcontent(self, url: str):
-        provider = url.split("/")[2]
-        content = ""
-        if provider == "thanhnien.vn" or provider == "tuoitre.vn":
-            content = self.crawl_byContainer(url, "div", "afcbc-body")
-        elif provider == "vietnamnet.vn":
-            content = self.crawl_byContainer(url, "div", "maincontent")
-        elif provider == "vnexpress.net":
-            content = self.crawl_byContainer(url, "article", "fck_detail")
-        elif provider == "www.24h.com.vn":
-            content = self.crawl_byContainer(url, "article", "cate-24h-foot-arti-deta-info")
-        elif provider == "vov.vn":
-            content = self.crawl_byContainer(url, "div", "article-content")
-        elif provider == "vtv.vn":
-            content = self.crawl_byContainer(url, "div", "ta-justify")
-        elif provider == "vi.wikipedia.org":
-            content = self.crawl_byContainer(url, "div", "mw-content-ltr")
-        elif provider == "www.vinmec.com":
-            content = self.crawl_byContainer(url, "div", "block-content")
-        elif provider == "vietstock.vn":
-            content = self.crawl_byContainer(url, "div", "single_post_heading")
-        elif provider == "vneconomy.vn":
-            content = self.crawl_byContainer(url, "article", "detail-wrap")
-        elif provider == "dantri.com.vn":
-            content = self.crawl_byContainer(url, "article", "singular-container")
-        # elif provider == "plo.vn":
-        #     content = self.crawl_byContainer(url, "div", "article__body")
-        return provider, url, content
-    #def crawl_redir(url):
-    @timer_func
-    def search(self, claim: str, count: int = 1):
-        processed_claim = preprocess([claim])[0]
-        num_words = 100
-        ls_word = processed_claim.split(" ")
-        claim_short = " ".join(ls_word[:num_words])
-        print(claim_short)
-        query = claim_short
-        # query = '+'.join(claim_short.split(" "))
-        try:
-            # print(soup.prettify())
-            #get all URLs
-            attemp_time = 0
-            urls = []
-            while len(urls) == 0 and attemp_time < 3:
-                req=requests.get("https://www.bing.com/search?", headers=self.headers, params={
-                    "q": query,
-                    "responseFilter":"-images",
-                    "responseFilter":"-videos"
-                    })
-                print("Query URL: " + req.url)
-                print("Crawling Attempt " + str(attemp_time))
-                soup = BeautifulSoup(req.text, 'html.parser')
-                completeData = soup.find_all("li",{"class":"b_algo"})
-                for data in completeData:
-                    urls.append(data.find("a", href=True)["href"])
-                attemp_time += 1
-                time.sleep(1)
-            print("Got " + str(len(urls)) + " urls")
-            result = []
-            for url in urls:
-                print("Crawling... " + url)
-                provider, url, content = self.crawl_webcontent(url)
-                if content:
-                    result.append({
-                        "provider": provider,
-                        "url": url,
-                        "content": content
-                    })
-                    count -= 1
-                    if count == 0:
-                        break
-            return result
-        except Exception as e:
-            print(e)
-            return []
-    @timer_func
-    def searchGoogle(self, claim: str, count: int = 1):
-        processed_claim = preprocess([claim])[0]
-        num_words = 100
-        ls_word = processed_claim.split(" ")
-        claim_short = " ".join(ls_word[:num_words])
-        print(claim_short)
-        query = claim_short
-        # query = '+'.join(claim_short.split(" "))
-        try:
-            # print(soup.prettify())
-            #get all URLs
-            attemp_time = 0
-            urls = []
-            while len(urls) == 0 and attemp_time < 3:
-                req=requests.get("https://www.google.com/search?", headers=self.headers, params={
-                    "q": query
-                    })
-                print("Query URL: " + req.url)
-                print("Crawling Attempt " + str(attemp_time))
-                soup = BeautifulSoup(req.text, 'html.parser')
-                completeData = soup.find_all("a",{"jsname":"UWckNb"})
-                for data in completeData:
-                    urls.append(data["href"])
-                attemp_time += 1
-                time.sleep(1)
-            print("Got " + str(len(urls)) + " urls")
-            result = []
-            for url in urls:
-                print("Crawling... " + url)
-                provider, url, content = self.crawl_webcontent(url)
-                if content:
-                    result.append({
-                        "provider": provider,
-                        "url": url,
-                        "content": content
-                    })
-                    count -= 1
-                    if count == 0:
-                        break
-            return result
-        except Exception as e:
-            print(e)
-            return []
-    @timer_func
-    def scraping(self, url: str):
-        try:
-            provider, url, content = self.crawl_webcontent(url)
-            if content:
-                return True
-            return False
-        except Exception as e:
-            print(e)
-            return False

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/.DS_Store DELETED Viewed

Binary file (6.15 kB)

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/cls.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f1c3c8eae44569fd01a746b220091611125f9eb04e09af2d60a6d80befcdb769
-size 11064

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/cls_log.txt DELETED Viewed

@@ -1,76 +0,0 @@
-Step 0 -- Accuracy: 0.3039772727272727 -- macro_f1: 0.20810584530698015 -- loss: 1.0453389883041382
-Step 100 -- Accuracy: 0.859375 -- macro_f1: 0.8598470398571504 -- loss: 0.11795929819345474
-Step 200 -- Accuracy: 0.8747159090909091 -- macro_f1: 0.8755251824421424 -- loss: 0.22730453312397003
-Step 300 -- Accuracy: 0.8536931818181818 -- macro_f1: 0.8533303214529117 -- loss: 0.18725647032260895
-Step 400 -- Accuracy: 0.8690340909090909 -- macro_f1: 0.8687299763460793 -- loss: 0.28860458731651306
-Step 500 -- Accuracy: 0.8798295454545455 -- macro_f1: 0.8802316356122608 -- loss: 0.6372634172439575
-Step 600 -- Accuracy: 0.8610795454545455 -- macro_f1: 0.8612099869711884 -- loss: 0.41530805826187134
-Step 700 -- Accuracy: 0.8491477272727272 -- macro_f1: 0.849751664990205 -- loss: 0.5970628261566162
-Step 800 -- Accuracy: 0.8764204545454546 -- macro_f1: 0.8766266441048876 -- loss: 0.2515469491481781
-Step 900 -- Accuracy: 0.8710227272727272 -- macro_f1: 0.8712350728851791 -- loss: 0.619756817817688
-Step 1000 -- Accuracy: 0.8744318181818181 -- macro_f1: 0.8746062203201398 -- loss: 0.5634986758232117
-Step 1100 -- Accuracy: 0.8735795454545454 -- macro_f1: 0.8735921715063891 -- loss: 0.2514641284942627
-Step 1200 -- Accuracy: 0.8375 -- macro_f1: 0.8368621880475362 -- loss: 0.44521981477737427
-Step 1300 -- Accuracy: 0.8551136363636364 -- macro_f1: 0.8555806721970362 -- loss: 0.048632219433784485
-Step 1400 -- Accuracy: 0.8508522727272727 -- macro_f1: 0.8506097642423027 -- loss: 0.24613773822784424
-Step 1500 -- Accuracy: 0.8673295454545454 -- macro_f1: 0.8671847303392856 -- loss: 0.1494443565607071
-Step 1600 -- Accuracy: 0.834375 -- macro_f1: 0.8342641066244109 -- loss: 0.17161081731319427
-Step 1700 -- Accuracy: 0.865625 -- macro_f1: 0.8651594643017528 -- loss: 0.154042050242424
-Step 1800 -- Accuracy: 0.865909090909091 -- macro_f1: 0.8657615265484808 -- loss: 0.1435176134109497
-Step 1900 -- Accuracy: 0.8176136363636364 -- macro_f1: 0.8171586288909666 -- loss: 0.09292535483837128
-Step 2000 -- Accuracy: 0.8440340909090909 -- macro_f1: 0.843042759250924 -- loss: 0.34320467710494995
-Step 2100 -- Accuracy: 0.8428977272727273 -- macro_f1: 0.8428498174495328 -- loss: 0.5764151811599731
-Step 2200 -- Accuracy: 0.8417613636363637 -- macro_f1: 0.8418818479059557 -- loss: 0.28757143020629883
-Step 2300 -- Accuracy: 0.840625 -- macro_f1: 0.8406394626850148 -- loss: 0.8960273861885071
-Step 2400 -- Accuracy: 0.8142045454545455 -- macro_f1: 0.8140964442024906 -- loss: 0.8550783395767212
-Step 2500 -- Accuracy: 0.8144886363636363 -- macro_f1: 0.8147455224461172 -- loss: 0.39625313878059387
-Step 2600 -- Accuracy: 0.8053977272727273 -- macro_f1: 0.8021211300036969 -- loss: 0.3774358034133911
-Step 2700 -- Accuracy: 0.8292613636363636 -- macro_f1: 0.8292382309283113 -- loss: 0.16644884645938873
-Step 2800 -- Accuracy: 0.8150568181818182 -- macro_f1: 0.814290740222007 -- loss: 0.237399160861969
-Step 2900 -- Accuracy: 0.8107954545454545 -- macro_f1: 0.8111709474507229 -- loss: 0.5621077418327332
-Step 3000 -- Accuracy: 0.7926136363636364 -- macro_f1: 0.7930916669737708 -- loss: 0.4253169298171997
-Step 3100 -- Accuracy: 0.8099431818181818 -- macro_f1: 0.8102288703246834 -- loss: 0.43165838718414307
-Step 3200 -- Accuracy: 0.772159090909091 -- macro_f1: 0.7717788019596861 -- loss: 0.673878014087677
-Step 3300 -- Accuracy: 0.7897727272727273 -- macro_f1: 0.7895567869064662 -- loss: 0.1990412026643753
-Step 3400 -- Accuracy: 0.8008522727272728 -- macro_f1: 0.7997998535844976 -- loss: 0.4523601531982422
-Step 3500 -- Accuracy: 0.7798295454545454 -- macro_f1: 0.7780260696858295 -- loss: 0.8848648071289062
-Step 3600 -- Accuracy: 0.7775568181818182 -- macro_f1: 0.7779453966289696 -- loss: 0.5041539669036865
-Step 3700 -- Accuracy: 0.709659090909091 -- macro_f1: 0.7069128111001839 -- loss: 0.6758942604064941

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/config.json DELETED Viewed

@@ -1,45 +0,0 @@
-{
-  "_name_or_path": "/content/checkpoint",
-  "architectures": [
-    "DebertaV2Model"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 768,
-  "id2label": {
-    "0": "entailment",
-    "1": "neutral",
-    "2": "contradiction"
-  },
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "label2id": {
-    "contradiction": 2,
-    "entailment": 0,
-    "neutral": 1
-  },
-  "layer_norm_eps": 1e-07,
-  "max_position_embeddings": 512,
-  "max_relative_positions": -1,
-  "model_type": "deberta-v2",
-  "norm_rel_ebd": "layer_norm",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "pooler_dropout": 0,
-  "pooler_hidden_act": "gelu",
-  "pooler_hidden_size": 768,
-  "pos_att_type": [
-    "p2c",
-    "c2p"
-  ],
-  "position_biased_input": false,
-  "position_buckets": 256,
-  "relative_attention": true,
-  "share_att_key": true,
-  "torch_dtype": "float32",
-  "transformers_version": "4.35.0",
-  "type_vocab_size": 0,
-  "vocab_size": 251000
-}

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-mean/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1c7e80e8237ad2969b1c989d71f97fa7b950fd239bfa8b3329f0535a0b8a2aca
-size 1112897768

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mean.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7f963dfcdad5469498af3b396c5af0e27365e59a01498c51896b9e6547851cd4
-size 11071

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/mean_log.txt DELETED Viewed

@@ -1,76 +0,0 @@
-Step 0 -- Accuracy: 0.275 -- macro_f1: 0.24245894645844043 -- loss: 1.1975505352020264
-Step 100 -- Accuracy: 0.8230113636363636 -- macro_f1: 0.8247917227891541 -- loss: 0.5072745084762573
-Step 200 -- Accuracy: 0.8585227272727273 -- macro_f1: 0.8596474113005192 -- loss: 0.3576969504356384
-Step 300 -- Accuracy: 0.8616477272727273 -- macro_f1: 0.8619445917534628 -- loss: 0.22678352892398834
-Step 400 -- Accuracy: 0.8710227272727272 -- macro_f1: 0.8713149438253084 -- loss: 0.3302939534187317
-Step 500 -- Accuracy: 0.8491477272727272 -- macro_f1: 0.8497535984618637 -- loss: 0.8534196615219116
-Step 600 -- Accuracy: 0.8627840909090909 -- macro_f1: 0.8630171351987245 -- loss: 0.27207863330841064
-Step 700 -- Accuracy: 0.8676136363636363 -- macro_f1: 0.8681189318753203 -- loss: 0.5472040772438049
-Step 800 -- Accuracy: 0.8480113636363636 -- macro_f1: 0.8474828960740969 -- loss: 0.20389704406261444
-Step 900 -- Accuracy: 0.8625 -- macro_f1: 0.8627369387200629 -- loss: 0.7003616094589233
-Step 1000 -- Accuracy: 0.8471590909090909 -- macro_f1: 0.8474576933366409 -- loss: 0.39897170662879944
-Step 1100 -- Accuracy: 0.8647727272727272 -- macro_f1: 0.8648449015557045 -- loss: 0.30028393864631653
-Step 1200 -- Accuracy: 0.8355113636363637 -- macro_f1: 0.8357176579844655 -- loss: 0.5329824090003967
-Step 1300 -- Accuracy: 0.8318181818181818 -- macro_f1: 0.832158484567787 -- loss: 0.04946904629468918
-Step 1400 -- Accuracy: 0.8275568181818181 -- macro_f1: 0.8270568913757921 -- loss: 0.290753036737442
-Step 1500 -- Accuracy: 0.8619318181818182 -- macro_f1: 0.8620216901652552 -- loss: 0.17760200798511505
-Step 1600 -- Accuracy: 0.8366477272727273 -- macro_f1: 0.8372501215741125 -- loss: 0.18745465576648712
-Step 1700 -- Accuracy: 0.8556818181818182 -- macro_f1: 0.8555692365839257 -- loss: 0.09077112376689911
-Step 1800 -- Accuracy: 0.8571022727272727 -- macro_f1: 0.8569408344903815 -- loss: 0.24079212546348572
-Step 1900 -- Accuracy: 0.8122159090909091 -- macro_f1: 0.8117034674801616 -- loss: 0.3681311309337616
-Step 2000 -- Accuracy: 0.8318181818181818 -- macro_f1: 0.8319676688379705 -- loss: 0.2374744713306427
-Step 2100 -- Accuracy: 0.8443181818181819 -- macro_f1: 0.8442918629955193 -- loss: 0.4600515365600586
-Step 2200 -- Accuracy: 0.8278409090909091 -- macro_f1: 0.8269904995679983 -- loss: 0.3283902704715729
-Step 2300 -- Accuracy: 0.8298295454545455 -- macro_f1: 0.8299882032010862 -- loss: 1.0965081453323364
-Step 2400 -- Accuracy: 0.8159090909090909 -- macro_f1: 0.8159808860940237 -- loss: 0.7295159697532654
-Step 2500 -- Accuracy: 0.8159090909090909 -- macro_f1: 0.8142475187664063 -- loss: 0.3925968408584595
-Step 2600 -- Accuracy: 0.8204545454545454 -- macro_f1: 0.820545798600696 -- loss: 0.3808274567127228
-Step 2700 -- Accuracy: 0.8198863636363637 -- macro_f1: 0.8199413434559383 -- loss: 0.26008090376853943
-Step 2800 -- Accuracy: 0.8056818181818182 -- macro_f1: 0.8051566431375038 -- loss: 0.20567485690116882
-Step 2900 -- Accuracy: 0.784375 -- macro_f1: 0.7848921849530183 -- loss: 0.5506788492202759
-Step 3000 -- Accuracy: 0.8153409090909091 -- macro_f1: 0.8150634367874668 -- loss: 0.4250873923301697
-Step 3100 -- Accuracy: 0.7991477272727273 -- macro_f1: 0.8000715520252392 -- loss: 0.4798588752746582
-Step 3200 -- Accuracy: 0.7840909090909091 -- macro_f1: 0.7836356305606565 -- loss: 0.5604580640792847
-Step 3300 -- Accuracy: 0.7977272727272727 -- macro_f1: 0.7965403402362528 -- loss: 0.26682722568511963
-Step 3400 -- Accuracy: 0.809375 -- macro_f1: 0.8087947373143304 -- loss: 0.3252097964286804
-Step 3500 -- Accuracy: 0.7568181818181818 -- macro_f1: 0.7548780108676749 -- loss: 0.9467527866363525
-Step 3600 -- Accuracy: 0.7889204545454546 -- macro_f1: 0.7892382882596812 -- loss: 0.29441171884536743
-Step 3700 -- Accuracy: 0.7227272727272728 -- macro_f1: 0.7227876418017654 -- loss: 0.8389160633087158

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/plot.png DELETED Viewed

Binary file (153 kB)

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/mDeBERTa (ft) V6/public_train_v4.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:56c03b9bb2cab8ffbe138badea76b6275ebad727e99f5040d2a8c21f2dcfaff2
-size 227113690

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/myNLI.py DELETED Viewed

@@ -1,190 +0,0 @@
-import torch
-from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
-from sentence_transformers import SentenceTransformer, util
-import nltk
-# import datasets
-from datasets import Dataset, DatasetDict
-from typing import List
-from .utils import timer_func
-from .nli_v3 import NLI_model
-from .crawler import MyCrawler
-int2label = {0:'SUPPORTED', 1:'NEI', 2:'REFUTED'}
-class FactChecker:
-    @timer_func
-    def __init__(self):
-        self.INPUT_TYPE = "mean"
-        self.load_model()
-    @timer_func
-    def load_model(self):
-        self.envir = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-        # Load LLM
-        self.tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")    # LOAD mDEBERTa TOKENIZER
-        self.mDeBertaModel = AutoModel.from_pretrained(f"src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-{self.INPUT_TYPE}")  # LOAD FINETUNED MODEL
-        # Load classifier model
-        self.checkpoints = torch.load(f"src/mDeBERTa (ft) V6/{self.INPUT_TYPE}.pt", map_location=self.envir)
-        self.classifierModel = NLI_model(768, torch.tensor([0., 0., 0.])).to(self.envir)
-        self.classifierModel.load_state_dict(self.checkpoints['model_state_dict'])
-        #Load model for predict similarity
-        self.model_sbert = SentenceTransformer('keepitreal/vietnamese-sbert')
-    @timer_func
-    def get_similarity_v2(self, src_sents, dst_sents, threshold = 0.4):
-        corpus_embeddings = self.model_sbert.encode(dst_sents, convert_to_tensor=True)
-        top_k = min(5, len(dst_sents))
-        ls_top_results = []
-        for query in src_sents:
-            query_embedding = self.model_sbert.encode(query, convert_to_tensor=True)
-            # We use cosine-similarity and torch.topk to find the highest 5 scores
-            cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
-            top_results = torch.topk(cos_scores, k=top_k)
-            # print("\n\n======================\n\n")
-            # print("Query:", src_sents)
-            # print("\nTop 5 most similar sentences in corpus:")
-            ls_top_results.append({
-                "top_k": top_k,
-                "claim": query,
-                "sim_score": top_results,
-                "evidences": [dst_sents[idx] for _, idx in zip(top_results[0], top_results[1])],
-            })
-            # for score, idx in zip(top_results[0], top_results[1]):
-            #     print(dst_sents[idx], "(Score: {:.4f})".format(score))
-        return None,ls_top_results
-    @timer_func
-    def inferSample(self, evidence, claim):
-        @timer_func
-        def mDeBERTa_tokenize(data): # mDeBERTa model: Taking input_ids
-            premises = [premise for premise, _ in data['sample']]
-            hypothesis = [hypothesis for _, hypothesis in data['sample']]
-            with torch.no_grad():
-                input_token = (self.tokenizer(premises, hypothesis, truncation=True, return_tensors="pt", padding = True)['input_ids']).to(self.envir)
-                embedding = self.mDeBertaModel(input_token).last_hidden_state
-            mean_embedding = torch.mean(embedding[:, 1:, :], dim = 1)
-            cls_embedding = embedding[:, 0, :]
-            return {'mean':mean_embedding, 'cls':cls_embedding}
-        @timer_func
-        def predict_mapping(batch):
-            with torch.no_grad():
-                predict_label, predict_prob = self.classifierModel.predict_step((batch[self.INPUT_TYPE].to(self.envir), None))
-            return {'label':predict_label, 'prob':-predict_prob}
-        # Mapping the predict label into corresponding string labels
-        @timer_func
-        def output_predictedDataset(predict_dataset):
-            for record in predict_dataset:
-                labels = int2label[ record['label'].item() ]
-                confidence = record['prob'].item()
-            return {'labels':labels, 'confidence':confidence}
-        dataset = {'sample':[(evidence, claim)], 'key': [0]}
-        output_dataset = DatasetDict({
-            'infer': Dataset.from_dict(dataset)
-        })
-        @timer_func
-        def tokenize_dataset():
-            tokenized_dataset = output_dataset.map(mDeBERTa_tokenize, batched=True, batch_size=1)
-            return tokenized_dataset
-        tokenized_dataset = tokenize_dataset()
-        tokenized_dataset = tokenized_dataset.with_format("torch", [self.INPUT_TYPE, 'key'])
-        # Running inference step
-        predicted_dataset = tokenized_dataset.map(predict_mapping, batched=True, batch_size=tokenized_dataset['infer'].num_rows)
-        return output_predictedDataset(predicted_dataset['infer'])
-    @timer_func
-    def predict_vt(self, claim: str) -> List:
-        # import pdb; pdb.set_trace()
-        # step 1: crawl evidences from bing search
-        crawler = MyCrawler()
-        evidences = crawler.searchGoogle(claim)
-        # evidences = crawler.get_evidences(claim)
-        # step 2: use emebdding setences to search most related setences
-        if len(evidences) == 0:
-            return None
-        for evidence in evidences:
-            print(evidence['url'])
-            top_evidence = evidence["content"]
-            post_message = nltk.tokenize.sent_tokenize(claim)
-            evidences = nltk.tokenize.sent_tokenize(top_evidence)
-            _, top_rst = self.get_similarity_v2(post_message, evidences)
-            print(top_rst)
-            ls_evidence, final_verdict = self.get_result_nli_v2(top_rst)
-            print("FINAL: " + final_verdict)
-        # _, top_rst = self.get_similarity_v1(post_message, evidences)
-        # ls_evidence, final_verdict = self.get_result_nli_v1(post_message, top_rst, evidences)
-        return ls_evidence, final_verdict
-    @timer_func
-    def predict(self, claim):
-        crawler = MyCrawler()
-        evidences = crawler.searchGoogle(claim)
-        if evidences:
-            tokenized_claim = nltk.tokenize.sent_tokenize(claim)
-            evidence = evidences[0]
-            tokenized_evidence = nltk.tokenize.sent_tokenize(evidence["content"])
-            # print("TOKENIZED EVIDENCES")
-            # print(tokenized_evidence)
-            _, top_rst = self.get_similarity_v2(tokenized_claim, tokenized_evidence)
-            processed_evidence = "\n".join(top_rst[0]["evidences"])
-            print(processed_evidence)
-            nli_result = self.inferSample(processed_evidence, claim)
-            return {
-                "claim": claim,
-                "label": nli_result["labels"],
-                "confidence": nli_result['confidence'],
-                "evidence": processed_evidence if nli_result["labels"] != "NEI" else "",
-                "provider": evidence['provider'],
-                "url": evidence['url']
-            }
-    @timer_func
-    def predict_nofilter(self, claim):
-        crawler = MyCrawler()
-        evidences = crawler.searchGoogle(claim)
-        tokenized_claim = nltk.tokenize.sent_tokenize(claim)
-        evidence = evidences[0]
-        processed_evidence = evidence['content']
-        nli_result = self.inferSample(processed_evidence, claim)
-        return {
-            "claim": claim,
-            "label": nli_result["labels"],
-            "confidence": nli_result['confidence'],
-            "evidence": processed_evidence if nli_result["labels"] != "NEI" else "",
-            "provider": evidence['provider'],
-            "url": evidence['url']
-        }

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/nli_v3.py DELETED Viewed

@@ -1,115 +0,0 @@
-import torch
-from torch import nn as nn
-import pandas as pd
-from transformers import AutoModel, AutoTokenizer
-# import datasets
-from datasets import Dataset, DatasetDict
-from sklearn.metrics import classification_report
-from sklearn.metrics._classification import _check_targets
-envir = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
-int2label = {0:'SUPPORTED', 1:'NEI', 2:'REFUTED'}
-class NLI_model(nn.Module):
-    def __init__(self, input_dims, class_weights=torch.tensor([0., 0., 0.])):
-        super(NLI_model, self).__init__()
-        self.classification = nn.Sequential(
-            nn.Linear(input_dims, 3)
-        )
-        self.criterion = nn.CrossEntropyLoss(class_weights)
-    def forward(self, input):
-        output_linear = self.classification(input)
-        return output_linear
-    def training_step(self, train_batch, batch_idx=0):
-        input_data, targets = train_batch
-        outputs = self.forward(input_data)
-        loss = self.criterion(outputs, targets)
-        return loss
-    def predict_step(self, batch, batch_idx=0):
-        input_data, _ = batch
-        outputs = self.forward(input_data)
-        prob = outputs.softmax(dim = -1)
-        sort_prob, sort_indices = torch.sort(-prob, 1)
-        return sort_indices[:,0], sort_prob[:,0]
-    def validation_step(self, val_batch, batch_idx=0):
-        _, targets = val_batch
-        sort_indices, _ = self.predict_step(val_batch, batch_idx)
-        report = classification_report(list(targets.to('cpu').numpy()), list(sort_indices.to('cpu').numpy()), output_dict=True, zero_division = 1)
-        return report
-    def test_step(self, batch, dict_form, batch_idx=0):
-        _, targets = batch
-        sort_indices, _ = self.predict_step(batch, batch_idx)
-        report = classification_report(targets.to('cpu').numpy(), sort_indices.to('cpu').numpy(), output_dict=dict_form, zero_division = 1)
-        return report
-    def configure_optimizers(self):
-        return torch.optim.Adam(self.parameters(), lr = 1e-5)
-def inferSample(evidence, claim, tokenizer, mDeBertaModel, classifierModel, input_type):
-    def mDeBERTa_tokenize(data): # mDeBERTa model: Taking input_ids
-        premises = [premise for premise, _ in data['sample']]
-        hypothesis = [hypothesis for _, hypothesis in data['sample']]
-        with torch.no_grad():
-            input_token = (tokenizer(premises, hypothesis, truncation=True, return_tensors="pt", padding = True)['input_ids']).to(envir)
-            embedding = mDeBertaModel(input_token).last_hidden_state
-        mean_embedding = torch.mean(embedding[:, 1:, :], dim = 1)
-        cls_embedding = embedding[:, 0, :]
-        return {'mean':mean_embedding, 'cls':cls_embedding}
-    def predict_mapping(batch):
-        with torch.no_grad():
-            predict_label, predict_prob = classifierModel.predict_step((batch[input_type].to(envir), None))
-        return {'label':predict_label, 'prob':-predict_prob}
-    # Mapping the predict label into corresponding string labels
-    def output_predictedDataset(predict_dataset):
-        for record in predict_dataset:
-            labels = int2label[ record['label'].item() ]
-            confidence = record['prob'].item()
-        return {'labels':labels, 'confidence':confidence}
-    dataset = {'sample':[(evidence, claim)], 'key': [0]}
-    output_dataset = DatasetDict({
-        'infer': Dataset.from_dict(dataset)
-    })
-    tokenized_dataset = output_dataset.map(mDeBERTa_tokenize, batched=True, batch_size=1)
-    tokenized_dataset = tokenized_dataset.with_format("torch", [input_type, 'key'])
-    # Running inference step
-    predicted_dataset = tokenized_dataset.map(predict_mapping, batched=True, batch_size=tokenized_dataset['infer'].num_rows)
-    return output_predictedDataset(predicted_dataset['infer'])
-if __name__ == '__main__':
-    # CHANGE 'INPUT_TYPE' TO CHANGE MODEL
-    INPUT_TYPE = 'mean' # USE "MEAN" OR "CLS" LAST HIDDEN STATE
-    # Load LLM
-    tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")    # LOAD mDEBERTa TOKENIZER
-    mDeBertaModel = AutoModel.from_pretrained(f"src/mDeBERTa (ft) V6/mDeBERTa-v3-base-mnli-xnli-{INPUT_TYPE}")  # LOAD FINETUNED MODEL
-    # Load classifier model
-    checkpoints = torch.load(f"src/mDeBERTa (ft) V6/{INPUT_TYPE}.pt", map_location=envir)
-    classifierModel = NLI_model(768, torch.tensor([0., 0., 0.])).to(envir)
-    classifierModel.load_state_dict(checkpoints['model_state_dict'])
-    evidence = "Sau khi thẩm định, Liên đoàn Bóng đá châu Á AFC xác nhận thủ thành mới nhập quốc tịch của Việt Nam Filip Nguyễn đủ điều kiện thi đấu ở Asian Cup 2024."
-    claim = "Filip Nguyễn đủ điều kiện dự Asian Cup 2024"
-    print(inferSample(evidence, claim, tokenizer, mDeBertaModel, classifierModel, INPUT_TYPE))

https:/huggingface.co/spaces/khaiphan29/fact-checking-api/utils.py DELETED Viewed

@@ -1,12 +0,0 @@
-from time import time
-def timer_func(func):
-    # This function shows the execution time of
-    # the function object passed
-    def wrap_func(*args, **kwargs):
-        t1 = time()
-        result = func(*args, **kwargs)
-        t2 = time()
-        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
-        return result
-    return wrap_func