File size: 3,773 Bytes
1b539bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter

from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from transformers import DistilBertTokenizer
from langchain_huggingface import HuggingFaceEmbeddings
#import bs4
from constants import *
from keybert import KeyBERT
import pandas as pd
import re
import json
#from langchain_chroma import Chroma



tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_overlap=50,chunk_size=600)
kw_model = KeyBERT()


         #'globalpartnerships':['capacity-development','finance','multi-stakeholder-partnerships','science','trade','national-sustainable-development-strategies']}

def preprocess(text):
    text=re.sub(r'<[^>]*>','',text)
    text=re.sub(r"[^A-Za-z" "]+"," ",text).lower()
    #text=re.sub("[0-9" "]+"," ",text)
    text=re.sub(r'[\W]+',' ',text.lower()).replace('-','')
    if re.findall(text,r'what can i do'):
        text=""
    return text
def webprofiler(base='https://www.un.org/sustainabledevelopment/',subpage='',\
                GoalNumber=1,GoalLabel=""):
    full_path=base+subpage
    loader=WebBaseLoader(web_paths=[full_path])
    docs=loader.load()
    doc_chunks=splitter.split_documents(docs)
    corpus=[]
    keyword_list=[]
    for d in doc_chunks:
        if "HomeOverview" in d.page_content:continue
        if len(d.page_content)<800:continue #Skips to body of text
        #keywords = kw_model.extract_keywords(d.page_content,keyphrase_ngram_range=(1, 2))
        #for k in keywords:
        #    if k[1]<0.45:continue
        #    keyword_list.append(k[0])
        sents=re.split(r'[.!?]+', d.page_content)
        if sents[-1]=="Links":break #End of page
        for s in sents:
            single_sentances=s.split('.')
            for ss in single_sentances:
                if 'please visit this link' in ss:continue
                if len(ss.split())<7:continue #ignore sentence fragments with few words
                #print(ss, len(ss))
                #if len(ss)>14:
                corpus.append(ss.lower()) 
    for t in corpus:
        keywords = kw_model.extract_keywords(t,keyphrase_ngram_range=(1, 2))
        keyword_list.append(keywords[0][0])        
    df=pd.DataFrame(data={'text_data':corpus,'source':full_path})
    df['text_data'] = df.text_data.apply(preprocess)
    df['best_key_word']=keyword_list
    mask=df['text_data'].str.len()>14 #ignore fragments with very few char
    df_filtered=df[mask]
    text=df_filtered.text_data.to_list()
    df_filtered['category']='Goal %d: %s' %(GoalNumber,GoalLabel)
    df_filtered['label']=GoalNumber
    keyword_list=set(keyword_list)
    return df_filtered,keyword_list

if __name__ == '__main__':
    goal_count=1
    keyword_dict={}
    for i,v in sdg_goals.items():
        #if i!='globalpartnerships':continue
        print("source: https://www.un.org/sustainabledevelopment/%s" %i) 
        print("Goal %d: %s" %(goal_count,v))
        df_un,un_keyword=webprofiler(subpage=i,GoalNumber=goal_count,GoalLabel=v)
        features_list=[df_un]
        for subpage in sdg_topics[i]:
            df_unsdg,new_keywords=webprofiler(base='https://sdgs.un.org/topics/',subpage=subpage,GoalNumber=goal_count,GoalLabel=v)
            un_keyword=un_keyword.union(new_keywords)
            features_list.append(df_unsdg)
        df_all=pd.concat(features_list)
        keyword_dict[i]=list(un_keyword)
        df_all.to_csv("training_data/FeatureSet_%d.csv" %goal_count)
        #print(df_all.head())
        goal_count+=1
        #break;
    with open('training_data/KeywordPayload.json','w') as fp:
        json.dump(keyword_dict, fp)