Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import WebBaseLoader | |
| from langchain_text_splitters import CharacterTextSplitter | |
| from langchain_text_splitters import SentenceTransformersTokenTextSplitter | |
| from transformers import DistilBertTokenizer | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| #import bs4 | |
| from constants import * | |
| from keybert import KeyBERT | |
| import pandas as pd | |
| import re | |
| import json | |
| #from langchain_chroma import Chroma | |
| tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') | |
| splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_overlap=50,chunk_size=600) | |
| kw_model = KeyBERT() | |
| #'globalpartnerships':['capacity-development','finance','multi-stakeholder-partnerships','science','trade','national-sustainable-development-strategies']} | |
| def preprocess(text): | |
| text=re.sub(r'<[^>]*>','',text) | |
| text=re.sub(r"[^A-Za-z" "]+"," ",text).lower() | |
| #text=re.sub("[0-9" "]+"," ",text) | |
| text=re.sub(r'[\W]+',' ',text.lower()).replace('-','') | |
| if re.findall(text,r'what can i do'): | |
| text="" | |
| return text | |
| def webprofiler(base='https://www.un.org/sustainabledevelopment/',subpage='',\ | |
| GoalNumber=1,GoalLabel=""): | |
| full_path=base+subpage | |
| loader=WebBaseLoader(web_paths=[full_path]) | |
| docs=loader.load() | |
| doc_chunks=splitter.split_documents(docs) | |
| corpus=[] | |
| keyword_list=[] | |
| for d in doc_chunks: | |
| if "HomeOverview" in d.page_content:continue | |
| if len(d.page_content)<800:continue #Skips to body of text | |
| #keywords = kw_model.extract_keywords(d.page_content,keyphrase_ngram_range=(1, 2)) | |
| #for k in keywords: | |
| # if k[1]<0.45:continue | |
| # keyword_list.append(k[0]) | |
| sents=re.split(r'[.!?]+', d.page_content) | |
| if sents[-1]=="Links":break #End of page | |
| for s in sents: | |
| single_sentances=s.split('.') | |
| for ss in single_sentances: | |
| if 'please visit this link' in ss:continue | |
| if len(ss.split())<7:continue #ignore sentence fragments with few words | |
| #print(ss, len(ss)) | |
| #if len(ss)>14: | |
| corpus.append(ss.lower()) | |
| for t in corpus: | |
| keywords = kw_model.extract_keywords(t,keyphrase_ngram_range=(1, 2)) | |
| keyword_list.append(keywords[0][0]) | |
| df=pd.DataFrame(data={'text_data':corpus,'source':full_path}) | |
| df['text_data'] = df.text_data.apply(preprocess) | |
| df['best_key_word']=keyword_list | |
| mask=df['text_data'].str.len()>14 #ignore fragments with very few char | |
| df_filtered=df[mask] | |
| text=df_filtered.text_data.to_list() | |
| df_filtered['category']='Goal %d: %s' %(GoalNumber,GoalLabel) | |
| df_filtered['label']=GoalNumber | |
| keyword_list=set(keyword_list) | |
| return df_filtered,keyword_list | |
| if __name__ == '__main__': | |
| goal_count=1 | |
| keyword_dict={} | |
| for i,v in sdg_goals.items(): | |
| #if i!='globalpartnerships':continue | |
| print("source: https://www.un.org/sustainabledevelopment/%s" %i) | |
| print("Goal %d: %s" %(goal_count,v)) | |
| df_un,un_keyword=webprofiler(subpage=i,GoalNumber=goal_count,GoalLabel=v) | |
| features_list=[df_un] | |
| for subpage in sdg_topics[i]: | |
| df_unsdg,new_keywords=webprofiler(base='https://sdgs.un.org/topics/',subpage=subpage,GoalNumber=goal_count,GoalLabel=v) | |
| un_keyword=un_keyword.union(new_keywords) | |
| features_list.append(df_unsdg) | |
| df_all=pd.concat(features_list) | |
| keyword_dict[i]=list(un_keyword) | |
| df_all.to_csv("training_data/FeatureSet_%d.csv" %goal_count) | |
| #print(df_all.head()) | |
| goal_count+=1 | |
| #break; | |
| with open('training_data/KeywordPayload.json','w') as fp: | |
| json.dump(keyword_dict, fp) | |