Agents_DeepSearch / src /train_bert /PreProcess_TrainingData.py
rgp230's picture
feat(QuestionModel): Complete for deploy
1b539bf
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import SentenceTransformersTokenTextSplitter
from transformers import DistilBertTokenizer
from langchain_huggingface import HuggingFaceEmbeddings
#import bs4
from constants import *
from keybert import KeyBERT
import pandas as pd
import re
import json
#from langchain_chroma import Chroma
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer,chunk_overlap=50,chunk_size=600)
kw_model = KeyBERT()
#'globalpartnerships':['capacity-development','finance','multi-stakeholder-partnerships','science','trade','national-sustainable-development-strategies']}
def preprocess(text):
text=re.sub(r'<[^>]*>','',text)
text=re.sub(r"[^A-Za-z" "]+"," ",text).lower()
#text=re.sub("[0-9" "]+"," ",text)
text=re.sub(r'[\W]+',' ',text.lower()).replace('-','')
if re.findall(text,r'what can i do'):
text=""
return text
def webprofiler(base='https://www.un.org/sustainabledevelopment/',subpage='',\
GoalNumber=1,GoalLabel=""):
full_path=base+subpage
loader=WebBaseLoader(web_paths=[full_path])
docs=loader.load()
doc_chunks=splitter.split_documents(docs)
corpus=[]
keyword_list=[]
for d in doc_chunks:
if "HomeOverview" in d.page_content:continue
if len(d.page_content)<800:continue #Skips to body of text
#keywords = kw_model.extract_keywords(d.page_content,keyphrase_ngram_range=(1, 2))
#for k in keywords:
# if k[1]<0.45:continue
# keyword_list.append(k[0])
sents=re.split(r'[.!?]+', d.page_content)
if sents[-1]=="Links":break #End of page
for s in sents:
single_sentances=s.split('.')
for ss in single_sentances:
if 'please visit this link' in ss:continue
if len(ss.split())<7:continue #ignore sentence fragments with few words
#print(ss, len(ss))
#if len(ss)>14:
corpus.append(ss.lower())
for t in corpus:
keywords = kw_model.extract_keywords(t,keyphrase_ngram_range=(1, 2))
keyword_list.append(keywords[0][0])
df=pd.DataFrame(data={'text_data':corpus,'source':full_path})
df['text_data'] = df.text_data.apply(preprocess)
df['best_key_word']=keyword_list
mask=df['text_data'].str.len()>14 #ignore fragments with very few char
df_filtered=df[mask]
text=df_filtered.text_data.to_list()
df_filtered['category']='Goal %d: %s' %(GoalNumber,GoalLabel)
df_filtered['label']=GoalNumber
keyword_list=set(keyword_list)
return df_filtered,keyword_list
if __name__ == '__main__':
goal_count=1
keyword_dict={}
for i,v in sdg_goals.items():
#if i!='globalpartnerships':continue
print("source: https://www.un.org/sustainabledevelopment/%s" %i)
print("Goal %d: %s" %(goal_count,v))
df_un,un_keyword=webprofiler(subpage=i,GoalNumber=goal_count,GoalLabel=v)
features_list=[df_un]
for subpage in sdg_topics[i]:
df_unsdg,new_keywords=webprofiler(base='https://sdgs.un.org/topics/',subpage=subpage,GoalNumber=goal_count,GoalLabel=v)
un_keyword=un_keyword.union(new_keywords)
features_list.append(df_unsdg)
df_all=pd.concat(features_list)
keyword_dict[i]=list(un_keyword)
df_all.to_csv("training_data/FeatureSet_%d.csv" %goal_count)
#print(df_all.head())
goal_count+=1
#break;
with open('training_data/KeywordPayload.json','w') as fp:
json.dump(keyword_dict, fp)