Spaces:
Paused
Paused
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat Dec 17 11:27:28 2022 | |
| @author: anas | |
| """ | |
| import pickle | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import regex as re | |
| # import spacy | |
| # from spacy import displacy | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,f1_score, precision_score, recall_score | |
| #from googletrans import Translator | |
| import requests | |
| class ModelPums: | |
| def __init__(self): | |
| self.obData=None | |
| self.obVec=None | |
| self.obSVML=None | |
| self.obSVMK=None | |
| self.oblogict=None | |
| self.X=None | |
| self.Y=None | |
| self.Classes=None | |
| class TEC: | |
| __Outs={'svmL':None,'svmK':None,'logstick':None,'target':None} | |
| def createOuts(): | |
| ob=TEC.__Outs.copy() | |
| for key in ob: | |
| ob[key]=[] | |
| return ob | |
| def __init__(self,typemodel="svmL",model_setting=None,spt=0.3,nlp=None): | |
| self.model_setting=model_setting | |
| self.isLoad=False | |
| self.nlp=nlp | |
| def toclean(self,txt): | |
| try: | |
| url="https://ansaltwyl256.pythonanywhere.com/api/nlp/"+txt | |
| response = requests.get(url) | |
| return response.json()['description'] | |
| except: | |
| return "$" | |
| def loadmodel(self): | |
| mm=ModelPums() | |
| matrck=pickle.load(open(self.model_setting.path_model,'rb')) | |
| self.DES=pickle.load(open(self.model_setting.path_Qwords,'rb')) | |
| mm.obData=matrck['obData'] | |
| mm.obVec=TfidfVectorizer(norm='l2') | |
| mm.obVec.fit(mm.obData) | |
| # mm.obVec=matrck['obVec'] | |
| mm.obSVML=matrck['obSVML'] | |
| mm.obSVMK=matrck['obSVMK'] | |
| mm.oblogict=matrck['oblogict'] | |
| mm.X=matrck['X'] | |
| mm.Y=matrck['Y'] | |
| mm.Classes=matrck['Classes'] | |
| self.obMP=mm | |
| if self.model_setting.path_Qwords!="": | |
| self.Qwords={}#pickle.load(open(model_setting.path_Qwords,'rb')) | |
| else: | |
| self.Qwords={} | |
| #self.detector = Translator() | |
| # self.Splits(spt) | |
| self.Model=self.obMP.obSVML | |
| self.name=self.model_setting.name | |
| self.pipeline= None | |
| self.isLoad=True | |
| def getLables(self): | |
| return self.obMP.Classes | |
| def is_found(self,words,ob): | |
| sms=[] | |
| ob=ob.lower().strip() | |
| for w in words: | |
| if w==ob: | |
| return 1,w | |
| sms.append(self.similarity(w,ob)) | |
| index=np.argmax(sms) | |
| if sms[index]>0.7: | |
| return 2, list(words)[index] | |
| return 0,'' | |
| def is_found_K(self,words,ob): | |
| ob=ob.lower().strip() | |
| for w in words: | |
| if w==ob: | |
| return 1,w | |
| return -1,'' | |
| def Training(self): | |
| self.obMP.obSVMK.fit(self.obMP.X,self.obMP.Y) | |
| def clean_dataT(self,data,typw='',is_input=False): | |
| d,_=self.clean_dataAPI(data) | |
| return d | |
| datac=data | |
| dock=[] | |
| is_found= self.is_found if is_input==True else self.is_found_K | |
| reg=r"([0-9])|(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" | |
| for datac in data: | |
| strr=str(datac) | |
| strr=re.sub(reg, "", strr) | |
| doc=self.nlp(strr) | |
| disc="" | |
| for token in doc: | |
| if not token.is_punct and not token.is_stop and not token.is_space and token.is_alpha: | |
| if token.pos_=='ADJ' or token.pos_=='NOUN' or token.pos_=='VERB':#token.pos_== typw: | |
| qk,key=is_found(self.Qwords,token.lemma_) | |
| if qk==1: | |
| disc=disc+self.Qwords[token.lemma_]+" " | |
| elif qk==2: | |
| disc=disc+self.Qwords[key]+" " | |
| elif qk==-1: | |
| disc=disc+token.lemma_+" " | |
| disc=disc.lower().strip() | |
| if len(disc)>0: | |
| dock.append(disc) | |
| return dock | |
| def similarity(self,ob1,ob2): | |
| ob1=ob1 | |
| ob2=ob2 | |
| nob1=self.nlp(ob1) | |
| return nob1.similarity(self.nlp(ob2)) | |
| #-----------------# | |
| def clean_data(self,data,typw='',is_input=False): | |
| return self.clean_dataAPI(data) | |
| datac=data | |
| dock=[] | |
| labels=[] | |
| is_found= self.is_found if is_input==True else self.is_found_K | |
| reg=r"([0-9])|(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" | |
| for (datac,label) in data: | |
| strr=str(datac) | |
| strr=re.sub(reg, "", strr) | |
| doc=self.nlp(strr) | |
| disc="" | |
| for token in doc: | |
| if not token.is_punct and not token.is_stop and not token.is_space and token.is_alpha: | |
| if True: # token.pos_=='ADJ' or token.pos_=='NOUN' or token.pos_=='VERB' : | |
| # disc=disc+token.lemma_+" " | |
| qk,key=is_found(self.Qwords,token.lemma_) | |
| # print(qk,key) | |
| if qk==1: | |
| disc=disc+self.Qwords[key]+" " | |
| elif qk==2: | |
| disc=disc+self.Qwords[key]+" " | |
| elif qk==-1: | |
| disc=disc+token.lemma_+" " | |
| if len(disc)>2: | |
| dock.append(disc.strip()) | |
| labels.append(label) | |
| return dock,labels | |
| #-----------------# | |
| def clean_dataAPI(self,data,typw='',is_input=False): | |
| txt=self.toclean(data) | |
| if txt !="$": | |
| return [txt],6 | |
| reg=r"([0-9])|(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?" | |
| strr=str(data) | |
| strr=re.sub(reg, "", strr) | |
| return [strr],6 | |
| def setPipeline(self,model=None): | |
| self.pipeline=model | |
| def to_tran(self,text,dest='en'): | |
| ff=True | |
| c=0 | |
| while ff: | |
| try: | |
| t=self.detector.translate(str(text),dest=dest) | |
| ff=False | |
| text=t.text | |
| except: | |
| c+=1 | |
| if c==20: | |
| ff=False | |
| print(' no connenet Tr') | |
| return text | |
| def Predict_ALL(self,description,istrans=False): | |
| if istrans: | |
| description=self.to_tran(description) | |
| if self.pipeline is not None: | |
| text_output,_,outs=self.pipeline.Predict_ALL(description) | |
| for key in outs: | |
| text_output=text_output+'--'+ outs[key] | |
| else: | |
| text_output='' | |
| mx,outs=self.get_ptedict_proba(description+' '+text_output) | |
| return text_output,mx,outs | |
| def predictAPI(self,description,is_input=False,istrans=False): | |
| if istrans: | |
| description=self.to_tran(description) | |
| clean_description,_=self.clean_dataAPI(description,'',is_input=is_input) | |
| try: | |
| features=self.obMP.obVec.transform(clean_description) | |
| yp=self.Model.predict(features) | |
| txttec=self.obMP.Classes[yp[0]] | |
| dis=self.DES[txttec] | |
| except: | |
| txttec='No' | |
| dis=" ....." | |
| return txttec,dis | |
| def predict(self,description,is_input=False,istrans=False): | |
| if istrans: | |
| description=self.to_tran(description) | |
| clean_description,_=self.clean_data([(description,' ')],'',is_input=is_input) | |
| try: | |
| features=self.obMP.obVec.transform(clean_description) | |
| yp=self.Model.predict(features) | |
| txttec=self.obMP.Classes[yp[0]] | |
| except: | |
| txttec='No' | |
| return txttec | |
| def predict_ids(self,description,is_input=False,istrans=False): | |
| if istrans: | |
| description=self.to_tran(description) | |
| # clean_description,_=self.clean_data([(description,' ')],'',is_input=is_input) | |
| try: | |
| features=self.obMP.obVec.transform(description) | |
| yp=self.Model.predict(features) | |
| # txttec=self.obMP.Classes[yp[0]] | |
| except: | |
| yp='No' | |
| return yp | |
| def ptedict_proba(self,description,mx=[],is_input=False,istrans=False): | |
| if istrans: | |
| description=self.to_tran(description) | |
| clean_description,_=self.clean_data([(description,' ')],'',is_input=is_input) | |
| try: | |
| features=self.obMP.obVec.transform(clean_description) | |
| mx=self.Model.predict_proba(features) | |
| print(np.int16(mx*100)) | |
| yp=np.argmax(mx) | |
| print(yp) | |
| txttec='Technique : '+ self.obMP.Classes[yp] | |
| except: | |
| txttec='No Found technique ...! (^_^)' | |
| return txttec | |
| def get_ptedict_proba(self,description,mx=[]): | |
| clean_description,_=self.clean_data([(description,' ')],'') | |
| try: | |
| features=self.obMP.obVec.transform(clean_description) | |
| mx=self.obMP.obSVML.predict_proba(features) | |
| yk=self.obMP.obSVMK.predict(features) | |
| # yp=np.argmax(mx) | |
| outputs={'svmK':self.obMP.Classes[yk[0]]} | |
| except: | |
| txttec='No Found technique ...! (^_^)' | |
| outputs={} | |
| return mx,outputs | |
| def get_ptedict_threemodel(self,description): | |
| clean_description,_=self.clean_data([(description,' ')],'') | |
| try: | |
| features=self.obMP.obVec.transform(clean_description) | |
| yl=self.obMP.obSVML.predict(features) | |
| yk=self.obMP.obSVMK.predict(features) | |
| ym=self.obMP.oblogict.predict(features) | |
| outputs={'svmL':self.obMP.Classes[yl[0]], | |
| 'svmK':self.obMP.Classes[yk[0]], | |
| 'logstick':self.obMP.Classes[ym[0]] | |
| } | |
| except: | |
| print('No Found technique ...! (^_^)' ) | |
| outputs={} | |
| return outputs | |
| def verification(self,inputs=[],outputs=[]): | |
| out_prodect=TEC.createOuts() | |
| unprocess=0 | |
| meta={"tf":TEC.createOuts(),"num":TEC.createOuts()} | |
| names=list(self.obMP.Classes) | |
| for i in range(len(outputs)): | |
| try: | |
| outs=self.get_ptedict_threemodel(inputs[i]) | |
| target=outputs[i].strip() | |
| names.index(target) | |
| outs['target']=target | |
| for key in outs: | |
| out_prodect[key].append(outs[key]) | |
| meta['tf'][key].append(int(outs[key]==target)) | |
| meta['num'][key].append(names.index(outs[key])) | |
| except : | |
| unprocess+=1 | |
| scores={} | |
| for key in meta['num']: | |
| if key!='target': | |
| scores[key]=self.valmodel(meta['num']['target'],meta['num'][key],' model '+key) | |
| return out_prodect,meta,scores | |
| def valmodel(self,y,yp,titel=" "): | |
| print('---------------'+titel+'------------------------' ) | |
| cr=classification_report(y, yp) | |
| print(cr) | |
| scores={} | |
| scores['accuracy']=accuracy_score(y, yp) | |
| scores['f1_score']=f1_score(y, yp, average="macro") | |
| scores['precision']=precision_score(y, yp, average="macro") | |
| scores['recall']=recall_score(y, yp, average="macro") | |
| return {'smmray':cr,'scores':scores} | |
| #---------------------- | |
| def ChangeModel(self,ob=None): | |
| if ob==None:return | |
| if type(ob) is not str: | |
| self.Model=ob | |
| else : | |
| if ob=='svmL': | |
| self.Model=self.obMP.obSVML | |
| elif ob=='svmK': | |
| self.Model=self.obMP.obSVMK | |
| else: | |
| self.Model=self.obMP.oblogict | |
| #------------------------------ | |
| def Search(self): | |
| txt=input('Enter any text :') | |
| print('Technique:'+ self.predict(txt)) | |
| def Info_Models(self): | |
| print('Number Data is ',self.obMP.X.shape) | |
| print('Number of classes :',len(self.obMP.Classes)) | |
| print ('---------simples -------------------') | |
| n=len(self.obMP.Classes) | |
| f=n%2 | |
| n=n-f | |
| for i in range(0,n,2): | |
| print( (self.obMP.Classes[i], np.sum(np.int16(self.obMP.Y==i))),'------------------',(self.obMP.Classes[i+1], np.sum(np.int16(self.obMP.Y==i+1)))) | |
| if f==1: print( (self.obMP.Classes[i+2], np.sum(np.int16(self.obMP.Y==i+2)))) | |
| def DlistModel(self): | |
| print ('SVC(kernel=\'linear\')-> svmL') | |
| print('LinearSVC(C=1.0) -> svmK ') | |
| print('LogisticRegression() -> logic') |