File size: 10,997 Bytes
74fa2db 5d61439 74fa2db 5d61439 74fa2db 5d61439 74fa2db 5d61439 74fa2db 5d61439 74fa2db 5d61439 74fa2db 5d61439 74fa2db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
from typing import Union, NamedTuple
from collections.abc import Callable
import io,os,re,sys,math,time,uuid,ctypes,pickle,random,shutil,string,urllib,decimal,datetime,itertools,traceback,collections,statistics
import numpy as np, pandas as pd
import plotly.express as px
import huggingface_hub
import sklearn #, statsmodels
from sklearn import svm, neighbors, naive_bayes, neural_network, tree, ensemble, linear_model, discriminant_analysis, gaussian_process, manifold, cluster
#from statsmodels.tsa import seasonal
os.makedirs(".temp", exist_ok=True) # for temporary local files
""" remove decoration and popup menu button at top """
STYLE_CORRECTION = " ".join([
"<style>",
"header[data-testid='stHeader'] { display:none }",
"div[data-testid='stSidebarHeader'] { display:none }",
"div[data-testid='stAppViewBlockContainer'] { padding:1em }",
"div[data-testid='collapsedControl'] { background-color:#EEE }",
"a[href='https://streamlit.io/cloud'] { display:none }"
"</style>"
])
###
def pandas_info(df: pd.DataFrame) -> Union[pd.DataFrame,str]:
buffer = io.StringIO()
df.info(buf=buffer)
str_info = buffer.getvalue()
try:
lines = str_info.splitlines()
df = (pd.DataFrame([x.split() for x in lines[5:-2]], columns=lines[3].split()).drop('Count',axis=1).rename(columns={'Non-Null':'Non-Null Count'}))
return df
except Exception as ex:
print(ex)
return str_info
def pandas_random_dataframe(n_cols:int = 15, n_rows:int = 100) -> pd.DataFrame:
""" create random dataframe - случайные числа, для отладки например """
df = pd.DataFrame(np.random.randn(n_rows, n_cols), columns=(f"col {i}" for i in range(n_cols)))
return df
########################################################################################################################################
class HfRepo(NamedTuple):
repo_id: str
repo_type: str
get_token: Callable[[], str]
class HF_tools:
""" Huggingface tools """
def list_models_spaces(get_token: Callable[[], str], author = 'f64k'):
""" list models and spaces """
api = huggingface_hub.HfApi(token=get_token()) #
#spaces = api.list_spaces(author=author)
models = api.list_models(author=author)
datasets = api.list_datasets(author=author)
lstResult = list(datasets) + list(models)
lstResult = [ {"id": i.id, "type": type(i).__name__, "private": i.private, "tags": i.tags} for i in lstResult]
return lstResult
def save_dataframe_to_hf(repo: HfRepo, dfToSave: pd.DataFrame, new_filename: str, remote_subdir: str) -> Union[huggingface_hub.CommitInfo, Exception]:
""" save dataframe to hf repo """
try:
local_filename = os.path.join(".temp", new_filename)
#df.to_csv('compressed_data.zip', index=False, compression={'method': 'zip', 'archive_name': 'data.csv'})
dfToSave.to_csv(local_filename, index=False, sep=";", encoding="utf-8") # , compression="zip"
apiHF = huggingface_hub.HfApi(token=repo.get_token()) # os.getenv(repo.env_token)
path_in_repo = os.path.basename(local_filename)
if remote_subdir:
path_in_repo = f"{remote_subdir}/{path_in_repo}"
commit_info = apiHF.upload_file(path_or_fileobj=local_filename, path_in_repo=path_in_repo, repo_id=repo.repo_id, repo_type=repo.repo_type)
return commit_info
except Exception as exSave:
return exSave
def load_dataframes_from_hf(repo: HfRepo, lstCsvFiles: list[str] = []) -> {str, pd.DataFrame}:
""" load dataframes from hf """
#https://huggingface.co/datasets/f64k/gaziev/blob/main/TestData3_2204_noAB_gaziev.zip
dict_res = {}
for fl_name in lstCsvFiles:
try: file_loaded = huggingface_hub.hf_hub_download(filename=fl_name, repo_id=repo.repo_id, repo_type=repo.repo_type, token=repo.get_token())
except: file_loaded = ""
if os.path.exists(file_loaded):
compress = "zip" if file_loaded.lower().endswith("zip") else None
df_loaded = pd.read_csv(file_loaded, sep=";", encoding = "utf-8", compression=compress)
dict_res[fl_name] = df_loaded # df_Vproc = df_process_v_column(df_loaded)
return dict_res
def list_files_hf(repo: HfRepo) -> list[str]:
""" List CSV and ZIP files in HF repo - список CSV и ZIP файлов (c уровнем вложенности) в репозитории """
### https://huggingface.co/docs/huggingface_hub/en/guides/hf_file_system
fs = huggingface_hub.HfFileSystem(token=repo.get_token(), use_listings_cache=False) # , skip_instance_cache=True
path_hf = f"{repo.repo_type}s/{repo.repo_id}/"
#lst = fs.ls(path_hf, detail=False)
lstGlob = fs.glob(path_hf + "**") # map(os.path.basename, lstGlob)
lstNames = [fname.replace(path_hf, "") for fname in lstGlob if fname.lower().endswith(".csv") or fname.lower().endswith(".zip")]
#print(f"ПРОЧИТАНО В list_files_hf() : {lstNames=}")
return lstNames
########################################################################################################################################
RANDOM_STATE=11
class XYZV_tools:
""" XYZV tools - для данных в специальном формате """
def df_process_v_column(df: pd.DataFrame) -> pd.DataFrame:
""" обработка столбца V для дальнейшего удобства + столб T типа время """
df = df.reset_index() #
df.rename(columns = {"index": "T"}, inplace=True)
df["Vis"] = df.V.map(lambda v: 0 if str(v)=="nan" else 1).astype(int)
df["Vfloat"] = df.V.map(lambda v: 0 if str(v)=="nan" else str(v).replace(',', '.')).astype(float)
df["Vsign"] = df.Vfloat.map(lambda v: -1 if v<0 else 1 if v>0 else 0).astype(int)
df["Vposneg"] = df.Vfloat.map(lambda v: "n" if v<0 else "p" if v>0 else "o").astype(str)
return df
@staticmethod
def CreateDictClassifiers_BestForXYZ() :
dictFastTree = {
#"RandomForestClassifier": ensemble.RandomForestClassifier(random_state=RANDOM_STATE), # совсем плохие показатели
#"ExtraTreeClassifier": tree.ExtraTreeClassifier(random_state=RANDOM_STATE), #
"DecisionTreeClassifier": tree.DecisionTreeClassifier(random_state=RANDOM_STATE), # лучший по последним баллам
}
#return {**dictFast}
#return {**dict_Test_MLPClassifier}
#return {**dictFast, **dictLongTrain}
return {**dictFastTree}
# lstRepoZipFiles = ["TrainData_1504_AB_gaziev.zip","TestData_1504_AB_gaziev.zip","TestData3_2204_noAB_gaziev.zip"]
### returns (classifier_object, df_train_with_predict, time_elapsed)
def GetClassifier(lstDfOriginal, nHystorySteps) :
#lstDfOriginal = [df_9125_Train, df_12010_Test, df_9051_Test3]
nShift = nHystorySteps
nCurrShift = nHystorySteps
classifierName = "DecisionTreeClassifier"
colsVectorInp = ["X","Y","Z"]
fieldY = "Vis" #
lstDataFrames = XYZV_tools.MakeHystoryColumns(lstDfOriginal, nShift)
df_train = pd.concat(lstDataFrames)
lstColsShift = [f"{c}-{i}" for i in range(1, nCurrShift+1) for c in colsVectorInp] # для nCurrShift=0 lstColsShift=[]
colsVectorInpAll = colsVectorInp + lstColsShift
y_train = df_train[fieldY]
x_train_vect = df_train[colsVectorInpAll]
dictClassifiers = XYZV_tools.CreateDictClassifiers_BestForXYZ()
classifierObject = dictClassifiers[classifierName]
start2 = time.time()
classifierObject.fit(x_train_vect, y_train) # процесс обучения
time_elapsed = time.time() - start2
y_pred = classifierObject.predict(x_train_vect.values) # .values[:,::-1] поля XYZ и истории в обратном порядке
df_train[f"predict_{fieldY}"] = y_pred
return (classifierObject, df_train, time_elapsed)
#
def MakeHystoryColumns(lstDfOriginal, nShift) :
lstDataframesShifted = [df.copy() for df in lstDfOriginal]
lstColsShift = []
for i in range(1, nShift+1):
#cols = ["X","Y","Z"]+["A","B"]
cols = ["X","Y","Z"]
#cols = ["A","B"]
for c in cols:
for dfShift in lstDataframesShifted:
dfShift[f'{c}-{i}'] = dfShift[c].shift(i).fillna(0)
lstColsShift.append(lstDataframesShifted[0].columns[-1])
print(lstColsShift)
return lstDataframesShifted
###
def plotly_xyzv_scatter_gray(df3D):
""" 3D plot """
color_discrete_map = dict(o='rgb(230,230,230)', p='rgb(90,1,1)', n='rgb(1,1,90)')
fig = px.scatter_3d(df3D, x='X', y='Y', z='Z', color="Vposneg", opacity=0.4, height=800, color_discrete_map=color_discrete_map)
fig.update_scenes(
xaxis={"gridcolor":"rgba(30, 0, 0, 0.2)","color":"rgb(100, 0, 0)","showbackground":False},
yaxis={"gridcolor":"rgba(0, 30, 0, 0.2)","color":"rgb(0, 100, 0)","showbackground":False},
zaxis={"gridcolor":"rgba(0, 0, 30, 0.2)","color":"rgb(0, 0, 100)","showbackground":False})
fig.update_traces(marker_size=3)
return fig
########################################################################################################################################
#import joblib
#REPO_ID = "YOUR_REPO_ID"
#FILENAME = "sklearn_model.joblib"
#model = joblib.load(hf_hub_download(repo_id=REPO_ID, filename=FILENAME))
if False:
if False:
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
scaler = sklearn.preprocessing.StandardScaler()
#scaler = sklearn.preprocessing.PowerTransformer()
#scaler = sklearn.preprocessing.RobustScaler()
#scaler = sklearn.preprocessing.MinMaxScaler() # https://scikit-learn.org/1.1/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler
#scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
#scaler = sklearn.preprocessing.QuantileTransformer()
#scaler = sklearn.preprocessing.QuantileTransformer(output_distribution="normal")
#scaler = sklearn.preprocessing.Normalizer() # всё на сферу кладёт - приводит к 1 длину вектора
scale_columns = ["X","Y","Z"]
scaledData = scaler.fit_transform(df3D[scale_columns])
if False:
scaler2 = sklearn.preprocessing.Normalizer()
scaledData = scaler2.fit_transform(scaledData)
df3D_Scaled = pd.DataFrame(data=scaledData, columns=scale_columns)
df3D_Scaled["Vposneg"] = df3D["Vposneg"]
df3D = df3D_Scaled
|