You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
44 KiB
44 KiB
페키지 로드¶
In [1]:
%reload_ext watermark %watermark -v -p numpy,matplotlib,pandas,sklearn,tqdm,tensorflow,rpy2,watermark,feature_engine
CPython 3.6.9 IPython 7.16.2 numpy 1.19.5 matplotlib 3.3.4 pandas 1.1.5 sklearn 0.24.2 tqdm 4.62.3 tensorflow 2.6.2 rpy2 3.4.5 watermark 2.0.2 feature_engine 1.2.0
In [2]:
from rpy2.robjects import pandas2ri from rpy2.robjects import r from rpy2.robjects.packages import importr pandas2ri.activate() utils = importr('utils') package_names = ('ranger') utils.chooseCRANmirror(ind=1) #utils.install_packages("ranger") # ranger 패키지 설치
Out[2]:
<rpy2.rinterface_lib.sexp.NULLType object at 0x7f27c9638308> [RTYPES.NILSXP]
In [3]:
import numpy as np import pandas as pd import os, re, cv2 from tqdm.auto import tqdm import matplotlib.pyplot as plt from feature_engine import transformation as vt from sklearn.model_selection import train_test_split import tensorflow as tf os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"]="0" main_dir="/root/data/dacon/open"
함수 생성¶
In [4]:
# 색 강조 def img_Contrast(img,clipLimit=3.0,tileGridSize=(8,8)): lab=cv2.cvtColor(img, cv2.COLOR_BGR2LAB) l,a,b=cv2.split(lab) clahe=cv2.createCLAHE(clipLimit=clipLimit, tileGridSize=tileGridSize) cl=clahe.apply(l) limg=cv2.merge((cl,a,b)) final = cv2.cvtColor(limg,cv2.COLOR_LAB2BGR) return final # 특정범위 색 추출 def img_extract(return_img,img,lower=(0,0,0), upper=(110,255,200)): img_hsv=cv2.cvtColor(img, cv2.COLOR_BGR2HSV) img_mask=cv2.inRange(img_hsv, lower, upper) img_result= cv2.bitwise_and(return_img, return_img, mask=img_mask) img_hsv=cv2.cvtColor(img, cv2.COLOR_HSV2RGB) return img_result # 파일 목록 def file_list(directory): def find_files(directory): return([f"{directory}/{i}" \ for i in os.listdir(directory) if re.compile('png$|jpg$').findall(i)]) out=list() if type(directory)==str: out=find_files(directory) elif type(directory)==list: for folder in range(len(directory)): [out.append(file) for file in find_files(directory[folder])] return( sorted(out)) # 이미지 통계량 추출 def rgb_stat(img): r_m,g_m,b_m =np.mean(img,axis=(0,1)) r_sd,g_sd,b_sd= np.std(img,axis=(0,1)) return r_m,g_m,b_m,r_sd,g_sd,b_sd # 무게 산출 def img_to_weight(img,n=15000): return (cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)!=0).sum()/n # 파일 목록 def file_list(directory): def find_files(directory): return([f"{directory}/{i}" \ for i in os.listdir(directory) if re.compile('png$|jpg$').findall(i)]) out=list() if type(directory)==str: out=find_files(directory) elif type(directory)==list: for folder in range(len(directory)): [out.append(file) for file in find_files(directory[folder])] return( sorted(out)) # 이미지 변수 생성 def img_feature(dirs): df=pd.DataFrame({'img_dirs':dirs}) for i in tqdm(range(len(dirs))): img=cv2.imread(dirs[i]) raw_img=cv2.resize(img,dsize=(int(img.shape[0]/4),int(img.shape[1]/4)),interpolation=cv2.INTER_CUBIC) contrast_img=img_Contrast(img, 3, tileGridSize=(4,3)) # 시든 잎 추출 extract_img=img_extract(img,contrast_img, lower=(10,0,0), upper=(30,255,255)) df.loc[i,'del_leaf']=img_to_weight(extract_img,16900) # 청경채 추출 extract_img=img_extract(img,contrast_img,upper=(90,255,130))# 원본에서 추출 df.loc[i,'pred_leaf1']=img_to_weight(extract_img,16900) contrast_img=img_Contrast(img, 3, tileGridSize=(5,5)) extract_img=img_extract(img,contrast_img,upper=(77,255,130))# 원본에서 추출 df.loc[i,'pred_leaf2']=img_to_weight(extract_img,16900) df.loc[i,'pred_leaf_mean']=(df.loc[i,'pred_leaf1']+df.loc[i,'pred_leaf2'])/2 #RGB 추출 df.loc[i,["r_m","g_m","b_m","r_sd","g_sd","b_sd"]]=rgb_stat(extract_img) return df
자료생성¶
In [5]:
tr_directory=[f"{main_dir}/train/CASE{i:02d}/image" for i in range(1,76)] tr_img_dirs=file_list(tr_directory) tr_img_dirs.remove(f'{main_dir}/train/CASE45/image/CASE45_17.png') te_img_dirs=file_list(f"{main_dir}/test/image") label_dfs=list() for file in tqdm([f"{main_dir}/train/CASE{i:02d}/label.csv" for i in range(1,76)]): temp_df=pd.read_csv(file) for i, img_file in enumerate(temp_df.img_name): time_df=pd.read_csv( f"{main_dir}/train/CASE{img_file[4:6]}/meta/{img_file.replace('jpg','png').replace('png','csv')}") time_df=time_df.sort_values('시간') time=time_df.loc[0,'시간'] temp_df.loc[i,'date']=pd.to_datetime(time).date() label_dfs.append(temp_df) label_df=pd.concat(label_dfs) label_df['case']=[i[:6] for i in label_df['img_name']] merge_df=label_df.copy() merge_df.columns=['img_name','now_weight','date','case'] merge_df=merge_df.drop("img_name",axis=1) merge_df['date']=merge_df.date+pd.to_timedelta(1,unit='day') label_df=pd.merge(label_df,merge_df,how='left',on=['case','date']) del merge_df
0%| | 0/75 [00:00<?, ?it/s]
In [6]:
if not('tr_df.csv' in os.listdir("/root/jupyter/데이콘/청경채/input/")): tr_df=img_feature(tr_img_dirs) te_df=img_feature(te_img_dirs) tr_df.to_csv('/root/jupyter/데이콘/청경채/input/tr_df.csv',index=False) te_df.to_csv('/root/jupyter/데이콘/청경채/input/te_df.csv',index=False) else: tr_df=pd.read_csv('/root/jupyter/데이콘/청경채/input/tr_df.csv') te_df=pd.read_csv('/root/jupyter/데이콘/청경채/input/te_df.csv') #라벨 for i in tqdm(range(tr_df.shape[0])): tr_df.loc[i,['leaf_weight','date','now_weight']]=label_df.loc[ label_df.img_name==tr_df.img_dirs[i].split('/')[-1],['leaf_weight','date','now_weight']].values[0]
0%| | 0/1591 [00:00<?, ?it/s]
삭제할 자료¶
CASE 2_10, 2_11, 34_01, 40_01, 40_02, 44_01, 52_01, 56_01, 60_20~34, 63_01, 64_01 : 환경자료 결측
CASE 8, 9, 22, 23, 26, 30, 31, 49, 59, 71, 72, 73 : 환경자료 결측
CASE 35_01, 41_01, 44_02, 45_01, 52_02, 53_01, 56_02, 57_01, 63_02 : 부분결측(제거)
CASE 34, 35, 48 : EC 결측
CASE 32_15, 51_11 : Co2 이상
In [7]:
env_na_p=[f"CASE{i}" for i in set([f"60_{i}" for i in range(20,34+1)]).union( set(['02_10','02_11','34_01','40_01','40_02','44_01','52_01','56_01','63_01','64_01']))] env_na_a=[f"CASE{i}" for i in ['08','09','22','23','26','30','31','49','59','71','72','73']] partial_na=[f"CASE{i}" for i in ["35_01","41_01","44_02","45_01","52_02","53_01","57_01","63_02"]] ec_na=[f"CASE{i}" for i in ["34","35","48"]] tr_df['na_label']=False for i in (env_na_p+env_na_a+partial_na+ec_na): tr_df.loc[tr_df['img_dirs'].str.contains(i),"na_label"]=True
In [8]:
for i, filename in tqdm(enumerate(tr_df.img_dirs)): temp_df=pd.read_csv(filename.replace('image','meta').replace('jpg','png').replace('png','csv')) time_df=time_df.sort_values('시간') temp_df.시간=pd.to_datetime(temp_df.시간) temp_df['청색광추정광량']=(temp_df['총추정광량']- temp_df['백색광추정광량']+temp_df['적색광추정광량'])[temp_df['청색광추정광량'].isna()] aftn_co2=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(9,19))),'CO2관측치'].quantile(.5) night_co2=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(19,23))+list(range(0,5))),'CO2관측치'].quantile(.5) co2_ratio=aftn_co2/night_co2 # 1보다 낮으면 생육단계, 1보다 크면 발아단계 zero_ec_cnt=sum(temp_df['EC관측치']==0) disease_signal=co2_ratio*zero_ec_cnt aftn_ec=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(10,15))),'EC관측치'].quantile(.5) night_ec1=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(22,23))),'EC관측치'].quantile(.5) night_ec2=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(3,5))),'EC관측치'].quantile(.5) m_temp=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(10,18))),'내부온도관측치'].mean(skipna=True) m_humidity=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(10,15))),'내부습도관측치'].mean(skipna=True) # 결측자료 처리 if np.isnan(aftn_ec): if tr_df['del_leaf'][i]>10: aftn_ec=night_ec1*1.5 else: aftn_ec=night_ec2 if np.isnan(m_temp): m_temp=temp_df['내부온도관측치'].mean(skipna=True) if np.isnan(m_humidity): m_humidity=temp_df['내부습도관측치'].mean(skipna=True) if np.isnan(night_ec1): night_ec1=0 if np.isnan(co2_ratio): if tr_df.pred_leaf_mean[i]>50: co2_ratio = 0.5 else: co2_ratio = 1.5 disease_signal=co2_ratio*zero_ec_cnt tr_df.loc[i,['co2_ratio','zero_ec_cnt','disease_signal', 'aftn_ec','night_ec1','night_ec2','m_temp','m_humidity']]=\ co2_ratio, zero_ec_cnt, disease_signal, aftn_ec, night_ec1, night_ec2, m_temp, m_humidity
0it [00:00, ?it/s]
In [9]:
for i, filename in tqdm(enumerate(te_df.img_dirs)): temp_df=pd.read_csv(filename.replace('image','meta').replace('jpg','png').replace('png','csv')) time_df=time_df.sort_values('시간') temp_df.시간=pd.to_datetime(temp_df.시간) temp_df['청색광추정광량']=(temp_df['총추정광량']- temp_df['백색광추정광량']+temp_df['적색광추정광량'])[temp_df['청색광추정광량'].isna()] aftn_co2=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(9,19))),'CO2관측치'].quantile(.5) night_co2=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(19,23))+list(range(0,5))),'CO2관측치'].quantile(.5) co2_ratio=aftn_co2/night_co2 # 1보다 낮으면 생육단계, 1보다 크면 발아단계 zero_ec_cnt=sum(temp_df['EC관측치']==0) aftn_ec=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(10,15))),'EC관측치'].quantile(.5) night_ec1=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(22,23))),'EC관측치'].quantile(.5) night_ec2=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(3,5))),'EC관측치'].quantile(.5) m_temp=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(10,18))),'내부온도관측치'].mean(skipna=True) m_humidity=temp_df.loc[temp_df.시간.dt.hour.isin(list(range(10,15))),'내부습도관측치'].mean(skipna=True) # 결측자료 처리 if np.isnan(aftn_ec): if te_df['del_leaf'][i]>10: aftn_ec=night_ec1*1.5 else: aftn_ec=night_ec2 if np.isnan(m_temp): m_temp=temp_df['내부온도관측치'].mean(skipna=True) if np.isnan(m_humidity): m_humidity=temp_df['내부습도관측치'].mean(skipna=True) if np.isnan(night_ec1): night_ec1=0 if np.isnan(co2_ratio): if te_df.pred_leaf_mean[i]>50: co2_ratio = 0.5 else: co2_ratio = 1.5 disease_signal=co2_ratio*zero_ec_cnt te_df.loc[i,['co2_ratio','zero_ec_cnt','disease_signal', 'aftn_ec','night_ec1','night_ec2','m_temp','m_humidity']]=\ co2_ratio, zero_ec_cnt, disease_signal, aftn_ec, night_ec1, night_ec2, m_temp, m_humidity
0it [00:00, ?it/s]
청경채 무게 추정¶
In [10]:
df=tr_df[['img_dirs','leaf_weight']].dropna().reset_index() train,valid=train_test_split(df, test_size=0.33, random_state=42)
In [11]:
df[['img_dirs','leaf_weight']].isna().sum()
Out[11]:
img_dirs 0 leaf_weight 0 dtype: int64
CNN만으로 예측¶
In [12]:
def tr_gen(): df=train[['img_dirs','leaf_weight']].dropna().reset_index() for i in range(df.shape[0]): img=cv2.imread(df.img_dirs[i]) img=cv2.resize(img,dsize=(int(img.shape[0]/4),int(img.shape[1]/4)),interpolation=cv2.INTER_CUBIC) contrast_img=img_Contrast(img, 3, tileGridSize=(4,3)) extract_img1=img_extract(img,contrast_img, lower=(10,0,0), upper=(30,255,255)) extract_img2=img_extract(img,contrast_img,upper=(90,255,130))# 원본에서 추출 extract_img=cv2.addWeighted(extract_img1,1,extract_img2,1,1) yield (extract_img, df.leaf_weight[i]) def va_gen(): df=valid[['img_dirs','leaf_weight']].dropna().reset_index() for i in range(df.shape[0]): img=cv2.imread(df.img_dirs[i]) img=cv2.resize(img,dsize=(int(img.shape[0]/4),int(img.shape[1]/4)),interpolation=cv2.INTER_CUBIC) contrast_img=img_Contrast(img, 3, tileGridSize=(4,3)) extract_img1=img_extract(img,contrast_img, lower=(10,0,0), upper=(30,255,255)) extract_img2=img_extract(img,contrast_img,upper=(90,255,130))# 원본에서 추출 extract_img=cv2.addWeighted(extract_img1,1,extract_img2,1,1) yield (extract_img, df.leaf_weight[i]) def check_gen(): df=tr_df[['img_dirs','leaf_weight']].dropna().reset_index() for i in range(df.shape[0]): img=cv2.imread(df.img_dirs[i]) img=cv2.resize(img,dsize=(int(img.shape[0]/4),int(img.shape[1]/4)),interpolation=cv2.INTER_CUBIC) contrast_img=img_Contrast(img, 3, tileGridSize=(4,3)) extract_img1=img_extract(img,contrast_img, lower=(10,0,0), upper=(30,255,255)) extract_img2=img_extract(img,contrast_img,upper=(90,255,130))# 원본에서 추출 extract_img=cv2.addWeighted(extract_img1,1,extract_img2,1,1) yield (extract_img, df.leaf_weight[i]) def te_gen(): df=te_df[['img_dirs']].reset_index() for i in range(df.shape[0]): img=cv2.imread(df.img_dirs[i]) img=cv2.resize(img,dsize=(int(img.shape[0]/4),int(img.shape[1]/4)),interpolation=cv2.INTER_CUBIC) contrast_img=img_Contrast(img, 3, tileGridSize=(4,3)) extract_img1=img_extract(img,contrast_img, lower=(10,0,0), upper=(30,255,255)) extract_img2=img_extract(img,contrast_img,upper=(90,255,130))# 원본에서 추출 extract_img=cv2.addWeighted(extract_img1,1,extract_img2,1,1) yield (extract_img, np.nan) def NMAE(true, pred): mae = np.mean(np.abs(true-pred)) score = mae / np.mean(np.abs(true)) return score def nmae_keras(y_true, y_pred): score = tf.py_function(func=NMAE, inp=[y_true, y_pred], Tout=tf.float32, name='name') return score
In [13]:
tr_data=tf.data.Dataset.from_generator(tr_gen,(tf.float32,tf.float32)) tr_data=tr_data.cache().batch(24).prefetch(buffer_size=10) va_data=tf.data.Dataset.from_generator(va_gen,(tf.float32,tf.float32)) va_data=va_data.cache().batch(24).prefetch(buffer_size=10) te_data=tf.data.Dataset.from_generator(te_gen,(tf.float32,tf.float32)) te_data=te_data.cache().batch(24).prefetch(buffer_size=10) ch_data=tf.data.Dataset.from_generator(check_gen,(tf.float32,tf.float32)) ch_data=ch_data.cache().batch(24).prefetch(buffer_size=10)
In [14]:
next(iter(tr_data))[1]
Out[14]:
<tf.Tensor: shape=(24,), dtype=float32, numpy= array([6.32770e+01, 6.21000e-01, 1.57960e+01, 6.40700e+00, 3.56220e+01, 1.78650e+01, 2.03625e+02, 1.12030e+01, 2.34540e+01, 2.91000e-01, 1.36669e+02, 5.90100e+01, 3.69356e+02, 1.98366e+02, 1.60211e+02, 1.30000e-01, 5.25400e+00, 1.27300e+00, 2.86059e+02, 1.12600e+01, 5.22000e-01, 2.86100e+00, 2.20446e+02, 8.98200e+00], dtype=float32)>
In [15]:
if not('forecast_weight_best_model_v1.h5' in os.listdir("/root/jupyter/데이콘/청경채/output/")): tf.random.set_seed(42) inp = tf.keras.Input(shape=(820, 616, 3),dtype=tf.float32) conv_1=tf.keras.layers.Conv2D(16,kernel_size=1, activation='LeakyReLU')(inp) avg_1=tf.keras.layers.AveragePooling2D()(conv_1) conv_2=tf.keras.layers.Conv2D(64,kernel_size=1, activation='LeakyReLU')(avg_1) avg_2=tf.keras.layers.AveragePooling2D()(conv_2) conv_3=tf.keras.layers.Conv2D(32,kernel_size=1, activation='LeakyReLU')(avg_2) avg_3=tf.keras.layers.AveragePooling2D()(conv_3) conv_4=tf.keras.layers.Conv2D(8,kernel_size=1, activation='LeakyReLU')(avg_3) avg_4=tf.keras.layers.AveragePooling2D()(conv_4) flat=tf.keras.layers.Flatten()(avg_4) dense_1=tf.keras.layers.Dense(64,activation='ReLU')(flat) dense_2=tf.keras.layers.Dense(32,activation='LeakyReLU')(dense_1) out=tf.keras.layers.Dense(1,activation='LeakyReLU')(dense_2) model = tf.keras.Model(inp, out) early = tf.keras.callbacks.EarlyStopping( monitor='val_loss',mode="min", patience=10) lr_reduce=tf.keras.callbacks.ReduceLROnPlateau( monitor='val_loss',patience=3,verbose=1,min_delta=0.001) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mse',metrics=['mae']) model.fit(tr_data,verbose=1,callbacks =[early,lr_reduce], epochs=50,validation_data=va_data) # fine tuning for i in model.layers[0:-4]: i.trainable=False cp_callback = tf.keras.callbacks.ModelCheckpoint( filepath='/root/jupyter/데이콘/청경채/model/now_weight_{val_loss:.2f}.h5', monitor='val_loss',mode='min',verbose=1) early = tf.keras.callbacks.EarlyStopping(monitor='val_loss',mode="min", patience=50) lr_reduce=tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',patience=10,verbose=1) tf.random.set_seed(42) model.fit(tr_data,verbose=1,callbacks =[early,lr_reduce,cp_callback], epochs=300,validation_data=va_data) file_loss=min([i.split('/')[-1].split('_')[-1].replace('.h5','') for i in os.listdir('/root/jupyter/데이콘/청경채/model/')]) model=tf.keras.models.load_model(f'/root/jupyter/데이콘/청경채/model/now_weight_{file_loss}.h5') tf.keras.models.save_model(model,f'/root/jupyter/데이콘/청경채/output/forecast_weight_best_model_v1.h5') else: model=tf.keras.models.load_model(f'/root/jupyter/데이콘/청경채/output/forecast_weight_best_model_v1.h5')
In [16]:
tr_df['cnn_now_weight']=model.predict(ch_data) te_df['cnn_now_weight']=model.predict(te_data)
In [17]:
plt.scatter(tr_df.cnn_now_weight,tr_df.leaf_weight)
Out[17]:
<matplotlib.collections.PathCollection at 0x7f26cc768cf8>
In [18]:
submit=pd.read_csv(f"{main_dir}/sample_submission.csv") submit['leaf_weight']=te_df['cnn_now_weight'] submit.to_csv(f"/root/jupyter/데이콘/청경채/output/submit_5.csv",index=False)