Machine Learning
train test split 하는 커스텀 python 스크립트를 만들어봤습니다
jinmc
2021. 12. 23. 18:29
반응형
한번 더 업그레이드 한 버전을 만들었습니다.
수정 된 버전은 좀 더 interactive 하게 만들었고,
train/test/val 이외에 train/test, ratio도 shell script로 input할 수 있도록 만들었습니다.
import os
import random
from shutil import copyfile
# You should make train, test, val directory in target directory before executing this script
# train test or train test val
# also, images, labels directory should be made for all directories
def main():
ldir = os.listdir()
ldir = [l for l in ldir if os.path.isdir(l)]
print("select directory to split ", ldir)
source = input()
print("select target", ldir)
target = input()
print("select ratio by train and test or train, val, test split by spacebar. ex) 3 1, or 3 1 1, 20 3 3")
ratio = input()
ratios = ratio.split()
ratios = [int(r) for r in ratios]
lst = []
for img in os.listdir(f"{source}/images"):
lst.append(img.replace(".jpg", ""))
random.shuffle(lst) # shuffle file names
# train, test 로만 split!
if len(ratios) == 2:
print("train test split!")
testIdx = len(lst) * (ratios[0] / sum(ratios))
for i in range(len(lst)):
img_name = lst[i] + ".jpg"
lbl_name = lst[i] + ".txt"
target_dir = "train"
if i > testIdx:
target_dir = "test"
try:
copyfile(f"{source}/images/{img_name}", f"{target}/{target_dir}/images/{img_name}")
except:
print("copy image fail")
try:
copyfile(f"{source}/labels/{lbl_name}", f"{target}/{target_dir}/labels/{lbl_name}")
except:
print("copy label fail")
elif len(ratios) == 3:
print("traint test val split!")
testIdx = len(lst) * (ratios[0] / sum(ratios))
valIdx = len(lst) * ((ratios[0]+ratios[1]) / sum(ratios))
print(f"test idx : {testIdx} val idx : {valIdx}")
for i in range(len(lst)):
img_name = lst[i] + ".jpg"
lbl_name = lst[i] + ".txt"
target_dir = "train"
if i > testIdx:
target_dir = "test"
if i > valIdx:
target_dir = "val"
try:
copyfile(f"{source}/images/{img_name}", f"{target}/{target_dir}/images/{img_name}")
except:
print("copy image fail")
try:
copyfile(f"{source}/labels/{lbl_name}", f"{target}/{target_dir}/labels/{lbl_name}")
except:
pass
# print("copy label fail")
else:
print("Error! only input 2 or 3 ratios")
if __name__ == "__main__":
main()
아래는 이전에 만든 모델입니다.
딥러닝을 하는데 학습 데이터 뿐만 아니라
Validation Set 뿐만 아니라 Test Set으로 나눠야 되는 경우가 생깁니다.
SkLearn에서의 train_test_split이 있긴 하지만, 제가 원하는 split이 아니기 때문에,
새로 python script를 만들기로 하겠습니다.
일단 조건은, 한 폴더 안에 이미지들과 레이블들이 같이 있는 Yolo의 이미지, 레이블이 같은 폴더에 있을 때를
기준으로 하도록 하겠습니다.
import os
import random
from shutil import copyfile
dir_name = "target_dir"
test_ratio = 0.2
lstdir = os.listdir(f"./{dir_name}/")
#print(lstdir)
lst = []
for img in lstdir:
if ".jpg" in img:
#print("jpg! : " + img)
lst.append(img.replace(".jpg", ""))
#else:
# print("not jpg! : " + img)
random.shuffle(lst)
print(lst)
train_last_idx = int(len(lst) * (1-test_ratio))
print(train_last_idx)
os.mkdir(f"./{dir_name}_train_test")
new_dir_name = f"./{dir_name}_train_test"
train_dir_name = new_dir_name + "/train"
val_dir_name = new_dir_name + "/val"
os.mkdir(train_dir_name)
os.mkdir(val_dir_name)
train_image_dir_name = train_dir_name + "/images"
train_label_dir_name = train_dir_name + "/labels"
os.mkdir(train_image_dir_name)
os.mkdir(train_label_dir_name)
val_image_dir_name = val_dir_name + "/images"
val_label_dir_name = val_dir_name + "/labels"
os.mkdir(val_image_dir_name)
os.mkdir(val_label_dir_name)
for i in range(len(lst)):
img_name = lst[i] + ".jpg"
lbl_name = lst[i] + ".txt"
#print(img_name)
#print(lbl_name)
if i < train_last_idx:
try:
copyfile(f"./{dir_name}/{img_name}", f"{train_image_dir_name}/{img_name}")
except:
print("copy image fail for train")
try:
copyfile(f"./{dir_name}/{lbl_name}", f"{train_label_dir_name}/{lbl_name}")
except:
print("copy label fail for train")
else:
try:
copyfile(f"./{dir_name}/{img_name}", f"{val_image_dir_name}/{img_name}")
except:
print("copy image fail for val")
try:
copyfile(f"./{dir_name}/{lbl_name}", f"{val_label_dir_name}/{lbl_name}")
except:
print("copy label fail for val")
print("complete!")
반응형