Machine Learning

train test split 하는 커스텀 python 스크립트를 만들어봤습니다

jinmc 2021. 12. 23. 18:29
반응형

한번 더 업그레이드 한 버전을 만들었습니다.

수정 된 버전은 좀 더 interactive 하게 만들었고, 

train/test/val 이외에 train/test, ratio도 shell script로 input할 수 있도록 만들었습니다.

 

import os
import random
from shutil import copyfile

# You should make train, test, val directory in target directory before executing this script
# train test  or train test val
# also, images, labels directory should be made for all directories

def main():
    ldir = os.listdir()
    ldir = [l for l in ldir if os.path.isdir(l)]
    print("select directory to split ", ldir)
    source = input()

    print("select target", ldir)
    target = input()

    print("select ratio by train and test or train, val, test split by spacebar. ex) 3 1, or 3 1 1, 20 3 3")
    ratio = input()
    ratios = ratio.split()
    ratios = [int(r) for r in ratios]

    lst = []
    for img in os.listdir(f"{source}/images"):
        lst.append(img.replace(".jpg", ""))
    random.shuffle(lst) # shuffle file names

    # train, test 로만 split!
    if len(ratios) == 2:
        print("train test split!")
        testIdx = len(lst) * (ratios[0] / sum(ratios))
        for i in range(len(lst)):
            img_name = lst[i] + ".jpg"
            lbl_name = lst[i] + ".txt"
            target_dir = "train"
            if i > testIdx:
                target_dir = "test"

            try:
                copyfile(f"{source}/images/{img_name}", f"{target}/{target_dir}/images/{img_name}")
            except:
                print("copy image fail")
            try:
                copyfile(f"{source}/labels/{lbl_name}", f"{target}/{target_dir}/labels/{lbl_name}")
            except:
                print("copy label fail")

    elif len(ratios) == 3:
        print("traint test val split!")
        testIdx = len(lst) * (ratios[0] / sum(ratios))
        valIdx = len(lst) * ((ratios[0]+ratios[1]) / sum(ratios))
        print(f"test idx : {testIdx} val idx : {valIdx}")
        for i in range(len(lst)):
            img_name = lst[i] + ".jpg"
            lbl_name = lst[i] + ".txt"
            target_dir = "train"
            if i > testIdx:
                target_dir = "test"
            if i > valIdx:
                target_dir = "val"
            try:
                copyfile(f"{source}/images/{img_name}", f"{target}/{target_dir}/images/{img_name}")
            except:
                print("copy image fail")
            try:
                copyfile(f"{source}/labels/{lbl_name}", f"{target}/{target_dir}/labels/{lbl_name}")
            except:
                pass
                # print("copy label fail")

    else:
        print("Error! only input 2 or 3 ratios")



if __name__ == "__main__":
    main()

 

아래는 이전에 만든 모델입니다.


 

딥러닝을 하는데 학습 데이터 뿐만 아니라 

Validation Set 뿐만 아니라 Test Set으로 나눠야 되는 경우가 생깁니다.

SkLearn에서의 train_test_split이 있긴 하지만, 제가 원하는 split이 아니기 때문에, 

새로 python script를 만들기로 하겠습니다.

 

일단 조건은, 한 폴더 안에 이미지들과 레이블들이 같이 있는 Yolo의 이미지, 레이블이 같은 폴더에 있을 때를 

기준으로 하도록 하겠습니다.

 

import os
import random
from shutil import copyfile

dir_name = "target_dir"
test_ratio = 0.2
lstdir = os.listdir(f"./{dir_name}/")

#print(lstdir)
lst = []

for img in lstdir:
    if ".jpg" in img:
        #print("jpg! : " + img)
        lst.append(img.replace(".jpg", ""))
    #else:
    #    print("not jpg! : " + img)

random.shuffle(lst)

print(lst)

train_last_idx = int(len(lst) * (1-test_ratio))
print(train_last_idx)

os.mkdir(f"./{dir_name}_train_test")
new_dir_name = f"./{dir_name}_train_test"

train_dir_name = new_dir_name + "/train"
val_dir_name = new_dir_name + "/val"

os.mkdir(train_dir_name)
os.mkdir(val_dir_name)

train_image_dir_name = train_dir_name + "/images"
train_label_dir_name = train_dir_name + "/labels"

os.mkdir(train_image_dir_name)
os.mkdir(train_label_dir_name)

val_image_dir_name = val_dir_name + "/images"
val_label_dir_name = val_dir_name + "/labels"

os.mkdir(val_image_dir_name)
os.mkdir(val_label_dir_name)


for i in range(len(lst)):
    img_name = lst[i] + ".jpg"
    lbl_name = lst[i] + ".txt"
    #print(img_name)
    #print(lbl_name)

    if i < train_last_idx:
        try:
            copyfile(f"./{dir_name}/{img_name}", f"{train_image_dir_name}/{img_name}")
        except:
            print("copy image fail for train")
        try:
            copyfile(f"./{dir_name}/{lbl_name}", f"{train_label_dir_name}/{lbl_name}")
        except:
            print("copy label fail for train")
    else:
        try:
            copyfile(f"./{dir_name}/{img_name}", f"{val_image_dir_name}/{img_name}")
        except:
            print("copy image fail for val")
        try:
            copyfile(f"./{dir_name}/{lbl_name}", f"{val_label_dir_name}/{lbl_name}")
        except:
            print("copy label fail for val")

print("complete!")

 

 

 

 

반응형