[DL] RNN을 활용한 주가 예측
카테고리: DL
태그: Deep Learning, NLP, RNN
💡 교내 학회 NLP 분반에서 학습한 내용을 정리한 포스팅입니다.
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
1. finance-datareader로 주가 데이터 갖고 오기
finance-datareader는 한국, 미국 주식 가격, 지수, 환율 등 금융 데이터를 수집해놓은 라이브러리이다.
DataRead(종목코드, 시작날짜, 종료날짜)
로 주가 데이터를 갖고올 수 있다.
!pip install finance-datareader
import FinanceDataReader as fdr
# 2016년~2022년 카카오 주가 데이터
data = fdr.DataReader('035720', start='20160101', end='20221231')
data.head()
Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|
Date | ||||||
2016-01-04 | 23583 | 23583 | 23142 | 23142 | 297687 | -0.004345 |
2016-01-05 | 22900 | 23583 | 22840 | 23503 | 318036 | 0.015599 |
2016-01-06 | 23684 | 24306 | 23543 | 23905 | 544137 | 0.017104 |
2016-01-07 | 23764 | 24186 | 23403 | 23544 | 342194 | -0.015101 |
2016-01-08 | 23142 | 23262 | 22820 | 23122 | 400046 | -0.017924 |
data.tail()
Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|
Date | ||||||
2022-12-23 | 54000 | 54300 | 53100 | 53400 | 1339673 | -0.030853 |
2022-12-26 | 53400 | 53800 | 52700 | 53600 | 988777 | 0.003745 |
2022-12-27 | 53900 | 54700 | 53600 | 54400 | 1226474 | 0.014925 |
2022-12-28 | 53900 | 54700 | 52900 | 53600 | 1268005 | -0.014706 |
2022-12-29 | 53500 | 55300 | 52900 | 53100 | 1319611 | -0.009328 |
index로 지정되어있는 Date를 컬럼으로 바꿔준다.
data.reset_index(drop=False, inplace=True)
data
Date | Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|---|
0 | 2016-01-04 | 23583 | 23583 | 23142 | 23142 | 297687 | -0.004345 |
1 | 2016-01-05 | 22900 | 23583 | 22840 | 23503 | 318036 | 0.015599 |
2 | 2016-01-06 | 23684 | 24306 | 23543 | 23905 | 544137 | 0.017104 |
3 | 2016-01-07 | 23764 | 24186 | 23403 | 23544 | 342194 | -0.015101 |
4 | 2016-01-08 | 23142 | 23262 | 22820 | 23122 | 400046 | -0.017924 |
... | ... | ... | ... | ... | ... | ... | ... |
1716 | 2022-12-23 | 54000 | 54300 | 53100 | 53400 | 1339673 | -0.030853 |
1717 | 2022-12-26 | 53400 | 53800 | 52700 | 53600 | 988777 | 0.003745 |
1718 | 2022-12-27 | 53900 | 54700 | 53600 | 54400 | 1226474 | 0.014925 |
1719 | 2022-12-28 | 53900 | 54700 | 52900 | 53600 | 1268005 | -0.014706 |
1720 | 2022-12-29 | 53500 | 55300 | 52900 | 53100 | 1319611 | -0.009328 |
1721 rows × 7 columns
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 1721 non-null datetime64[ns]
1 Open 1721 non-null int64
2 High 1721 non-null int64
3 Low 1721 non-null int64
4 Close 1721 non-null int64
5 Volume 1721 non-null int64
6 Change 1721 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(5)
memory usage: 94.2 KB
2. train, val set 나누기
평가에 사용할 validation set은 전체 데이터의 30%를 사용할 것이다.
length_data = len(data) # data 행 개수
split_ratio = 0.7 # 0.7 / 0.3 으로 분리
length_train = round(length_data * split_ratio)
length_validation = length_data - length_train
print("Data length :", length_data)
print("Train data length :", length_train)
print("Validation data lenth :", length_validation)
Data length : 1721
Train data length : 1205
Validation data lenth : 516
train_data = data[:length_train]
train_data
Date | Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|---|
0 | 2016-01-04 | 23583 | 23583 | 23142 | 23142 | 297687 | -0.004345 |
1 | 2016-01-05 | 22900 | 23583 | 22840 | 23503 | 318036 | 0.015599 |
2 | 2016-01-06 | 23684 | 24306 | 23543 | 23905 | 544137 | 0.017104 |
3 | 2016-01-07 | 23764 | 24186 | 23403 | 23544 | 342194 | -0.015101 |
4 | 2016-01-08 | 23142 | 23262 | 22820 | 23122 | 400046 | -0.017924 |
... | ... | ... | ... | ... | ... | ... | ... |
1200 | 2020-11-23 | 73964 | 74064 | 73261 | 73663 | 363196 | 0.004103 |
1201 | 2020-11-24 | 73562 | 75670 | 73562 | 74867 | 649755 | 0.016345 |
1202 | 2020-11-25 | 75770 | 75770 | 73462 | 73663 | 703279 | -0.016082 |
1203 | 2020-11-26 | 73763 | 75168 | 73562 | 75068 | 539570 | 0.019073 |
1204 | 2020-11-27 | 75369 | 75770 | 74465 | 74867 | 335836 | -0.002678 |
1205 rows × 7 columns
validation_data = data[length_train:]
validation_data
Date | Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|---|
1205 | 2020-11-30 | 74865 | 75066 | 73862 | 73863 | 544990 | -0.013410 |
1206 | 2020-12-01 | 74164 | 75268 | 74064 | 75168 | 452723 | 0.017668 |
1207 | 2020-12-02 | 75369 | 75369 | 74465 | 74867 | 440500 | -0.004004 |
1208 | 2020-12-03 | 74867 | 75068 | 74064 | 75068 | 464371 | 0.002685 |
1209 | 2020-12-04 | 75068 | 78781 | 74465 | 78179 | 1561745 | 0.041442 |
... | ... | ... | ... | ... | ... | ... | ... |
1716 | 2022-12-23 | 54000 | 54300 | 53100 | 53400 | 1339673 | -0.030853 |
1717 | 2022-12-26 | 53400 | 53800 | 52700 | 53600 | 988777 | 0.003745 |
1718 | 2022-12-27 | 53900 | 54700 | 53600 | 54400 | 1226474 | 0.014925 |
1719 | 2022-12-28 | 53900 | 54700 | 52900 | 53600 | 1268005 | -0.014706 |
1720 | 2022-12-29 | 53500 | 55300 | 52900 | 53100 | 1319611 | -0.009328 |
516 rows × 7 columns
분석에 쓰일 것은 시가이므로 Open 컬럼의 값들만 가져온다.
dataset_train = train_data.Open.values # open 컬럼에 있는 변수만 가져오기
dataset_train.shape
(1205,)
# 1차원 -> 2차원 데이터로 변환
# Changing shape from (1205,) to (1205,1)
dataset_train = np.reshape(dataset_train, (-1,1))
dataset_train.shape
(1205, 1)
validation set도 마찬가지의 과정을 거쳐준다.
dataset_validation = validation_data.Open.values
dataset_validation = np.reshape(dataset_validation, (-1,1))
3. Scaling
데이터의 전체적인 분포를 생각했을 때 정규화를 시켜줘야 한다. MinMaxScaler()
를 사용하면 feature의 값이 0~1의 범위를 갖도록 스케일링된다.
data.describe()
Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|
count | 1721.000000 | 1721.000000 | 1721.000000 | 1721.000000 | 1.721000e+03 | 1721.000000 |
mean | 49057.984311 | 49821.862289 | 48313.374201 | 49232.969204 | 1.150001e+06 | 0.000746 |
std | 37993.093979 | 38595.746965 | 37402.362602 | 38001.443308 | 1.598395e+06 | 0.023144 |
min | 0.000000 | 0.000000 | 0.000000 | 14311.000000 | 0.000000e+00 | -0.100649 |
25% | 21075.000000 | 21474.000000 | 20773.000000 | 21075.000000 | 3.417420e+05 | -0.012584 |
50% | 27698.000000 | 28199.000000 | 27296.000000 | 27799.000000 | 6.295670e+05 | 0.000000 |
75% | 74164.000000 | 75068.000000 | 73200.000000 | 74064.000000 | 1.380887e+06 | 0.012292 |
max | 172000.000000 | 173000.000000 | 161000.000000 | 169500.000000 | 1.889515e+07 | 0.155512 |
주가가 0인 경우가 있어 살펴보니 21년 4월 12-14일이다. 기사를 검색해보니 21년 4월 15일 카카오의 액면분할이 있있어 12~14일 매매 정지가 있었다고 한다.
data[data['Open']==0]
Date | Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|---|
1294 | 2021-04-12 | 0 | 0 | 0 | 112000 | 0 | 0.0 |
1295 | 2021-04-13 | 0 | 0 | 0 | 112000 | 0 | 0.0 |
1296 | 2021-04-14 | 0 | 0 | 0 | 112000 | 0 | 0.0 |
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1)) # min - max scaling
# scaling dataset
dataset_train_scaled = scaler.fit_transform(dataset_train)
scaled_dataset_validation = scaler.transform(dataset_validation)
plt.subplots(figsize = (15,6))
plt.plot(dataset_train_scaled)
plt.xlabel("Days")
plt.ylabel("Open Price")
plt.show()
4. time step 50으로 X_train, y_train 생성
RNN 모델을 만들 때 예측에 쓰이는 관측치를 만들어줘야 한다. 만약 1-50일째의 데이터를 가지고 51일째를 예측하고 싶다면, 이 50일간의 데이터가 관측치이고 time_step
은 50이 된다. 그리고 2-51일째의 데이터를 가지고 52일째를 예측하게 된다. 즉, 한 칸(하루)씩 이동하면서 예측을 반복하는 것이다.
X_train = []
y_train = []
time_step = 50 # 그 다음 값을 예측하기 위해(51번째) 50개의 데이터 사용
for i in range(time_step, length_train): # 한칸씩 이동하면서 계속 다음 값 예측 반복
X_train.append(dataset_train_scaled[i-time_step:i,0])
y_train.append(dataset_train_scaled[i,0])
# convert list to array
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1],1))
y_train = np.reshape(y_train, (y_train.shape[0],1))
X_test = []
y_test = []
for i in range(time_step, length_validation):
X_test.append(scaled_dataset_validation[i-time_step:i,0])
y_test.append(scaled_dataset_validation[i,0])
X_test, y_test = np.array(X_test), np.array(y_test)
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))
y_test = np.reshape(y_test, (-1,1))
5. RNN 모델 생성
# importing libraries
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN
from keras.layers import Dropout
# RNN 초기화
regressor = Sequential()
# 첫번째 RNN 계층 추가 + dropout regulatization
regressor.add(
SimpleRNN(units = 50,
activation = "tanh",
return_sequences = True,
input_shape = (X_train.shape[1],1))
)
regressor.add(
Dropout(0.2)
)
# 2번째 RNN 계층 추가
regressor.add(
SimpleRNN(units = 50,
activation = "tanh",
return_sequences = True)
)
regressor.add(
Dropout(0.2)
)
# 3번째 RNN 계층 추가
regressor.add(
SimpleRNN(units = 50,
activation = "tanh",
return_sequences = True)
)
regressor.add(
Dropout(0.2)
)
# 4번째 RNN 계층 추가
regressor.add(
SimpleRNN(units = 50)
)
regressor.add(
Dropout(0.2)
)
# 출력층 계층 추가
regressor.add(Dense(units = 1))
# compiling RNN
regressor.compile(
optimizer = "adam",
loss = "mean_squared_error",
metrics = ["accuracy"])
# fitting the RNN # 배치 크기 = 32, epoch = 50
history = regressor.fit(X_train, y_train, epochs = 50, batch_size = 32)
Epoch 1/50
37/37 [==============================] - 5s 42ms/step - loss: 0.3659 - accuracy: 0.0017
Epoch 2/50
37/37 [==============================] - 2s 42ms/step - loss: 0.2305 - accuracy: 8.6580e-04
Epoch 3/50
37/37 [==============================] - 2s 42ms/step - loss: 0.1752 - accuracy: 0.0017
Epoch 4/50
37/37 [==============================] - 2s 43ms/step - loss: 0.1480 - accuracy: 0.0017
Epoch 5/50
37/37 [==============================] - 2s 42ms/step - loss: 0.1032 - accuracy: 0.0017
Epoch 6/50
37/37 [==============================] - 2s 44ms/step - loss: 0.0864 - accuracy: 8.6580e-04
Epoch 7/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0635 - accuracy: 8.6580e-04
Epoch 8/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0447 - accuracy: 8.6580e-04
Epoch 9/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0330 - accuracy: 0.0017
Epoch 10/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0309 - accuracy: 8.6580e-04
Epoch 11/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0273 - accuracy: 0.0017
Epoch 12/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0207 - accuracy: 0.0017
Epoch 13/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0179 - accuracy: 0.0017
Epoch 14/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0174 - accuracy: 0.0017
Epoch 15/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0158 - accuracy: 0.0017
Epoch 16/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0138 - accuracy: 0.0017
Epoch 17/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0132 - accuracy: 0.0017
Epoch 18/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0123 - accuracy: 0.0017
Epoch 19/50
37/37 [==============================] - 2s 44ms/step - loss: 0.0093 - accuracy: 0.0017
Epoch 20/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0102 - accuracy: 0.0017
Epoch 21/50
37/37 [==============================] - 2s 62ms/step - loss: 0.0085 - accuracy: 0.0017
Epoch 22/50
37/37 [==============================] - 3s 74ms/step - loss: 0.0071 - accuracy: 0.0017
Epoch 23/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0078 - accuracy: 0.0017
Epoch 24/50
37/37 [==============================] - 2s 44ms/step - loss: 0.0084 - accuracy: 0.0017
Epoch 25/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0068 - accuracy: 0.0017
Epoch 26/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0070 - accuracy: 0.0017
Epoch 27/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0056 - accuracy: 0.0017
Epoch 28/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0062 - accuracy: 0.0017
Epoch 29/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0055 - accuracy: 0.0017
Epoch 30/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0058 - accuracy: 0.0017
Epoch 31/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0053 - accuracy: 0.0017
Epoch 32/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0044 - accuracy: 0.0017
Epoch 33/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0048 - accuracy: 0.0017
Epoch 34/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0045 - accuracy: 0.0017
Epoch 35/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0045 - accuracy: 0.0017
Epoch 36/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0048 - accuracy: 0.0017
Epoch 37/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0033 - accuracy: 0.0017
Epoch 38/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0044 - accuracy: 0.0017
Epoch 39/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0039 - accuracy: 0.0017
Epoch 40/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0042 - accuracy: 0.0017
Epoch 41/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0037 - accuracy: 0.0017
Epoch 42/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0031 - accuracy: 0.0017
Epoch 43/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0037 - accuracy: 0.0017
Epoch 44/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0036 - accuracy: 0.0017
Epoch 45/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0035 - accuracy: 0.0017
Epoch 46/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0031 - accuracy: 0.0017
Epoch 47/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0033 - accuracy: 0.0017
Epoch 48/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0032 - accuracy: 0.0017
Epoch 49/50
37/37 [==============================] - 2s 43ms/step - loss: 0.0028 - accuracy: 0.0017
Epoch 50/50
37/37 [==============================] - 2s 42ms/step - loss: 0.0029 - accuracy: 0.0017
plt.figure(figsize =(10,7))
plt.plot(history.history["loss"])
plt.xlabel("Epochs")
plt.ylabel("Losses")
plt.title("Simple RNN model")
plt.show()
plt.figure(figsize =(10,5))
plt.plot(history.history["accuracy"])
plt.xlabel("Epochs")
plt.ylabel("Accuracies")
plt.title("Simple RNN model")
plt.show()
6. train set 예측
y_pred = regressor.predict(X_train) # predictions
y_pred = scaler.inverse_transform(y_pred) # scaling back from 0-1 to original 예측값 -> 원래 값으로 변화
y_pred.shape
37/37 [==============================] - 0s 12ms/step
(1155, 1)
y_train = scaler.inverse_transform(y_train) # scaling back from 0-1 to original
y_train.shape
(1155, 1)
훈련한 모델을 가지고 예측한 y_pred와 실제값인 y_train를 비교한다.
plt.figure(figsize = (30,10))
plt.plot(y_pred, color = "b", label = "y_pred" )
plt.plot(y_train, color = "g", label = "y_train")
plt.xlabel("Days")
plt.ylabel("Open price")
plt.title("Simple RNN model, Predictions with input X_train vs y_train")
plt.legend()
plt.show()
7. test set 예측
# predictions with X_test data
y_pred_of_test = regressor.predict(X_test)
# scaling back from 0-1 to original
y_pred_of_test = scaler.inverse_transform(y_pred_of_test)
print("Shape of y_pred_of_test :",y_pred_of_test.shape)
15/15 [==============================] - 1s 12ms/step
Shape of y_pred_of_test : (466, 1)
# 예측값과 실제값의 차이
plt.figure(figsize = (30,10))
plt.plot(y_pred_of_test, label = "y_pred_of_test", c = "orange")
plt.plot(scaler.inverse_transform(y_test), label = "y_test", c = "g")
plt.xlabel("Days")
plt.ylabel("Open price")
plt.title("Simple RNN model, Prediction with input X_test vs y_test") # 예측값과 실제값의 차이
plt.legend()
plt.show()
앞서 예측한 train set의 시각화 test set의 시각화를 이어붙여준다. 역시나 모델이 학습된 것은 train set 기반이었기에에, train set을 예측한 앞부분은 오차가 적어보이는 반면, test set을 예측한 뒷부분은 오차가 꽤 있어보인다.
곤두박질 친 부분은 앞서 언급한 액면분할로 인한 거래중지의 data이다.
plt.subplots(figsize =(30,12))
plt.plot(train_data.Date, train_data.Open, label = "train_data", color = "b")
plt.plot(validation_data.Date, validation_data.Open, label = "validation_data", color = "g")
plt.plot(train_data.Date.iloc[time_step:], y_pred, label = "y_pred", color = "r")
plt.plot(validation_data.Date.iloc[time_step:], y_pred_of_test, label = "y_pred_of_test", color = "orange")
plt.xlabel("Days")
plt.ylabel("Open price")
plt.title("Simple RNN model, Train-Validation-Prediction")
plt.legend()
plt.show()
8. Pytorch
이번엔 케라스가 아닌 파이토치로 모델을 구현해본다. 그리고 이전에 이전 Open 데이터로 미래 Open을 예측한 반면, 이번에는 Open, High, Low, Volume, Close로 미래 Open을 예측한다.
import torch
import torch.nn as nn
import torch.optim as optim
# 2016년~2022년 카카오 주가 데이터
data_pytorch = fdr.DataReader('035720', start='20160101', end='20221231')
data_pytorch.reset_index(drop=False, inplace=True)
scaler = MinMaxScaler()
data_pytorch[['Open','High','Low','Close','Volume']] = scaler.fit_transform(data_pytorch[['Open','High','Low','Close','Volume']])
data_pytorch.head()
Date | Open | High | Low | Close | Volume | Change | |
---|---|---|---|---|---|---|---|
0 | 2016-01-04 | 0.137110 | 0.136318 | 0.143739 | 0.056905 | 0.015755 | -0.004345 |
1 | 2016-01-05 | 0.133140 | 0.136318 | 0.141863 | 0.059231 | 0.016832 | 0.015599 |
2 | 2016-01-06 | 0.137698 | 0.140497 | 0.146230 | 0.061821 | 0.028798 | 0.017104 |
3 | 2016-01-07 | 0.138163 | 0.139803 | 0.145360 | 0.059495 | 0.018110 | -0.015101 |
4 | 2016-01-08 | 0.134547 | 0.134462 | 0.141739 | 0.056776 | 0.021172 | -0.017924 |
data_pytorch.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 1721 non-null datetime64[ns]
1 Open 1721 non-null float64
2 High 1721 non-null float64
3 Low 1721 non-null float64
4 Close 1721 non-null float64
5 Volume 1721 non-null float64
6 Change 1721 non-null float64
dtypes: datetime64[ns](1), float64(6)
memory usage: 94.2 KB
데이터셋을 feature와 target으로 나눈다.
X = data_pytorch[['Open','High','Low','Volume', 'Close']].values
y = data_pytorch['Open'].values
앞서 했던 것처럼 time step을 50으로 잡고 데이터를 생성해준다.
def seq_data(x, y, time_step):
x_seq = []
y_seq = []
for i in range(len(x) - time_step):
x_seq.append(x[i: i+time_step])
y_seq.append(y[i+time_step])
return torch.FloatTensor(x_seq), torch.FloatTensor(y_seq)
time_step = 50
x_seq, y_seq = seq_data(X, y, time_step)
train과 test set을 7:3 비율로 split 해준다.
length_data = x_seq.shape[0] # data 행 개수
split_ratio = 0.7 # 0.7 / 0.3 으로 분리
length_train = round(length_data * split_ratio)
x_train_seq = x_seq[:length_train]
y_train_seq = y_seq[:length_train]
x_test_seq = x_seq[length_train:]
y_test_seq = y_seq[length_train:]
print(x_train_seq.size(), y_train_seq.size())
print(x_test_seq.size(), y_test_seq.size())
torch.Size([1170, 50, 5]) torch.Size([1170])
torch.Size([501, 50, 5]) torch.Size([501])
이제 min batch 형태로 쪼개질 수 있도록 DataLoader()
를 사용하여 데이터를 iterator하게 만든다다.
train = torch.utils.data.TensorDataset(x_train_seq, y_train_seq)
test = torch.utils.data.TensorDataset(x_test_seq, y_test_seq)
batch_size = 32
train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test, batch_size=batch_size, shuffle=False)
input_size
는 feature의 개수이고, num_layers
는 쌓고자 하는 층의 개수이다.
input_dim = x_seq.size(2)
num_layers = 4
hidden_dim = 2
9. Pytorch RNN class 구성
class My_RNN(nn.Module):
def __init__(self, input_dim, hidden_dim, time_step, num_layers):
super(My_RNN, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.rnn = nn.RNN(input_dim, hidden_dim, num_layers, dropout=0.2, batch_first=True)
self.fc = nn.Sequential(nn.Linear(hidden_dim*time_step, 1), nn.Sigmoid())
def forward(self, x):
h0 = torch.zeros(self.num_layers, x.size()[0], self.hidden_dim)# 초기 hidden state 설정하기.
out, _ = self.rnn(x, h0) # out: RNN의 마지막 레이어로부터 나온 output feature 를 반환한다. hn: hidden state를 반환한다.
out = out.reshape(out.shape[0], -1) # many to many 전략
out = self.fc(out)
return out
model = My_RNN(input_dim = input_dim,
hidden_dim = hidden_dim,
time_step = time_step,
num_layers = num_layers)
criterion = nn.MSELoss()
lr = 1e-3
num_epochs = 50
loss_function = torch.nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_graph = [] # 그래프 그릴 목적인 loss.
n = len(train_loader)
for epoch in range(num_epochs):
running_loss = 0.0
for data in train_loader:
seq, target = data # mini batch로 쪼개진 데이터터
out = model(seq) # model에서 forward
loss = criterion(out, target) # output 기반으로로 loss 계산
optimizer.zero_grad() #
loss.backward() # loss가 최소가 되게하는
optimizer.step() # 가중치 업데이트 해주고,
running_loss += loss.item() # 한 mini batch의 loss 더해주고,
loss_graph.append(running_loss / n) # 한 epoch에 모든 mini batch의 평균 loss 리스트에 담고,
if epoch % 10 == 0:
print('[epoch: %d] loss: %.4f'%(epoch, running_loss/n))
[epoch: 0] loss: 0.0702
[epoch: 10] loss: 0.0060
[epoch: 20] loss: 0.0008
[epoch: 30] loss: 0.0006
[epoch: 40] loss: 0.0007
plt.figure(figsize =(10,7))
plt.plot(loss_graph)
plt.xlabel("Epochs")
plt.ylabel("Losses")
plt.title("Simple RNN model")
plt.show()
10. Pytorch 예측
이제 학습한 model로 train set과 test set을 예측한다. 아까처럼 두 그래프를 이어붙여서 시각화할 것이다.
결과를 보면 test set 부분이 많이 아쉽다. 이전에는 feature를 무엇을 쓸 것이고, layer를 어떻게 쌓느냐에 따라 결과가 많이 바뀔 것 같다.
def plotting(train_loader, test_loader, actual):
with torch.no_grad():
train_pred = []
test_pred = []
for data in train_loader:
seq, target = data
out = model(seq)
train_pred += out.numpy().tolist()
for data in test_loader:
seq, target = data
out = model(seq)
test_pred += out.numpy().tolist()
total = train_pred + test_pred
plt.figure(figsize=(20,10))
plt.plot(np.ones(100)*len(train_pred), np.linspace(0,1,100), '--', linewidth=0.6)
plt.plot(actual, '--')
plt.plot(total, 'b', linewidth=0.6)
plt.legend(['train boundary', 'actual', 'prediction'])
plt.show()
plotting(train_loader, test_loader, data_pytorch['Open'][time_step:])