분류 전체보기
- HW2 딥러닝 논문 리뷰 및 연구계획 3 2018.12.24 1
- HW2 딥러닝 논문 리뷰 및 연구계획 2 2018.12.18
- HW2 딥러닝 논문 리뷰 및 연구계획 2018.12.17
- 12주차 강의 자료 2018.12.16
- 11주차 수업 자료 2018.12.12
- Word2vec 예제 2018.12.07
- 10주차 수업 자료 2018.11.26
- LSTM multivariate prediction example 2018.11.21
- LSTM 예제 데이터 (베이징 공기 상태) 2018.11.20
- 9주차 수업내용 및 숙제 2 2018.11.17
HW2 딥러닝 논문 리뷰 및 연구계획 3
HW2 딥러닝 논문 리뷰 및 연구계획 2
HW2 딥러닝 논문 리뷰 및 연구계획
12주차 강의 자료
11주차 수업 자료
Word2vec 예제
# https://github.com/adventuresinML/adventures-in-ml-code
import gensim
from gensim.models import word2vec
import logging
from keras.layers import Input, Embedding, merge
from keras.models import Model
import tensorflow as tf
import numpy as np
import urllib.request
import os
import zipfile
vector_dim = 300
#root_path = "C:\\Users\Andy\PycharmProjects\\adventures-in-ml-code\\"
root_path = "C:\\MyData\\pywork\\"
def maybe_download(filename, url, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
# convert the input data into a list of integer indexes aligning with the wv indexes
# Read the data into a list of strings.
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words."""
with zipfile.ZipFile(filename) as f:
data = f.read(f.namelist()[0]).split()
return data
def convert_data_to_index(string_data, wv):
index_data = []
for word in string_data:
if word in wv:
index_data.append(wv.vocab[word].index)
return index_data
def gensim_demo():
# url = 'http://mattmahoney.net/dc/'
# filename = maybe_download('text8.zip', url, 31344016)
filename = 'text8.zip'
if not os.path.exists((root_path + filename).strip('.zip')):
zipfile.ZipFile(root_path+filename).extractall()
sentences = word2vec.Text8Corpus((root_path + filename).strip('.zip'))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = word2vec.Word2Vec(sentences, iter=10, min_count=10, size=300, workers=4)
# get the word vector of "the"
print(model.wv['the'])
# get the most common words
print(model.wv.index2word[0], model.wv.index2word[1], model.wv.index2word[2])
# get the least common words
vocab_size = len(model.wv.vocab)
print(model.wv.index2word[vocab_size - 1], model.wv.index2word[vocab_size - 2], model.wv.index2word[vocab_size - 3])
# find the index of the 2nd most common word ("of")
print('Index of "of" is: {}'.format(model.wv.vocab['of'].index))
# some similarity fun
print(model.wv.similarity('woman', 'man'), model.wv.similarity('man', 'elephant'))
# what doesn't fit?
print(model.wv.doesnt_match("green blue red zebra".split()))
str_data = read_data(root_path + filename)
index_data = convert_data_to_index(str_data, model.wv)
print(str_data[:4], index_data[:4])
# save and reload the model
model.save(root_path + "mymodel")
def create_embedding_matrix(model):
# convert the wv word vectors into a numpy matrix that is suitable for insertion
# into our TensorFlow and Keras models
embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim))
for i in range(len(model.wv.vocab)):
embedding_vector = model.wv[model.wv.index2word[i]]
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def tf_model(embedding_matrix, wv):
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# embedding layer weights are frozen to avoid updating embeddings while training
saved_embeddings = tf.constant(embedding_matrix)
embedding = tf.Variable(initial_value=saved_embeddings, trainable=False)
# create the cosine similarity operations
norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
normalized_embeddings = embedding / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
# call our similarity operation
sim = similarity.eval()
# run through each valid example, finding closest words
for i in range(valid_size):
valid_word = wv.index2word[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = wv.index2word[nearest[k]]
log_str = '%s %s,' % (log_str, close_word)
print(log_str)
def keras_model(embedding_matrix, wv):
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# input words - in this case we do sample by sample evaluations of the similarity
valid_word = Input((1,), dtype='int32')
other_word = Input((1,), dtype='int32')
# setup the embedding layer
embeddings = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
weights=[embedding_matrix])
embedded_a = embeddings(valid_word)
embedded_b = embeddings(other_word)
similarity = merge([embedded_a, embedded_b], mode='cos', dot_axes=2)
# create the Keras model
k_model = Model(input=[valid_word, other_word], output=similarity)
def get_sim(valid_word_idx, vocab_size):
sim = np.zeros((vocab_size,))
in_arr1 = np.zeros((1,))
in_arr2 = np.zeros((1,))
in_arr1[0,] = valid_word_idx
for i in range(vocab_size):
in_arr2[0,] = i
out = k_model.predict_on_batch([in_arr1, in_arr2])
sim[i] = out
return sim
# now run the model and get the closest words to the valid examples
for i in range(valid_size):
valid_word = wv.index2word[valid_examples[i]]
top_k = 8 # number of nearest neighbors
sim = get_sim(valid_examples[i], len(wv.vocab))
nearest = (-sim).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = wv.index2word[nearest[k]]
log_str = '%s %s,' % (log_str, close_word)
print(log_str)
if __name__ == "__main__":
run_opt = 1
if run_opt == 1:
gensim_demo()
print('gensim demo finished..')
elif run_opt == 2:
model = gensim.models.Word2Vec.load(root_path + "mymodel")
embedding_matrix = create_embedding_matrix(model)
tf_model(embedding_matrix, model.wv)
elif run_opt == 3:
model = gensim.models.Word2Vec.load(root_path + "mymodel")
embedding_matrix = create_embedding_matrix(model)
keras_model(embedding_matrix, model.wv)
10주차 수업 자료
LSTM multivariate prediction example
Beijing air pollution prediction
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
"""
Frame a time series as a supervised learning dataset.
Arguments:
data: Sequence of observations as a list or NumPy array.
n_in: Number of lag observations as input (X).
n_out: Number of observations as output (y).
dropnan: Boolean whether or not to drop rows with NaN values.
Returns:
Pandas DataFrame of series framed for supervised learning.
"""
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
# put it all together
agg = concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
# specify the number of lag hours
n_hours = 3
n_features = 8
# frame as supervised learning
reframed = series_to_supervised(scaled, n_hours, 1)
print(reframed.shape)
# split into train and test sets
values = reframed.values
n_train_hours = 365 * 24
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]
# split into input and outputs
n_obs = n_hours * n_features
train_X, train_y = train[:, :n_obs], train[:, -n_features]
test_X, test_y = test[:, :n_obs], test[:, -n_features]
print(train_X.shape, len(train_X), train_y.shape)
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
# design network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], n_hours*n_features))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, -7:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, -7:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)
LSTM 예제 데이터 (베이징 공기 상태)
베이징 공기 상태 데이터
전처리 코드 ===============================
from pandas import read_csv
from datetime import datetime
# load data
def parse(x):
return datetime.strptime(x, '%Y %m %d %H')
dataset = read_csv('BeijingRaw.csv', parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
dataset.drop('No', axis=1, inplace=True)
# manually specify column names
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# mark all NA values with 0
dataset['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# summarize first 5 rows
print(dataset.head(5))
# save to file
dataset.to_csv('BeijingPollution.csv')
전처리 후 데이터 가시화 ========================
from pandas import read_csv
from matplotlib import pyplot
# load dataset
dataset = read_csv('BeijingPollution.csv', header=0, index_col=0)
values = dataset.values
# specify columns to plot
groups = [0, 1, 2, 3, 5, 6, 7]
i = 1
# plot each column
pyplot.figure()
for group in groups:
pyplot.subplot(len(groups), 1, i)
pyplot.plot(values[:, group])
pyplot.title(dataset.columns[group], y=0.5, loc='right')
i += 1
pyplot.show()
conversion to supervised data ====================
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
# put it all together
agg = concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
# load dataset
dataset = read_csv('BeijingPollution.csv', header=0, index_col=0)
values = dataset.values
# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
# frame as supervised learning
reframed = series_to_supervised(scaled, 1, 1)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print(reframed.head())