From 3f69bcaea37a6e35d18a078cd6c07fbb3685d4c3 Mon Sep 17 00:00:00 2001 From: Kendall Chuang Date: Tue, 3 Jan 2017 10:17:04 -0800 Subject: [PATCH] Refactoring code in tf_model to be more readable and modularized using helper methods --- main.py | 2 - tests/test_train.py | 4 ++ wine_quality/tf_model.py | 134 +++++++++++++++++++++++---------------- 3 files changed, 83 insertions(+), 57 deletions(-) diff --git a/main.py b/main.py index e74c991..4ebcaf3 100644 --- a/main.py +++ b/main.py @@ -31,7 +31,6 @@ def test_parameters(): form = TestParameterForm(request.form) if request.method == 'POST' and form.validate(): print(form.__dict__) - # simple([[0.7, 0, 1.9, 0.076, 11, 34, 0.99780, 3.51, 0.56, 9.4]]) results = simple([[0.7, 0, 1.9, 0.076, 11, 34, 0.99780, 3.51, 0.56, 9.4]]) return render_template('test_parameters.html', form=form, result=results[0]) @@ -47,7 +46,6 @@ def upload(): batch_size = int(form.batch_size.data) filename = secure_filename(form.training_data.data.filename) print(form.__dict__) - # Save to Redis here form.training_data.data.save('wine_quality/data/' + filename) dataframe = pd.read_csv('wine_quality/data/' + filename, sep=',') my_model.train(dataframe, learning_rate, batch_size, model_name) diff --git a/tests/test_train.py b/tests/test_train.py index 997db4d..1d743f3 100644 --- a/tests/test_train.py +++ b/tests/test_train.py @@ -6,3 +6,7 @@ def test_train(): model = tf_model() dataframe = pd.read_csv('wine_quality/data/winequality-red.csv', sep=',') model.train(dataframe, learning_rate=0.5, batch_size=126, model_name='model_name') + + +if __name__ == '__main__': + test_train() diff --git a/wine_quality/tf_model.py b/wine_quality/tf_model.py index 701a02b..45dbdda 100644 --- a/wine_quality/tf_model.py +++ b/wine_quality/tf_model.py @@ -7,47 +7,87 @@ from boto.s3.connection import S3Connection -# Remove outliers -def _outliers(df, threshold, columns): +def _remove_outliers(wine_df, threshold, columns): + """ Removes the outliers from the dataframe based on value greater than threshold + number of standard deviations """ for col in columns: - mask = df[col] > float(threshold) * df[col].std() + df[col].mean() - df.loc[mask==True, col] = np.nan # noqa - mean_property = df.loc[:, col].mean() - df.loc[mask==True, col] = mean_property # noqa - return df + mask = wine_df[col] > float(threshold) * wine_df[col].std() + wine_df[col].mean() + wine_df.loc[mask==True, col] = np.nan # noqa + mean_property = wine_df.loc[:, col].mean() + wine_df.loc[mask==True, col] = mean_property # noqa + return wine_df -def _dense_to_one_hot(labels_dense, num_classes=2): - # Convert class labels from scalars to one-hot vectors - num_labels = len(labels_dense) +def _dense_to_one_hot(dense_labels, num_classes=2): + """ Converts dense label data to one-hot encoded vectors """ + num_labels = len(dense_labels) index_offset = np.arange(num_labels) * num_classes labels_one_hot = np.zeros((num_labels, num_classes)) - labels_one_hot.flat[index_offset + labels_dense] = 1 + labels_one_hot.flat[index_offset + dense_labels] = 1 return labels_one_hot +def _filter_wine_to_categories(wine_df, categories={'Bad', 'Good'}): + """ Filters out the rows for which categories are not in category set """ + return wine_df.ix[[item in categories for item in wine_df['category']],:] + + +def _create_and_separate_bins(wine_df, bins, new_bins, labels, new_labels, + categories={'Bad', 'Good'}): + """ Creates the categories to bin the data into and filters out the rows that + are not in the categories """ + wine_df['quality_bins'] = pd.cut(wine_df.quality, bins, labels, include_lowest=True) + wine_df['category'] = pd.cut(wine_df.quality, + new_bins, labels=new_labels, include_lowest=True) + wine_df = _filter_wine_to_categories(wine_df, categories) + + wine_df['quality_bins'] = pd.cut(wine_df.quality, + bins, labels=['Bad', 'Good'], include_lowest=True) + return wine_df + + +def _get_x_one_hot_values(wine_df): + """ Retrieves the one-hot feature data from the full dataframe """ + X_red_wine = wine_df.iloc[:, 1:-2].get_values() + return X_red_wine + + +def _get_y_one_hot_values(wine_df, labels_dict={'Bad': '1', 'Good': '1'}): + """ Retrieves the one-hot label data from the full dataframe """ + y_red_wine = wine_df[['quality_bins']].get_values() + + y_red_wine_raveled = y_red_wine.ravel() + for key, value in labels_dict.items(): + y_red_wine_integers = [y.replace(key, value) for y in y_red_wine_raveled] + y_red_wine_integers = [np.int(y) for y in y_red_wine_integers] + + y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2) + return y_one_hot + + class tf_model(): + """ This is a class method to enable access to a TensorFlow model. It + provides helper methods to save and load the core model from local or AWS S3 + storage, and also allows training of the model with input parameters. """ def __init__(self): self.sess = tf.Session() - x = tf.placeholder("float", [None, 10]) with tf.variable_scope("softmax_regression"): self.X = tf.placeholder("float", [None, 10]) - self.y1, self.variables = softmax_regression(x) + self.y1, self.variables = softmax_regression(self.X) self.saver = tf.train.Saver(self.variables) - def run_model(self, x1): + def predict(self, x1): return self.sess.run(self.y1, feed_dict={self.X: x1}) - def load_locally(self): - # Load the data - self.saver.restore(self.sess, "wine_quality/data/softmax_regression.ckpt") - - def save_locally(self, filename): - path = self.saver.save(self.sess, os.path.join(os.path.dirname(__file__), filename)) + def save_locally(self, filename='softmax_regression.ckpt'): + path = self.saver.save(self.sess, '/tmp/wine_quality/' + filename) return path + def load_locally(self, filename='softmax_regression.ckpt'): + self.saver.restore(self.sess, '/tmp/wine_quality/' + filename) + def save_to_s3(self, filename, model_name): try: AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID') @@ -77,45 +117,31 @@ def load_from_s3(self, filename, model_name): return False return True - def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model"): + def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model", filename="data/softmax_regression.ckpt"): column_list = training_df.columns.tolist() threshold = 5 + training_df = _remove_outliers(training_df, threshold, column_list[0:-1]) - red_wine_cleaned = training_df.copy() - red_wine_cleaned = _outliers(red_wine_cleaned, threshold, column_list[0:-1]) - - # Bin the data bins = [3, 5, 6, 8] - red_wine_cleaned['category'] = pd.cut(red_wine_cleaned.quality, bins, labels=['Bad', 'Average', 'Good'], - include_lowest=True) + new_bins = [3, 5, 8] + labels = ['Bad', 'Average', 'Good'] + new_labels = ['Bad', 'Good'] + training_df = _create_and_separate_bins(training_df, bins, + new_bins, labels, new_labels) - # Only include 'Bad' and 'Good' categories - red_wine_newcats = red_wine_cleaned[red_wine_cleaned['category'].isin(['Bad', 'Good'])].copy() - - bins = [3, 5, 8] - red_wine_newcats['category'] = pd.cut(red_wine_newcats.quality, - bins, labels=['Bad', 'Good'], include_lowest=True) - - y_red_wine = red_wine_newcats[['category']].get_values() - - # Removing fixed_acidity and quality - X_red_wine = red_wine_newcats.iloc[:, 1:-2].get_values() - - y_red_wine_raveled = y_red_wine.ravel() - y_red_wine_integers = [y.replace('Bad', '1') for y in y_red_wine_raveled] - y_red_wine_integers = [y.replace('Good', '0') for y in y_red_wine_integers] - y_red_wine_integers = [np.int(y) for y in y_red_wine_integers] - - y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2) + labels_dict = {'Bad': '1', 'Good': '0'} + y_red_wine = _get_y_one_hot_values(training_df, labels_dict) + X_red_wine = _get_x_one_hot_values(training_df) X_train, X_test, y_train, y_test = train_test_split(X_red_wine, y_one_hot, test_size=0.2, random_state=42) - # model + _train_with_gradient_descent(X_train, X_test, y_train, y_test) - with tf.variable_scope("softmax_regression"): - X = tf.placeholder("float", [None, 10]) - y, variables = softmax_regression(X) + path = self.save_locally(filename) + self.save_to_s3(path, model_name) + print("Saved:", path) - # train + def _train_with_gradient_descent(X_train, X_test, y_train, y_test): + # Initialize training setup for gradient descent y_ = tf.placeholder("float", [None, 2]) cost = -tf.reduce_mean(y_ * tf.log(y)) optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) @@ -124,6 +150,8 @@ def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="so init = tf.initialize_all_variables() self.sess.run(init) + + # Run gradient descent on parameters for i in range(100): average_cost = 0 number_of_batches = int(len(X_train) / batch_size) @@ -132,9 +160,5 @@ def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="so # Compute average loss average_cost += self.sess.run(cost, feed_dict={X: X_train[start:end], y_: y_train[start:end]}) / number_of_batches - print(self.sess.run(accuracy, feed_dict={X: X_test, y_: y_test})) + print(self.sess.run(accuracy, feed_dict={self.X: X_test, y_: y_test})) - filename = "data/softmax_regression.ckpt" - path = self.save_locally(filename) - self.save_to_s3(path, model_name) - print("Saved:", path)