From 3f69bcaea37a6e35d18a078cd6c07fbb3685d4c3 Mon Sep 17 00:00:00 2001
From: Kendall Chuang <kendallc@gmail.com>
Date: Tue, 3 Jan 2017 10:17:04 -0800
Subject: [PATCH] Refactoring code in tf_model to be more readable and
 modularized using helper methods

---
 main.py                  |   2 -
 tests/test_train.py      |   4 ++
 wine_quality/tf_model.py | 134 +++++++++++++++++++++++----------------
 3 files changed, 83 insertions(+), 57 deletions(-)

diff --git a/main.py b/main.py
index e74c991..4ebcaf3 100644
--- a/main.py
+++ b/main.py
@@ -31,7 +31,6 @@ def test_parameters():
     form = TestParameterForm(request.form)
     if request.method == 'POST' and form.validate():
         print(form.__dict__)
-        # simple([[0.7, 0, 1.9, 0.076, 11, 34, 0.99780, 3.51, 0.56, 9.4]])
         results = simple([[0.7, 0, 1.9, 0.076, 11, 34, 0.99780, 3.51, 0.56, 9.4]])
         return render_template('test_parameters.html', form=form, result=results[0])
 
@@ -47,7 +46,6 @@ def upload():
         batch_size = int(form.batch_size.data)
         filename = secure_filename(form.training_data.data.filename)
         print(form.__dict__)
-        # Save to Redis here
         form.training_data.data.save('wine_quality/data/' + filename)
         dataframe = pd.read_csv('wine_quality/data/' + filename, sep=',')
         my_model.train(dataframe, learning_rate, batch_size, model_name)
diff --git a/tests/test_train.py b/tests/test_train.py
index 997db4d..1d743f3 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -6,3 +6,7 @@ def test_train():
     model = tf_model()
     dataframe = pd.read_csv('wine_quality/data/winequality-red.csv', sep=',')
     model.train(dataframe, learning_rate=0.5, batch_size=126, model_name='model_name')
+
+
+if __name__ == '__main__':
+    test_train()
diff --git a/wine_quality/tf_model.py b/wine_quality/tf_model.py
index 701a02b..45dbdda 100644
--- a/wine_quality/tf_model.py
+++ b/wine_quality/tf_model.py
@@ -7,47 +7,87 @@
 from boto.s3.connection import S3Connection
 
 
-# Remove outliers
-def _outliers(df, threshold, columns):
+def _remove_outliers(wine_df, threshold, columns):
+    """ Removes the outliers from the dataframe based on value greater than threshold
+        number of standard deviations """
     for col in columns:
-        mask = df[col] > float(threshold) * df[col].std() + df[col].mean()
-        df.loc[mask==True, col] = np.nan  # noqa
-        mean_property = df.loc[:, col].mean()
-        df.loc[mask==True, col] = mean_property  # noqa
-    return df
+        mask = wine_df[col] > float(threshold) * wine_df[col].std() + wine_df[col].mean()
+        wine_df.loc[mask==True, col] = np.nan  # noqa
+        mean_property = wine_df.loc[:, col].mean()
+        wine_df.loc[mask==True, col] = mean_property  # noqa
+    return wine_df
 
 
-def _dense_to_one_hot(labels_dense, num_classes=2):
-    # Convert class labels from scalars to one-hot vectors
-    num_labels = len(labels_dense)
+def _dense_to_one_hot(dense_labels, num_classes=2):
+    """ Converts dense label data to one-hot encoded vectors """
+    num_labels = len(dense_labels)
     index_offset = np.arange(num_labels) * num_classes
     labels_one_hot = np.zeros((num_labels, num_classes))
-    labels_one_hot.flat[index_offset + labels_dense] = 1
+    labels_one_hot.flat[index_offset + dense_labels] = 1
     return labels_one_hot
 
 
+def _filter_wine_to_categories(wine_df, categories={'Bad', 'Good'}):
+    """ Filters out the rows for which categories are not in category set """
+    return wine_df.ix[[item in categories for item in wine_df['category']],:]
+
+
+def _create_and_separate_bins(wine_df, bins, new_bins, labels, new_labels,
+                              categories={'Bad', 'Good'}):
+    """ Creates the categories to bin the data into and filters out the rows that
+        are not in the categories """
+    wine_df['quality_bins'] = pd.cut(wine_df.quality, bins, labels, include_lowest=True)
+    wine_df['category'] = pd.cut(wine_df.quality,
+                                 new_bins, labels=new_labels, include_lowest=True)
+    wine_df = _filter_wine_to_categories(wine_df, categories)
+
+    wine_df['quality_bins'] = pd.cut(wine_df.quality,
+                                     bins, labels=['Bad', 'Good'], include_lowest=True)
+    return wine_df
+
+
+def _get_x_one_hot_values(wine_df):
+    """ Retrieves the one-hot feature data from the full dataframe """
+    X_red_wine = wine_df.iloc[:, 1:-2].get_values()
+    return X_red_wine
+
+
+def _get_y_one_hot_values(wine_df, labels_dict={'Bad': '1', 'Good': '1'}):
+    """ Retrieves the one-hot label data from the full dataframe """
+    y_red_wine = wine_df[['quality_bins']].get_values()
+
+    y_red_wine_raveled = y_red_wine.ravel()
+    for key, value in labels_dict.items():
+        y_red_wine_integers = [y.replace(key, value) for y in y_red_wine_raveled]
+    y_red_wine_integers = [np.int(y) for y in y_red_wine_integers]
+
+    y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2)
+    return y_one_hot
+
+
 class tf_model():
+    """ This is a class method to enable access to a TensorFlow model. It
+        provides helper methods to save and load the core model from local or AWS S3
+        storage, and also allows training of the model with input parameters. """
 
     def __init__(self):
         self.sess = tf.Session()
-        x = tf.placeholder("float", [None, 10])
 
         with tf.variable_scope("softmax_regression"):
             self.X = tf.placeholder("float", [None, 10])
-            self.y1, self.variables = softmax_regression(x)
+            self.y1, self.variables = softmax_regression(self.X)
             self.saver = tf.train.Saver(self.variables)
 
-    def run_model(self, x1):
+    def predict(self, x1):
         return self.sess.run(self.y1, feed_dict={self.X: x1})
 
-    def load_locally(self):
-        # Load the data
-        self.saver.restore(self.sess, "wine_quality/data/softmax_regression.ckpt")
-
-    def save_locally(self, filename):
-        path = self.saver.save(self.sess, os.path.join(os.path.dirname(__file__), filename))
+    def save_locally(self, filename='softmax_regression.ckpt'):
+        path = self.saver.save(self.sess, '/tmp/wine_quality/' + filename)
         return path
 
+    def load_locally(self, filename='softmax_regression.ckpt'):
+        self.saver.restore(self.sess, '/tmp/wine_quality/' + filename)
+
     def save_to_s3(self, filename, model_name):
         try:
             AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')
@@ -77,45 +117,31 @@ def load_from_s3(self, filename, model_name):
             return False
         return True
 
-    def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model"):
+    def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="softmax_model", filename="data/softmax_regression.ckpt"):
         column_list = training_df.columns.tolist()
         threshold = 5
+        training_df = _remove_outliers(training_df, threshold, column_list[0:-1])
 
-        red_wine_cleaned = training_df.copy()
-        red_wine_cleaned = _outliers(red_wine_cleaned, threshold, column_list[0:-1])
-
-        # Bin the data
         bins = [3, 5, 6, 8]
-        red_wine_cleaned['category'] = pd.cut(red_wine_cleaned.quality, bins, labels=['Bad', 'Average', 'Good'],
-                                              include_lowest=True)
+        new_bins = [3, 5, 8]
+        labels = ['Bad', 'Average', 'Good']
+        new_labels = ['Bad', 'Good']
+        training_df = _create_and_separate_bins(training_df, bins,
+                                                new_bins, labels, new_labels)
 
-        # Only include 'Bad' and 'Good' categories
-        red_wine_newcats = red_wine_cleaned[red_wine_cleaned['category'].isin(['Bad', 'Good'])].copy()
-
-        bins = [3, 5, 8]
-        red_wine_newcats['category'] = pd.cut(red_wine_newcats.quality,
-                                              bins, labels=['Bad', 'Good'], include_lowest=True)
-
-        y_red_wine = red_wine_newcats[['category']].get_values()
-
-        # Removing fixed_acidity and quality
-        X_red_wine = red_wine_newcats.iloc[:, 1:-2].get_values()
-
-        y_red_wine_raveled = y_red_wine.ravel()
-        y_red_wine_integers = [y.replace('Bad', '1') for y in y_red_wine_raveled]
-        y_red_wine_integers = [y.replace('Good', '0') for y in y_red_wine_integers]
-        y_red_wine_integers = [np.int(y) for y in y_red_wine_integers]
-
-        y_one_hot = _dense_to_one_hot(y_red_wine_integers, num_classes=2)
+        labels_dict = {'Bad': '1', 'Good': '0'}
+        y_red_wine = _get_y_one_hot_values(training_df, labels_dict)
+        X_red_wine = _get_x_one_hot_values(training_df)
 
         X_train, X_test, y_train, y_test = train_test_split(X_red_wine, y_one_hot, test_size=0.2, random_state=42)
-        # model
+        _train_with_gradient_descent(X_train, X_test, y_train, y_test)
 
-        with tf.variable_scope("softmax_regression"):
-            X = tf.placeholder("float", [None, 10])
-            y, variables = softmax_regression(X)
+        path = self.save_locally(filename)
+        self.save_to_s3(path, model_name)
+        print("Saved:", path)
 
-        # train
+    def _train_with_gradient_descent(X_train, X_test, y_train, y_test):
+        # Initialize training setup for gradient descent
         y_ = tf.placeholder("float", [None, 2])
         cost = -tf.reduce_mean(y_ * tf.log(y))
         optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
@@ -124,6 +150,8 @@ def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="so
 
         init = tf.initialize_all_variables()
         self.sess.run(init)
+
+        # Run gradient descent on parameters
         for i in range(100):
             average_cost = 0
             number_of_batches = int(len(X_train) / batch_size)
@@ -132,9 +160,5 @@ def train(self, training_df, learning_rate=0.001, batch_size=126, model_name="so
                 # Compute average loss
                 average_cost += self.sess.run(cost, feed_dict={X: X_train[start:end],
                                               y_: y_train[start:end]}) / number_of_batches
-            print(self.sess.run(accuracy, feed_dict={X: X_test, y_: y_test}))
+            print(self.sess.run(accuracy, feed_dict={self.X: X_test, y_: y_test}))
 
-        filename = "data/softmax_regression.ckpt"
-        path = self.save_locally(filename)
-        self.save_to_s3(path, model_name)
-        print("Saved:", path)