mackcmillion February 2016

Tensorflow Training using input queue gets stuck

I am trying to build a NN training similar to the one in this tutorial.

My code looks as follows:

def train():
    init_op = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init_op)

    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    step = 0

    try:
        while not coord.should_stop():
            step += 1
            print 'Training step %i' % step
            training = train_op()
            sess.run(training)

    except tf.errors.OutOfRangeError:
        print 'Done training - epoch limit reached.'
    finally:
        coord.request_stop()

    coord.join(threads)
    sess.close()

with

MIN_NUM_EXAMPLES_IN_QUEUE = 10
NUM_PRODUCING_THREADS = 1
NUM_CONSUMING_THREADS = 1

def train_op():
    images, true_labels = inputs()
    predictions = NET(images)
    true_labels = tf.cast(true_labels, tf.float32)
    loss = tf.nn.softmax_cross_entropy_with_logits(predictions, true_labels)
    return OPTIMIZER.minimize(loss)


def inputs():
    filenames = [os.path.join(FLAGS.train_dir, filename) 
        for filename in os.listdir(FLAGS.train_dir) 
        if os.path.isfile(os.path.join(FLAGS.train_dir, filename))]
    filename_queue = tf.train.string_input_producer(filenames,
        num_epochs=FLAGS.training_epochs, shuffle=True)

    example_list = [_read_and_preprocess_image(filename_queue) 
        for _ in xrange(NUM_CONSUMING_THREADS)]

    image_batch, label_batch = tf.train.shuffle_batch_join(
        example_list,
        batch_size=FLAGS.batch_size,
        capacity=MIN_NUM_EXAMPLES_IN_QUEUE + (NUM_CONSUMING_THREADS + 2) * FLAGS.batch_size,
        min_after_dequeue=MIN_NUM_EXAMPLES_IN_QUEUE)

    return image_batch, label_batch

The tutorial says

Answers


dga February 2016

You are redefining your network every time you try to run the training loop.

Remember that TensorFlow defines an execution graph, and then executes it. You want to call your train_op() outside of the run loop, and you need to define that graph BEFORE you call initialize_all_variables and tf.train.start_queue_runners

Post Status

Asked in February 2016
Viewed 3,796 times
Voted 6
Answered 1 times

Search




Leave an answer