;;; learning/agents/exploring-tdq-learner.lisp
;;; Exploratory reinforcement learning agent using temporal differences.
;;; Works without a model by using the stochastic sampling to
;;; mirror the effect of averaging using the model.

(defun make-exploring-tdq-learner (actions)
  (let ((i nil)           ;;; the previous state visited
	(a nil)           ;;; the last action taken
	(Q (make-hash-table :test #'equal))
	(N (make-hash-table :test #'equal))
	(Ri nil))         ;;; reward received in state i
    #'(lambda (e)
	(let ((terminalp (mdp-percept-terminalp e))
	      (j (mdp-percept-state e))
	      (reward (mdp-percept-reward e)))
	  (unless (gethash j Q)
	    (setf (gethash j Q) 
		  (mapcar #'(lambda (a) (cons a reward)) actions))
	    (setf (gethash j N) 
		  (mapcar #'(lambda (a) (cons a 0)) actions)))
	  (when i
	    (incf (cdr (assoc a (gethash i N) :test #'eq)))
	    (update-exploratory-Q Q a i j N Ri))
	  (cond (terminalp 
		 (setq i nil)
		 (setf (gethash j Q) 
		       (mapcar #'(lambda (a) (cons a reward)) actions)))
		(t (setq i j Ri reward)))
	  (setq a (exploration-q-choice j Q N))))))


(defun update-exploratory-Q (Q a i j N Ri)
  (incf (cdr (assoc a (gethash i Q) :test #'eq))
	(* (current-alpha (cdr (assoc a (gethash i N) :test #'eq)))
	   (+ Ri
	      (- (apply #'max (all-q-entries j Q))
		 (q-entry Q a i))))))

(defun exploration-q-choice (s Q N)
  (the-biggest-random-tie 
   #'(lambda (a)
       (exploration-function 
	(q-entry Q a s)
	(cdr (assoc a (gethash s N) :test #'equal))))
   (q-actions s Q)))