;;; learning/algorithms/q-iteration.lisp
;;; Data structures and algorithms for calculating the Q-table for an
;;; MDP. Q(a,i) is the value of doing action a in state i.

(defun q-entry (Q a i) (cdr (assoc a (gethash i Q) :test #'eq)))

(defun all-q-entries (i Q) (mapcar #'cdr (gethash i Q)))

(defun q-actions (s Q) (mapcar #'car (gethash s Q)))

;;; Given an MDP, determine the q-values of the states.
;;; Q-iteration iterates on the Q-values instead of the U-values.
;;; Basic equation is Q(a,i) <- R(i) + sum_j M(a,i,j) max_a' Q(a',j)
;;; where Q(a',j) MUST be the old value not the new.
(defun q-iteration (mdp &optional (Qold nil)
			&key (epsilon 0.000001)
			&aux Qnew
			     (max-delta infinity) 
			     (M (mdp-model mdp))
			     (R (mdp-rewards mdp)))
  (unless Qold
    (setq Qold (make-hash-table :test #'equal))
    (maphash #'(lambda (s r)
		 (setf (gethash s Qold)
		       (mapcar #'(lambda (a) (cons a r)) (actions s M))))
	     R))
  (setq Qnew (copy-hash-table Qold #'identity))
  (do ()
      ((< max-delta epsilon) Qnew)
    (setq max-delta 0)
    (maphash #'(lambda (s a-qs) 
		 (dolist (a-q a-qs) (setf (cdr a-q) 
					  (q-entry Qnew (car a-q) s))))
	     Qold)
    (maphash 
     #'(lambda (i a-qs)
	 (let ((new-a-qs (gethash i Qnew)))
	   (dolist (a-q a-qs)
	     (let ((a (car a-q)) (q (cdr a-q)))
	       (unless (sink? i M)
		 (setf (cdr (assoc a new-a-qs :test #'eq))
		       (+ (gethash i R)
			  (average-successor-q a i Qold M))))
	       (setq max-delta (max max-delta 
				(abs (- (q-entry Qnew a i) q))))))))
     Qold)))

(defun average-successor-q (a i Q M &aux (sum 0))
  (dolist (transition (transitions a i M) sum)
    (let* ((j (transition-destination transition)) 
	   (p (transition-probability transition)) 
	   (qjs (gethash j Q)))
      (incf sum 
	    (if qjs
		(* p (cdr (the-biggest #'cdr qjs)))
	      0)))))




;;; Compute optimal policy from Q table
(defun q-optimal-policy (Q &aux (P (make-hash-table :test #'equal)))
  (maphash #'(lambda (s a-qs) (declare (ignore a-qs))
	       (setf (gethash s P) 
		     (list (list (q-dmax-choice s Q) 1.0))))
	   Q)
  P)


;;; Choice functions select an action under specific circumstances

;;; Pick a random action
(defun q-random-choice (s Q)
  (random-element (q-actions s Q)))

;;; Pick the currently best action
(defun q-dmax-choice (s Q)
  (car (the-biggest #'cdr (gethash s Q))))

;;; Pick the currently best action with tie-breaking
(defun q-max-choice (s Q)
  (car (the-biggest-random-tie #'cdr (gethash s Q))))