;;;; Code for performance assessment of DP and RL algorithms.

;;; Makes extensive use of global variables to minimize interference with the
;;; algorithms themselves.

(defvar *policy-fn*)   ;;; the policy used by the agent in acting
(defvar *correct-U*)
(defvar *correct-M*)
(defvar *correct-R*)

;;;; U2 is the correct utility table
;;;; assume U1, U2 have the same states
(defun u-rms-error (U1 U2 &aux (n 0) (e 0)) 
  (maphash #'(lambda (s u)
	       (incf n)
	       (incf e (square (- u (gethash s U2)))))
	   U1)
  (sqrt (/ e n)))

;;; The policy loss of a utility function U for an mdp is defined as the
;;; difference in utility between the corresponding policy and the optimal
;;; policy, for the agent's current state. Calculate using
;;; value determination wrt the current policy 

(defun loss (mdp U &aux (U2 (copy-hash-table U #'identity))
		        (M (mdp-model mdp))
			(R (mdp-rewards mdp)))
  (maphash #'(lambda (s md) (declare (ignore md))
	       (unless (gethash s U2) (setf (gethash s U2) 0)))
	   *correct-R*)       ;;; fill in missing entries if any
  (setq U2 (value-determination (funcall *policy-fn* U M R) 
				U2 *correct-M* *correct-R*))
  (- (gethash (mdp-initial-state mdp) *correct-U*)
     (gethash (mdp-initial-state mdp) U2)))