defeplison_bandit(K, P, V, R, T): r = 0 Q = np.zeros(K) count = np.zeros(K) for t inrange(T): eplison = 1. / np.sqrt(t + 1) if random.random() < eplison: k = random.randint(0, K - 1) else: k = np.argmax(Q) v = R(k, P, V) r += v Q[k] += (v - Q[k]) / (count[k] + 1) count[k] += 1 return r
defmain(): K = 5 P = np.array([0.1, 0.9, 0.3, 0.2, 0.7]) V = np.array([5, 3, 1, 7, 4]) T = 1000000 print eplison_bandit(K, P, V, R, T)
if __name__ == '__main__': main()
代码运行结果为:获得总价值2795546。
Softmax算法
原理
实现代码
#!/usr/bin/python # -*- coding: UTF-8 -*- import random import numpy as np
defeplison_bandit(K, P, V, R, T, tau=0.1): r = 0 Q = np.zeros(K) count = np.zeros(K) for t inrange(T): p = softmax(Q / tau) rand = random.random() total = 0.0 for i inrange(K): total += p[i] if total >= rand: k = i break v = R(k, P, V) r += v Q[k] += (v - Q[k]) / (count[k] + 1) count[k] += 1 return r
defmain(): K = 5 P = np.array([0.1, 0.9, 0.3, 0.2, 0.7]) V = np.array([5, 3, 1, 7, 4]) T = 1000000 tau = 0.1 print eplison_bandit(K, P, V, R, T, tau)