pythonによる強化学習、ゲームAIの実装を目指して【プログラムあり】

強化学習は、何ができるのでしょうか?

このような疑問にお答えします。

強化学習（きょうかがくしゅう、英: reinforcement learning）とは、ある環境内におけるエージェントが、現在の状態を観測し、取るべき行動を決定する問題を扱う機械学習の一種。エージェントは行動を選択することで環境から報酬を得る。強化学習は一連の行動を通じて報酬が最も多く得られるような方策（policy）を学習する。(wikipediaより抜粋)

上記を整理すると、以下の手順で方策を学習することになります。

エージェントが時刻tにて状態stを観測
観測した状態stから行動atを選択
エージェントが行動実行
行動atの実行に伴い、状態がst+1に遷移
遷移に応じた報酬rt+1を獲得
状態、行動、報酬のパラメータに基づき学習実施
1.から再度繰り返す

強化学習は、このような性質を利用して、ロボット制御、オセロ等のゲームに応用されています。

強化学習の問題を解くための方法として、モンテカルロ木探索法があります。この方法では、何らかの報酬が得られるまで行動を試みて、その報酬値を獲得してから、一連の状態と行動に対して、報酬を分配し、行動に対する価値基準を更新することができます。この手法では、数手先の未来を見てから一気に価値の更新を実施するため、数手先の行動の報酬も初期段階で把握することができる特徴を持ちます。

実際にリバーシに対してモンテカルロ木探索法を適用することを考えてみます。状態sは、ゲームの盤面、行動aは、ゲームの盤面に対して、可能な選択肢となります。

モンテカルロ探索木の適用イメージは以下のようになります。

エージェント(プレイヤー)が時刻tに状態st(ゲーム盤の配置)を観測
観測した状態stから行動at(可能な選択肢)を選択
エージェント(プレイヤー)が行動実行
行動atの実行に伴い、状態(ゲーム盤)がst+1に遷移
ゲームの勝敗が決まるまで行動と状態遷移を繰り返す
遷移に応じた報酬rt+1(勝敗)を獲得
状態、行動、報酬のパラメータに基づき価値(勝率)の高い行動をプレイヤーが選択する
1.から再度繰り返す

以下、ソースコードを記載します。

このプログラムでは、可能な選択肢からランダムにゲームを行うAIと
勝率の高い選択肢を算出しゲームを行うAIを対戦させています。

# coding: utf-8

import numpy as np

class Reverci:
	def __init__(self):
		self.empty = 0
		self.black = 1
		self.white = -1
		
		self.board_size = 8
		
		self.board_range = range(self.board_size)
		self.board_positions = [(x, y) for x in self.board_range for y in self.board_range]
		self.board_direction = [(dx, dy) for dx in [0, -1, 1] for dy in [0, -1, 1]][1:]
		
	def fn_create_board(self):
		# ゲーム盤初期化 白2, 黒2初期位置設定
		board = np.zeros([self.board_size, self.board_size], dtype=int)
		board[self.board_size // 2-1, self.board_size // 2-1] = self.white
		board[self.board_size // 2, self.board_size // 2] = self.white
		board[self.board_size // 2-1, self.board_size // 2] = self.black
		board[self.board_size // 2, self.board_size // 2-1] = self.black
		
		return board
		
	def fn_output_board(self, board):
		for y in range(-1, self.board_size):
			for x in range(-1, self.board_size):
				if x == -1 and y == -1:
					print(" ", end="")
				elif x == -1:
					print("ABCDEFGH"[y], end="")
				elif y == -1:
					print(" "+"12345678"[x], end="")
				elif board[x, y] == 1:
					print(" $", end="")
				elif board[x, y] == -1:
					print(" #", end="")
				else:
					print(" .", end="")
			print("")

	def fn_check_on_board(self, position):
#		print(position)
		return 0 <= position[0] and position[0] < self.board_size and 0 <= position[1] and position[1] < self.board_size
	
	
	def fn_check_put(self, board, color=None, position=None):
		if color is None:
			return False
		elif position is None:
			return False
		elif not self.fn_check_on_board(position) or board[position] != self.empty:
			return False
		else:
			for ds in self.board_direction:
				dif_cnt = 0
				pos = position
				while True:
					upd_pos = (pos[0] + ds[0], pos[1] + ds[1])
					if not self.fn_check_on_board(upd_pos): # ゲーム盤の範囲外?
						break
					if board[upd_pos] == self.empty:
						break
					elif board[upd_pos] == color:
						if dif_cnt >= 1:
							return True
						else:
							break
					else:
						dif_cnt += 1
						pos = upd_pos
					
			return False
		
	def fn_judge(self, board):
		result = np.sum(board)
		if result > 0:
			return self.black
		elif result < 0:
			return self.white
		else:
			return self.empty
	
	def fn_hands(self, board, color):
		hands = []
		for pos in self.board_positions:
			if self.fn_check_put(board, color, pos):
				hands.append(pos)
				
		return hands
		
		
	def fn_put(self, board, color, position):
		board = np.copy(board)
		if color is None:
			return False
		elif position is None:
			return False
		elif not self.fn_check_on_board(position) or board[position] != self.empty:
			return False
		else:
			board[position[0], position[1]] = color
			poslist = []
			
			for ds in self.board_direction:
				dif_cnt = 0
				pos = position
				poslist.clear()
				
				while True:
					upd_pos = (pos[0] + ds[0], pos[1] + ds[1])
					
					if not self.fn_check_on_board(upd_pos): # ゲーム盤の範囲内?
						break
					if board[upd_pos] == self.empty:
						break
					elif board[upd_pos] == color:
						if dif_cnt >= 1:
							for index in poslist:
								board[index] = color
								
							break
						
						else:
							break
							
					else:
						dif_cnt += 1
						poslist.append(upd_pos)
						pos = upd_pos
			
		return board
		
	
reverci = Reverci()
print(reverci)


import math
import sys

# マニュアル操作用クラス定義
class ManualInput:
	def __init__(self):
		pass
		
	def choice(self, board, color):
		if len(reverci.fn_hands(board, color)) == 0:
			return None
		
		return self.check(board, color)

	def check(self, board, color):
		print("$" if color == reverci.black else "#")
		
		while True:
			input_pos = input("input position:ex C4")
			if len(input_pos) == 2:
				y = "ABCDEFGH".find(input_pos[0])
				x = "12345678".find(input_pos[1])
				
				if reverci.fn_check_put(board, color, (x, y)):
					return (x, y)
				else:
					print("Input_invalid")
					
	
# 一様分布に従い有効手をランダムに選択実行
class Uniform:
	def __init__(self):
		pass
		
	def choice(self, board, color):
		return self.random_choice(board, color)
		
	def random_choice(self, board, color):
		hands = reverci.fn_hands(board, color)
		if len(hands) == 0:
			return None
		else:
			choice = np.random.randint(len(hands))
			return hands[choice]
			

# モンテカルロ木探索時に生成するノード定義
class MonteNode:
	def __init__(self, parent, board, color, max_depth=-1, move=None):
		self.parent = parent
		self.board = board
		self.color = color
		self.move = move
		self.max_depth = max_depth
		self.children = []
		if max_depth == 0:
			self.hands = []
		else:
			self.hands = reverci.fn_hands(self.board, self.color)
			if len(self.hands) == 0:
				self.hands.append(None)
			else:
				np.random.shuffle(self.hands)
				
		self.total_win = 0
		self.total_playout = 0
		
	def expand_child(self):
		move = self.hands.pop()
		if move is None:
			child = MonteNode(self, self.board, -self.color, self.max_depth-1, None)
			
		else:
			board = reverci.fn_put(self.board, self.color, move)
			child = MonteNode(self, self.board, -self.color, self.max_depth-1, move)
			
		self.children.append(child)
		
		return child
		
	def ucb(self, child):
		c = 1
		if child.total_playout == 0:
			return 0
		
		return child.total_win / child.total_playout \
				+ c * np.sqrt(math.log(self.total_playout) / child.total_playout)
				
	def choice_node(self):
		max_score = -sys.maxsize - 1
		ret = None
		
		for child in self.children:
			ucb = self.ucb(child)
			if max_score <= ucb:
				max_score = ucb
				ret = child
				
		return ret
		
	def choice_move(self):
		if not self.parent is None:
			raise Exception("operation error")
			
		if len(self.children) == 0 and len(self.hands) == 9:
			return None
		else:
			ret = None
			max_score = -sys.maxsize - 1
			
			for child in self.children:
				if child.total_playout == 0:
					continue
				score = child.total_win / child.total_playout
				
				if score >= max_score:
					ret = child
					max_score = score
				
			if ret is None:
				return None
				
			return ret.move
			
	def learning(self, win):
		if win is None:
			return 
		
		node = self
		
		while not node is None:
			if node.color == -win:
				node.total_win += 1
			node.total_playout += 1
			node = node.parent
			
	def get_root(self):
		node = self
		while not node.parent is None:
			node = node.parent
			
		return node
		
	def output_route(self):
		node = self
		while not node.parent is None:
			print(node.move, end="<-")
			node = node.parenet
			print("")
			
	def output_dump(self, ucb=0, pad=0):
		pass
		
# モンテカルロ木探索による勝率の高い有効手の選択実行
class MonteTree:
	def __init__(self):
		self.playout_cnt = 100
		self.max_depth = -1
		self.dump_nodes = False
		
	def monte_choice(self, board, color):
		root_node = MonteNode(None, board, color, self.max_depth)
#		print("root", root_node)

		for index in range(self.playout_cnt):
			node = root_node
			
			while len(node.hands) == 0 and len(node.children) > 0:
				node = node.choice_node()
			
			if len(node.hands) > 0:
				node = node.expand_child()
			
				
			win = self.playout(node.board, node.color)
			
			node.learning(win)
		
		move = root_node.choice_move()
		
		return move
		
	
	def choice(self, board, color):
		return self.monte_choice(board, color)
		
	def playout(self, board, color):
		board = np.copy(board)
		
		while True:
			hands = reverci.fn_hands(board, color)
			
			np.random.shuffle(hands)
			hand = hands[0] if len(hands) > 0 else None
			if hand is None:
				color = -color
				hands = reverci.fn_hands(board, color)
				np.random.shuffle(hands)
				hand = hands[0] if len(hands) > 0 else None
				if hand is None:
					break
#				if not reverci.fn_check_put(board, color, hand):
#					break
				continue
				
			board = reverci.fn_put(board, color, hand)
			color = -color
			
		return reverci.fn_judge(board)
		
	
import os, random

if __name__ == '__main__':
	player_color = reverci.black if random.randint(0, 1) else reverci.white
	players = {player_color: Uniform(),
				-player_color: MonteTree()}
				
	board = reverci.fn_create_board()
	reverci.fn_output_board(board)
	turn = reverci.black
	
	game_cnt = 0
	
	while True:
		hand = players[turn].choice(board, turn)
		
		if hand is not None:
			board = reverci.fn_put(board, turn, hand)
			reverci.fn_output_board(board)
			game_cnt = 0
		
		else:
			game_cnt += 1
			if game_cnt >= 2:
				break
				
		turn = -turn
		
	print("finish")
	won = reverci.fn_judge(board)
	if won == player_color:
		print("You win")
	elif won == -player_color:
		print("You lose")
	else:
		print("draw")

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

# coding: utf-8

import numpy as np

class Reverci:

def __init__(self):

self.empty = 0

self.black = 1

self.white = -1

self.board_size = 8

self.board_range = range(self.board_size)

self.board_positions = [(x, y) for x in self.board_range for y in self.board_range]

self.board_direction = [(dx, dy) for dx in [0, -1, 1] for dy in [0, -1, 1]][1:]

def fn_create_board(self):

# ゲーム盤初期化白2, 黒2初期位置設定

board = np.zeros([self.board_size, self.board_size], dtype=int)

board[self.board_size // 2-1, self.board_size // 2-1] = self.white

board[self.board_size // 2, self.board_size // 2] = self.white

board[self.board_size // 2-1, self.board_size // 2] = self.black

board[self.board_size // 2, self.board_size // 2-1] = self.black

return board

def fn_output_board(self, board):

for y in range(-1, self.board_size):

for x in range(-1, self.board_size):

if x == -1 and y == -1:

print(" ", end="")

elif x == -1:

print("ABCDEFGH"[y], end="")

elif y == -1:

print(" "+"12345678"[x], end="")

elif board[x, y] == 1:

print(" $", end="")

elif board[x, y] == -1:

print(" #", end="")

else:

print(" .", end="")

print("")

def fn_check_on_board(self, position):

# print(position)

return 0 <= position[0] and position[0] < self.board_size and 0 <= position[1] and position[1] < self.board_size

def fn_check_put(self, board, color=None, position=None):

if color is None:

return False

elif position is None:

return False

elif not self.fn_check_on_board(position) or board[position] != self.empty:

return False

else:

for ds in self.board_direction:

dif_cnt = 0

pos = position

while True:

upd_pos = (pos[0] + ds[0], pos[1] + ds[1])

if not self.fn_check_on_board(upd_pos): # ゲーム盤の範囲外?

break

if board[upd_pos] == self.empty:

break

elif board[upd_pos] == color:

if dif_cnt >= 1:

return True

else:

break

else:

dif_cnt += 1

pos = upd_pos

return False

def fn_judge(self, board):

result = np.sum(board)

if result > 0:

return self.black

elif result < 0:

return self.white

else:

return self.empty

def fn_hands(self, board, color):

hands = []

for pos in self.board_positions:

if self.fn_check_put(board, color, pos):

hands.append(pos)

return hands

def fn_put(self, board, color, position):

board = np.copy(board)

if color is None:

return False

elif position is None:

return False

elif not self.fn_check_on_board(position) or board[position] != self.empty:

return False

else:

board[position[0], position[1]] = color

poslist = []

for ds in self.board_direction:

dif_cnt = 0

pos = position

poslist.clear()

while True:

upd_pos = (pos[0] + ds[0], pos[1] + ds[1])

if not self.fn_check_on_board(upd_pos): # ゲーム盤の範囲内?

break

if board[upd_pos] == self.empty:

break

elif board[upd_pos] == color:

if dif_cnt >= 1:

for index in poslist:

board[index] = color

break

else:

break

else:

dif_cnt += 1

poslist.append(upd_pos)

pos = upd_pos

return board

reverci = Reverci()

print(reverci)

import math

import sys

# マニュアル操作用クラス定義

class ManualInput:

def __init__(self):

pass

def choice(self, board, color):

if len(reverci.fn_hands(board, color)) == 0:

return None

return self.check(board, color)

def check(self, board, color):

print("$" if color == reverci.black else "#")

while True:

input_pos = input("input position:ex C4")

if len(input_pos) == 2:

y = "ABCDEFGH".find(input_pos[0])

x = "12345678".find(input_pos[1])

if reverci.fn_check_put(board, color, (x, y)):

return (x, y)

else:

print("Input_invalid")

# 一様分布に従い有効手をランダムに選択実行

class Uniform:

def __init__(self):

pass

def choice(self, board, color):

return self.random_choice(board, color)

def random_choice(self, board, color):

hands = reverci.fn_hands(board, color)

if len(hands) == 0:

return None

else:

choice = np.random.randint(len(hands))

return hands[choice]

# モンテカルロ木探索時に生成するノード定義

class MonteNode:

def __init__(self, parent, board, color, max_depth=-1, move=None):

self.parent = parent

self.board = board

self.color = color

self.move = move

self.max_depth = max_depth

self.children = []

if max_depth == 0:

self.hands = []

else:

self.hands = reverci.fn_hands(self.board, self.color)

if len(self.hands) == 0:

self.hands.append(None)

else:

np.random.shuffle(self.hands)

self.total_win = 0

self.total_playout = 0

def expand_child(self):

move = self.hands.pop()

if move is None:

child = MonteNode(self, self.board, -self.color, self.max_depth-1, None)

else:

board = reverci.fn_put(self.board, self.color, move)

child = MonteNode(self, self.board, -self.color, self.max_depth-1, move)

self.children.append(child)

return child

def ucb(self, child):

c = 1

if child.total_playout == 0:

return 0

return child.total_win / child.total_playout \

+ c * np.sqrt(math.log(self.total_playout) / child.total_playout)

def choice_node(self):

max_score = -sys.maxsize - 1

ret = None

for child in self.children:

ucb = self.ucb(child)

if max_score <= ucb:

max_score = ucb

ret = child

return ret

def choice_move(self):

if not self.parent is None:

raise Exception("operation error")

if len(self.children) == 0 and len(self.hands) == 9:

return None

else:

ret = None

max_score = -sys.maxsize - 1

for child in self.children:

if child.total_playout == 0:

continue

score = child.total_win / child.total_playout

if score >= max_score:

ret = child

max_score = score

if ret is None:

return None

return ret.move

def learning(self, win):

if win is None:

return

node = self

while not node is None:

if node.color == -win:

node.total_win += 1

node.total_playout += 1

node = node.parent

def get_root(self):

node = self

while not node.parent is None:

node = node.parent

return node

def output_route(self):

node = self

while not node.parent is None:

print(node.move, end="<-")

node = node.parenet

print("")

def output_dump(self, ucb=0, pad=0):

pass

# モンテカルロ木探索による勝率の高い有効手の選択実行

class MonteTree:

def __init__(self):

self.playout_cnt = 100

self.max_depth = -1

self.dump_nodes = False

def monte_choice(self, board, color):

root_node = MonteNode(None, board, color, self.max_depth)

# print("root", root_node)

for index in range(self.playout_cnt):

node = root_node

while len(node.hands) == 0 and len(node.children) > 0:

node = node.choice_node()

if len(node.hands) > 0:

node = node.expand_child()

win = self.playout(node.board, node.color)

node.learning(win)

move = root_node.choice_move()

return move

def choice(self, board, color):

return self.monte_choice(board, color)

def playout(self, board, color):

board = np.copy(board)

while True:

hands = reverci.fn_hands(board, color)

np.random.shuffle(hands)

hand = hands[0] if len(hands) > 0 else None

if hand is None:

color = -color

hands = reverci.fn_hands(board, color)

np.random.shuffle(hands)

hand = hands[0] if len(hands) > 0 else None

if hand is None:

break

# if not reverci.fn_check_put(board, color, hand):

# break

continue

board = reverci.fn_put(board, color, hand)

color = -color

return reverci.fn_judge(board)

import os, random

if __name__ == '__main__':

player_color = reverci.black if random.randint(0, 1) else reverci.white

players = {player_color: Uniform(),

-player_color: MonteTree()}

board = reverci.fn_create_board()

reverci.fn_output_board(board)

turn = reverci.black

game_cnt = 0

while True:

hand = players[turn].choice(board, turn)

if hand is not None:

board = reverci.fn_put(board, turn, hand)

reverci.fn_output_board(board)

game_cnt = 0

else:

game_cnt += 1

if game_cnt >= 2:

break

turn = -turn

print("finish")

won = reverci.fn_judge(board)

if won == player_color:

print("You win")

elif won == -player_color:

print("You lose")

else:

print("draw")

最後に、強化学習を様々なタスクに手軽に導入することができるツールが近年では多く提供されています。強化学習の世界を楽しんでみませんか。