Working MCTS implementation

This is a basic working implementation of the MCTS algorithm. Though currently the algorithm is slow compared with other implemenations, and makes sub-optimal choices when playing tic-tac-toe. Therefore some modifications are needed
2025-06-23 13:46:04 -07:00 · 2025-06-23 13:46:04 -07:00 · 17884f4b90
commit 17884f4b90
parent 197a46996a
18 changed files with 1416 additions and 0 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,154 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 4
 [[package]]
 name = "cfg-if"
 version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268"
 [[package]]
 name = "getrandom"
 version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
 dependencies = [
 "cfg-if",
 "libc",
 "wasi",
 ]
 [[package]]
 name = "libc"
 version = "0.2.174"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
 [[package]]
 name = "ppv-lite86"
 version = "0.2.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
 dependencies = [
 "zerocopy",
 ]
 [[package]]
 name = "proc-macro2"
 version = "1.0.95"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "quote"
 version = "1.0.40"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "rand"
 version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
 "libc",
 "rand_chacha",
 "rand_core",
 ]
 [[package]]
 name = "rand_chacha"
 version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
 dependencies = [
 "ppv-lite86",
 "rand_core",
 ]
 [[package]]
 name = "rand_core"
 version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
 "getrandom",
 ]
 [[package]]
 name = "rustic_mcts"
 version = "0.1.0"
 dependencies = [
 "rand",
 "thiserror",
 ]
 [[package]]
 name = "syn"
 version = "2.0.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e4307e30089d6fd6aff212f2da3a1f9e32f3223b1f010fb09b7c95f90f3ca1e8"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]
 [[package]]
 name = "thiserror"
 version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
 dependencies = [
 "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
 version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "unicode-ident"
 version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
 [[package]]
 name = "wasi"
 version = "0.11.1+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
 [[package]]
 name = "zerocopy"
 version = "0.8.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f"
 dependencies = [
 "zerocopy-derive",
 ]
 [[package]]
 name = "zerocopy-derive"
 version = "0.8.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,16 @@
 [package]
 name = "rustic_mcts"
 version = "0.1.0"
 edition = "2021"
 authors = ["David Kruger <david@krugerlabs.us>"]
 description = "An extensible implementation of Monte Carlo Tree Search (MCTS) using an arena allocator."
 license = "MIT"
 repository = "https://gitlabs.krugerlabs.us/krugd/rustic_mcts"
 readme = "README.md"
 keywords = ["mcts", "rust", "monte_carlo", "tree", "ai", "ml"]
 categories = ["algorithms", "data-structures"]
 [dependencies]
 rand = "~0.8"
 thiserror = "~2.0"
--- a/examples/tic_tac_toe.rs
+++ b/examples/tic_tac_toe.rs
@ -0,0 +1,298 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::io::{self, Write};
 use rustic_mcts::policy::backprop::BackpropagationPolicy;
 use rustic_mcts::policy::decision::DecisionPolicy;
 use rustic_mcts::policy::selection::SelectionPolicy;
 use rustic_mcts::policy::simulation::SimulationPolicy;
 use rustic_mcts::{Action, GameState, MCTSConfig, RewardVal, MCTS};
 fn main() {
    println!("MCTS Tic-Tac-Toe Example");
    println!("========================");
    println!();
    // Set up a new game
    let mut game = TicTacToe::new();
    // Create MCTS configuration
    let config = MCTSConfig {
        max_iterations: 10_000,
        max_time: None,
        tree_size_allocation: 10_000,
        selection_policy: SelectionPolicy::UCB1Tuned(1.414),
        simulation_policy: SimulationPolicy::Random,
        backprop_policy: BackpropagationPolicy::Standard,
        decision_policy: DecisionPolicy::MostVisits,
    };
    // Main game loop
    while !game.is_terminal() {
        // Display the board
        println!("{}", game);
        if game.current_player == Player::X {
            // Human player (X)
            println!("Your move (enter row column, e.g. '1 2'): ");
            io::stdout().flush().unwrap();
            let mut input = String::new();
            io::stdin().read_line(&mut input).unwrap();
            let coords: Vec<usize> = input
                .trim()
                .split_whitespace()
                .filter_map(|s| s.parse::<usize>().ok())
                .collect();
            if coords.len() != 2 || coords[0] > 2 || coords[1] > 2 {
                println!("Invalid move! Enter row and column (0-2).");
                continue;
            }
            let row = coords[0];
            let col = coords[1];
            let move_index = row * 3 + col;
            let action = Move { index: move_index };
            if !game.is_legal_move(&action) {
                println!("Illegal move! Try again.");
                continue;
            }
            // Apply the human's move
            game = game.state_after_action(&action);
        } else {
            // AI player (O)
            println!("AI is thinking...");
            // Create a new MCTS search
            let mut mcts = MCTS::new(game.clone(), &config);
            // Find the best move
            match mcts.search() {
                Ok(action) => {
                    println!(
                        "AI chooses: {} (row {}, col {})",
                        action.index,
                        action.index / 3,
                        action.index % 3
                    );
                    // Apply the AI's move
                    game = game.state_after_action(&action);
                }
                Err(e) => {
                    println!("Error: {:?}", e);
                    break;
                }
            }
        }
    }
    // Display final state
    println!("{}", game);
    // Report the result
    if let Some(winner) = game.get_winner() {
        println!("Player {:?} wins!", winner);
    } else {
        println!("The game is a draw!");
    }
 }
 /// Players in Tic-Tac-Toe
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 enum Player {
    X,
    O,
 }
 impl rustic_mcts::Player for Player {}
 /// Tic-Tac-Toe move
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 struct Move {
    /// Board position index (0-8)
    index: usize,
 }
 impl Action for Move {
    fn id(&self) -> usize {
        self.index
    }
 }
 /// Tic-Tac-Toe game state
 #[derive(Clone)]
 struct TicTacToe {
    /// Board representation (None = empty, Some(Player) = occupied)
    board: [Option<Player>; 9],
    /// Current player's turn
    current_player: Player,
    /// Number of moves played so far
    moves_played: usize,
 }
 impl TicTacToe {
    /// Creates a new empty Tic-Tac-Toe board
    fn new() -> Self {
        TicTacToe {
            board: [None; 9],
            current_player: Player::X,
            moves_played: 0,
        }
    }
    /// Checks if a move is legal
    fn is_legal_move(&self, action: &Move) -> bool {
        if action.index >= 9 {
            return false;
        }
        self.board[action.index].is_none()
    }
    /// Returns the winner of the game, if any
    fn get_winner(&self) -> Option<Player> {
        // Check rows
        for row in 0..3 {
            let i = row * 3;
            if self.board[i].is_some()
                && self.board[i] == self.board[i + 1]
                && self.board[i] == self.board[i + 2]
            {
                return self.board[i];
            }
        }
        // Check columns
        for col in 0..3 {
            if self.board[col].is_some()
                && self.board[col] == self.board[col + 3]
                && self.board[col] == self.board[col + 6]
            {
                return self.board[col];
            }
        }
        // Check diagonals
        if self.board[0].is_some()
            && self.board[0] == self.board[4]
            && self.board[0] == self.board[8]
        {
            return self.board[0];
        }
        if self.board[2].is_some()
            && self.board[2] == self.board[4]
            && self.board[2] == self.board[6]
        {
            return self.board[2];
        }
        None
    }
 }
 impl GameState for TicTacToe {
    type Action = Move;
    type Player = Player;
    fn get_legal_actions(&self) -> Vec<Self::Action> {
        let mut actions = Vec::new();
        for i in 0..9 {
            if self.board[i].is_none() {
                actions.push(Move { index: i });
            }
        }
        actions
    }
    fn state_after_action(&self, action: &Self::Action) -> Self {
        let mut new_state = self.clone();
        // Make the move
        new_state.board[action.index] = Some(self.current_player);
        new_state.moves_played = self.moves_played + 1;
        // Switch player
        new_state.current_player = match self.current_player {
            Player::X => Player::O,
            Player::O => Player::X,
        };
        new_state
    }
    fn is_terminal(&self) -> bool {
        self.get_winner().is_some() || self.moves_played == 9
    }
    fn reward_for_player(&self, player: &Self::Player) -> RewardVal {
        if let Some(winner) = self.get_winner() {
            if winner == *player {
                return 1.0; // Win
            } else {
                return 0.0; // Loss
            }
        }
        // Draw
        0.5
    }
    fn rewards_for_players(&self) -> HashMap<Self::Player, RewardVal> {
        HashMap::from_iter(vec![
            (Player::X, self.reward_for_player(&Player::X)),
            (Player::O, self.reward_for_player(&Player::O)),
        ])
    }
    fn get_current_player(&self) -> &Self::Player {
        &self.current_player
    }
 }
 impl fmt::Display for TicTacToe {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        writeln!(f, "  0 1 2")?;
        for row in 0..3 {
            write!(f, "{} ", row)?;
            for col in 0..3 {
                let index = row * 3 + col;
                let symbol = match self.board[index] {
                    Some(Player::X) => "X",
                    Some(Player::O) => "O",
                    None => ".",
                };
                write!(f, "{} ", symbol)?;
            }
            writeln!(f)?;
        }
        writeln!(f, "\nPlayer {:?}'s turn", self.current_player)?;
        Ok(())
    }
 }
 impl fmt::Debug for TicTacToe {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "\n")?;
        for row in 0..3 {
            for col in 0..3 {
                let index = row * 3 + col;
                let symbol = match self.board[index] {
                    Some(Player::X) => "X",
                    Some(Player::O) => "O",
                    None => ".",
                };
                write!(f, "{} ", symbol)?;
            }
            writeln!(f)?;
        }
        Ok(())
    }
 }
--- a/src/config.rs
+++ b/src/config.rs
@ -0,0 +1,67 @@
 use crate::policy::backprop::BackpropagationPolicy;
 use crate::policy::decision::DecisionPolicy;
 use crate::policy::selection::SelectionPolicy;
 use crate::policy::simulation::SimulationPolicy;
 use crate::state::GameState;
 use std::time::Duration;
 /// Configuration for the MCTS algorithm
 #[derive(Debug)]
 pub struct MCTSConfig<S: GameState> {
    /// The maximum number of iterations to run when searching
    ///
    /// The search will stop after the given number of iterations, even if there
    /// is search time has not exceeded `max_time`.
    pub max_iterations: usize,
    /// The maximum time to run the search
    ///
    /// If set, the search will stop after this duration even if the maximum
    /// iterations hasn't been reached.
    pub max_time: Option<Duration>,
    /// The size to initially allocate for the search tree
    ///
    /// This pre-allocates memory for the search tree which ensures contiguous
    /// memory and improves performance by preventing the resizing of tree
    /// as we explore.
    pub tree_size_allocation: usize,
    /// The selection policy
    ///
    /// This dictates the path through which the game tree is searched. As such
    /// the policy has a large impact on the overall aglorthm exeuction
    pub selection_policy: SelectionPolicy<S>,
    /// The simulation policy
    ///
    /// This dictates the game siluation when expanding and evaluating the
    /// search tree. Random is generally a good default.
    pub simulation_policy: SimulationPolicy<S>,
    /// The backpropagation policy
    ///
    /// This dictates how the results of the simulation playouts are propagated
    /// back up the tree.
    pub backprop_policy: BackpropagationPolicy<S>,
    /// The decision policy
    ///
    /// This dictates how the MCTS algorithm determines its final decision
    /// after iterating through the search tree
    pub decision_policy: DecisionPolicy,
 }
 impl<S: GameState> Default for MCTSConfig<S> {
    fn default() -> Self {
        MCTSConfig {
            max_iterations: 10_000,
            max_time: None,
            tree_size_allocation: 10_000,
            selection_policy: SelectionPolicy::UCB1Tuned(1.414),
            simulation_policy: SimulationPolicy::Random,
            backprop_policy: BackpropagationPolicy::Standard,
            decision_policy: DecisionPolicy::MostVisits,
        }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,17 @@
 //! # rustic_mcts
 //!
 //! An extensible implementation of Monte Carlo Tree Search (MCTS) using arena allocation and
 //! configurable policies.
 pub mod config;
 pub mod mcts;
 pub mod policy;
 pub mod state;
 pub mod tree;
 pub use config::MCTSConfig;
 pub use mcts::MCTS;
 pub use state::Action;
 pub use state::GameState;
 pub use state::Player;
 pub use tree::node::RewardVal;
--- a/src/mcts.rs
+++ b/src/mcts.rs
@ -0,0 +1,147 @@
 use crate::config::MCTSConfig;
 use crate::policy::backprop::backpropagate_rewards;
 use crate::policy::decision::decide_on_action;
 use crate::policy::selection::select_best_child;
 use crate::policy::simulation::simulate_reward;
 use crate::state::GameState;
 use crate::tree::arena::Arena;
 use crate::tree::node::{Node, RewardVal};
 use rand::prelude::SliceRandom;
 use std::collections::HashMap;
 use std::time::Instant;
 /// Monte Carlo Tree Search implementation
 ///
 /// This provides the interface for performing optimal searches on a tree using
 /// the MCTS algorithm.
 pub struct MCTS<'conf, S: GameState> {
    /// The arena used for the tree
    arena: Arena<S>,
    /// The identifier of the root node of the search tree
    root_id: usize,
    /// The configuration used for the search
    config: &'conf MCTSConfig<S>,
 }
 impl<'conf, S: GameState + std::fmt::Debug> MCTS<'conf, S> {
    /// Creates a new instance with the given initial state and configuration
    pub fn new(initial_state: S, config: &'conf MCTSConfig<S>) -> Self {
        let mut arena: Arena<S> = Arena::new(config.tree_size_allocation);
        let root: Node<S> = Node::new(initial_state.clone(), None, None);
        let root_id: usize = arena.add_node(root);
        MCTS {
            arena,
            root_id,
            config,
        }
    }
    /// Runs the MCTS algorithm, returning the "best" action
    ///
    /// The search will stop once `max_iterations` or `max_time` from
    /// the assigned configration is reached.
    pub fn search(&mut self) -> Result<S::Action> {
        self.search_for_iterations(self.config.max_iterations)
    }
    /// Runs the MCTS algorithm, returning the "best" action after the given iterations
    ///
    /// This ignores the `max_iterations` provided in the config, however will
    /// return if `max_time` is specific and reached before the iterations are complete.
    pub fn search_for_iterations(&mut self, iterations: usize) -> Result<S::Action> {
        let start_time = Instant::now();
        for _ in 0..iterations {
            match self.config.max_time {
                Some(max_time) => {
                    if start_time.elapsed() >= max_time {
                        break; // ending early due to time
                    }
                }
                None => {}
            }
            self.execute_iteration()?;
        }
        self.best_action()
    }
    /// Runs the MCTS algorithm for a single iteration
    fn execute_iteration(&mut self) -> Result<()> {
        let mut selected_id: usize = self.select();
        let selected_node: &Node<S> = self.arena.get_node(selected_id);
        if !selected_node.state.is_terminal() {
            self.expand(selected_id);
            let children: &Vec<usize> = &self.arena.get_node(selected_id).children;
            let random_child: usize = *children.choose(&mut rand::thread_rng()).unwrap();
            selected_id = random_child;
        }
        let rewards = self.simulate(selected_id);
        self.backprop(selected_id, &rewards);
        Ok(())
    }
    /// MCTS Phase 1: Selection - Find the "best" node to expand
    fn select(&mut self) -> usize {
        let mut current_id: usize = self.root_id;
        loop {
            let node = &self.arena.get_node(current_id);
            if node.is_leaf() || node.state.is_terminal() {
                return current_id;
            }
            current_id = select_best_child(&self.config.selection_policy, &node, &self.arena);
        }
    }
    /// MCTS Phase 2: Expansion - Expand the selected node on the tree
    fn expand(&mut self, id: usize) {
        let parent: &Node<S> = self.arena.get_node_mut(id);
        let legal_actions: Vec<S::Action> = parent.state.get_legal_actions();
        let parent_state: S = parent.state.clone();
        for action in legal_actions {
            let state = parent_state.state_after_action(&action);
            let new_node = Node::new(state, Some(action), Some(id));
            let new_id = self.arena.add_node(new_node);
            self.arena.get_node_mut(id).children.push(new_id);
        }
    }
    fn simulate(&self, id: usize) -> HashMap<S::Player, RewardVal> {
        let node = &self.arena.get_node(id);
        simulate_reward(&self.config.simulation_policy, &node, &self.arena)
    }
    fn backprop(&mut self, selected_id: usize, rewards: &HashMap<S::Player, RewardVal>) {
        backpropagate_rewards(
            &self.config.backprop_policy,
            selected_id,
            &mut self.arena,
            &rewards,
        )
    }
    fn best_action(&self) -> Result<S::Action> {
        let root_node: &Node<S> = self.arena.get_node(self.root_id);
        match decide_on_action(&self.config.decision_policy, &root_node, &self.arena) {
            Some(action) => Ok(action),
            None => Err(MCTSError::NoBestAction),
        }
    }
 }
 /// Errors returned by the MCTS algorithm
 #[derive(Debug, thiserror::Error)]
 pub enum MCTSError {
    /// The best action doesn't exist
    #[error("Unable to determine a best action for the game")]
    NoBestAction,
    /// The search tree was exhausted without finding a terminal node
    #[error("Search tree exhausted without finding terminal node")]
    NonTerminalGame,
 }
 /// Result returned by the MCTS algorithm
 pub type Result<T> = std::result::Result<T, MCTSError>;
--- a/src/policy/backprop/mod.rs
+++ b/src/policy/backprop/mod.rs
@ -0,0 +1,99 @@
 use crate::state::GameState;
 use crate::tree::arena::Arena;
 use crate::tree::node::RewardVal;
 use std::collections::HashMap;
 /// The back propagation policy dictating the propagation of playout results
 ///
 /// This policy drives how the backpropagation phase of the MCTS algorithm is
 /// executed, allowing for some minor customization.
 ///
 /// Typically the Standard policy, used by most implementaions of MCTS, is
 /// sufficient
 #[derive(Debug)]
 pub enum BackpropagationPolicy<S: GameState> {
    /// Standard back propagation
    ///
    /// This increments the visitation count and adds the simulated rewards
    /// results to the aggregate values.
    ///
    /// This is the standard policy used in most MCTS implementations.
    Standard,
    /// Weighted back propagation
    ///
    /// This weights the value of the simulated rewards based on the depth,
    /// allowing us to put more-or-less influence on deeper branches
    /// - Positive weight factor makes deeper nodes less influential
    /// - Negative weight factor makes deeper nodes more influential
    Weighted(f64),
    /// Custom backpropagation policy
    Custom(Box<dyn CustomBackpropagationPolicy<S>>),
 }
 /// Trait for an object implementing the backpropagation logic whene exploring the MCTS
 /// search tree.
 pub trait CustomBackpropagationPolicy<S: GameState>: std::fmt::Debug {
    /// Backpropagate the given rewards values from the node up the tree
    fn backprop(
        &self,
        node_id: usize,
        arena: &mut Arena<S>,
        rewards: &HashMap<S::Player, RewardVal>,
    );
 }
 pub fn backpropagate_rewards<S: GameState>(
    policy: &BackpropagationPolicy<S>,
    node_id: usize,
    arena: &mut Arena<S>,
    rewards: &HashMap<S::Player, RewardVal>,
 ) {
    match policy {
        BackpropagationPolicy::Standard => standard_backprop(node_id, arena, rewards),
        BackpropagationPolicy::Weighted(depth_factor) => {
            weighted_backprop(*depth_factor, node_id, arena, rewards)
        }
        BackpropagationPolicy::Custom(custom_policy) => {
            custom_policy.backprop(node_id, arena, rewards)
        }
    }
 }
 fn standard_backprop<S: GameState>(
    node_id: usize,
    arena: &mut Arena<S>,
    rewards: &HashMap<S::Player, RewardVal>,
 ) {
    // TODO:
    // - each node needs the perspective of the different players not just one view
    //   - e.g. reward_sum(player), reward_avg(player), rewards(player)[], visits(player)
    // - we could make special version for 2-player zero-sum games like below
    let mut current_id: usize = node_id;
    loop {
        let node = arena.get_node_mut(current_id);
        let player = node.state.get_current_player().clone();
        match rewards.get(&player) {
            Some(reward) => {
                node.increment_visits();
                node.record_player_reward(player, *reward);
                if let Some(parent_id) = node.parent {
                    current_id = parent_id;
                } else {
                    break;
                }
            }
            None => (),
        }
    }
 }
 fn weighted_backprop<S: GameState>(
    _depth_factor: f64,
    _node_id: usize,
    _arena: &mut Arena<S>,
    _rewards: &HashMap<S::Player, RewardVal>,
 ) {
    // TODO
 }
--- a/src/policy/decision/mod.rs
+++ b/src/policy/decision/mod.rs
@ -0,0 +1,65 @@
 use crate::state::GameState;
 use crate::tree::arena::Arena;
 use crate::tree::node::Node;
 /// The decision policy when determining the action in final MCTS phase
 ///
 /// This policy drives how the MCTS algorithm chooses which action is the
 /// "best" from the exploration.
 #[derive(Debug)]
 pub enum DecisionPolicy {
    /// Decide on the action with the most visits
    ///
    /// This option relies on the statistical confidence drive by the MCTS
    /// algorithm instead of the potentially more noisy value estimates.
    ///
    /// This is the standard policy used in most MCTS implementations, and
    /// is a good selection when not hyper-maximizing for potential gain
    MostVisits,
    /// Decide on the action with the highest average value
    ///
    /// This is non-standard, but is more aggressive in attempting to gain
    /// the highest value in a decision.
    HighestValue,
 }
 pub fn decide_on_action<S: GameState>(
    policy: &DecisionPolicy,
    root_node: &Node<S>,
    arena: &Arena<S>,
 ) -> Option<S::Action> {
    match policy {
        DecisionPolicy::MostVisits => most_visits(root_node, arena),
        DecisionPolicy::HighestValue => highest_value(root_node, arena),
    }
 }
 fn most_visits<S: GameState>(root_node: &Node<S>, arena: &Arena<S>) -> Option<S::Action> {
    let best_child_id: &usize = root_node
        .children
        .iter()
        .max_by(|&a, &b| {
            let node_a_visits = arena.get_node(*a).visits;
            let node_b_visits = arena.get_node(*b).visits;
            node_a_visits.partial_cmp(&node_b_visits).unwrap()
        })
        .unwrap();
    arena.get_node(*best_child_id).action.clone()
 }
 fn highest_value<S: GameState>(root_node: &Node<S>, arena: &Arena<S>) -> Option<S::Action> {
    let player = root_node.state.get_current_player();
    let best_child_id: &usize = root_node
        .children
        .iter()
        .max_by(|&a, &b| {
            let node_a_score = arena.get_node(*a).reward_average(player);
            let node_b_score = arena.get_node(*b).reward_average(player);
            node_a_score.partial_cmp(&node_b_score).unwrap()
        })
        .unwrap();
    arena.get_node(*best_child_id).action.clone()
 }
--- a/src/policy/mod.rs
+++ b/src/policy/mod.rs
@ -0,0 +1,4 @@
 pub mod backprop;
 pub mod decision;
 pub mod selection;
 pub mod simulation;
--- a/src/policy/selection/mod.rs
+++ b/src/policy/selection/mod.rs
@ -0,0 +1,59 @@
 mod ucb1;
 mod ucb1_tuned;
 use crate::state::GameState;
 use crate::tree::arena::Arena;
 use crate::tree::node::Node;
 /// The selection policy used in the MCTS selection phase
 ///
 /// This drives the selection of the nodes in the search tree, determining
 /// which paths are explored and evaluated.
 ///
 /// In general UCB1-Tuned or UCB1 should be effective, however if necessariy
 /// a custom selection policy can be provided.
 #[derive(Debug)]
 pub enum SelectionPolicy<S: GameState> {
    /// Upper Confidence Bound 1 (UCB1) with the given exploration constant
    ///
    /// The exploration constant controls the balance between exploration and
    /// exploitation. The higher the value, the mroe likely the search will
    /// explore less-visited nodes. A standard value is √2 ≈ 1.414.
    UCB1(f64),
    /// Upper Confidence Bound 1 Tuned (UCB1-Tuned)
    ///
    /// A tuned version of UCB1 instead using the empirical
    /// standard deviation of the rewards to drive exploration.
    ///
    /// Auer, P., Cesa-Bianchi, N. & Fischer, P. Finite-time Analysis of the Multiarmed Bandit Problem. Machine Learning 47, 235–256 (2002). https://doi.org/10.1023/A:1013689704352
    UCB1Tuned(f64),
    /// Custom selection policy
    Custom(Box<dyn CustomSelectionPolicy<S>>),
 }
 /// Trait for an object implementing the selection logic whene exploring the MCTS
 /// search tree.
 ///
 /// The policy should select the child of the given node which is "best" for the current player
 pub trait CustomSelectionPolicy<S: GameState>: std::fmt::Debug {
    /// Selects a child based on the policy, returning the node ID
    fn select_child(&self, node: &Node<S>, arena: &Arena<S>) -> usize;
 }
 pub fn select_best_child<S: GameState>(
    policy: &SelectionPolicy<S>,
    node: &Node<S>,
    arena: &Arena<S>,
 ) -> usize {
    match policy {
        SelectionPolicy::UCB1(exploration_constant) => {
            ucb1::select_best_child(*exploration_constant, node, arena)
        }
        SelectionPolicy::UCB1Tuned(exploration_constant) => {
            ucb1_tuned::select_best_child(*exploration_constant, node, arena)
        }
        SelectionPolicy::Custom(custom_policy) => custom_policy.select_child(node, arena),
    }
 }
--- a/src/policy/selection/ucb1.rs
+++ b/src/policy/selection/ucb1.rs
@ -0,0 +1,79 @@
 //! Upper Confidence Bound 1 (UCB1) selection policy
 //!
 //! This is the classic selection policy for MCTS, which balances
 //! exploration and exploitation using the UCB1 formula:
 //!
 //! ```text
 //! UCB1 = average_reward + exploration_constant * sqrt(ln(parent_visits) / child_visits)
 //! ```
 //!
 //! Where:
 //! - `average_reward` is the average reward from simulations through this node
 //! - `exploration_constant` controls the balance between exploration and exploitation
 //! - `parent_visits` is the number of visits to the parent node
 //! - `child_visits` is the number of visits to the child node
 //!
 //! Higher exploration constants favor exploration (trying less-visited nodes),
 //! while lower values favor exploitation (choosing nodes with higher values).
 //!
 //! The commonly used value for the exploration constant is sqrt(2) ≈ 1.414,
 //! which is the default in this implementation.
 use crate::state::GameState;
 use crate::tree::arena::Arena;
 use crate::tree::node::{Node, RewardVal};
 /// Selects the index of the "best" child using the UCB1 selection policy
 pub fn select_best_child<S: GameState>(
    exploration_constant: f64,
    node: &Node<S>,
    arena: &Arena<S>,
 ) -> usize {
    if node.is_leaf() {
        panic!("select_best_child called on leaf node");
    }
    let player = node.state.get_current_player();
    let parent_visits = node.visits;
    let best_child = node
        .children
        .iter()
        .max_by(|&a, &b| {
            let node_a = arena.get_node(*a);
            let node_b = arena.get_node(*b);
            let ucb_a = ucb1_value(
                exploration_constant,
                node_a.reward_average(player),
                node_a.visits,
                parent_visits,
            );
            let ucb_b = ucb1_value(
                exploration_constant,
                node_b.reward_average(player),
                node_b.visits,
                parent_visits,
            );
            ucb_a.partial_cmp(&ucb_b).unwrap()
        })
        .unwrap();
    *best_child
 }
 /// Calculates the UCB1 value for a node
 pub fn ucb1_value(
    exploration_constant: f64,
    child_value: RewardVal,
    child_visits: u64,
    parent_visits: u64,
 ) -> RewardVal {
    if child_visits == 0 {
        return f64::INFINITY; // Always explore nodes that have never been visited
    }
    // UCB1 formula: value + C * sqrt(ln(parent_visits) / child_visits)
    let exploitation = child_value;
    let exploration =
        exploration_constant * ((parent_visits as f64).ln() / child_visits as f64).sqrt();
    exploitation + exploration
 }
--- a/src/policy/selection/ucb1_tuned.rs
+++ b/src/policy/selection/ucb1_tuned.rs
@ -0,0 +1,97 @@
 //! Upper Confidence Bound 1 Tuned (UCB1-Tuned) selection policy
 //!
 //! This is a fine-tuned version of UCB which takes into account the
 //! empircally measured variance of the rewards to drive the exploration.
 //!
 //! This has been found to perform substantially better than UCB1 in most
 //! situations.
 //!
 //! Auer, P., Cesa-Bianchi, N. & Fischer, P.
 //!  Finite-time Analysis of the Multiarmed Bandit Problem.
 //!  Machine Learning 47, 235–256 (2002). https://doi.org/10.1023/A:1013689704352
 use crate::state::GameState;
 use crate::tree::arena::Arena;
 use crate::tree::node::{Node, RewardVal};
 /// Selects the index of the "best" child using the UCB1-Tuned selection policy
 pub fn select_best_child<S: GameState>(
    exploration_constant: f64,
    node: &Node<S>,
    arena: &Arena<S>,
 ) -> usize {
    if node.is_leaf() {
        panic!("select_best_child called on leaf node");
    }
    let player = node.state.get_current_player();
    let parent_visits = node.visits;
    let best_child = node
        .children
        .iter()
        .max_by(|&a, &b| {
            let node_a = arena.get_node(*a);
            let node_b = arena.get_node(*b);
            let ucb_a = ucb1_tuned_value(
                exploration_constant,
                parent_visits,
                node_a.visits,
                node_a.rewards(player),
                node_a.reward_average(player),
            );
            let ucb_b = ucb1_tuned_value(
                exploration_constant,
                parent_visits,
                node_b.visits,
                node_b.rewards(player),
                node_b.reward_average(player),
            );
            ucb_a.partial_cmp(&ucb_b).unwrap()
        })
        .unwrap();
    *best_child
 }
 /// Calculates the UCB1-Tuned value for a node
 pub fn ucb1_tuned_value(
    exploration_constant: f64,
    parent_visits: u64,
    child_visits: u64,
    child_rewards: Option<&Vec<RewardVal>>,
    reward_avg: RewardVal,
 ) -> RewardVal {
    match child_rewards {
        None => {
            RewardVal::INFINITY // Always explore nodes that have never been visited
        }
        Some(child_rewards) => {
            if child_visits == 0 {
                RewardVal::INFINITY // Always explore nodes that have never been visited
            } else {
                let parent_visits: RewardVal = parent_visits as RewardVal;
                let child_visits: RewardVal = child_visits as RewardVal;
                // N: number of visits to the parent node
                // n: number of visits to the child node
                // x_i: reward of the ith visit to the child node
                // X: average reward of the child
                // C: exploration constant
                //
                // UCB1-Tuned = X + C * sqrt(Ln(parent_visits) / child_visits * min(1/4, V_n)
                //   V(n) = sum(x_i^2)/n - X^2 + sqrt(2*ln(N)/n)
                let exploitation = reward_avg;
                let mut variance = (child_rewards.iter().map(|&x| x * x).sum::<RewardVal>()
                    / child_visits)
                    - (reward_avg * reward_avg)
                    + (2.0 * parent_visits.ln() / child_visits).sqrt();
                if variance > 0.25 {
                    variance = 0.25;
                }
                let exploration =
                    exploration_constant * (parent_visits.ln() / child_visits * variance).sqrt();
                exploitation + exploration
            }
        }
    }
 }
--- a/src/policy/simulation/mod.rs
+++ b/src/policy/simulation/mod.rs
@ -0,0 +1,44 @@
 mod random;
 use crate::state::GameState;
 use crate::tree::arena::Arena;
 use crate::tree::node::{Node, RewardVal};
 use std::collections::HashMap;
 /// The simulation policy used in the MCTS simulation phase
 ///
 /// This policy drives the game simulations while evaluating the tree. While
 /// a random policy works well, a game-specific policy can be provided either
 /// as a custom policy.
 #[derive(Debug)]
 pub enum SimulationPolicy<S: GameState> {
    /// Random simulation policy
    ///
    /// The sequential actions are selected randomly from the available actions
    /// at each state until a terminal state is found.
    Random,
    /// Custom simulation policy
    Custom(Box<dyn CustomSimulationPolicy<S>>),
 }
 /// Trait for an object implementing the simulation logic whene exploring the MCTS
 /// search tree.
 pub trait CustomSimulationPolicy<S: GameState>: std::fmt::Debug {
    /// Simulates the gameplay from the current node onward, returning the rewards
    ///
    /// This should simulate the game until a terminal node is reached, returning
    /// the final reward for each player at the terminal node
    fn simulate(&self, node: &Node<S>, arena: &Arena<S>) -> HashMap<S::Player, RewardVal>;
 }
 pub fn simulate_reward<S: GameState>(
    policy: &SimulationPolicy<S>,
    node: &Node<S>,
    arena: &Arena<S>,
 ) -> HashMap<S::Player, RewardVal> {
    match policy {
        SimulationPolicy::Random => random::simulate(node),
        SimulationPolicy::Custom(custom_policy) => custom_policy.simulate(node, arena),
    }
 }
--- a/src/policy/simulation/random.rs
+++ b/src/policy/simulation/random.rs
@ -0,0 +1,18 @@
 //! Random play simulation policy
 //!
 //! Actions are chosen at random
 use crate::state::GameState;
 use crate::tree::node::{Node, RewardVal};
 use rand::prelude::SliceRandom;
 use std::collections::HashMap;
 pub fn simulate<S: GameState>(node: &Node<S>) -> HashMap<S::Player, RewardVal> {
    let mut state: S = node.state.clone();
    while !state.is_terminal() {
        let legal_actions = state.get_legal_actions();
        let action = legal_actions.choose(&mut rand::thread_rng()).unwrap();
        state = state.state_after_action(&action);
    }
    state.rewards_for_players()
 }
--- a/src/state.rs
+++ b/src/state.rs
@ -0,0 +1,90 @@
 use crate::tree::node::RewardVal;
 use std::collections::HashMap;
 use std::fmt::Debug;
 use std::hash::Hash;
 /// Trait for the game state used in MCTS
 ///
 /// When leveraging MCTS for your game, you must implement this trait to provide
 /// the specifics for your game.
 pub trait GameState: Clone {
    /// The type of actions that can be taken in the game
    type Action: Action;
    /// The type of players in the game
    type Player: Player;
    /// Returns if the game state is terminal, i.e. the game is over
    ///
    /// A game state is terminal when no other actions are possible. This can be
    /// the result of a player winning, a draw, or because some other conditions
    /// have been met leading to a game with no further possible states.
    ///
    /// The default implementation returns True if `get_legal_actions()` returns
    /// an empty list. It is recommended to override this for a more efficient
    /// implementation if possible.
    fn is_terminal(&self) -> bool {
        let actions = self.get_legal_actions();
        actions.len() == 0
    }
    /// Returns the list of legal actions for the game state
    ///
    /// This method must return all possible actions that can be made from the
    /// current game state.
    fn get_legal_actions(&self) -> Vec<Self::Action>;
    /// Returns the game state resulting from applying the action to the state
    ///
    /// This function should not modify the current state directly, and
    /// instead should modify a copy of the state and return that.
    fn state_after_action(&self, action: &Self::Action) -> Self;
    /// Returns the reward from the perspective of the given player for the game state
    ///
    /// This evaluates the current state from the perspective of the given player, and
    /// returns the reward indicating how good of a result the given state is for the
    /// player.
    ///
    /// This is used in the MCTS backpropagation and simulation phases to evaluate
    /// the value of a given node in the search tree.
    ///
    /// A general rule of thumb for values are:
    /// - 1.0 => a win for the player
    /// - 0.5 => a draw
    /// - 0.0 => a loss for the player
    ///
    /// Other values can be used for relative wins or losses
    fn reward_for_player(&self, player: &Self::Player) -> RewardVal;
    /// Returns the rewards for all players at the current state
    fn rewards_for_players(&self) -> HashMap<Self::Player, RewardVal>;
    /// Returns the player whose turn it is for the game state
    ///
    /// This is used for evaluating the state, so for simultaneous games
    /// consider the "current player" as the one from whose perspective we are
    /// evaluating the game state from
    fn get_current_player(&self) -> &Self::Player;
 }
 /// Trait used for actions that can be taken in a game
 ///
 /// An action is dependent upon the specific game being defined, and includes
 /// things like moves, attacks, and other decisions.
 pub trait Action: Clone + Debug {
    /// Returns a uniqie identifier for this action
    fn id(&self) -> usize;
 }
 /// Trait used for players participating in a game
 pub trait Player: Clone + Debug + PartialEq + Eq + Hash {}
 /// Convenience implemnentation of a Player for usize
 impl Player for usize {}
 /// Convenience implemnentation of a Player for char
 impl Player for char {}
 /// Convenience implemnentation of a Player for String
 impl Player for String {}
--- a/src/tree/arena.rs
+++ b/src/tree/arena.rs
@ -0,0 +1,46 @@
 use crate::state::GameState;
 use crate::tree::node::Node;
 /// An arena for Node allocation
 ///
 /// We use an arena for node allocation to improve performance of our search.
 /// The memory is contiguous which allows for faster movement through the tree,
 /// as well as more efficient destruction as our MCTS search will destroy the
 /// entire tree at once.
 pub struct Arena<S: GameState> {
    pub nodes: Vec<Node<S>>,
 }
 impl<S: GameState> Arena<S> {
    /// Create a new Arena with the given initial capacity
    ///
    /// The arena creates a contiguous block. By reserving an initial capacity
    /// that is sufficient to encapsulate a full search tree we can reduce the
    /// number of reallocs that are required. This number is highly game
    /// dependent.
    pub fn new(initial_capacity: usize) -> Self {
        Arena {
            nodes: Vec::with_capacity(initial_capacity),
        }
    }
    /// Adds a node to the Arena, returning its identifier
    ///
    /// This appends the node to the allocated Arena, and returns the nodes
    /// index in the arena which is used as an identifier for later retrieval.
    pub fn add_node(&mut self, node: Node<S>) -> usize {
        let id = self.nodes.len();
        self.nodes.push(node);
        id
    }
    /// Retrieves a mutable reference to a Node in the Arena
    pub fn get_node_mut(&mut self, id: usize) -> &mut Node<S> {
        &mut self.nodes[id]
    }
    /// Retrieves a reference to a Node in the Arena
    pub fn get_node(&self, id: usize) -> &Node<S> {
        &self.nodes[id]
    }
 }
--- a/src/tree/mod.rs
+++ b/src/tree/mod.rs
@ -0,0 +1,2 @@
 pub mod arena;
 pub mod node;
--- a/src/tree/node.rs
+++ b/src/tree/node.rs
@ -0,0 +1,114 @@
 use std::collections::HashMap;
 use std::fmt::Debug;
 use crate::state::GameState;
 /// The type used for reward values
 pub type RewardVal = f64;
 /// A node in the MCTS tree
 ///
 /// A node represents a given game state and, using the path from the root node,
 /// the actions that led to the given state. A node has a number of children
 /// nodes representing the game states reachable from the given state, after
 /// a given action. This creates the tree that MCTS iterates through.
 ///
 /// This class is not thread safe, as the library does not provide for parallel
 /// search.
 #[derive(Debug)]
 pub struct Node<S: GameState> {
    /// The game state at the given node, after `action`
    pub state: S,
    /// The action that led to this state from its parent
    pub action: Option<S::Action>,
    /// The identifier of the parent Node
    pub parent: Option<usize>,
    /// The number of times this node has been visited
    pub visits: u64,
    /// The player's evaluation of the node
    pub player_view: HashMap<S::Player, PlayerNodeView>,
    /// The identifiers of children nodes, states reachable from this one
    pub children: Vec<usize>,
 }
 impl<S: GameState> Node<S> {
    pub fn new(state: S, action: Option<S::Action>, parent: Option<usize>) -> Self {
        Node {
            state,
            action,
            parent,
            visits: 0,
            player_view: HashMap::with_capacity(2),
            children: Vec::new(),
        }
    }
    pub fn is_leaf(&self) -> bool {
        self.children.is_empty()
    }
    pub fn reward_sum(&self, player: &S::Player) -> RewardVal {
        match self.player_view.get(player) {
            Some(pv) => pv.reward_sum,
            None => 0.0,
        }
    }
    pub fn reward_average(&self, player: &S::Player) -> RewardVal {
        match self.player_view.get(player) {
            Some(pv) => pv.reward_average,
            None => 0.0,
        }
    }
    pub fn rewards(&self, player: &S::Player) -> Option<&Vec<RewardVal>> {
        match self.player_view.get(player) {
            Some(pv) => Some(&pv.rewards),
            None => None,
        }
    }
    pub fn increment_visits(&mut self) {
        self.visits += 1
    }
    pub fn record_player_reward(&mut self, player: S::Player, reward: RewardVal) {
        let pv = self
            .player_view
            .entry(player)
            .or_insert(PlayerNodeView::default());
        pv.rewards.push(reward);
        pv.reward_sum += reward;
        pv.reward_average = pv.reward_sum / pv.rewards.len() as f64;
    }
 }
 /// A player's specific perspective of a node's value
 ///
 /// Each player has their own idea of the value of a node.
 #[derive(Debug)]
 pub struct PlayerNodeView {
    /// The total reward from simulations through this node
    pub reward_sum: RewardVal,
    /// The average reward from simulations through this node, often called the node value
    pub reward_average: RewardVal,
    /// The rewards we have gotten so far for simulations through this node
    pub rewards: Vec<RewardVal>,
 }
 impl Default for PlayerNodeView {
    fn default() -> Self {
        PlayerNodeView {
            reward_sum: 0.0,
            reward_average: 0.0,
            rewards: Vec::new(),
        }
    }
 }