########################################################################### # # Q-ucenje # ########################################################################### # # Tmat[s,a] - prehod iz stanja s z akcijo a # Rmat[s,a] - nagrada, ki jo agent dobi, ce v stanju s izvede akcijo a # Fvec - vektor koncnih stanj # qlearning <- function(Tmat, Rmat, Fvec, gamma = 0.9) { # stevilo stanj nstates <- nrow(Tmat) nactions <- ncol(Tmat) # incializiramo Q matriko na 0 Qmat <- matrix(0, nrow = nstates, ncol = nactions) alpha <- 1 while (alpha > 0.1) { # nakljucno izberemo zacetno stanje cur.state <- sample(1:nstates, 1) # dokler ne pridemo v koncno stanje while (!(cur.state %in% Fvec)) { # katere akcije so na voljo iz trenutnega stanja possible.actions <- which(!is.na(Tmat[cur.state,])) # nakljucno izberi akcijo action <- possible.actions[sample(length(possible.actions), 1)] # izbrana akcija doloca naslednje stanje next.state <- Tmat[cur.state, action] # popravi Q matriko za trenutno stanje in izbrano akcijo Qmat[cur.state, action] <- Qmat[cur.state, action] + alpha * (Rmat[cur.state, action] + gamma * max(Qmat[next.state,], na.rm = T) - Qmat[cur.state, action]) cur.state <- next.state } alpha <- alpha * 0.999 } Qmat / max(Qmat) } ############################################################### # # Labirint s Q-ucenjem # ############################################################### # matrika prehodov # vrstice so polja v labirintu (1..25) # stolpci so premiki (1 - desno, 2 - gor, 3 - levo, 4 - dol) # Tlab <- matrix( c( 2, 1, 1, 6, 3, 2, 1, 2, 4, 23, 2, 8, 5, 4, 3, 4, 5, 5, 4, 10, 6, 1, 6, 11, 8, 7, 7, 12, 9, 3, 7, 8, 9, 9, 8, 14, 10, 5, 10, 15, 11, 6, 15, 16, 12, 7, 12, 17, 13, 13, 13, 18, 15, 9, 14, 19, 11, 10, 14, 20, 17, 11, 16, 21, 18, 12, 16, 22, 18, 13, 17, 23, 19, 14, 19, 24, 20, 15, 20, 25, 21, 16, 21, 21, 22, 17, 22, 22, 23, 18, 23, 3, 25, 19, 24, 24, 25, 20, 24, 25), nrow = 25, ncol = 4, byrow = TRUE) # matrika nagrad WALL <- -1 # kazen za udarec v zid MOVE <- 0 # do premika smo nevtralni BOMB <- -100 # kazen, ce agent stopi na bombo GOAL <- 100 # nagrada za izhod Rlab <- matrix( c( MOVE, WALL, WALL, BOMB, MOVE, WALL, MOVE, WALL, MOVE, MOVE, MOVE, MOVE, MOVE, WALL, MOVE, WALL, WALL, WALL, MOVE, MOVE, WALL, MOVE, WALL, MOVE, MOVE, WALL, WALL, BOMB, MOVE, MOVE, MOVE, WALL, WALL, WALL, MOVE, MOVE, WALL, MOVE, WALL, MOVE, WALL, BOMB, MOVE, MOVE, WALL, MOVE, WALL, MOVE, WALL, WALL, WALL, MOVE, MOVE, MOVE, WALL, MOVE, MOVE, MOVE, MOVE, BOMB, MOVE, MOVE, WALL, MOVE, MOVE, BOMB, MOVE, GOAL, WALL, MOVE, MOVE, MOVE, WALL, MOVE, WALL, MOVE, WALL, MOVE, WALL, MOVE, WALL, MOVE, WALL, WALL, WALL, MOVE, WALL, WALL, WALL, MOVE, WALL, MOVE, MOVE, MOVE, WALL, WALL, WALL, BOMB, MOVE, WALL), nrow = 25, ncol = 4, byrow = TRUE) Qlab <- qlearning(Tlab, Rlab, 22) smeri <- c("desno", "gor", "levo", "dol") # izpis optimalne strategije (1 - desno, 2 - gor, 3 - levo, 4 - dol) apply(Qlab, 1, function(x){smeri[which(x == max(x))]}) # # preverite, kaj se zgodi s strategijo v stanju 1 in 7, # ce uporabimo naslednje kazni / nagrade... # # # WALL <- -2 # MOVE <- -1 # BOMB <- -10 # GOAL <- 100 # # nalogo lahko definiramo tudi tako, da je poskusa konec takoj, ko agent stopi na past Qlab <- qlearning(Tlab, Rlab, c(6, 12, 20, 22)) # izpis optimalne strategije (1 - desno, 2 - gor, 3 - levo, 4 - dol) apply(Qlab, 1, function(x){smeri[which(x == max(x))]}) ############################################################# # # SIMULIRANO OKOLJE # ############################################################# library(grid) KEYPOS <- c(6,2) EXITPOS <- c(4,10) getLabyrinth <- function() { lab <- matrix(c( 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 ), nrow=11, byrow=T) sel <- lab == 1 lab[sel] <- rgb(0, 0, 0) # zid lab[!sel] <- rgb(1, 1, 1) # hodnik lab } getNextPos <- function(lab, pos, dir) { nextpos <- pos if (dir == 1) # desno nextpos <- nextpos + c(1,0) else if (dir == 2) # gor nextpos <- nextpos + c(0,1) else if (dir == 3) # levo nextpos <- nextpos + c(-1,0) else if (dir == 4) # dol nextpos <- nextpos + c(0,-1) else stop("unknown dir") if (lab[nextpos[2], nextpos[1]] == rgb(1, 1, 1)) nextpos else pos } isAtJunction <- function(lab, pos) { n <- 0 for (i in 1:4) { if (!identical(getNextPos(lab, pos, i), pos)) n <- n+1 } n > 2 } selectFreeDirection <- function(lab, pos) { dir <- vector() for (i in 1:4) { if (!identical(getNextPos(lab, pos, i), pos)) dir <- c(dir, i) } sample(dir, 1, F) } getRandomPosition <- function(lab) { pos <- vector() while(TRUE) { pos[1] <- sample(1:ncol(lab), 1, F) pos[2] <- sample(1:nrow(lab), 1, F) if (lab[pos[2], pos[1]] == rgb(1,1,1)) return(pos) } } getFlattenInd <- function (gridInd, gridDim) { gridInd <- t(gridInd) pdim <- c(1, cumprod(gridDim[-length(gridDim)])) gridInd <- gridInd - 1 colSums(apply(gridInd, 1, "*", pdim)) + 1 } selectAction <- function(simData, Q) { state <- getStateDesc(simData) len <- length(state) if (len > 1) { d <- dim(Q) n <- getFlattenInd(state, d[1:len]) } else { n <- state[1] } vals <- apply(Q, len+1, '[', n) m <- max(vals) candidates <- which(vals==m) if (length(candidates) == 1) action <- candidates else action <- sample(candidates,1) action } prepareSimData <- function() { lab <- getLabyrinth() data <- list() data$lab <- lab data$playerPos <- getRandomPosition(lab) data$playerDir <- NA data$playerKey <- FALSE data$playerCaught <- FALSE data$playerEscaped <- FALSE data$hunterPos <- getRandomPosition(lab) data$hunterDir <- NA data } simulationDraw <- function(simData) { screen <- simData$lab screen[EXITPOS[2], EXITPOS[1]] <- rgb(0, 0, 1) # izhod if (!simData$playerKey) screen[KEYPOS[2], KEYPOS[1]] <- rgb(1, 1, 0) # kljuc screen[simData$playerPos[2], simData$playerPos[1]] <- rgb(0, 1, 0) # agent screen[simData$hunterPos[2], simData$hunterPos[1]] <- rgb(1, 0, 0) # lovec grid.newpage() grid.raster(screen, interpolate=F) } simulationStep <- function(simData, Q=NULL) { if (!is.null(Q)) simData$playerDir <- selectAction(simData, Q) else if (is.na(simData$playerDir)) simData$playerDir <- selectFreeDirection(simData$lab, simData$playerPos) nextPlayerPos <- getNextPos(simData$lab, simData$playerPos, simData$playerDir) if (identical(simData$playerPos, nextPlayerPos)) { simData$playerDir <- NA nextPlayerPos <- simData$playerPos } if (is.na(simData$hunterDir) || isAtJunction(simData$lab, simData$hunterPos)) simData$hunterDir <- selectFreeDirection(simData$lab, simData$hunterPos) nextHunterPos <- getNextPos(simData$lab, simData$hunterPos, simData$hunterDir) if (identical(simData$hunterPos, nextHunterPos)) { simData$hunterDir <- NA nextHunterPos <- simData$hunterPos } if (abs(nextPlayerPos[1] - nextHunterPos[1]) + abs(nextPlayerPos[2] - nextHunterPos[2]) <= 1) simData$playerCaught <- TRUE else if (nextPlayerPos[1] == EXITPOS[1] && nextPlayerPos[2] == EXITPOS[2] && simData$playerKey) simData$playerEscaped <- TRUE else if (nextPlayerPos[1] == KEYPOS[1] && nextPlayerPos[2] == KEYPOS[2]) simData$playerKey <- TRUE simData$playerPos <- nextPlayerPos simData$hunterPos <- nextHunterPos simData } simulation <- function(Q=NULL, maxSteps=500) { simData <- prepareSimData() steps=0 while(!simData$playerCaught && !simData$playerEscaped && steps < maxSteps) { simData <- simulationStep(simData, Q) simulationDraw(simData) steps <- steps + 1 Sys.sleep(0.1) } if (simData$playerEscaped) grid.text("WIN!", x=0.5, y=0.5, gp=gpar(fontsize=80, col="blue")) else if (simData$playerCaught) grid.text("CAUGHT!", x=0.5, y=0.5, gp=gpar(fontsize=80, col="red")) } # Ce agenta ne ucimo (ne podamo matrike Q), bo v vsakem koraku simulacije nakljucno izbral akcijo # izvedimo 5 simulacij set.seed(0) for (i in 1:5) { simulation() Sys.sleep(1) } # Argument dimStateSpace je vektor iste dolzine kot opisi stanj, ki jih vraca funkcija getStateDesc. # Vsak element vektorja dimStateSpace doloca najvecjo vrednost, ki jo lahko zavzame istolezni element v opisu stanja. qlearning <- function(dimStateSpace, gamma = 0.9, maxtrials = 1000, maxsteps = 500) { dimQ <- c(dimStateSpace, 4) # 4 akcije (desno, gor, levo, dol) Q <- array(0, dim=dimQ) alpha <- 1 ntrials <- 0 while (alpha > 0.1 && ntrials < maxtrials) { simData <- prepareSimData() curState <- getStateDesc(simData) steps <- 0 while (!simData$playerCaught && !simData$playerEscaped && steps < maxsteps) { simData$playerDir <- sample(1:4, 1, T) # 4 akcije (desno, gor, levo, dol) simData <- simulationStep(simData) nextState <- getStateDesc(simData) reward <- getReward(curState, simData$playerDir, nextState) curStatePos <- getFlattenInd(c(curState, simData$playerDir), dimQ) len <- length(nextState) if (len > 1) { n <- getFlattenInd(nextState, dimStateSpace) } else { n <- nextState[1] } vals <- apply(Q, len+1, '[', n) Q[curStatePos] <- Q[curStatePos] + alpha * (reward + gamma * max(vals, na.rm = T) - Q[curStatePos]) curState <- nextState steps <- steps + 1 } ntrials <- ntrials + 1 alpha <- alpha * 0.9999 print(paste("trial",ntrials,"out of max",maxtrials), quote=F) flush.console() } Q / max(Q) } # Mnozica stanj mora biti koncna in diskretna. # Upostevati naslednje omejitve: # - vsa stanja morajo biti opisana z vektorjem enake dolzine # - vsak element vektorja opisa mora biti pozitivno celo stevilo # # Zaradi hitrosti in zanesljivosti ucenja je zazeleno, da je razlicnih stanj cim manj! # Zacetna preprosta ideja: stanje vsebuje trenutni polozaj agenta getStateDesc <- function(simData) { c(simData$playerPos[1], simData$playerPos[2]) } # Rezultat funkcije je nagrada (ali kazen), ki jo agent sprejme v opisani situaciji. # Nagrada mora spodbujati agenta, da izvaja koristne akcije oziroma ga odvracati od negativnih akcij getReward <- function(oldstate, action, newstate) { # trenutno vse akcije vracajo nagrado 1 reward <- 1 reward } # Za zacetno implementacijo funkcije getStateDesc, je parameter dimStateSpace definiran kot dvodimenzionalni vektor c(7,11) set.seed(0) qmat <- qlearning(c(7, 11), gamma=0.99, maxtrials=1000) for (i in 1:5) { simulation(qmat) Sys.sleep(1) } # Sedaj v opis stanja, poleg pozicije agenta, vkljucimo se pozicijo lovca ter zastavico, ali je agent pobral kljuc getStateDesc <- function(simData) { c(simData$playerPos[1], simData$playerPos[2], simData$hunterPos[1], simData$hunterPos[2], ifelse(simData$playerKey,2,1)) } # Tudi nagrade razdelimo v skladu s pravili igre getReward <- function(oldstate, action, newstate) { reward <- 0 if (abs(newstate[1] - newstate[3]) + abs(newstate[2] - newstate[4]) <= 1) # lovec je ujel agenta reward <- -1000 else if (newstate[1] == KEYPOS[1] && newstate[2] == KEYPOS[2] && oldstate[5]==1) # agent je pobral kljuc reward <- 1000 else if (newstate[1] == EXITPOS[1] && newstate[2] == EXITPOS[2] && newstate[5]==2) # agent je pobegnil reward <- 5000 else if (newstate[1] == oldstate[1] && newstate[2] == oldstate[2]) # neveljavna poteza (agent se je zabil v zid) reward <- -1 reward } # Zdaj imamo vecji problemski prostor. # Ce izvedemo premalo eksperimentov, se agent ne bo naucil naloge v celoti. set.seed(0) qmat <- qlearning(c(7, 11, 7, 11, 2), gamma=0.9, maxtrials=1000) # save(qmat, file="qmat1000.RData") # load(file="qmat1000.RData") set.seed(0) for (i in 1:5) { simulation(qmat) Sys.sleep(1) } # Povecajmo stevilo iteracij ucenja (eksperimentov) set.seed(0) qmat <- qlearning(c(7, 11, 7, 11, 2), gamma=0.9, maxtrials=10000) # save(qmat, file="qmat10000.RData") # load(file="qmat10000.RData") set.seed(0) for (i in 1:10) { simulation(qmat) Sys.sleep(1) }