function [sigma, mu]=PolicyGradient(L, M, T, options, winx,winy)
  startSimulation (winx,winy);  % {̂̃EBhE\
  MaxTorque = 100;      % őgN
  MinTorque = -100;      % ŏgN
  N = 19;        % fp[^imu:18Csigma:1j
  ibstate = getBaseState();      % ̂̏

  % 􃂃fp[^_ɏ
  mu = rand(N-1,1)-0.5;
  sigma = rand*10;

  % 
  for l=1:L
    dr = 0;
    rand('state',l);

    % W{
    for m=1:M
      drs(m) = 0;
      der(m,:) = zeros(1,N);

      resetSimulation();

      for t=1:T
        % Ԃ̏
        state = zeros(N-2,1);

        % ֐ߏԊϑ
        jstate = getJointState();
        bstate = getBaseState();

        % ԃxNg̍\z
        state(1:16) = jstate;   % 8֐߂̊pxёx
        state(17) = bstate(3);  % z̈ʒu
        state(18) = bstate(10); % z̑x

        % s̑I
        action = randn*sigma + mu'*state;
        action = min(action,MaxTorque); % ŏlmF
        action = max(action,MinTorque); % őlmF

        % s̎s
        u = zeros(1,8);
        u(2) = action;
        u(4) = action;
        u(6) = action;
        u(8) = action;

        stepSimulation (u,0.0005);
        if(t==0 || mod(t,50)==0)
           drawWorld;
        end

        % ̏Ԋϑ
        abstate = getBaseState();

        % muɊւžvZ
        der(m,1:N-1) = der(m,1:N-1) + ((action-mu'*state)*state/(sigma^2))';

        % W΍sigmaɊւžvZ
        der(m,N) = der(m,N) + ((action-mu'*state)^2-sigma^2)/(sigma^3);
        
        % VǎvZ
        rewards(m,t) = abstate(1) - ibstate(1);
        drs(m) = drs(m) + options.gamma^(t-1)*rewards(m,t);
        dr = dr + options.gamma^(t-1)*rewards(m,t);
      end
    end

    % ŏUx[XCvZ
    b = drs * diag(der*der') / trace(der*der');

    % z𐄒
    derJ = 1/M * ((drs-b) * der)';

    % fp[^XV
    mu = mu + options.alpha * derJ(1:N-1);
    sigma = sigma + options.alpha * derJ(N);

    printf("%d) Max=%.2f Min=%.2f Avg=%.2f Dsum=%.2f\n",l,max(max(rewards)), min(min(rewards)), mean(mean(rewards)), dr/M);
    fflush(stdout);
  end