#!/usr/bin/python -Wall # ================================================================ # Some simple statistics routines. # # John Kerl # kerl.john.r@gmail.com # 2007-06-28 # # ================================================================ # This software is released under the terms of the GNU GPL. # Please see LICENSE.txt in the same directory as this file. # ================================================================ from __future__ import division # 1/2 = 0.5, not 0. import math import copy import sackmat_m import normal_m import sys # ---------------------------------------------------------------- def make_histogram(farray, lo, hi, num_bins): bins = [0] * num_bins # Ensure they are floats. lo *= 1.0; hi *= 1.0 # This is loop-invariant; hoist it out. mul = num_bins / (hi - lo) for element in farray: if ((element >= lo) and (element < hi)): idx = int((element-lo) * mul) bins[idx] += 1 return bins # ---------------------------------------------------------------- def make_histogram2(xy_pairs, lo_x, hi_x, num_x_bins, lo_y, hi_y, num_y_bins): bins = sackmat_m.make_zero_matrix(num_x_bins, num_y_bins) # Ensure they are floats. lo_x *= 1.0; hi_x *= 1.0; lo_y *= 1.0; hi_y *= 1.0 # These are loop-invariant; hoist them out. mul_x = num_x_bins / (hi_x - lo_x) mul_y = num_y_bins / (hi_y - lo_y) for [x, y] in xy_pairs: if ((x >= lo_x) and (x < hi_x) and (y >= lo_y) and (y < hi_y)): x_idx = int((x-lo_x) * mul_x) y_idx = int((y-lo_y) * mul_y) bins[x_idx][y_idx] += 1 return bins # ---------------------------------------------------------------- def make_histogram_labels(lo, hi, num_bins, center=0): list = [] delta = (hi-lo)/(1.0*num_bins) for i in range(0, num_bins): if (center): label = lo + (i + 0.5) *delta else: label = lo + i*delta list.append(label) return list # ---------------------------------------------------------------- def find_min(farray): lo = farray[0] for element in farray: if (element < lo): lo = element return lo # ---------------------------------------------------------------- def find_max(farray): hi = farray[0] for element in farray: if (element > hi): hi = element return hi # ---------------------------------------------------------------- def find_bounds(farray): lo = farray[0] hi = farray[0] for element in farray: if (element < lo): lo = element if (element > hi): hi = element return [lo, hi] # ---------------------------------------------------------------- # Scalar mean def find_mean(farray): sum = 0.0 for element in farray: sum += element return sum / len(farray) # ---------------------------------------------------------------- # Scalar standard deviation def find_stddev(farray, mean): sum = 0.0 for element in farray: sum += (element - mean) ** 2 return math.sqrt(sum / (len(farray) - 1)) # ---------------------------------------------------------------- # Standard error, i.e. standard deviation of the sample mean. def find_stderr(farray, mean): s = find_stddev(farray, mean) n = len(farray) return s / math.sqrt(n) # ---------------------------------------------------------------- # Population variance def find_popvaraux(xs, mean): sum = 0.0 for x in xs: sum += (x - mean) ** 2 return sum / len(xs) def find_popvar(xs): return find_popvaraux(xs, find_mean(xs)) # Sample variance def find_sampvaraux(xs, mean): sum = 0.0 for x in xs: sum += (x - mean) ** 2 return sum / (len(xs) - 1) def find_sampvar(xs): return find_sampvaraux(xs, find_mean(xs)) def find_var_of_sample_mean(xs): return find_sampvar(xs) / len(xs) # U = 1 - / (3 ^2) # where X is taken to have zero mean def find_fourth_order_cumulant(farray): n = len(farray) sumx = 0.0 sumx2 = 0.0 sumx4 = 0.0 for x in farray: sumx += x meanx = sumx / n for y in farray: x = y - meanx x2 = x*x x4 = x2*x2 sumx2 += x2 sumx4 += x4 meanx2 = sumx2 / n meanx4 = sumx4 / n if meanx2 == 0.0: return 0.0 else: return 1.0 - meanx4 / (3 * meanx2 * meanx2); def find_fourth_order_cumulant_temp(farray): n = len(farray) sumx = 0.0 sumx2 = 0.0 sumx3 = 0.0 sumx4 = 0.0 for x in farray: sumx += x sumx2 += x*x sumx3 += x*x*x sumx4 += x*x*x*x xbar = sumx / n xbar2 = xbar * xbar xbar3 = xbar * xbar2 xbar4 = xbar * xbar3 numer = \ sumx4 \ - 4 * xbar * sumx3 \ + 6 * xbar2 * sumx2 \ - 4 * xbar3 * sumx \ + n * xbar4 denom = \ sumx2 * sumx2 \ + 4 * xbar2 * sumx * sumx \ + n * n * xbar4 \ + 2 * n * xbar2 * sumx2 \ - 4 * xbar * sumx * sumx2 \ - 4 * n * xbar3 * sumx if denom == 0.0: return 0.0 else: return 1.0 - (n * numer) / (3 * denom) # ---------------------------------------------------------------- # Sample covariance of two lists. def find_sample_covariance(xs, ys): N = len(xs) mean_x = find_mean(xs) mean_y = find_mean(ys) sum = 0.0 for k in range(0, N): sum += (xs[k] - mean_x) * (ys[k] - mean_y) return sum / (N-1.0) # ---------------------------------------------------------------- # Sample correlation of two lists: # Corr(X,Y) = Cov(X,Y) / sigma_X sigma_Y. def find_sample_correlation(xs, ys): N = len(xs) cov = find_sample_covariance(xs, ys) mean_x = find_mean(xs) mean_y = find_mean(ys) stddev_x = find_stddev(xs, mean_x) stddev_y = find_stddev(ys, mean_y) return cov / (stddev_x * stddev_y) # ---------------------------------------------------------------- def find_sample_tauint(xs, numk=0): N = len(xs) if numk == 0: numk = N tauint = [0.0] * numk autocorr = find_sample_autocorr(xs, numk) sum = 1.0 tauint[0] = sum for t in range(1, numk): sum += 2 * autocorr[t] tauint[t] = sum return tauint # ---------------------------------------------------------------- # Computing running sum of $\hat{\tau}_int(t)$. # Stop after a flat spot has been found, detected via a turning point. # Be clever about re-using previous sums. # # See my dissertation for details. Or, Berg's Markov Chain Monte Carlo # Simulations and Their Statistical Analysis. def find_sample_tauint_flat_spot(xs): N = len(xs) minflat = 4 # Within-window sums sumi, sumi2, sumj, and sumj2 contain terms which can # be re-used across k. The cross-sums sumij are different for each k and # must be recomputed. # # Here, compute the full-window sums. sumi = 0.0; sumi2 = 0.0 for k in range(0, N): x = xs[k] sumi += x sumi2 += x*x sumj = sumi; sumj2 = sumi2 tauint = 1.0 nflat = 0 # Go up only to t=N-2. The autocorr estimator for t=N-1 doesn't work (only # one sample), and if we haven't found a flat spot of tauint by then, we # aren't going to. for t in range(1, N-1): winsz = N - t winszm1 = winsz - 1 denom = winszm1 if winszm1 == 0: denom = 1 i = t; j = N-t-1 # Update the within-window sums. xi = xs[ i ]; xj = xs[ j ] xim = xs[i-1]; xjp = xs[j+1] sumi -= xim; sumi2 -= xim*xim sumj -= xjp; sumj2 -= xjp*xjp # Compute the cross-sum. sumij = 0.0 for k in range(0, winsz): sumij += xs[k] * xs[t+k] # Compute the autocorrelation term for this t. meani = sumi / winsz; meanj = sumj / winsz stdi = math.sqrt((sumi2 - (sumi**2 / winsz)) / denom) stdj = math.sqrt((sumj2 - (sumj**2 / winsz)) / denom) autocorr_t = 0.0 if (stdi != 0.0) and (stdj != 0.0): autocorr_t = (sumij / winsz - (meani*meanj)) / (stdi*stdj) tauint += 2.0 * autocorr_t #print '%d %11.7f %11.7f' % (t, autocorr_t, tauint) if autocorr_t < 0.0: return tauint return tauint # ---------------------------------------------------------------- # E[X_i X_j] - mu^2 # -----------------. # sigma^2 # More efficient autocorrelation: # # The cross-window sums must be computed for each k, because they differ for # each k. But the within-window sums may be accumulated, as long as we start # with k at the end and work our way backward to the beginning. # Example with N = 8: # [ 0 ] k = 7 # [ o ] # [ o ] # [ 7 ] # [ 0 1 ] k = 6 # [ o o ] # [ o o ] # [ 6 7 ] # ... # [ 0 1 2 3 4 5 6 ] k = 1 # [ o o o o o o o ] # [ o o o o o o o ] # [ 1 2 3 4 5 6 7 ] # [ 0 1 2 3 4 5 6 7 ] k = 0 # [ o o o o o o o o ] # [ o o o o o o o o ] # [ 0 1 2 3 4 5 6 7 ] # ---------------------------------------------------------------- # Pass numk=0 or numk=N to compute all possible autocorrelations. Pass numk = # something smaller to compute only the first numk autocorrelations. def find_sample_autocorr(xs, numk=0): N = len(xs) if numk == 0: numk = N autocorr = [0.0] * numk if (numk <= 1) or (numk > N): print >> sys.stderr, \ "find_sample_autocorr: numk must be > 1 and <= N=%d; got %d." \ % (N, numk) sys.exit(1) # Sums over first and second windows sumi = 0.0; sumi2 = 0.0 sumj = 0.0; sumj2 = 0.0 # k = N-1: accumulate sums, but set autocorr to zero. Sample sizes # are 1; standard deviations would get a division by zero. i = 0; j = N-1 for k in range(N-1, numk-1, -1): xi = xs[i]; xj = xs[j] sumi += xi; sumi2 += xi**2 sumj += xj; sumj2 += xj**2 i += 1; j -= 1 # k = N-2 down to 1. for k in range(numk-1, 0, -1): xi = xs[i]; xj = xs[j] sumi += xi; sumi2 += xi**2 sumj += xj; sumj2 += xj**2 winsz = N-k; winszm1 = winsz - 1 if k == N-1: # Leave autocorr[N-1] = 0.0 to avoid division by zero. i += 1; j -= 1 continue sumij = 0.0 for ell in range(0, winsz): sumij += xs[ell] * xs[ell+k] meani = sumi / winsz; meanj = sumj / winsz stdi = math.sqrt((sumi2 - (sumi**2 / winsz)) / winszm1) stdj = math.sqrt((sumj2 - (sumj**2 / winsz)) / winszm1) if (stdi == 0.0) or (stdj == 0.0): autocorr[k] = 0.0 else: autocorr[k] = (sumij / winsz - (meani*meanj)) / (stdi*stdj) i += 1; j -= 1 # k = 0: sumij, sumi2, and sumj2 are all the same. k = 0 winsz = N; winszm1 = N-1 xi = xs[N-1] sumi += xi sumi2 += xi**2 meani = sumi / N stdi = math.sqrt((sumi2 - (sumi**2 / winsz)) / winszm1) autocorr[0] = (sumi2 / N - (meani*meani)) / (stdi*stdi) return autocorr ## ---------------------------------------------------------------- #def find_sample_full_autocorr(xs): # N = len(xs) # autocorr = [0.0] * N # # # Sums over first and second windows # sumi = 0.0; sumi2 = 0.0 # sumj = 0.0; sumj2 = 0.0 # # # k = N-1: accumulate sums, but set autocorr to zero. Sample sizes # # are 1; standard deviations would get a division by zero. # i = 0; j = N-1 # k = N-1 # xi = xs[i]; xj = xs[j] # sumi += xi; sumi2 += xi**2 # sumj += xj; sumj2 += xj**2 # autocorr[k] = 0.0 # i += 1; j -= 1 # # # k = N-2 down to 1. # for k in range(N-2, 0, -1): # xi = xs[i]; xj = xs[j] # sumi += xi; sumi2 += xi**2 # sumj += xj; sumj2 += xj**2 # winsz = N-k; winszm1 = winsz - 1 # # sumij = 0.0 # for ell in range(0, winsz): # sumij += xs[ell] * xs[ell+k] # # meani = sumi / winsz; meanj = sumj / winsz # stdi = math.sqrt((sumi2 - (sumi**2 / winsz)) / winszm1) # stdj = math.sqrt((sumj2 - (sumj**2 / winsz)) / winszm1) # # autocorr[k] = (sumij / winsz - (meani*meanj)) / (stdi*stdj) # i += 1; j -= 1 # # # k = 0: sumij, sumi2, and sumj2 are all the same. # k = 0 # winsz = N; winszm1 = N-1 # xi = xs[N-1] # sumi += xi # sumi2 += xi**2 # meani = sumi / N # stdi = math.sqrt((sumi2 - (sumi**2 / winsz)) / winszm1) # # autocorr[0] = (sumi2 / N - (meani*meani)) / (stdi*stdi) # # return autocorr # ## ---------------------------------------------------------------- #def find_sample_short_autocorr(xs, numk): # N = len(xs) # autocorr = [0.0] * numk # # if (numk <= 1) or (numk >= N): # print >> sys.stderr, \ # "find_sample_short_autocorr: numk must be > 1 and < N=%d; got %d." \ # % (N, numk) # sys.exit(1) # # # Sums over first and second windows # sumi = 0.0; sumi2 = 0.0 # sumj = 0.0; sumj2 = 0.0 # # # k = N-1: accumulate sums, but set autocorr to zero. Sample sizes # # are 1; standard deviations would get a division by zero. # i = 0; j = N-1 # for k in range(N-1, numk-1, -1): # xi = xs[i]; xj = xs[j] # sumi += xi; sumi2 += xi**2 # sumj += xj; sumj2 += xj**2 # i += 1; j -= 1 # # # k = N-2 down to 1. # for k in range(numk-1, 0, -1): # xi = xs[i]; xj = xs[j] # sumi += xi; sumi2 += xi**2 # sumj += xj; sumj2 += xj**2 # winsz = N-k; winszm1 = winsz - 1 # # sumij = 0.0 # for ell in range(0, winsz): # sumij += xs[ell] * xs[ell+k] # # meani = sumi / winsz; meanj = sumj / winsz # stdi = math.sqrt((sumi2 - (sumi**2 / winsz)) / winszm1) # stdj = math.sqrt((sumj2 - (sumj**2 / winsz)) / winszm1) # # autocorr[k] = (sumij / winsz - (meani*meanj)) / (stdi*stdj) # i += 1; j -= 1 # # # k = 0: sumij, sumi2, and sumj2 are all the same. # k = 0 # winsz = N; winszm1 = N-1 # xi = xs[N-1] # sumi += xi # sumi2 += xi**2 # meani = sumi / N # stdi = math.sqrt((sumi2 - (sumi**2 / winsz)) / winszm1) # # autocorr[0] = (sumi2 / N - (meani*meani)) / (stdi*stdi) # # return autocorr # ---------------------------------------------------------------- # E[X_i X_j] - mu^2 # -----------------. # sigma^2 # Example: # # N = 8, k = 2 # 0 1 2 3 4 5 # o o o o o o # o o o o o o # 2 3 4 5 6 7 def find_sample_autocorr_old1(xs): N = len(xs) autocorr = [0.0] * N for k in range(0, N-1): # k = j-i window1 = xs[0:N-k] window2 = xs[k:N] mean1 = find_mean(window1) mean2 = find_mean(window2) std1 = find_stddev(window1, mean1) std2 = find_stddev(window2, mean2) cross_sum = 0.0 for i in range(0, N-k): cross_sum += xs[i] * xs[i+k] cross_mean = cross_sum / (N-k) autocorr[k] = (cross_mean - mean1*mean2) / (std1 * std2) return autocorr # ---------------------------------------------------------------- # Vector mean: varray is a list of N vectors, each having n elements. This # could be done more elegantly using calls to sackmat_m's vecadd but it seems # tighter performancewise this way. def find_vector_mean(varray): N = len(varray) n = len(varray[0]) sum = [0.0] * n for vector in varray: for j in range(0, n): sum[j] += vector[j] for j in range(0, n): sum[j] *= (1.0/N) return sum # ---------------------------------------------------------------- # Vector standard deviation: varray is a list of N vectors, each having n # elements. This could be done more elegantly using calls to sackmat routines # but it seems tighter performancewise this way. def find_vector_stddev(varray, vmean): N = len(varray) n = len(varray[0]) sum = [0.0] * n for vector in varray: for j in range(0, n): sum[j] += (vector[j] - vmean[j])**2 denom = 1.0 / (N-1) for j in range(0, n): sum[j] = math.sqrt(sum[j] * denom) return sum # ---------------------------------------------------------------- # find_sample_covariance_matrix # ---------------------------------------------------------------- # # There are nx vectors x, each having dim elements. # The sample covariance matrix Q is dim x dim with elements # q[i][j] = (1/(nx-1)) sum_{k=1}^nx (x[k][i] - mu[i])(x[k][j] - mu[j]) # where k indexes the set of x's and i,j index elements within the k'th vector. # I.e. the ij'th entry of Q is the (scalar) sample covariance of xi and xj. # In particular, the diagonal entries are the variances of the xi. # Example: # nx = 3 and dim = 2. # # x0 = [ 1 ] x1 = [ 2 ] x2 = [ 3 ] # [-4 ] [-5 ] [-6 ] # # mu = [ 2 ] # [-5 ] # q[0][0] = (x[0][0] - mu[0]) * (x[0][0] - mu[0]) # + (x[1][0] - mu[0]) * (x[1][0] - mu[0]) # + (x[2][0] - mu[0]) * (x[2][0] - mu[0]) # # = ( 1 - 2) * ( 1 - 2) # + ( 2 - 2) * ( 2 - 2) # + ( 3 - 2) * ( 3 - 2) # # = 1 + 0 + 1 = 2 # q[0][1] = (x[0][0] - mu[0]) * (x[0][1] - mu[1]) # + (x[1][0] - mu[0]) * (x[1][1] - mu[1]) # + (x[2][0] - mu[0]) * (x[2][1] - mu[1]) # # = ( 1 - 2) * (-4 + 5) # + ( 2 - 2) * (-5 + 5) # + ( 3 - 2) * (-6 + 5) # # =-1 + 0 - 1 = -2 # q[1][0] = (x[0][1] - mu[1]) * (x[0][0] - mu[0]) # + (x[1][1] - mu[1]) * (x[1][0] - mu[0]) # + (x[2][1] - mu[1]) * (x[2][0] - mu[0]) # # = (-4 + 5) * ( 1 - 2) # + (-5 + 5) * ( 2 - 2) # + (-6 + 5) * ( 3 - 2) # # =-1 + 0 - 1 = -2 # q[1][1] = (x[0][1] - mu[1]) * (x[0][1] - mu[1]) # + (x[1][1] - mu[1]) * (x[1][1] - mu[1]) # + (x[2][1] - mu[1]) * (x[2][1] - mu[1]) # # = (-4 + 5) * (-4 + 5) # + (-5 + 5) * (-5 + 5) # + (-6 + 5) * (-6 + 5) # # = 1 + 0 + 1 = 2 # Q = [ 2 -2 ] * (1/2) # [-2 2 ] # # = [ 1 -1 ] # [-1 1 ] def find_sample_covariance_matrix(xs): nx = len(xs) dim = len(xs[0]) mean = find_vector_mean(xs) Q = sackmat_m.make_zero_matrix(dim, dim) for i in range(0, dim): for j in range(0, dim): sum = 0.0 for k in range(0, nx): sum += (xs[k][i] - mean[i]) * (xs[k][j] - mean[j]) Q[i][j] = sum / (nx-1.0) return Q # ---------------------------------------------------------------- # Corr(X,Y) = Cov(X,Y) / sigma_X sigma_Y # Corr(Xi,Xj) = Cov(Xi,Xj) / sigma_Xi sigma_Xj # The ijth entry of the correlation matrix is the correlation of Xi and Xj. def find_sample_correlation_matrix(xs): nx = len(xs) dim = len(xs[0]) cov = find_sample_covariance_matrix(xs) vmean = find_vector_mean(xs) vstddev = find_vector_stddev(xs, vmean) for i in range(0, dim): for j in range(0, dim): cov[i][j] /= vstddev[i] * vstddev[j] return cov # ---------------------------------------------------------------- # Univariate linear regression # ---------------------------------------------------------------- # There are N (xi, yi) pairs. # # E = sum (yi - m xi - b)^2 # # DE/Dm = sum 2 (yi - m xi - b) (-xi) = 0 # DE/Db = sum 2 (yi - m xi - b) (-1) = 0 # # sum (yi - m xi - b) (xi) = 0 # sum (yi - m xi - b) = 0 # # sum (xi yi - m xi^2 - b xi) = 0 # sum (yi - m xi - b) = 0 # # m sum(xi^2) + b sum(xi) = sum(xi yi) # m sum(xi) + b N = sum(yi) # # [ sum(xi^2) sum(xi) ] [ m ] = [ sum(xi yi) ] # [ sum(xi) N ] [ b ] = [ sum(yi) ] # # [ m ] = [ sum(xi^2) sum(xi) ]^-1 [ sum(xi yi) ] # [ b ] [ sum(xi) N ] [ sum(yi) ] # # = [ N -sum(xi) ] [ sum(xi yi) ] * 1/D # [ -sum(xi) sum(xi^2)] [ sum(yi) ] # # where # # D = N sum(xi^2) - sum(xi)^2. # # So # # N sum(xi yi) - sum(xi) sum(yi) # m = -------------------------------- # D # # -sum(xi)sum(xi yi) + sum(xi^2) sum(yi) # b = ---------------------------------------- # D def linear_regression(xs, ys): sumxi = 0.0 sumyi = 0.0 sumxiyi = 0.0 sumxi2 = 0.0 N = len(xs) for i in range(0, N): x = xs[i] y = ys[i] sumxi += x sumyi += y sumxiyi += x*y sumxi2 += x*x D = N * sumxi2 - sumxi**2 m = (N * sumxiyi - sumxi * sumyi) / D b = (-sumxi * sumxiyi + sumxi2 * sumyi) / D # Young 1962, pp. 122-124. Compute sample variance of linear # approximations, then variances of m and b. var_z = 0.0 for i in range(0, N): var_z += (m * xs[i] + b - ys[i])**2 var_z /= N var_m = (N * var_z) / D var_b = (var_z * sumxi2) / D return [m, b, math.sqrt(var_m), math.sqrt(var_b)] # ---------------------------------------------------------------- # xxx comment me please def linear_regression_pca(xs, ys): xy_pairs = [] n = len(xs) for i in xrange(0, n): xy_pairs.append([xs[i], ys[i]]) Q = find_sample_covariance_matrix(xy_pairs) [L, V] = sackmat_m.rs_eigensystem(Q) #print "Q:"; print Q #print "V:"; print V #print "L:"; print L lambda1 = L[0] lambda2 = L[1] abs1 = abs(lambda1) abs2 = abs(lambda2) a0 = 0.0 a1 = 0.0 if (abs1 > abs2): a0 = V[0][0] a1 = V[1][0] else: [abs1, abs2] = [abs2, abs1] [lambda1, lambda2] = [lambda2, lambda1] a0 = V[0][1] a1 = V[1][1] #print "lambda1: ",; print lambda1 #print "abs1: ",; print abs1 quality = 0.0 if abs2 > 0.0: quality = 1.0 - abs2/abs1 mu = find_mean(xs) nu = find_mean(ys) #print "mu: ",; print mu #print "nu: ",; print nu m = a1/a0 b = nu - m * mu #print "b: ",; print b return [m, b, quality] # ---------------------------------------------------------------- def get_corr_coeff(xs, ys): sumxi = 0.0 sumyi = 0.0 sumxiyi = 0.0 sumxi2 = 0.0 sumyi2 = 0.0 N = len(xs) for i in range(0, N): x = xs[i] y = ys[i] sumxi += x sumyi += y sumxiyi += x*y sumxi2 += x*x sumyi2 += y*y # Young 1962, p. 130. a = N*sumxiyi - sumxi*sumyi b = N*sumxi2 - sumxi**2 c = N*sumyi2 - sumyi**2 corr_coeff = a / math.sqrt(b*c) return corr_coeff # ---------------------------------------------------------------- # log(p/(1-p)) = m*x + b # p(x, m, b) = 1 / (1 + exp(-m*x-b) # = exp(m*x+b) / (1 + exp(m*x+b) # 1-p = exp(-m*x-b) / (1 + exp(-m*x-b) # = 1 / (1 + exp(m*x+b) # # dp/dm = -1 / [1 + exp(-m*x-b)]**2 * (-x) * exp(-m*x-b) # = [x exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2 # = x * p * (1-p) # # dp/db = -1 / [1 + exp(-m*x-b)]**2 * (-1) * exp(-m*x-b) # = [exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2 # = p * (1-p) # # Write p_i for p(x_i, m, b) # # L(m, b) = prod_{i=0}^{N-1} [ p_i**y_i * (1-p_i)**(1-y_i) ] # # ell(m, b) = sum{i=0}^{N-1} [ y_i log(p_i) + (1-y_i) log(1-p_i) ] # = sum{i=0}^{N-1} [ log(1-p_i) + y_i log(p_i/(1-p_i)) ] # = sum{i=0}^{N-1} [ log(1-p_i) + y_i*(m*x_i+b) ] # # # d ell/dm = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/dm + x_i*y_i ] # = sum{i=0}^{N-1} [ -1/(1-p_i) x_i*p_i*(1-p_i) + x_i*y_i ] # = sum{i=0}^{N-1} [ x_i(y_i-p_i) ] # # d ell/db = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/db + y_i ] # = sum{i=0}^{N-1} [ -1/(1-p_i) p_i*(1-p_i) + y_i ] # = sum{i=0}^{N-1} [ y_i - p_i ] # # # d2ell/dm2 = sum{i=0}^{N-1} [ -x_i dp_i/dm ] # = sum{i=0}^{N-1} [ -x_i**2 * p_i * (1-p_i) ] # # d2ell/dmdb = sum{i=0}^{N-1} [ -x_i dp_i/db ] # = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ] # # d2ell/dbdm = sum{i=0}^{N-1} [ -dp_i/dm ] # = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ] # # d2ell/db2 = sum{i=0}^{N-1} [ -dp_i/db ] # = sum{i=0}^{N-1} [ -p_i * (1-p_i) ] # # # Newton-Raphson to minimize ell(m, b): # * Pick m0, b0 # * [m_{j+1], b_{j+1}] = H^{-1} grad ell(m_j, b_j) # * H = Hessian of ell = # [ d2ell/dm2 d2ell/dmdb ] # [ d2ell/dmdb d2ell/db2 ] # * grad ell = # [ dell/dm ] # [ dell/db ] # ---------------------------------------------------------------- # ---------------------------------------------------------------- def lrp(x, m, b): return 1.0 / (1.0 + math.exp(-m*x-b)) def lrq(x, m, b): return 1.0 / (1.0 + math.exp(m*x+b)) def logistic_regression_aux(xs, ys, m0, b0, tol, maxits): N = len(xs) its = 0 done = False while not done: dldm = 0.0 dldb = 0.0 d2ldm2 = 0.0 d2ldmdb = 0.0 d2ldb2 = 0.0 for i in xrange(0, N): xi = xs[i] yi = ys[i] pi = lrp(xi, m0, b0) #print "x[%d]=%.12f," % (i, xi), #print "y[%d]=%.12f," % (i, yi), #print "p[%d]=%.12f" % (i, pi) qi = lrq(xi, m0, b0) dldm += xi*(yi - pi) dldb += yi - pi piqi = pi * qi; xipiqi = xi*piqi xi2piqi = xi*xipiqi d2ldm2 -= xi2piqi d2ldmdb -= xipiqi d2ldb2 -= piqi # L = 1.0 # ell = 0.0 # for i in xrange(0, N): # xi = xs[i] # yi = ys[i] # pi = lrp(xi, m0, b0) # qi = lrq(xi, m0, b0) # L *= pi**yi * qi**(1-yi) # ell += math.log(qi) + yi*(m0*xi+b0) #print #print "L=%.12f" % L #print "l=%.12f" % ell #print "its=%d" % its ha = d2ldm2 hb = d2ldmdb hc = d2ldmdb hd = d2ldb2 D = ha*hd - hb*hc #print "H:" #print "%.12f" % ha, #print "%.12f" % hb #print "%.12f" % hc, #print "%.12f" % hd #print "D=%.12f" % D Hinva = hd/D Hinvb = -hb/D Hinvc = -hc/D Hinvd = ha/D #print "Hinv:" #print "%.12f" % Hinva, #print "%.12f" % Hinvb #print "%.12f" % Hinvc, #print "%.12f" % Hinvd Hinvgradm = Hinva*dldm + Hinvb*dldb Hinvgradb = Hinvc*dldm + Hinvd*dldb #print "grad ell:" #print "%.12f" % dldm #print "%.12f" % dldb #print "-Hinv grad ell:" #print "%.12f" % (-Hinvgradm) #print "%.12f" % (-Hinvgradb) m = m0 - Hinvgradm b = b0 - Hinvgradb #print "m0=%.12f," % m0, #print "m=%.12f" % m #print "b0=%.12f," % b0, #print "b=%.12f" % b err = math.sqrt(Hinvgradm**2 + Hinvgradb**2) #print "err=%.12f" % err if err < tol: done = True its += 1 if its > maxits: raise "logistic_regression_aux: Newton-Raphson convergence failed after %d iterations" % (maxits) m0 = m b0 = b return [m, b] def logistic_regression(xs, ys): return logistic_regression_aux(xs, ys, -0.001, 0.002, 1e-9, 50) # ---------------------------------------------------------------- def zscore(x): return normal_m.invnorm(x) # ---------------------------------------------------------------- def get_normal_quantiles_zs(xarray): # Assume inputs are sorted? n = len(xarray) zarray = [] for i in range(0, n): z = zscore((1.0*(i+0.5))/n) zarray.append(z) return zarray # ---------------------------------------------------------------- def print_normal_quantiles_zx(xarray): xs = copy.copy(xarray) xs.sort() zs = get_normal_quantiles_zs(xs) n = len(xs) for i in range(0, n): print zs[i], xs[i] # ---------------------------------------------------------------- # Suppose x_1, ..., x_N are IID samples of a random variable X. # The kernel density approximator of the PDF of X, with bandwidth parameter # h, is the function # # KDA(x) = (1/Nh) sum K((x-x_i)/h) # # where the kernel K may be taken to be the PDF of the standard normal. def kernel_density_estimator(x, xarray, h): N = len(xarray) sum = 0.0 for xi in xarray: sum += normal_m.normalpdf((x-xi)/h) return sum/N/h def plot_kde(xarray, h, xlo, xhi, nx): dx = (xhi - xlo) / (1.0*nx) for i in range(0, nx+1): x = xlo + i*dx y = kernel_density_estimator(x, xarray, h) print x, y